alpaca / checkpoint-796 /trainer_state.json
kloodia's picture
Upload folder using huggingface_hub
c12362f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9836888331242157,
"eval_steps": 100,
"global_step": 796,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.1427878886461258,
"learning_rate": 2e-05,
"loss": 1.3359,
"step": 1
},
{
"epoch": 0.0,
"eval_loss": 1.324710488319397,
"eval_runtime": 82.0652,
"eval_samples_per_second": 31.67,
"eval_steps_per_second": 31.67,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 0.13537771999835968,
"learning_rate": 4e-05,
"loss": 1.2865,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 0.14623422920703888,
"learning_rate": 6e-05,
"loss": 1.3192,
"step": 3
},
{
"epoch": 0.01,
"grad_norm": 0.15388095378875732,
"learning_rate": 8e-05,
"loss": 1.3244,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 0.1628686636686325,
"learning_rate": 0.0001,
"loss": 1.3,
"step": 5
},
{
"epoch": 0.02,
"grad_norm": 0.20623335242271423,
"learning_rate": 0.00012,
"loss": 1.244,
"step": 6
},
{
"epoch": 0.02,
"grad_norm": 0.1510678231716156,
"learning_rate": 0.00014,
"loss": 1.2799,
"step": 7
},
{
"epoch": 0.02,
"grad_norm": 0.15237094461917877,
"learning_rate": 0.00016,
"loss": 1.2979,
"step": 8
},
{
"epoch": 0.02,
"grad_norm": 0.15166334807872772,
"learning_rate": 0.00018,
"loss": 1.28,
"step": 9
},
{
"epoch": 0.03,
"grad_norm": 0.17794868350028992,
"learning_rate": 0.0002,
"loss": 1.2298,
"step": 10
},
{
"epoch": 0.03,
"grad_norm": 0.25811436772346497,
"learning_rate": 0.0001999998028228211,
"loss": 1.1909,
"step": 11
},
{
"epoch": 0.03,
"grad_norm": 0.19142530858516693,
"learning_rate": 0.000199999211292062,
"loss": 1.178,
"step": 12
},
{
"epoch": 0.03,
"grad_norm": 0.1891462802886963,
"learning_rate": 0.00019999822541005537,
"loss": 1.1173,
"step": 13
},
{
"epoch": 0.04,
"grad_norm": 0.17077742516994476,
"learning_rate": 0.00019999684518068916,
"loss": 1.2092,
"step": 14
},
{
"epoch": 0.04,
"grad_norm": 0.15135815739631653,
"learning_rate": 0.00019999507060940625,
"loss": 1.1439,
"step": 15
},
{
"epoch": 0.04,
"grad_norm": 0.1767009049654007,
"learning_rate": 0.00019999290170320485,
"loss": 1.1408,
"step": 16
},
{
"epoch": 0.04,
"grad_norm": 0.1310850977897644,
"learning_rate": 0.00019999033847063811,
"loss": 1.2369,
"step": 17
},
{
"epoch": 0.05,
"grad_norm": 0.12432192265987396,
"learning_rate": 0.00019998738092181421,
"loss": 1.152,
"step": 18
},
{
"epoch": 0.05,
"grad_norm": 0.12430022656917572,
"learning_rate": 0.00019998402906839643,
"loss": 1.2111,
"step": 19
},
{
"epoch": 0.05,
"grad_norm": 0.12175025045871735,
"learning_rate": 0.00019998028292360286,
"loss": 1.1686,
"step": 20
},
{
"epoch": 0.05,
"grad_norm": 0.11878372728824615,
"learning_rate": 0.0001999761425022067,
"loss": 1.2452,
"step": 21
},
{
"epoch": 0.06,
"grad_norm": 0.11329779773950577,
"learning_rate": 0.00019997160782053578,
"loss": 1.0964,
"step": 22
},
{
"epoch": 0.06,
"grad_norm": 0.11987729370594025,
"learning_rate": 0.00019996667889647288,
"loss": 1.1809,
"step": 23
},
{
"epoch": 0.06,
"grad_norm": 0.12245086580514908,
"learning_rate": 0.00019996135574945544,
"loss": 1.1138,
"step": 24
},
{
"epoch": 0.06,
"grad_norm": 0.1399640142917633,
"learning_rate": 0.00019995563840047542,
"loss": 1.184,
"step": 25
},
{
"epoch": 0.07,
"grad_norm": 0.13597123324871063,
"learning_rate": 0.00019994952687207954,
"loss": 1.1872,
"step": 26
},
{
"epoch": 0.07,
"grad_norm": 0.13976556062698364,
"learning_rate": 0.00019994302118836883,
"loss": 1.1685,
"step": 27
},
{
"epoch": 0.07,
"grad_norm": 0.13106240332126617,
"learning_rate": 0.00019993612137499876,
"loss": 1.1872,
"step": 28
},
{
"epoch": 0.07,
"grad_norm": 0.12896399199962616,
"learning_rate": 0.00019992882745917902,
"loss": 1.1462,
"step": 29
},
{
"epoch": 0.08,
"grad_norm": 0.13873620331287384,
"learning_rate": 0.00019992113946967353,
"loss": 1.1742,
"step": 30
},
{
"epoch": 0.08,
"grad_norm": 0.14103546738624573,
"learning_rate": 0.00019991305743680013,
"loss": 1.1245,
"step": 31
},
{
"epoch": 0.08,
"grad_norm": 0.1377720981836319,
"learning_rate": 0.00019990458139243077,
"loss": 1.2045,
"step": 32
},
{
"epoch": 0.08,
"grad_norm": 0.13191157579421997,
"learning_rate": 0.000199895711369991,
"loss": 1.1716,
"step": 33
},
{
"epoch": 0.09,
"grad_norm": 0.13426551222801208,
"learning_rate": 0.00019988644740446022,
"loss": 1.1382,
"step": 34
},
{
"epoch": 0.09,
"grad_norm": 0.13733097910881042,
"learning_rate": 0.00019987678953237127,
"loss": 1.1677,
"step": 35
},
{
"epoch": 0.09,
"grad_norm": 0.12618272006511688,
"learning_rate": 0.00019986673779181033,
"loss": 1.2195,
"step": 36
},
{
"epoch": 0.09,
"grad_norm": 0.13636991381645203,
"learning_rate": 0.00019985629222241694,
"loss": 1.1577,
"step": 37
},
{
"epoch": 0.1,
"grad_norm": 0.13234035670757294,
"learning_rate": 0.0001998454528653836,
"loss": 1.1089,
"step": 38
},
{
"epoch": 0.1,
"grad_norm": 0.1395445317029953,
"learning_rate": 0.00019983421976345586,
"loss": 1.139,
"step": 39
},
{
"epoch": 0.1,
"grad_norm": 0.1284484714269638,
"learning_rate": 0.0001998225929609319,
"loss": 1.117,
"step": 40
},
{
"epoch": 0.1,
"grad_norm": 0.13304275274276733,
"learning_rate": 0.00019981057250366253,
"loss": 1.161,
"step": 41
},
{
"epoch": 0.11,
"grad_norm": 0.13184913992881775,
"learning_rate": 0.00019979815843905097,
"loss": 1.1826,
"step": 42
},
{
"epoch": 0.11,
"grad_norm": 0.12830235064029694,
"learning_rate": 0.0001997853508160526,
"loss": 1.0739,
"step": 43
},
{
"epoch": 0.11,
"grad_norm": 0.1346379965543747,
"learning_rate": 0.0001997721496851748,
"loss": 1.191,
"step": 44
},
{
"epoch": 0.11,
"grad_norm": 0.13036642968654633,
"learning_rate": 0.00019975855509847686,
"loss": 1.1361,
"step": 45
},
{
"epoch": 0.12,
"grad_norm": 0.12707848846912384,
"learning_rate": 0.00019974456710956964,
"loss": 1.101,
"step": 46
},
{
"epoch": 0.12,
"grad_norm": 0.12984970211982727,
"learning_rate": 0.00019973018577361536,
"loss": 1.1085,
"step": 47
},
{
"epoch": 0.12,
"grad_norm": 0.12627972662448883,
"learning_rate": 0.00019971541114732741,
"loss": 1.1607,
"step": 48
},
{
"epoch": 0.12,
"grad_norm": 0.13074152171611786,
"learning_rate": 0.00019970024328897022,
"loss": 1.1004,
"step": 49
},
{
"epoch": 0.13,
"grad_norm": 0.1309152990579605,
"learning_rate": 0.0001996846822583589,
"loss": 1.1378,
"step": 50
},
{
"epoch": 0.13,
"grad_norm": 0.1303664743900299,
"learning_rate": 0.000199668728116859,
"loss": 1.0956,
"step": 51
},
{
"epoch": 0.13,
"grad_norm": 0.13290388882160187,
"learning_rate": 0.00019965238092738643,
"loss": 1.1264,
"step": 52
},
{
"epoch": 0.13,
"grad_norm": 0.12805409729480743,
"learning_rate": 0.00019963564075440703,
"loss": 1.183,
"step": 53
},
{
"epoch": 0.14,
"grad_norm": 0.1399564892053604,
"learning_rate": 0.0001996185076639364,
"loss": 1.1102,
"step": 54
},
{
"epoch": 0.14,
"grad_norm": 0.12978173792362213,
"learning_rate": 0.00019960098172353962,
"loss": 1.1634,
"step": 55
},
{
"epoch": 0.14,
"grad_norm": 0.13925811648368835,
"learning_rate": 0.00019958306300233098,
"loss": 1.0636,
"step": 56
},
{
"epoch": 0.14,
"grad_norm": 0.13258852064609528,
"learning_rate": 0.00019956475157097378,
"loss": 1.1428,
"step": 57
},
{
"epoch": 0.15,
"grad_norm": 0.1285356879234314,
"learning_rate": 0.00019954604750167993,
"loss": 1.1664,
"step": 58
},
{
"epoch": 0.15,
"grad_norm": 0.1321210116147995,
"learning_rate": 0.00019952695086820975,
"loss": 1.1419,
"step": 59
},
{
"epoch": 0.15,
"grad_norm": 0.14086973667144775,
"learning_rate": 0.00019950746174587163,
"loss": 1.1827,
"step": 60
},
{
"epoch": 0.15,
"grad_norm": 0.1311366856098175,
"learning_rate": 0.0001994875802115218,
"loss": 1.1971,
"step": 61
},
{
"epoch": 0.16,
"grad_norm": 0.14063993096351624,
"learning_rate": 0.0001994673063435639,
"loss": 1.1945,
"step": 62
},
{
"epoch": 0.16,
"grad_norm": 0.12695981562137604,
"learning_rate": 0.00019944664022194885,
"loss": 1.0385,
"step": 63
},
{
"epoch": 0.16,
"grad_norm": 0.14170674979686737,
"learning_rate": 0.0001994255819281744,
"loss": 1.0883,
"step": 64
},
{
"epoch": 0.16,
"grad_norm": 0.13162197172641754,
"learning_rate": 0.0001994041315452849,
"loss": 1.153,
"step": 65
},
{
"epoch": 0.17,
"grad_norm": 0.1326906979084015,
"learning_rate": 0.0001993822891578708,
"loss": 1.1186,
"step": 66
},
{
"epoch": 0.17,
"grad_norm": 0.13306689262390137,
"learning_rate": 0.00019936005485206851,
"loss": 1.1587,
"step": 67
},
{
"epoch": 0.17,
"grad_norm": 0.13625258207321167,
"learning_rate": 0.00019933742871556,
"loss": 1.1339,
"step": 68
},
{
"epoch": 0.17,
"grad_norm": 0.13773800432682037,
"learning_rate": 0.00019931441083757245,
"loss": 1.1944,
"step": 69
},
{
"epoch": 0.18,
"grad_norm": 0.15291447937488556,
"learning_rate": 0.00019929100130887782,
"loss": 1.1028,
"step": 70
},
{
"epoch": 0.18,
"grad_norm": 0.15140767395496368,
"learning_rate": 0.0001992672002217926,
"loss": 1.1896,
"step": 71
},
{
"epoch": 0.18,
"grad_norm": 0.1344233751296997,
"learning_rate": 0.0001992430076701775,
"loss": 1.0561,
"step": 72
},
{
"epoch": 0.18,
"grad_norm": 0.13877920806407928,
"learning_rate": 0.0001992184237494368,
"loss": 1.1108,
"step": 73
},
{
"epoch": 0.19,
"grad_norm": 0.1359027922153473,
"learning_rate": 0.00019919344855651833,
"loss": 1.1563,
"step": 74
},
{
"epoch": 0.19,
"grad_norm": 0.14610135555267334,
"learning_rate": 0.0001991680821899128,
"loss": 1.1299,
"step": 75
},
{
"epoch": 0.19,
"grad_norm": 0.14259958267211914,
"learning_rate": 0.00019914232474965365,
"loss": 1.1021,
"step": 76
},
{
"epoch": 0.19,
"grad_norm": 0.14158602058887482,
"learning_rate": 0.00019911617633731638,
"loss": 1.0787,
"step": 77
},
{
"epoch": 0.2,
"grad_norm": 0.1418074518442154,
"learning_rate": 0.00019908963705601846,
"loss": 1.1359,
"step": 78
},
{
"epoch": 0.2,
"grad_norm": 0.12850767374038696,
"learning_rate": 0.0001990627070104187,
"loss": 1.1373,
"step": 79
},
{
"epoch": 0.2,
"grad_norm": 0.1312914341688156,
"learning_rate": 0.0001990353863067169,
"loss": 1.0832,
"step": 80
},
{
"epoch": 0.2,
"grad_norm": 0.13280583918094635,
"learning_rate": 0.0001990076750526534,
"loss": 1.0462,
"step": 81
},
{
"epoch": 0.21,
"grad_norm": 0.13617292046546936,
"learning_rate": 0.00019897957335750878,
"loss": 1.1059,
"step": 82
},
{
"epoch": 0.21,
"grad_norm": 0.15030132234096527,
"learning_rate": 0.00019895108133210335,
"loss": 1.0761,
"step": 83
},
{
"epoch": 0.21,
"grad_norm": 0.14291270077228546,
"learning_rate": 0.00019892219908879653,
"loss": 1.1217,
"step": 84
},
{
"epoch": 0.21,
"grad_norm": 0.1685461699962616,
"learning_rate": 0.00019889292674148682,
"loss": 1.1607,
"step": 85
},
{
"epoch": 0.22,
"grad_norm": 0.13756121695041656,
"learning_rate": 0.00019886326440561093,
"loss": 1.0914,
"step": 86
},
{
"epoch": 0.22,
"grad_norm": 0.13901358842849731,
"learning_rate": 0.0001988332121981436,
"loss": 1.1234,
"step": 87
},
{
"epoch": 0.22,
"grad_norm": 0.13816247880458832,
"learning_rate": 0.00019880277023759702,
"loss": 1.1583,
"step": 88
},
{
"epoch": 0.22,
"grad_norm": 0.13309679925441742,
"learning_rate": 0.00019877193864402038,
"loss": 1.163,
"step": 89
},
{
"epoch": 0.23,
"grad_norm": 0.13356180489063263,
"learning_rate": 0.0001987407175389994,
"loss": 1.1301,
"step": 90
},
{
"epoch": 0.23,
"grad_norm": 0.1388397067785263,
"learning_rate": 0.00019870910704565588,
"loss": 1.1326,
"step": 91
},
{
"epoch": 0.23,
"grad_norm": 0.13303454220294952,
"learning_rate": 0.0001986771072886472,
"loss": 1.0779,
"step": 92
},
{
"epoch": 0.23,
"grad_norm": 0.1316283941268921,
"learning_rate": 0.00019864471839416576,
"loss": 1.0935,
"step": 93
},
{
"epoch": 0.24,
"grad_norm": 0.1348309963941574,
"learning_rate": 0.00019861194048993863,
"loss": 1.1816,
"step": 94
},
{
"epoch": 0.24,
"grad_norm": 0.1341564655303955,
"learning_rate": 0.00019857877370522685,
"loss": 1.1187,
"step": 95
},
{
"epoch": 0.24,
"grad_norm": 0.13689687848091125,
"learning_rate": 0.0001985452181708251,
"loss": 1.1637,
"step": 96
},
{
"epoch": 0.24,
"grad_norm": 0.13348707556724548,
"learning_rate": 0.0001985112740190611,
"loss": 1.1026,
"step": 97
},
{
"epoch": 0.25,
"grad_norm": 0.13700643181800842,
"learning_rate": 0.00019847694138379506,
"loss": 1.1508,
"step": 98
},
{
"epoch": 0.25,
"grad_norm": 0.13654476404190063,
"learning_rate": 0.00019844222040041928,
"loss": 1.1668,
"step": 99
},
{
"epoch": 0.25,
"grad_norm": 0.15331624448299408,
"learning_rate": 0.0001984071112058574,
"loss": 1.1121,
"step": 100
},
{
"epoch": 0.25,
"eval_loss": 1.1294280290603638,
"eval_runtime": 81.6595,
"eval_samples_per_second": 31.827,
"eval_steps_per_second": 31.827,
"step": 100
},
{
"epoch": 0.25,
"grad_norm": 0.14425526559352875,
"learning_rate": 0.0001983716139385641,
"loss": 1.1447,
"step": 101
},
{
"epoch": 0.26,
"grad_norm": 0.13741208612918854,
"learning_rate": 0.00019833572873852444,
"loss": 1.1001,
"step": 102
},
{
"epoch": 0.26,
"grad_norm": 0.1282232254743576,
"learning_rate": 0.0001982994557472532,
"loss": 1.1199,
"step": 103
},
{
"epoch": 0.26,
"grad_norm": 0.13605354726314545,
"learning_rate": 0.00019826279510779454,
"loss": 1.154,
"step": 104
},
{
"epoch": 0.26,
"grad_norm": 0.13503985106945038,
"learning_rate": 0.00019822574696472126,
"loss": 1.0565,
"step": 105
},
{
"epoch": 0.27,
"grad_norm": 0.13878273963928223,
"learning_rate": 0.00019818831146413434,
"loss": 1.106,
"step": 106
},
{
"epoch": 0.27,
"grad_norm": 0.141740083694458,
"learning_rate": 0.00019815048875366234,
"loss": 1.0848,
"step": 107
},
{
"epoch": 0.27,
"grad_norm": 0.13799507915973663,
"learning_rate": 0.0001981122789824607,
"loss": 1.1582,
"step": 108
},
{
"epoch": 0.27,
"grad_norm": 0.1441466212272644,
"learning_rate": 0.0001980736823012114,
"loss": 1.0787,
"step": 109
},
{
"epoch": 0.28,
"grad_norm": 0.1377534121274948,
"learning_rate": 0.0001980346988621221,
"loss": 1.1092,
"step": 110
},
{
"epoch": 0.28,
"grad_norm": 0.1400901973247528,
"learning_rate": 0.00019799532881892564,
"loss": 1.0549,
"step": 111
},
{
"epoch": 0.28,
"grad_norm": 0.13621239364147186,
"learning_rate": 0.00019795557232687956,
"loss": 1.0991,
"step": 112
},
{
"epoch": 0.28,
"grad_norm": 0.1324262171983719,
"learning_rate": 0.0001979154295427653,
"loss": 1.0583,
"step": 113
},
{
"epoch": 0.29,
"grad_norm": 0.13273654878139496,
"learning_rate": 0.0001978749006248877,
"loss": 1.1504,
"step": 114
},
{
"epoch": 0.29,
"grad_norm": 0.14279481768608093,
"learning_rate": 0.00019783398573307428,
"loss": 1.0941,
"step": 115
},
{
"epoch": 0.29,
"grad_norm": 0.1432316154241562,
"learning_rate": 0.00019779268502867473,
"loss": 1.1111,
"step": 116
},
{
"epoch": 0.29,
"grad_norm": 0.14505276083946228,
"learning_rate": 0.00019775099867456013,
"loss": 1.0941,
"step": 117
},
{
"epoch": 0.3,
"grad_norm": 0.13935014605522156,
"learning_rate": 0.0001977089268351225,
"loss": 1.0597,
"step": 118
},
{
"epoch": 0.3,
"grad_norm": 0.14532430469989777,
"learning_rate": 0.0001976664696762739,
"loss": 1.1116,
"step": 119
},
{
"epoch": 0.3,
"grad_norm": 0.14096760749816895,
"learning_rate": 0.00019762362736544607,
"loss": 1.1381,
"step": 120
},
{
"epoch": 0.3,
"grad_norm": 0.1470746099948883,
"learning_rate": 0.00019758040007158948,
"loss": 1.1215,
"step": 121
},
{
"epoch": 0.31,
"grad_norm": 0.13610850274562836,
"learning_rate": 0.00019753678796517282,
"loss": 1.136,
"step": 122
},
{
"epoch": 0.31,
"grad_norm": 0.1399529129266739,
"learning_rate": 0.00019749279121818235,
"loss": 1.1035,
"step": 123
},
{
"epoch": 0.31,
"grad_norm": 0.13626012206077576,
"learning_rate": 0.00019744841000412123,
"loss": 1.1248,
"step": 124
},
{
"epoch": 0.31,
"grad_norm": 0.13053762912750244,
"learning_rate": 0.0001974036444980086,
"loss": 1.1286,
"step": 125
},
{
"epoch": 0.32,
"grad_norm": 0.14427675306797028,
"learning_rate": 0.00019735849487637929,
"loss": 1.2792,
"step": 126
},
{
"epoch": 0.32,
"grad_norm": 0.14464688301086426,
"learning_rate": 0.0001973129613172827,
"loss": 1.1091,
"step": 127
},
{
"epoch": 0.32,
"grad_norm": 0.12712322175502777,
"learning_rate": 0.0001972670440002825,
"loss": 1.1219,
"step": 128
},
{
"epoch": 0.32,
"grad_norm": 0.13343971967697144,
"learning_rate": 0.00019722074310645553,
"loss": 1.1296,
"step": 129
},
{
"epoch": 0.33,
"grad_norm": 0.15525247156620026,
"learning_rate": 0.00019717405881839145,
"loss": 1.159,
"step": 130
},
{
"epoch": 0.33,
"grad_norm": 0.12908953428268433,
"learning_rate": 0.0001971269913201918,
"loss": 1.0821,
"step": 131
},
{
"epoch": 0.33,
"grad_norm": 0.24165280163288116,
"learning_rate": 0.00019707954079746927,
"loss": 1.1388,
"step": 132
},
{
"epoch": 0.33,
"grad_norm": 0.1432817280292511,
"learning_rate": 0.00019703170743734706,
"loss": 1.1184,
"step": 133
},
{
"epoch": 0.34,
"grad_norm": 0.14007362723350525,
"learning_rate": 0.00019698349142845814,
"loss": 1.1576,
"step": 134
},
{
"epoch": 0.34,
"grad_norm": 0.14235983788967133,
"learning_rate": 0.00019693489296094443,
"loss": 1.0847,
"step": 135
},
{
"epoch": 0.34,
"grad_norm": 0.1430092453956604,
"learning_rate": 0.00019688591222645607,
"loss": 1.1562,
"step": 136
},
{
"epoch": 0.34,
"grad_norm": 0.13986627757549286,
"learning_rate": 0.00019683654941815077,
"loss": 1.124,
"step": 137
},
{
"epoch": 0.35,
"grad_norm": 0.13933469355106354,
"learning_rate": 0.00019678680473069293,
"loss": 1.1001,
"step": 138
},
{
"epoch": 0.35,
"grad_norm": 0.13476844131946564,
"learning_rate": 0.00019673667836025283,
"loss": 1.1186,
"step": 139
},
{
"epoch": 0.35,
"grad_norm": 0.13418316841125488,
"learning_rate": 0.00019668617050450603,
"loss": 1.1309,
"step": 140
},
{
"epoch": 0.35,
"grad_norm": 0.12794847786426544,
"learning_rate": 0.00019663528136263246,
"loss": 1.1142,
"step": 141
},
{
"epoch": 0.36,
"grad_norm": 0.1326293647289276,
"learning_rate": 0.00019658401113531565,
"loss": 1.0503,
"step": 142
},
{
"epoch": 0.36,
"grad_norm": 0.14793147146701813,
"learning_rate": 0.000196532360024742,
"loss": 1.2104,
"step": 143
},
{
"epoch": 0.36,
"grad_norm": 0.13718444108963013,
"learning_rate": 0.00019648032823459994,
"loss": 1.1685,
"step": 144
},
{
"epoch": 0.36,
"grad_norm": 0.14404018223285675,
"learning_rate": 0.00019642791597007902,
"loss": 1.09,
"step": 145
},
{
"epoch": 0.37,
"grad_norm": 0.14241506159305573,
"learning_rate": 0.00019637512343786937,
"loss": 1.1355,
"step": 146
},
{
"epoch": 0.37,
"grad_norm": 0.14581352472305298,
"learning_rate": 0.00019632195084616063,
"loss": 1.1005,
"step": 147
},
{
"epoch": 0.37,
"grad_norm": 0.14792676270008087,
"learning_rate": 0.00019626839840464119,
"loss": 1.1168,
"step": 148
},
{
"epoch": 0.37,
"grad_norm": 0.1484677940607071,
"learning_rate": 0.00019621446632449744,
"loss": 1.1138,
"step": 149
},
{
"epoch": 0.38,
"grad_norm": 0.15315671265125275,
"learning_rate": 0.0001961601548184129,
"loss": 1.1636,
"step": 150
},
{
"epoch": 0.38,
"grad_norm": 0.14746810495853424,
"learning_rate": 0.0001961054641005674,
"loss": 1.0881,
"step": 151
},
{
"epoch": 0.38,
"grad_norm": 0.1407732516527176,
"learning_rate": 0.00019605039438663614,
"loss": 1.0347,
"step": 152
},
{
"epoch": 0.38,
"grad_norm": 0.14150719344615936,
"learning_rate": 0.0001959949458937889,
"loss": 1.1112,
"step": 153
},
{
"epoch": 0.39,
"grad_norm": 0.16782569885253906,
"learning_rate": 0.0001959391188406893,
"loss": 1.0496,
"step": 154
},
{
"epoch": 0.39,
"grad_norm": 0.1452791690826416,
"learning_rate": 0.0001958829134474937,
"loss": 1.1185,
"step": 155
},
{
"epoch": 0.39,
"grad_norm": 0.145284965634346,
"learning_rate": 0.00019582632993585052,
"loss": 1.1431,
"step": 156
},
{
"epoch": 0.39,
"grad_norm": 0.15500612556934357,
"learning_rate": 0.00019576936852889936,
"loss": 1.1679,
"step": 157
},
{
"epoch": 0.4,
"grad_norm": 0.1416521966457367,
"learning_rate": 0.00019571202945126994,
"loss": 1.1322,
"step": 158
},
{
"epoch": 0.4,
"grad_norm": 0.1465340405702591,
"learning_rate": 0.00019565431292908146,
"loss": 1.0693,
"step": 159
},
{
"epoch": 0.4,
"grad_norm": 0.13601765036582947,
"learning_rate": 0.0001955962191899415,
"loss": 1.0676,
"step": 160
},
{
"epoch": 0.4,
"grad_norm": 0.14759162068367004,
"learning_rate": 0.0001955377484629453,
"loss": 1.0506,
"step": 161
},
{
"epoch": 0.41,
"grad_norm": 0.14839032292366028,
"learning_rate": 0.00019547890097867468,
"loss": 1.1245,
"step": 162
},
{
"epoch": 0.41,
"grad_norm": 0.1440214365720749,
"learning_rate": 0.0001954196769691973,
"loss": 1.1672,
"step": 163
},
{
"epoch": 0.41,
"grad_norm": 0.1372719258069992,
"learning_rate": 0.00019536007666806556,
"loss": 1.1084,
"step": 164
},
{
"epoch": 0.41,
"grad_norm": 0.14372558891773224,
"learning_rate": 0.00019530010031031586,
"loss": 1.1679,
"step": 165
},
{
"epoch": 0.42,
"grad_norm": 0.13789264857769012,
"learning_rate": 0.00019523974813246767,
"loss": 1.1253,
"step": 166
},
{
"epoch": 0.42,
"grad_norm": 0.14368915557861328,
"learning_rate": 0.0001951790203725223,
"loss": 1.085,
"step": 167
},
{
"epoch": 0.42,
"grad_norm": 0.1380469799041748,
"learning_rate": 0.00019511791726996243,
"loss": 1.1379,
"step": 168
},
{
"epoch": 0.42,
"grad_norm": 0.13288158178329468,
"learning_rate": 0.00019505643906575073,
"loss": 1.113,
"step": 169
},
{
"epoch": 0.43,
"grad_norm": 0.1390606164932251,
"learning_rate": 0.0001949945860023292,
"loss": 1.095,
"step": 170
},
{
"epoch": 0.43,
"grad_norm": 0.14271940290927887,
"learning_rate": 0.0001949323583236181,
"loss": 1.1063,
"step": 171
},
{
"epoch": 0.43,
"grad_norm": 0.13795693218708038,
"learning_rate": 0.00019486975627501502,
"loss": 1.0628,
"step": 172
},
{
"epoch": 0.43,
"grad_norm": 0.14073535799980164,
"learning_rate": 0.0001948067801033938,
"loss": 1.1192,
"step": 173
},
{
"epoch": 0.44,
"grad_norm": 0.138822540640831,
"learning_rate": 0.0001947434300571038,
"loss": 1.1299,
"step": 174
},
{
"epoch": 0.44,
"grad_norm": 0.13592712581157684,
"learning_rate": 0.0001946797063859686,
"loss": 1.0868,
"step": 175
},
{
"epoch": 0.44,
"grad_norm": 0.1379610300064087,
"learning_rate": 0.00019461560934128533,
"loss": 1.069,
"step": 176
},
{
"epoch": 0.44,
"grad_norm": 0.14286787807941437,
"learning_rate": 0.00019455113917582346,
"loss": 1.139,
"step": 177
},
{
"epoch": 0.45,
"grad_norm": 0.14168201386928558,
"learning_rate": 0.0001944862961438239,
"loss": 1.1405,
"step": 178
},
{
"epoch": 0.45,
"grad_norm": 0.1345077008008957,
"learning_rate": 0.000194421080500998,
"loss": 1.1039,
"step": 179
},
{
"epoch": 0.45,
"grad_norm": 0.1363426297903061,
"learning_rate": 0.00019435549250452645,
"loss": 1.1056,
"step": 180
},
{
"epoch": 0.45,
"grad_norm": 0.14109478890895844,
"learning_rate": 0.00019428953241305838,
"loss": 1.0927,
"step": 181
},
{
"epoch": 0.46,
"grad_norm": 0.14332321286201477,
"learning_rate": 0.0001942232004867103,
"loss": 1.0305,
"step": 182
},
{
"epoch": 0.46,
"grad_norm": 0.15956294536590576,
"learning_rate": 0.00019415649698706507,
"loss": 1.1245,
"step": 183
},
{
"epoch": 0.46,
"grad_norm": 0.14164718985557556,
"learning_rate": 0.0001940894221771708,
"loss": 1.0963,
"step": 184
},
{
"epoch": 0.46,
"grad_norm": 0.14296875894069672,
"learning_rate": 0.00019402197632153992,
"loss": 1.0853,
"step": 185
},
{
"epoch": 0.47,
"grad_norm": 0.12994709610939026,
"learning_rate": 0.00019395415968614813,
"loss": 1.0503,
"step": 186
},
{
"epoch": 0.47,
"grad_norm": 0.1399766504764557,
"learning_rate": 0.00019388597253843334,
"loss": 1.0623,
"step": 187
},
{
"epoch": 0.47,
"grad_norm": 0.14874404668807983,
"learning_rate": 0.00019381741514729443,
"loss": 1.0885,
"step": 188
},
{
"epoch": 0.47,
"grad_norm": 0.1453857719898224,
"learning_rate": 0.00019374848778309055,
"loss": 1.1702,
"step": 189
},
{
"epoch": 0.48,
"grad_norm": 0.14976643025875092,
"learning_rate": 0.0001936791907176397,
"loss": 1.0834,
"step": 190
},
{
"epoch": 0.48,
"grad_norm": 0.1418897956609726,
"learning_rate": 0.00019360952422421793,
"loss": 1.0918,
"step": 191
},
{
"epoch": 0.48,
"grad_norm": 0.14602817595005035,
"learning_rate": 0.00019353948857755803,
"loss": 1.0825,
"step": 192
},
{
"epoch": 0.48,
"grad_norm": 0.14669157564640045,
"learning_rate": 0.00019346908405384867,
"loss": 1.0973,
"step": 193
},
{
"epoch": 0.49,
"grad_norm": 0.14327263832092285,
"learning_rate": 0.00019339831093073318,
"loss": 1.1191,
"step": 194
},
{
"epoch": 0.49,
"grad_norm": 0.13806897401809692,
"learning_rate": 0.0001933271694873084,
"loss": 1.1504,
"step": 195
},
{
"epoch": 0.49,
"grad_norm": 0.13992969691753387,
"learning_rate": 0.00019325566000412376,
"loss": 1.0865,
"step": 196
},
{
"epoch": 0.49,
"grad_norm": 0.14395759999752045,
"learning_rate": 0.00019318378276318,
"loss": 1.1204,
"step": 197
},
{
"epoch": 0.5,
"grad_norm": 0.1409691572189331,
"learning_rate": 0.0001931115380479281,
"loss": 1.0766,
"step": 198
},
{
"epoch": 0.5,
"grad_norm": 0.1448824405670166,
"learning_rate": 0.00019303892614326836,
"loss": 1.1741,
"step": 199
},
{
"epoch": 0.5,
"grad_norm": 0.142364963889122,
"learning_rate": 0.00019296594733554892,
"loss": 1.1716,
"step": 200
},
{
"epoch": 0.5,
"eval_loss": 1.109603762626648,
"eval_runtime": 81.7249,
"eval_samples_per_second": 31.802,
"eval_steps_per_second": 31.802,
"step": 200
},
{
"epoch": 0.5,
"grad_norm": 0.1372615098953247,
"learning_rate": 0.00019289260191256483,
"loss": 1.1084,
"step": 201
},
{
"epoch": 0.51,
"grad_norm": 0.13863563537597656,
"learning_rate": 0.0001928188901635571,
"loss": 1.0546,
"step": 202
},
{
"epoch": 0.51,
"grad_norm": 0.13055531680583954,
"learning_rate": 0.00019274481237921114,
"loss": 1.018,
"step": 203
},
{
"epoch": 0.51,
"grad_norm": 0.14135099947452545,
"learning_rate": 0.00019267036885165588,
"loss": 1.1131,
"step": 204
},
{
"epoch": 0.51,
"grad_norm": 0.14308464527130127,
"learning_rate": 0.0001925955598744627,
"loss": 1.0723,
"step": 205
},
{
"epoch": 0.52,
"grad_norm": 0.13907764852046967,
"learning_rate": 0.00019252038574264405,
"loss": 1.1607,
"step": 206
},
{
"epoch": 0.52,
"grad_norm": 0.13771073520183563,
"learning_rate": 0.00019244484675265232,
"loss": 1.172,
"step": 207
},
{
"epoch": 0.52,
"grad_norm": 0.13774815201759338,
"learning_rate": 0.00019236894320237894,
"loss": 1.0622,
"step": 208
},
{
"epoch": 0.52,
"grad_norm": 0.1426474153995514,
"learning_rate": 0.0001922926753911527,
"loss": 1.0368,
"step": 209
},
{
"epoch": 0.53,
"grad_norm": 0.1380661278963089,
"learning_rate": 0.00019221604361973919,
"loss": 1.0873,
"step": 210
},
{
"epoch": 0.53,
"grad_norm": 0.14044702053070068,
"learning_rate": 0.00019213904819033903,
"loss": 1.0901,
"step": 211
},
{
"epoch": 0.53,
"grad_norm": 0.1415887176990509,
"learning_rate": 0.00019206168940658712,
"loss": 1.1061,
"step": 212
},
{
"epoch": 0.53,
"grad_norm": 0.1580592840909958,
"learning_rate": 0.00019198396757355118,
"loss": 1.1073,
"step": 213
},
{
"epoch": 0.54,
"grad_norm": 0.14094668626785278,
"learning_rate": 0.00019190588299773062,
"loss": 1.1781,
"step": 214
},
{
"epoch": 0.54,
"grad_norm": 0.14229640364646912,
"learning_rate": 0.00019182743598705542,
"loss": 1.1095,
"step": 215
},
{
"epoch": 0.54,
"grad_norm": 0.140314981341362,
"learning_rate": 0.00019174862685088472,
"loss": 1.1534,
"step": 216
},
{
"epoch": 0.54,
"grad_norm": 0.160028338432312,
"learning_rate": 0.00019166945590000584,
"loss": 1.087,
"step": 217
},
{
"epoch": 0.55,
"grad_norm": 0.14278572797775269,
"learning_rate": 0.0001915899234466328,
"loss": 1.1583,
"step": 218
},
{
"epoch": 0.55,
"grad_norm": 0.13695856928825378,
"learning_rate": 0.0001915100298044054,
"loss": 1.1151,
"step": 219
},
{
"epoch": 0.55,
"grad_norm": 0.14235751330852509,
"learning_rate": 0.00019142977528838762,
"loss": 1.1111,
"step": 220
},
{
"epoch": 0.55,
"grad_norm": 0.15174664556980133,
"learning_rate": 0.00019134916021506666,
"loss": 1.1438,
"step": 221
},
{
"epoch": 0.56,
"grad_norm": 0.15249325335025787,
"learning_rate": 0.0001912681849023516,
"loss": 1.1575,
"step": 222
},
{
"epoch": 0.56,
"grad_norm": 0.14303787052631378,
"learning_rate": 0.00019118684966957207,
"loss": 1.1302,
"step": 223
},
{
"epoch": 0.56,
"grad_norm": 0.1405183970928192,
"learning_rate": 0.00019110515483747716,
"loss": 1.1157,
"step": 224
},
{
"epoch": 0.56,
"grad_norm": 0.1475205421447754,
"learning_rate": 0.00019102310072823393,
"loss": 1.1175,
"step": 225
},
{
"epoch": 0.57,
"grad_norm": 0.14406634867191315,
"learning_rate": 0.0001909406876654264,
"loss": 1.0578,
"step": 226
},
{
"epoch": 0.57,
"grad_norm": 0.13999773561954498,
"learning_rate": 0.00019085791597405404,
"loss": 1.0865,
"step": 227
},
{
"epoch": 0.57,
"grad_norm": 0.1409848928451538,
"learning_rate": 0.00019077478598053063,
"loss": 1.1297,
"step": 228
},
{
"epoch": 0.57,
"grad_norm": 0.14548417925834656,
"learning_rate": 0.00019069129801268294,
"loss": 1.1524,
"step": 229
},
{
"epoch": 0.58,
"grad_norm": 0.13622736930847168,
"learning_rate": 0.00019060745239974936,
"loss": 1.0744,
"step": 230
},
{
"epoch": 0.58,
"grad_norm": 0.14302954077720642,
"learning_rate": 0.0001905232494723788,
"loss": 1.1469,
"step": 231
},
{
"epoch": 0.58,
"grad_norm": 0.15202221274375916,
"learning_rate": 0.0001904386895626291,
"loss": 1.0693,
"step": 232
},
{
"epoch": 0.58,
"grad_norm": 0.14072120189666748,
"learning_rate": 0.00019035377300396597,
"loss": 1.0584,
"step": 233
},
{
"epoch": 0.59,
"grad_norm": 0.13941141963005066,
"learning_rate": 0.00019026850013126157,
"loss": 1.1257,
"step": 234
},
{
"epoch": 0.59,
"grad_norm": 0.1389845460653305,
"learning_rate": 0.0001901828712807932,
"loss": 1.0003,
"step": 235
},
{
"epoch": 0.59,
"grad_norm": 0.1431329846382141,
"learning_rate": 0.0001900968867902419,
"loss": 1.0795,
"step": 236
},
{
"epoch": 0.59,
"grad_norm": 0.15022633969783783,
"learning_rate": 0.00019001054699869133,
"loss": 1.1427,
"step": 237
},
{
"epoch": 0.6,
"grad_norm": 0.1578160673379898,
"learning_rate": 0.00018992385224662623,
"loss": 1.13,
"step": 238
},
{
"epoch": 0.6,
"grad_norm": 0.13778769969940186,
"learning_rate": 0.00018983680287593105,
"loss": 1.0739,
"step": 239
},
{
"epoch": 0.6,
"grad_norm": 0.1454969048500061,
"learning_rate": 0.00018974939922988883,
"loss": 1.0864,
"step": 240
},
{
"epoch": 0.6,
"grad_norm": 0.13545964658260345,
"learning_rate": 0.00018966164165317966,
"loss": 1.0169,
"step": 241
},
{
"epoch": 0.61,
"grad_norm": 0.13648608326911926,
"learning_rate": 0.00018957353049187936,
"loss": 1.0732,
"step": 242
},
{
"epoch": 0.61,
"grad_norm": 0.14080677926540375,
"learning_rate": 0.00018948506609345813,
"loss": 1.0579,
"step": 243
},
{
"epoch": 0.61,
"grad_norm": 0.14503297209739685,
"learning_rate": 0.00018939624880677918,
"loss": 1.0755,
"step": 244
},
{
"epoch": 0.61,
"grad_norm": 0.15316741168498993,
"learning_rate": 0.00018930707898209733,
"loss": 1.0885,
"step": 245
},
{
"epoch": 0.62,
"grad_norm": 0.14839263260364532,
"learning_rate": 0.0001892175569710577,
"loss": 1.121,
"step": 246
},
{
"epoch": 0.62,
"grad_norm": 0.13919925689697266,
"learning_rate": 0.00018912768312669424,
"loss": 1.1039,
"step": 247
},
{
"epoch": 0.62,
"grad_norm": 0.13975974917411804,
"learning_rate": 0.00018903745780342839,
"loss": 1.1454,
"step": 248
},
{
"epoch": 0.62,
"grad_norm": 0.13851100206375122,
"learning_rate": 0.0001889468813570676,
"loss": 1.0905,
"step": 249
},
{
"epoch": 0.63,
"grad_norm": 0.14839564263820648,
"learning_rate": 0.00018885595414480405,
"loss": 1.1002,
"step": 250
},
{
"epoch": 0.63,
"grad_norm": 0.1421942263841629,
"learning_rate": 0.00018876467652521317,
"loss": 1.093,
"step": 251
},
{
"epoch": 0.63,
"grad_norm": 0.14453786611557007,
"learning_rate": 0.0001886730488582522,
"loss": 1.0278,
"step": 252
},
{
"epoch": 0.63,
"grad_norm": 0.13856688141822815,
"learning_rate": 0.0001885810715052589,
"loss": 1.079,
"step": 253
},
{
"epoch": 0.64,
"grad_norm": 0.14092479646205902,
"learning_rate": 0.00018848874482894993,
"loss": 1.0608,
"step": 254
},
{
"epoch": 0.64,
"grad_norm": 0.14616413414478302,
"learning_rate": 0.0001883960691934196,
"loss": 1.1097,
"step": 255
},
{
"epoch": 0.64,
"grad_norm": 0.1410474181175232,
"learning_rate": 0.00018830304496413822,
"loss": 1.0577,
"step": 256
},
{
"epoch": 0.64,
"grad_norm": 0.15473878383636475,
"learning_rate": 0.000188209672507951,
"loss": 1.1453,
"step": 257
},
{
"epoch": 0.65,
"grad_norm": 0.14370983839035034,
"learning_rate": 0.00018811595219307622,
"loss": 1.1732,
"step": 258
},
{
"epoch": 0.65,
"grad_norm": 0.14861780405044556,
"learning_rate": 0.00018802188438910405,
"loss": 1.1471,
"step": 259
},
{
"epoch": 0.65,
"grad_norm": 0.1523188352584839,
"learning_rate": 0.000187927469466995,
"loss": 1.129,
"step": 260
},
{
"epoch": 0.65,
"grad_norm": 0.14366289973258972,
"learning_rate": 0.00018783270779907838,
"loss": 1.0792,
"step": 261
},
{
"epoch": 0.66,
"grad_norm": 0.1363295018672943,
"learning_rate": 0.00018773759975905098,
"loss": 0.9848,
"step": 262
},
{
"epoch": 0.66,
"grad_norm": 0.1438857764005661,
"learning_rate": 0.00018764214572197552,
"loss": 1.1371,
"step": 263
},
{
"epoch": 0.66,
"grad_norm": 0.13751162588596344,
"learning_rate": 0.00018754634606427914,
"loss": 1.0557,
"step": 264
},
{
"epoch": 0.66,
"grad_norm": 0.1384708732366562,
"learning_rate": 0.00018745020116375197,
"loss": 1.0664,
"step": 265
},
{
"epoch": 0.67,
"grad_norm": 0.14196960628032684,
"learning_rate": 0.00018735371139954558,
"loss": 1.0828,
"step": 266
},
{
"epoch": 0.67,
"grad_norm": 0.15374121069908142,
"learning_rate": 0.00018725687715217163,
"loss": 1.073,
"step": 267
},
{
"epoch": 0.67,
"grad_norm": 0.14955537021160126,
"learning_rate": 0.0001871596988035001,
"loss": 1.1444,
"step": 268
},
{
"epoch": 0.68,
"grad_norm": 0.13760650157928467,
"learning_rate": 0.00018706217673675811,
"loss": 1.088,
"step": 269
},
{
"epoch": 0.68,
"grad_norm": 0.17072008550167084,
"learning_rate": 0.00018696431133652817,
"loss": 1.07,
"step": 270
},
{
"epoch": 0.68,
"grad_norm": 0.14745061099529266,
"learning_rate": 0.00018686610298874676,
"loss": 1.1105,
"step": 271
},
{
"epoch": 0.68,
"grad_norm": 0.14695587754249573,
"learning_rate": 0.00018676755208070275,
"loss": 1.0612,
"step": 272
},
{
"epoch": 0.69,
"grad_norm": 0.15686020255088806,
"learning_rate": 0.00018666865900103597,
"loss": 1.0933,
"step": 273
},
{
"epoch": 0.69,
"grad_norm": 0.14162233471870422,
"learning_rate": 0.00018656942413973555,
"loss": 1.0832,
"step": 274
},
{
"epoch": 0.69,
"grad_norm": 0.14662939310073853,
"learning_rate": 0.00018646984788813856,
"loss": 1.1175,
"step": 275
},
{
"epoch": 0.69,
"grad_norm": 0.13886839151382446,
"learning_rate": 0.0001863699306389282,
"loss": 1.1221,
"step": 276
},
{
"epoch": 0.7,
"grad_norm": 0.13897326588630676,
"learning_rate": 0.00018626967278613253,
"loss": 1.0767,
"step": 277
},
{
"epoch": 0.7,
"grad_norm": 0.13283655047416687,
"learning_rate": 0.0001861690747251228,
"loss": 1.1397,
"step": 278
},
{
"epoch": 0.7,
"grad_norm": 0.14036604762077332,
"learning_rate": 0.0001860681368526118,
"loss": 1.0965,
"step": 279
},
{
"epoch": 0.7,
"grad_norm": 0.1449379026889801,
"learning_rate": 0.00018596685956665245,
"loss": 1.1262,
"step": 280
},
{
"epoch": 0.71,
"grad_norm": 0.14264287054538727,
"learning_rate": 0.00018586524326663615,
"loss": 1.1317,
"step": 281
},
{
"epoch": 0.71,
"grad_norm": 0.14677459001541138,
"learning_rate": 0.00018576328835329117,
"loss": 1.0785,
"step": 282
},
{
"epoch": 0.71,
"grad_norm": 0.14834077656269073,
"learning_rate": 0.00018566099522868119,
"loss": 1.0892,
"step": 283
},
{
"epoch": 0.71,
"grad_norm": 0.15325355529785156,
"learning_rate": 0.00018555836429620358,
"loss": 1.0843,
"step": 284
},
{
"epoch": 0.72,
"grad_norm": 0.14825651049613953,
"learning_rate": 0.00018545539596058795,
"loss": 1.1288,
"step": 285
},
{
"epoch": 0.72,
"grad_norm": 0.14722499251365662,
"learning_rate": 0.00018535209062789433,
"loss": 1.1391,
"step": 286
},
{
"epoch": 0.72,
"grad_norm": 0.14388781785964966,
"learning_rate": 0.00018524844870551185,
"loss": 1.1013,
"step": 287
},
{
"epoch": 0.72,
"grad_norm": 0.1455835998058319,
"learning_rate": 0.00018514447060215698,
"loss": 1.0811,
"step": 288
},
{
"epoch": 0.73,
"grad_norm": 0.14625433087348938,
"learning_rate": 0.00018504015672787184,
"loss": 1.0854,
"step": 289
},
{
"epoch": 0.73,
"grad_norm": 0.13978470861911774,
"learning_rate": 0.00018493550749402278,
"loss": 1.1398,
"step": 290
},
{
"epoch": 0.73,
"grad_norm": 0.1447162628173828,
"learning_rate": 0.00018483052331329857,
"loss": 1.0553,
"step": 291
},
{
"epoch": 0.73,
"grad_norm": 0.13894303143024445,
"learning_rate": 0.00018472520459970898,
"loss": 1.0305,
"step": 292
},
{
"epoch": 0.74,
"grad_norm": 0.1372181624174118,
"learning_rate": 0.00018461955176858285,
"loss": 1.021,
"step": 293
},
{
"epoch": 0.74,
"grad_norm": 0.14599645137786865,
"learning_rate": 0.0001845135652365668,
"loss": 1.0808,
"step": 294
},
{
"epoch": 0.74,
"grad_norm": 0.1599220335483551,
"learning_rate": 0.00018440724542162328,
"loss": 1.1143,
"step": 295
},
{
"epoch": 0.74,
"grad_norm": 0.1450476050376892,
"learning_rate": 0.00018430059274302917,
"loss": 1.0508,
"step": 296
},
{
"epoch": 0.75,
"grad_norm": 0.1439283937215805,
"learning_rate": 0.00018419360762137395,
"loss": 1.0592,
"step": 297
},
{
"epoch": 0.75,
"grad_norm": 0.1410531848669052,
"learning_rate": 0.00018408629047855804,
"loss": 1.0632,
"step": 298
},
{
"epoch": 0.75,
"grad_norm": 0.1468774974346161,
"learning_rate": 0.00018397864173779133,
"loss": 1.056,
"step": 299
},
{
"epoch": 0.75,
"grad_norm": 0.1467033177614212,
"learning_rate": 0.00018387066182359133,
"loss": 1.1122,
"step": 300
},
{
"epoch": 0.75,
"eval_loss": 1.0955116748809814,
"eval_runtime": 81.7775,
"eval_samples_per_second": 31.781,
"eval_steps_per_second": 31.781,
"step": 300
},
{
"epoch": 0.76,
"grad_norm": 0.14950688183307648,
"learning_rate": 0.00018376235116178148,
"loss": 1.0698,
"step": 301
},
{
"epoch": 0.76,
"grad_norm": 0.142381951212883,
"learning_rate": 0.00018365371017948964,
"loss": 1.0528,
"step": 302
},
{
"epoch": 0.76,
"grad_norm": 0.1410701423883438,
"learning_rate": 0.0001835447393051463,
"loss": 1.0785,
"step": 303
},
{
"epoch": 0.76,
"grad_norm": 0.14708860218524933,
"learning_rate": 0.00018343543896848273,
"loss": 1.0142,
"step": 304
},
{
"epoch": 0.77,
"grad_norm": 0.1467617303133011,
"learning_rate": 0.00018332580960052965,
"loss": 1.0973,
"step": 305
},
{
"epoch": 0.77,
"grad_norm": 0.15761792659759521,
"learning_rate": 0.00018321585163361527,
"loss": 1.1745,
"step": 306
},
{
"epoch": 0.77,
"grad_norm": 0.13972119987010956,
"learning_rate": 0.00018310556550136357,
"loss": 1.0832,
"step": 307
},
{
"epoch": 0.77,
"grad_norm": 0.1481141895055771,
"learning_rate": 0.00018299495163869275,
"loss": 1.1573,
"step": 308
},
{
"epoch": 0.78,
"grad_norm": 0.14397870004177094,
"learning_rate": 0.0001828840104818134,
"loss": 1.171,
"step": 309
},
{
"epoch": 0.78,
"grad_norm": 0.14765049517154694,
"learning_rate": 0.0001827727424682268,
"loss": 1.0544,
"step": 310
},
{
"epoch": 0.78,
"grad_norm": 0.14956365525722504,
"learning_rate": 0.00018266114803672318,
"loss": 1.1755,
"step": 311
},
{
"epoch": 0.78,
"grad_norm": 0.15122386813163757,
"learning_rate": 0.00018254922762738008,
"loss": 1.1547,
"step": 312
},
{
"epoch": 0.79,
"grad_norm": 0.14254115521907806,
"learning_rate": 0.00018243698168156054,
"loss": 1.1075,
"step": 313
},
{
"epoch": 0.79,
"grad_norm": 0.14294452965259552,
"learning_rate": 0.00018232441064191125,
"loss": 1.1419,
"step": 314
},
{
"epoch": 0.79,
"grad_norm": 0.14777772128582,
"learning_rate": 0.0001822115149523611,
"loss": 1.1662,
"step": 315
},
{
"epoch": 0.79,
"grad_norm": 0.14944781363010406,
"learning_rate": 0.0001820982950581191,
"loss": 1.1497,
"step": 316
},
{
"epoch": 0.8,
"grad_norm": 0.1466801017522812,
"learning_rate": 0.00018198475140567287,
"loss": 1.1374,
"step": 317
},
{
"epoch": 0.8,
"grad_norm": 0.15346656739711761,
"learning_rate": 0.00018187088444278674,
"loss": 1.1356,
"step": 318
},
{
"epoch": 0.8,
"grad_norm": 0.15271005034446716,
"learning_rate": 0.00018175669461850005,
"loss": 1.0845,
"step": 319
},
{
"epoch": 0.8,
"grad_norm": 0.14452996850013733,
"learning_rate": 0.00018164218238312535,
"loss": 1.1162,
"step": 320
},
{
"epoch": 0.81,
"grad_norm": 0.14632536470890045,
"learning_rate": 0.00018152734818824658,
"loss": 1.0187,
"step": 321
},
{
"epoch": 0.81,
"grad_norm": 0.14935997128486633,
"learning_rate": 0.00018141219248671745,
"loss": 1.1167,
"step": 322
},
{
"epoch": 0.81,
"grad_norm": 0.14043933153152466,
"learning_rate": 0.0001812967157326595,
"loss": 1.0044,
"step": 323
},
{
"epoch": 0.81,
"grad_norm": 0.14850106835365295,
"learning_rate": 0.00018118091838146029,
"loss": 1.1226,
"step": 324
},
{
"epoch": 0.82,
"grad_norm": 0.14655061066150665,
"learning_rate": 0.00018106480088977172,
"loss": 1.0508,
"step": 325
},
{
"epoch": 0.82,
"grad_norm": 0.14721763134002686,
"learning_rate": 0.00018094836371550824,
"loss": 1.0659,
"step": 326
},
{
"epoch": 0.82,
"grad_norm": 0.1433349996805191,
"learning_rate": 0.00018083160731784486,
"loss": 1.147,
"step": 327
},
{
"epoch": 0.82,
"grad_norm": 0.13528144359588623,
"learning_rate": 0.00018071453215721554,
"loss": 1.0388,
"step": 328
},
{
"epoch": 0.83,
"grad_norm": 0.15466062724590302,
"learning_rate": 0.0001805971386953113,
"loss": 1.0649,
"step": 329
},
{
"epoch": 0.83,
"grad_norm": 0.15163114666938782,
"learning_rate": 0.00018047942739507836,
"loss": 1.1454,
"step": 330
},
{
"epoch": 0.83,
"grad_norm": 0.14693276584148407,
"learning_rate": 0.0001803613987207163,
"loss": 1.1137,
"step": 331
},
{
"epoch": 0.83,
"grad_norm": 0.14229321479797363,
"learning_rate": 0.00018024305313767646,
"loss": 1.0153,
"step": 332
},
{
"epoch": 0.84,
"grad_norm": 0.13863018155097961,
"learning_rate": 0.00018012439111265974,
"loss": 1.0491,
"step": 333
},
{
"epoch": 0.84,
"grad_norm": 0.1422068327665329,
"learning_rate": 0.000180005413113615,
"loss": 1.0952,
"step": 334
},
{
"epoch": 0.84,
"grad_norm": 0.1419857293367386,
"learning_rate": 0.00017988611960973713,
"loss": 1.0532,
"step": 335
},
{
"epoch": 0.84,
"grad_norm": 0.1446901261806488,
"learning_rate": 0.00017976651107146533,
"loss": 1.0477,
"step": 336
},
{
"epoch": 0.85,
"grad_norm": 0.14558811485767365,
"learning_rate": 0.00017964658797048108,
"loss": 1.1481,
"step": 337
},
{
"epoch": 0.85,
"grad_norm": 0.15488363802433014,
"learning_rate": 0.0001795263507797063,
"loss": 1.1302,
"step": 338
},
{
"epoch": 0.85,
"grad_norm": 0.14942613244056702,
"learning_rate": 0.00017940579997330165,
"loss": 1.0698,
"step": 339
},
{
"epoch": 0.85,
"grad_norm": 0.14417564868927002,
"learning_rate": 0.00017928493602666445,
"loss": 1.0867,
"step": 340
},
{
"epoch": 0.86,
"grad_norm": 0.14839497208595276,
"learning_rate": 0.0001791637594164269,
"loss": 1.0124,
"step": 341
},
{
"epoch": 0.86,
"grad_norm": 0.1415972113609314,
"learning_rate": 0.00017904227062045437,
"loss": 1.0958,
"step": 342
},
{
"epoch": 0.86,
"grad_norm": 0.143202543258667,
"learning_rate": 0.00017892047011784312,
"loss": 1.0808,
"step": 343
},
{
"epoch": 0.86,
"grad_norm": 0.14291773736476898,
"learning_rate": 0.00017879835838891875,
"loss": 1.1386,
"step": 344
},
{
"epoch": 0.87,
"grad_norm": 0.1504325121641159,
"learning_rate": 0.00017867593591523422,
"loss": 1.0804,
"step": 345
},
{
"epoch": 0.87,
"grad_norm": 0.1444767862558365,
"learning_rate": 0.00017855320317956784,
"loss": 1.1207,
"step": 346
},
{
"epoch": 0.87,
"grad_norm": 0.14493699371814728,
"learning_rate": 0.00017843016066592158,
"loss": 1.0954,
"step": 347
},
{
"epoch": 0.87,
"grad_norm": 0.14571166038513184,
"learning_rate": 0.00017830680885951887,
"loss": 1.0676,
"step": 348
},
{
"epoch": 0.88,
"grad_norm": 0.14583171904087067,
"learning_rate": 0.000178183148246803,
"loss": 1.0674,
"step": 349
},
{
"epoch": 0.88,
"grad_norm": 0.15080390870571136,
"learning_rate": 0.00017805917931543492,
"loss": 1.0757,
"step": 350
},
{
"epoch": 0.88,
"grad_norm": 0.14790864288806915,
"learning_rate": 0.00017793490255429157,
"loss": 1.1005,
"step": 351
},
{
"epoch": 0.88,
"grad_norm": 0.14861677587032318,
"learning_rate": 0.00017781031845346375,
"loss": 1.0645,
"step": 352
},
{
"epoch": 0.89,
"grad_norm": 0.15099036693572998,
"learning_rate": 0.00017768542750425426,
"loss": 1.1306,
"step": 353
},
{
"epoch": 0.89,
"grad_norm": 0.14353971183300018,
"learning_rate": 0.00017756023019917607,
"loss": 1.0834,
"step": 354
},
{
"epoch": 0.89,
"grad_norm": 0.14582550525665283,
"learning_rate": 0.00017743472703195015,
"loss": 1.0722,
"step": 355
},
{
"epoch": 0.89,
"grad_norm": 0.14268234372138977,
"learning_rate": 0.00017730891849750377,
"loss": 1.092,
"step": 356
},
{
"epoch": 0.9,
"grad_norm": 0.1424105316400528,
"learning_rate": 0.00017718280509196828,
"loss": 1.1355,
"step": 357
},
{
"epoch": 0.9,
"grad_norm": 0.13972117006778717,
"learning_rate": 0.0001770563873126775,
"loss": 1.0318,
"step": 358
},
{
"epoch": 0.9,
"grad_norm": 0.14622163772583008,
"learning_rate": 0.00017692966565816532,
"loss": 1.0985,
"step": 359
},
{
"epoch": 0.9,
"grad_norm": 0.13956372439861298,
"learning_rate": 0.0001768026406281642,
"loss": 1.102,
"step": 360
},
{
"epoch": 0.91,
"grad_norm": 0.14042189717292786,
"learning_rate": 0.0001766753127236029,
"loss": 1.0284,
"step": 361
},
{
"epoch": 0.91,
"grad_norm": 0.14376944303512573,
"learning_rate": 0.00017654768244660448,
"loss": 1.1452,
"step": 362
},
{
"epoch": 0.91,
"grad_norm": 0.14055544137954712,
"learning_rate": 0.00017641975030048454,
"loss": 1.0306,
"step": 363
},
{
"epoch": 0.91,
"grad_norm": 0.14599303901195526,
"learning_rate": 0.00017629151678974907,
"loss": 1.0838,
"step": 364
},
{
"epoch": 0.92,
"grad_norm": 0.1528831571340561,
"learning_rate": 0.00017616298242009251,
"loss": 1.1293,
"step": 365
},
{
"epoch": 0.92,
"grad_norm": 0.1404455453157425,
"learning_rate": 0.00017603414769839577,
"loss": 1.0425,
"step": 366
},
{
"epoch": 0.92,
"grad_norm": 0.14992842078208923,
"learning_rate": 0.00017590501313272415,
"loss": 1.0928,
"step": 367
},
{
"epoch": 0.92,
"grad_norm": 0.14540541172027588,
"learning_rate": 0.00017577557923232546,
"loss": 1.0366,
"step": 368
},
{
"epoch": 0.93,
"grad_norm": 0.1451583057641983,
"learning_rate": 0.00017564584650762793,
"loss": 1.1108,
"step": 369
},
{
"epoch": 0.93,
"grad_norm": 0.155447855591774,
"learning_rate": 0.00017551581547023819,
"loss": 1.1394,
"step": 370
},
{
"epoch": 0.93,
"grad_norm": 0.1441376656293869,
"learning_rate": 0.0001753854866329393,
"loss": 1.0264,
"step": 371
},
{
"epoch": 0.93,
"grad_norm": 0.13875485956668854,
"learning_rate": 0.00017525486050968875,
"loss": 1.0672,
"step": 372
},
{
"epoch": 0.94,
"grad_norm": 0.14158080518245697,
"learning_rate": 0.00017512393761561632,
"loss": 1.053,
"step": 373
},
{
"epoch": 0.94,
"grad_norm": 0.15505361557006836,
"learning_rate": 0.00017499271846702213,
"loss": 1.0713,
"step": 374
},
{
"epoch": 0.94,
"grad_norm": 0.14172373712062836,
"learning_rate": 0.0001748612035813747,
"loss": 1.0544,
"step": 375
},
{
"epoch": 0.94,
"grad_norm": 0.14016349613666534,
"learning_rate": 0.00017472939347730856,
"loss": 1.0382,
"step": 376
},
{
"epoch": 0.95,
"grad_norm": 0.15148378908634186,
"learning_rate": 0.00017459728867462275,
"loss": 1.1218,
"step": 377
},
{
"epoch": 0.95,
"grad_norm": 0.1416306346654892,
"learning_rate": 0.0001744648896942782,
"loss": 1.0895,
"step": 378
},
{
"epoch": 0.95,
"grad_norm": 0.14276988804340363,
"learning_rate": 0.00017433219705839616,
"loss": 1.0991,
"step": 379
},
{
"epoch": 0.95,
"grad_norm": 0.13922327756881714,
"learning_rate": 0.00017419921129025576,
"loss": 1.0883,
"step": 380
},
{
"epoch": 0.96,
"grad_norm": 0.1479676216840744,
"learning_rate": 0.00017406593291429217,
"loss": 1.1083,
"step": 381
},
{
"epoch": 0.96,
"grad_norm": 0.14659778773784637,
"learning_rate": 0.0001739323624560945,
"loss": 1.0863,
"step": 382
},
{
"epoch": 0.96,
"grad_norm": 0.14685633778572083,
"learning_rate": 0.00017379850044240368,
"loss": 1.1075,
"step": 383
},
{
"epoch": 0.96,
"grad_norm": 0.14316044747829437,
"learning_rate": 0.00017366434740111037,
"loss": 1.0584,
"step": 384
},
{
"epoch": 0.97,
"grad_norm": 0.14292864501476288,
"learning_rate": 0.00017352990386125292,
"loss": 1.1002,
"step": 385
},
{
"epoch": 0.97,
"grad_norm": 0.14412067830562592,
"learning_rate": 0.00017339517035301532,
"loss": 1.0671,
"step": 386
},
{
"epoch": 0.97,
"grad_norm": 0.14292089641094208,
"learning_rate": 0.000173260147407725,
"loss": 1.0958,
"step": 387
},
{
"epoch": 0.97,
"grad_norm": 0.1490335911512375,
"learning_rate": 0.00017312483555785086,
"loss": 1.1074,
"step": 388
},
{
"epoch": 0.98,
"grad_norm": 0.14249826967716217,
"learning_rate": 0.00017298923533700107,
"loss": 1.1546,
"step": 389
},
{
"epoch": 0.98,
"grad_norm": 0.14555396139621735,
"learning_rate": 0.000172853347279921,
"loss": 1.076,
"step": 390
},
{
"epoch": 0.98,
"grad_norm": 0.14374902844429016,
"learning_rate": 0.00017271717192249116,
"loss": 1.0767,
"step": 391
},
{
"epoch": 0.98,
"grad_norm": 0.14903804659843445,
"learning_rate": 0.00017258070980172494,
"loss": 1.0969,
"step": 392
},
{
"epoch": 0.99,
"grad_norm": 0.1533229798078537,
"learning_rate": 0.00017244396145576672,
"loss": 1.1206,
"step": 393
},
{
"epoch": 0.99,
"grad_norm": 0.14720167219638824,
"learning_rate": 0.0001723069274238895,
"loss": 1.0655,
"step": 394
},
{
"epoch": 0.99,
"grad_norm": 0.14380764961242676,
"learning_rate": 0.00017216960824649303,
"loss": 1.0123,
"step": 395
},
{
"epoch": 0.99,
"grad_norm": 0.14513961970806122,
"learning_rate": 0.0001720320044651014,
"loss": 1.0196,
"step": 396
},
{
"epoch": 1.0,
"grad_norm": 0.14310909807682037,
"learning_rate": 0.0001718941166223612,
"loss": 1.0278,
"step": 397
},
{
"epoch": 1.0,
"grad_norm": 0.14312389492988586,
"learning_rate": 0.00017175594526203905,
"loss": 1.0649,
"step": 398
},
{
"epoch": 1.0,
"grad_norm": 0.1408112645149231,
"learning_rate": 0.00017161749092901984,
"loss": 1.0793,
"step": 399
},
{
"epoch": 1.0,
"grad_norm": 0.14593806862831116,
"learning_rate": 0.00017147875416930416,
"loss": 1.0474,
"step": 400
},
{
"epoch": 1.0,
"eval_loss": 1.083612322807312,
"eval_runtime": 81.6893,
"eval_samples_per_second": 31.816,
"eval_steps_per_second": 31.816,
"step": 400
},
{
"epoch": 1.01,
"grad_norm": 0.14213843643665314,
"learning_rate": 0.00017133973553000654,
"loss": 1.0476,
"step": 401
},
{
"epoch": 1.01,
"grad_norm": 0.14211952686309814,
"learning_rate": 0.00017120043555935298,
"loss": 1.0386,
"step": 402
},
{
"epoch": 1.01,
"grad_norm": 0.15638479590415955,
"learning_rate": 0.00017106085480667903,
"loss": 1.1145,
"step": 403
},
{
"epoch": 1.01,
"grad_norm": 0.1525896191596985,
"learning_rate": 0.00017092099382242748,
"loss": 1.1124,
"step": 404
},
{
"epoch": 1.0,
"grad_norm": 0.13780884444713593,
"learning_rate": 0.0001707808531581462,
"loss": 1.0208,
"step": 405
},
{
"epoch": 1.01,
"grad_norm": 0.13917113840579987,
"learning_rate": 0.00017064043336648599,
"loss": 1.0143,
"step": 406
},
{
"epoch": 1.01,
"grad_norm": 0.14122170209884644,
"learning_rate": 0.00017049973500119845,
"loss": 0.9977,
"step": 407
},
{
"epoch": 1.01,
"grad_norm": 0.14243052899837494,
"learning_rate": 0.0001703587586171337,
"loss": 0.9933,
"step": 408
},
{
"epoch": 1.01,
"grad_norm": 0.14186780154705048,
"learning_rate": 0.0001702175047702382,
"loss": 0.9567,
"step": 409
},
{
"epoch": 1.02,
"grad_norm": 0.1524883359670639,
"learning_rate": 0.00017007597401755276,
"loss": 0.9874,
"step": 410
},
{
"epoch": 1.02,
"grad_norm": 0.15759988129138947,
"learning_rate": 0.00016993416691720998,
"loss": 1.0292,
"step": 411
},
{
"epoch": 1.02,
"grad_norm": 0.15617264807224274,
"learning_rate": 0.00016979208402843237,
"loss": 1.0168,
"step": 412
},
{
"epoch": 1.02,
"grad_norm": 0.15921927988529205,
"learning_rate": 0.00016964972591153,
"loss": 1.0209,
"step": 413
},
{
"epoch": 1.03,
"grad_norm": 0.1540677845478058,
"learning_rate": 0.00016950709312789833,
"loss": 1.0013,
"step": 414
},
{
"epoch": 1.03,
"grad_norm": 0.156731516122818,
"learning_rate": 0.00016936418624001592,
"loss": 1.0171,
"step": 415
},
{
"epoch": 1.03,
"grad_norm": 0.15679331123828888,
"learning_rate": 0.00016922100581144228,
"loss": 1.0137,
"step": 416
},
{
"epoch": 1.03,
"grad_norm": 0.15117546916007996,
"learning_rate": 0.00016907755240681577,
"loss": 0.9041,
"step": 417
},
{
"epoch": 1.04,
"grad_norm": 0.1581723839044571,
"learning_rate": 0.00016893382659185105,
"loss": 0.9891,
"step": 418
},
{
"epoch": 1.04,
"grad_norm": 0.15231919288635254,
"learning_rate": 0.00016878982893333717,
"loss": 0.9626,
"step": 419
},
{
"epoch": 1.04,
"grad_norm": 0.15532514452934265,
"learning_rate": 0.00016864555999913518,
"loss": 0.9639,
"step": 420
},
{
"epoch": 1.04,
"grad_norm": 0.16158603131771088,
"learning_rate": 0.00016850102035817588,
"loss": 1.0156,
"step": 421
},
{
"epoch": 1.05,
"grad_norm": 0.16860714554786682,
"learning_rate": 0.0001683562105804577,
"loss": 1.0279,
"step": 422
},
{
"epoch": 1.05,
"grad_norm": 0.1704617142677307,
"learning_rate": 0.00016821113123704424,
"loss": 1.0261,
"step": 423
},
{
"epoch": 1.05,
"grad_norm": 0.16520226001739502,
"learning_rate": 0.00016806578290006225,
"loss": 1.0307,
"step": 424
},
{
"epoch": 1.05,
"grad_norm": 0.16199736297130585,
"learning_rate": 0.00016792016614269924,
"loss": 0.9764,
"step": 425
},
{
"epoch": 1.06,
"grad_norm": 0.16184571385383606,
"learning_rate": 0.0001677742815392012,
"loss": 0.9958,
"step": 426
},
{
"epoch": 1.06,
"grad_norm": 0.16386933624744415,
"learning_rate": 0.00016762812966487044,
"loss": 1.0221,
"step": 427
},
{
"epoch": 1.06,
"grad_norm": 0.17046724259853363,
"learning_rate": 0.00016748171109606328,
"loss": 1.029,
"step": 428
},
{
"epoch": 1.06,
"grad_norm": 0.1638820469379425,
"learning_rate": 0.00016733502641018766,
"loss": 1.0175,
"step": 429
},
{
"epoch": 1.07,
"grad_norm": 0.16480222344398499,
"learning_rate": 0.00016718807618570106,
"loss": 1.033,
"step": 430
},
{
"epoch": 1.07,
"grad_norm": 0.1661783903837204,
"learning_rate": 0.00016704086100210815,
"loss": 0.9379,
"step": 431
},
{
"epoch": 1.07,
"grad_norm": 0.15570427477359772,
"learning_rate": 0.00016689338143995833,
"loss": 0.9877,
"step": 432
},
{
"epoch": 1.07,
"grad_norm": 0.170819491147995,
"learning_rate": 0.00016674563808084377,
"loss": 1.0738,
"step": 433
},
{
"epoch": 1.08,
"grad_norm": 0.16349053382873535,
"learning_rate": 0.00016659763150739677,
"loss": 0.9474,
"step": 434
},
{
"epoch": 1.08,
"grad_norm": 0.1703306883573532,
"learning_rate": 0.0001664493623032877,
"loss": 1.054,
"step": 435
},
{
"epoch": 1.08,
"grad_norm": 0.1705269068479538,
"learning_rate": 0.00016630083105322266,
"loss": 1.0175,
"step": 436
},
{
"epoch": 1.08,
"grad_norm": 0.15883858501911163,
"learning_rate": 0.00016615203834294119,
"loss": 1.0414,
"step": 437
},
{
"epoch": 1.09,
"grad_norm": 0.17120327055454254,
"learning_rate": 0.00016600298475921365,
"loss": 1.0222,
"step": 438
},
{
"epoch": 1.09,
"grad_norm": 0.1668461114168167,
"learning_rate": 0.00016585367088983946,
"loss": 0.9212,
"step": 439
},
{
"epoch": 1.09,
"grad_norm": 0.178915336728096,
"learning_rate": 0.00016570409732364437,
"loss": 1.0167,
"step": 440
},
{
"epoch": 1.09,
"grad_norm": 0.171407088637352,
"learning_rate": 0.00016555426465047823,
"loss": 0.9693,
"step": 441
},
{
"epoch": 1.1,
"grad_norm": 0.1687992811203003,
"learning_rate": 0.0001654041734612127,
"loss": 1.0257,
"step": 442
},
{
"epoch": 1.1,
"grad_norm": 0.17136409878730774,
"learning_rate": 0.00016525382434773894,
"loss": 0.9874,
"step": 443
},
{
"epoch": 1.1,
"grad_norm": 0.1806887686252594,
"learning_rate": 0.00016510321790296525,
"loss": 1.0684,
"step": 444
},
{
"epoch": 1.1,
"grad_norm": 0.17648373544216156,
"learning_rate": 0.00016495235472081468,
"loss": 0.9867,
"step": 445
},
{
"epoch": 1.11,
"grad_norm": 0.17426486313343048,
"learning_rate": 0.00016480123539622281,
"loss": 1.0439,
"step": 446
},
{
"epoch": 1.11,
"grad_norm": 0.17550793290138245,
"learning_rate": 0.0001646498605251352,
"loss": 1.0127,
"step": 447
},
{
"epoch": 1.11,
"grad_norm": 0.1805875450372696,
"learning_rate": 0.00016449823070450531,
"loss": 1.0317,
"step": 448
},
{
"epoch": 1.11,
"grad_norm": 0.17466574907302856,
"learning_rate": 0.00016434634653229199,
"loss": 0.9713,
"step": 449
},
{
"epoch": 1.12,
"grad_norm": 0.16918793320655823,
"learning_rate": 0.00016419420860745699,
"loss": 1.0376,
"step": 450
},
{
"epoch": 1.12,
"grad_norm": 0.16672617197036743,
"learning_rate": 0.00016404181752996289,
"loss": 0.9211,
"step": 451
},
{
"epoch": 1.12,
"grad_norm": 0.17270368337631226,
"learning_rate": 0.00016388917390077054,
"loss": 0.987,
"step": 452
},
{
"epoch": 1.12,
"grad_norm": 0.16792818903923035,
"learning_rate": 0.0001637362783218368,
"loss": 0.9782,
"step": 453
},
{
"epoch": 1.13,
"grad_norm": 0.1800449639558792,
"learning_rate": 0.00016358313139611195,
"loss": 0.9747,
"step": 454
},
{
"epoch": 1.13,
"grad_norm": 0.17128407955169678,
"learning_rate": 0.0001634297337275376,
"loss": 1.0312,
"step": 455
},
{
"epoch": 1.13,
"grad_norm": 0.17059966921806335,
"learning_rate": 0.0001632760859210442,
"loss": 1.0075,
"step": 456
},
{
"epoch": 1.13,
"grad_norm": 0.18244986236095428,
"learning_rate": 0.0001631221885825485,
"loss": 1.0161,
"step": 457
},
{
"epoch": 1.14,
"grad_norm": 0.17219580709934235,
"learning_rate": 0.00016296804231895142,
"loss": 1.0105,
"step": 458
},
{
"epoch": 1.14,
"grad_norm": 0.1736789494752884,
"learning_rate": 0.0001628136477381354,
"loss": 1.0128,
"step": 459
},
{
"epoch": 1.14,
"grad_norm": 0.2108864039182663,
"learning_rate": 0.00016265900544896225,
"loss": 0.9926,
"step": 460
},
{
"epoch": 1.14,
"grad_norm": 0.16976673901081085,
"learning_rate": 0.00016250411606127054,
"loss": 0.9633,
"step": 461
},
{
"epoch": 1.15,
"grad_norm": 0.1719416379928589,
"learning_rate": 0.00016234898018587337,
"loss": 1.0222,
"step": 462
},
{
"epoch": 1.15,
"grad_norm": 0.17205439507961273,
"learning_rate": 0.00016219359843455577,
"loss": 1.0328,
"step": 463
},
{
"epoch": 1.15,
"grad_norm": 0.17340464890003204,
"learning_rate": 0.0001620379714200725,
"loss": 0.9781,
"step": 464
},
{
"epoch": 1.15,
"grad_norm": 0.17654834687709808,
"learning_rate": 0.00016188209975614542,
"loss": 1.0151,
"step": 465
},
{
"epoch": 1.16,
"grad_norm": 0.17264829576015472,
"learning_rate": 0.00016172598405746124,
"loss": 0.9525,
"step": 466
},
{
"epoch": 1.16,
"grad_norm": 0.16847053170204163,
"learning_rate": 0.00016156962493966908,
"loss": 0.9202,
"step": 467
},
{
"epoch": 1.16,
"grad_norm": 0.18013043701648712,
"learning_rate": 0.00016141302301937786,
"loss": 1.0383,
"step": 468
},
{
"epoch": 1.16,
"grad_norm": 0.17866036295890808,
"learning_rate": 0.0001612561789141541,
"loss": 0.9682,
"step": 469
},
{
"epoch": 1.17,
"grad_norm": 0.17272624373435974,
"learning_rate": 0.0001610990932425194,
"loss": 1.0254,
"step": 470
},
{
"epoch": 1.17,
"grad_norm": 0.18053527176380157,
"learning_rate": 0.00016094176662394792,
"loss": 1.0435,
"step": 471
},
{
"epoch": 1.17,
"grad_norm": 0.17645591497421265,
"learning_rate": 0.00016078419967886402,
"loss": 0.9929,
"step": 472
},
{
"epoch": 1.17,
"grad_norm": 0.17896148562431335,
"learning_rate": 0.00016062639302863986,
"loss": 0.9597,
"step": 473
},
{
"epoch": 1.18,
"grad_norm": 0.1784675121307373,
"learning_rate": 0.0001604683472955928,
"loss": 0.9877,
"step": 474
},
{
"epoch": 1.18,
"grad_norm": 0.18384787440299988,
"learning_rate": 0.00016031006310298306,
"loss": 0.98,
"step": 475
},
{
"epoch": 1.18,
"grad_norm": 0.17336387932300568,
"learning_rate": 0.00016015154107501133,
"loss": 0.9813,
"step": 476
},
{
"epoch": 1.18,
"grad_norm": 0.1778045892715454,
"learning_rate": 0.00015999278183681604,
"loss": 0.9327,
"step": 477
},
{
"epoch": 1.19,
"grad_norm": 0.17641645669937134,
"learning_rate": 0.00015983378601447127,
"loss": 0.9955,
"step": 478
},
{
"epoch": 1.19,
"grad_norm": 0.18100661039352417,
"learning_rate": 0.00015967455423498387,
"loss": 1.0304,
"step": 479
},
{
"epoch": 1.19,
"grad_norm": 0.17939269542694092,
"learning_rate": 0.0001595150871262914,
"loss": 0.9129,
"step": 480
},
{
"epoch": 1.19,
"grad_norm": 0.18178121745586395,
"learning_rate": 0.00015935538531725927,
"loss": 1.0567,
"step": 481
},
{
"epoch": 1.2,
"grad_norm": 0.18156662583351135,
"learning_rate": 0.00015919544943767856,
"loss": 0.9731,
"step": 482
},
{
"epoch": 1.2,
"grad_norm": 0.18265368044376373,
"learning_rate": 0.00015903528011826335,
"loss": 1.0253,
"step": 483
},
{
"epoch": 1.2,
"grad_norm": 0.16867631673812866,
"learning_rate": 0.00015887487799064838,
"loss": 0.967,
"step": 484
},
{
"epoch": 1.2,
"grad_norm": 0.181188702583313,
"learning_rate": 0.0001587142436873864,
"loss": 1.0113,
"step": 485
},
{
"epoch": 1.21,
"grad_norm": 0.17186175286769867,
"learning_rate": 0.00015855337784194577,
"loss": 0.9987,
"step": 486
},
{
"epoch": 1.21,
"grad_norm": 0.16855312883853912,
"learning_rate": 0.000158392281088708,
"loss": 0.9623,
"step": 487
},
{
"epoch": 1.21,
"grad_norm": 0.1724013239145279,
"learning_rate": 0.00015823095406296514,
"loss": 0.922,
"step": 488
},
{
"epoch": 1.21,
"grad_norm": 0.18288518488407135,
"learning_rate": 0.00015806939740091734,
"loss": 0.9884,
"step": 489
},
{
"epoch": 1.22,
"grad_norm": 0.17419768869876862,
"learning_rate": 0.00015790761173967036,
"loss": 0.9246,
"step": 490
},
{
"epoch": 1.22,
"grad_norm": 0.1798882633447647,
"learning_rate": 0.00015774559771723298,
"loss": 0.9276,
"step": 491
},
{
"epoch": 1.22,
"grad_norm": 0.18484486639499664,
"learning_rate": 0.00015758335597251458,
"loss": 0.9967,
"step": 492
},
{
"epoch": 1.22,
"grad_norm": 0.17431318759918213,
"learning_rate": 0.00015742088714532247,
"loss": 0.9672,
"step": 493
},
{
"epoch": 1.23,
"grad_norm": 0.1722385287284851,
"learning_rate": 0.00015725819187635968,
"loss": 0.9561,
"step": 494
},
{
"epoch": 1.23,
"grad_norm": 0.19427751004695892,
"learning_rate": 0.00015709527080722202,
"loss": 0.969,
"step": 495
},
{
"epoch": 1.23,
"grad_norm": 0.1689085215330124,
"learning_rate": 0.00015693212458039584,
"loss": 0.9618,
"step": 496
},
{
"epoch": 1.23,
"grad_norm": 0.1696721762418747,
"learning_rate": 0.00015676875383925534,
"loss": 0.9686,
"step": 497
},
{
"epoch": 1.24,
"grad_norm": 0.17037516832351685,
"learning_rate": 0.00015660515922806027,
"loss": 0.956,
"step": 498
},
{
"epoch": 1.24,
"grad_norm": 0.17930398881435394,
"learning_rate": 0.000156441341391953,
"loss": 0.983,
"step": 499
},
{
"epoch": 1.24,
"grad_norm": 0.18172559142112732,
"learning_rate": 0.00015627730097695638,
"loss": 1.0447,
"step": 500
},
{
"epoch": 1.24,
"eval_loss": 1.0872775316238403,
"eval_runtime": 81.628,
"eval_samples_per_second": 31.84,
"eval_steps_per_second": 31.84,
"step": 500
},
{
"epoch": 1.24,
"grad_norm": 0.179900661110878,
"learning_rate": 0.0001561130386299709,
"loss": 0.9864,
"step": 501
},
{
"epoch": 1.25,
"grad_norm": 0.1860770583152771,
"learning_rate": 0.0001559485549987723,
"loss": 0.9963,
"step": 502
},
{
"epoch": 1.25,
"grad_norm": 0.17942041158676147,
"learning_rate": 0.00015578385073200895,
"loss": 1.0004,
"step": 503
},
{
"epoch": 1.25,
"grad_norm": 0.17420290410518646,
"learning_rate": 0.0001556189264791992,
"loss": 1.002,
"step": 504
},
{
"epoch": 1.25,
"grad_norm": 0.17478443682193756,
"learning_rate": 0.00015545378289072922,
"loss": 0.9624,
"step": 505
},
{
"epoch": 1.26,
"grad_norm": 0.18624065816402435,
"learning_rate": 0.0001552884206178498,
"loss": 1.0315,
"step": 506
},
{
"epoch": 1.26,
"grad_norm": 0.17450089752674103,
"learning_rate": 0.00015512284031267437,
"loss": 0.9906,
"step": 507
},
{
"epoch": 1.26,
"grad_norm": 0.1746608465909958,
"learning_rate": 0.00015495704262817597,
"loss": 0.9898,
"step": 508
},
{
"epoch": 1.26,
"grad_norm": 0.17796628177165985,
"learning_rate": 0.00015479102821818507,
"loss": 1.0194,
"step": 509
},
{
"epoch": 1.27,
"grad_norm": 0.17470288276672363,
"learning_rate": 0.0001546247977373867,
"loss": 0.9309,
"step": 510
},
{
"epoch": 1.27,
"grad_norm": 0.17829464375972748,
"learning_rate": 0.000154458351841318,
"loss": 1.0141,
"step": 511
},
{
"epoch": 1.27,
"grad_norm": 0.17732754349708557,
"learning_rate": 0.00015429169118636566,
"loss": 0.9817,
"step": 512
},
{
"epoch": 1.27,
"grad_norm": 0.1795651614665985,
"learning_rate": 0.00015412481642976318,
"loss": 0.9709,
"step": 513
},
{
"epoch": 1.28,
"grad_norm": 0.17974676191806793,
"learning_rate": 0.00015395772822958845,
"loss": 1.0243,
"step": 514
},
{
"epoch": 1.28,
"grad_norm": 0.18511098623275757,
"learning_rate": 0.0001537904272447611,
"loss": 1.0001,
"step": 515
},
{
"epoch": 1.28,
"grad_norm": 0.1780577152967453,
"learning_rate": 0.00015362291413503984,
"loss": 0.9829,
"step": 516
},
{
"epoch": 1.28,
"grad_norm": 0.17798136174678802,
"learning_rate": 0.0001534551895610199,
"loss": 0.9659,
"step": 517
},
{
"epoch": 1.29,
"grad_norm": 0.1870565563440323,
"learning_rate": 0.00015328725418413045,
"loss": 0.9749,
"step": 518
},
{
"epoch": 1.29,
"grad_norm": 0.18744368851184845,
"learning_rate": 0.00015311910866663196,
"loss": 1.015,
"step": 519
},
{
"epoch": 1.29,
"grad_norm": 0.18052896857261658,
"learning_rate": 0.00015295075367161367,
"loss": 1.0313,
"step": 520
},
{
"epoch": 1.29,
"grad_norm": 0.1779204159975052,
"learning_rate": 0.00015278218986299074,
"loss": 0.9496,
"step": 521
},
{
"epoch": 1.3,
"grad_norm": 0.1824800670146942,
"learning_rate": 0.00015261341790550196,
"loss": 1.0281,
"step": 522
},
{
"epoch": 1.3,
"grad_norm": 0.19057531654834747,
"learning_rate": 0.0001524444384647069,
"loss": 1.0271,
"step": 523
},
{
"epoch": 1.3,
"grad_norm": 0.19244614243507385,
"learning_rate": 0.0001522752522069833,
"loss": 0.9907,
"step": 524
},
{
"epoch": 1.3,
"grad_norm": 0.17696735262870789,
"learning_rate": 0.0001521058597995246,
"loss": 0.9331,
"step": 525
},
{
"epoch": 1.31,
"grad_norm": 0.17268431186676025,
"learning_rate": 0.00015193626191033712,
"loss": 0.9427,
"step": 526
},
{
"epoch": 1.31,
"grad_norm": 0.18662290275096893,
"learning_rate": 0.0001517664592082375,
"loss": 1.0074,
"step": 527
},
{
"epoch": 1.31,
"grad_norm": 0.17090214788913727,
"learning_rate": 0.0001515964523628501,
"loss": 0.9608,
"step": 528
},
{
"epoch": 1.31,
"grad_norm": 0.1795254349708557,
"learning_rate": 0.00015142624204460435,
"loss": 0.9439,
"step": 529
},
{
"epoch": 1.32,
"grad_norm": 0.18272066116333008,
"learning_rate": 0.00015125582892473204,
"loss": 0.9828,
"step": 530
},
{
"epoch": 1.32,
"grad_norm": 0.2021034061908722,
"learning_rate": 0.00015108521367526479,
"loss": 1.0375,
"step": 531
},
{
"epoch": 1.32,
"grad_norm": 0.18685071170330048,
"learning_rate": 0.00015091439696903115,
"loss": 1.0026,
"step": 532
},
{
"epoch": 1.32,
"grad_norm": 0.17936167120933533,
"learning_rate": 0.00015074337947965435,
"loss": 0.9296,
"step": 533
},
{
"epoch": 1.33,
"grad_norm": 0.18303433060646057,
"learning_rate": 0.00015057216188154928,
"loss": 0.9416,
"step": 534
},
{
"epoch": 1.33,
"grad_norm": 0.18212522566318512,
"learning_rate": 0.00015040074484992,
"loss": 0.9812,
"step": 535
},
{
"epoch": 1.33,
"grad_norm": 0.17352260649204254,
"learning_rate": 0.00015022912906075702,
"loss": 0.9766,
"step": 536
},
{
"epoch": 1.33,
"grad_norm": 0.17948494851589203,
"learning_rate": 0.0001500573151908347,
"loss": 1.006,
"step": 537
},
{
"epoch": 1.34,
"grad_norm": 0.18391214311122894,
"learning_rate": 0.00014988530391770856,
"loss": 1.0484,
"step": 538
},
{
"epoch": 1.34,
"grad_norm": 0.1719055324792862,
"learning_rate": 0.00014971309591971252,
"loss": 0.964,
"step": 539
},
{
"epoch": 1.34,
"grad_norm": 0.1985386312007904,
"learning_rate": 0.00014954069187595633,
"loss": 1.0035,
"step": 540
},
{
"epoch": 1.34,
"grad_norm": 0.18530823290348053,
"learning_rate": 0.0001493680924663228,
"loss": 1.0089,
"step": 541
},
{
"epoch": 1.35,
"grad_norm": 0.18150845170021057,
"learning_rate": 0.00014919529837146528,
"loss": 1.0586,
"step": 542
},
{
"epoch": 1.35,
"grad_norm": 0.19130894541740417,
"learning_rate": 0.00014902231027280486,
"loss": 1.0152,
"step": 543
},
{
"epoch": 1.35,
"grad_norm": 0.1798924058675766,
"learning_rate": 0.0001488491288525275,
"loss": 0.9548,
"step": 544
},
{
"epoch": 1.35,
"grad_norm": 0.17213404178619385,
"learning_rate": 0.0001486757547935818,
"loss": 1.0226,
"step": 545
},
{
"epoch": 1.36,
"grad_norm": 0.18383356928825378,
"learning_rate": 0.0001485021887796759,
"loss": 1.0291,
"step": 546
},
{
"epoch": 1.36,
"grad_norm": 0.19143284857273102,
"learning_rate": 0.0001483284314952749,
"loss": 1.0055,
"step": 547
},
{
"epoch": 1.36,
"grad_norm": 0.19124020636081696,
"learning_rate": 0.00014815448362559826,
"loss": 1.0231,
"step": 548
},
{
"epoch": 1.36,
"grad_norm": 0.18096496164798737,
"learning_rate": 0.00014798034585661695,
"loss": 1.0152,
"step": 549
},
{
"epoch": 1.37,
"grad_norm": 0.17621304094791412,
"learning_rate": 0.00014780601887505088,
"loss": 0.9718,
"step": 550
},
{
"epoch": 1.37,
"grad_norm": 0.18995219469070435,
"learning_rate": 0.00014763150336836604,
"loss": 1.0052,
"step": 551
},
{
"epoch": 1.37,
"grad_norm": 0.19126906991004944,
"learning_rate": 0.00014745680002477203,
"loss": 0.9409,
"step": 552
},
{
"epoch": 1.37,
"grad_norm": 0.17537294328212738,
"learning_rate": 0.00014728190953321903,
"loss": 1.0021,
"step": 553
},
{
"epoch": 1.38,
"grad_norm": 0.18963244557380676,
"learning_rate": 0.00014710683258339536,
"loss": 1.0154,
"step": 554
},
{
"epoch": 1.38,
"grad_norm": 0.17940685153007507,
"learning_rate": 0.00014693156986572456,
"loss": 0.9898,
"step": 555
},
{
"epoch": 1.38,
"grad_norm": 0.19598953425884247,
"learning_rate": 0.0001467561220713628,
"loss": 1.0479,
"step": 556
},
{
"epoch": 1.38,
"grad_norm": 0.18346156179904938,
"learning_rate": 0.00014658048989219614,
"loss": 1.0076,
"step": 557
},
{
"epoch": 1.39,
"grad_norm": 0.17553867399692535,
"learning_rate": 0.0001464046740208377,
"loss": 0.9696,
"step": 558
},
{
"epoch": 1.39,
"grad_norm": 0.1788376122713089,
"learning_rate": 0.00014622867515062503,
"loss": 0.9788,
"step": 559
},
{
"epoch": 1.39,
"grad_norm": 0.17731797695159912,
"learning_rate": 0.00014605249397561736,
"loss": 1.003,
"step": 560
},
{
"epoch": 1.39,
"grad_norm": 0.17706608772277832,
"learning_rate": 0.00014587613119059284,
"loss": 1.0055,
"step": 561
},
{
"epoch": 1.4,
"grad_norm": 0.168448805809021,
"learning_rate": 0.00014569958749104575,
"loss": 0.9516,
"step": 562
},
{
"epoch": 1.4,
"grad_norm": 0.18675707280635834,
"learning_rate": 0.0001455228635731839,
"loss": 0.9837,
"step": 563
},
{
"epoch": 1.4,
"grad_norm": 0.17538242042064667,
"learning_rate": 0.00014534596013392575,
"loss": 1.0367,
"step": 564
},
{
"epoch": 1.4,
"grad_norm": 0.17501141130924225,
"learning_rate": 0.00014516887787089774,
"loss": 0.9733,
"step": 565
},
{
"epoch": 1.41,
"grad_norm": 0.1874341070652008,
"learning_rate": 0.00014499161748243147,
"loss": 1.0206,
"step": 566
},
{
"epoch": 1.41,
"grad_norm": 0.1980811208486557,
"learning_rate": 0.00014481417966756102,
"loss": 1.0289,
"step": 567
},
{
"epoch": 1.41,
"grad_norm": 0.18807095289230347,
"learning_rate": 0.0001446365651260201,
"loss": 1.0205,
"step": 568
},
{
"epoch": 1.41,
"grad_norm": 0.1855577528476715,
"learning_rate": 0.00014445877455823946,
"loss": 1.0497,
"step": 569
},
{
"epoch": 1.42,
"grad_norm": 0.18725629150867462,
"learning_rate": 0.00014428080866534396,
"loss": 1.0326,
"step": 570
},
{
"epoch": 1.42,
"grad_norm": 0.19902606308460236,
"learning_rate": 0.0001441026681491498,
"loss": 1.0252,
"step": 571
},
{
"epoch": 1.42,
"grad_norm": 0.19441325962543488,
"learning_rate": 0.00014392435371216185,
"loss": 1.0191,
"step": 572
},
{
"epoch": 1.42,
"grad_norm": 0.18167538940906525,
"learning_rate": 0.00014374586605757095,
"loss": 1.029,
"step": 573
},
{
"epoch": 1.43,
"grad_norm": 0.1809268742799759,
"learning_rate": 0.0001435672058892509,
"loss": 0.975,
"step": 574
},
{
"epoch": 1.43,
"grad_norm": 0.18132343888282776,
"learning_rate": 0.00014338837391175582,
"loss": 0.9688,
"step": 575
},
{
"epoch": 1.43,
"grad_norm": 0.1733206808567047,
"learning_rate": 0.00014320937083031748,
"loss": 0.958,
"step": 576
},
{
"epoch": 1.43,
"grad_norm": 0.1799648404121399,
"learning_rate": 0.00014303019735084226,
"loss": 0.9842,
"step": 577
},
{
"epoch": 1.44,
"grad_norm": 0.1771499365568161,
"learning_rate": 0.0001428508541799086,
"loss": 1.0048,
"step": 578
},
{
"epoch": 1.44,
"grad_norm": 0.1818363070487976,
"learning_rate": 0.00014267134202476417,
"loss": 1.0374,
"step": 579
},
{
"epoch": 1.44,
"grad_norm": 0.1858426034450531,
"learning_rate": 0.0001424916615933229,
"loss": 0.9952,
"step": 580
},
{
"epoch": 1.44,
"grad_norm": 0.19056333601474762,
"learning_rate": 0.00014231181359416247,
"loss": 1.0125,
"step": 581
},
{
"epoch": 1.45,
"grad_norm": 0.179644376039505,
"learning_rate": 0.00014213179873652127,
"loss": 0.9194,
"step": 582
},
{
"epoch": 1.45,
"grad_norm": 0.177077516913414,
"learning_rate": 0.0001419516177302957,
"loss": 0.991,
"step": 583
},
{
"epoch": 1.45,
"grad_norm": 0.18390731513500214,
"learning_rate": 0.00014177127128603745,
"loss": 0.9921,
"step": 584
},
{
"epoch": 1.45,
"grad_norm": 0.1845334768295288,
"learning_rate": 0.00014159076011495061,
"loss": 0.993,
"step": 585
},
{
"epoch": 1.46,
"grad_norm": 0.1941182017326355,
"learning_rate": 0.0001414100849288888,
"loss": 0.9864,
"step": 586
},
{
"epoch": 1.46,
"grad_norm": 0.17679093778133392,
"learning_rate": 0.00014122924644035249,
"loss": 1.0078,
"step": 587
},
{
"epoch": 1.46,
"grad_norm": 0.1847458928823471,
"learning_rate": 0.00014104824536248614,
"loss": 1.0043,
"step": 588
},
{
"epoch": 1.46,
"grad_norm": 0.1811904013156891,
"learning_rate": 0.00014086708240907542,
"loss": 0.9493,
"step": 589
},
{
"epoch": 1.47,
"grad_norm": 0.18393242359161377,
"learning_rate": 0.00014068575829454436,
"loss": 1.0019,
"step": 590
},
{
"epoch": 1.47,
"grad_norm": 0.17711445689201355,
"learning_rate": 0.0001405042737339524,
"loss": 0.9666,
"step": 591
},
{
"epoch": 1.47,
"grad_norm": 0.18920022249221802,
"learning_rate": 0.00014032262944299194,
"loss": 0.9579,
"step": 592
},
{
"epoch": 1.47,
"grad_norm": 0.18185077607631683,
"learning_rate": 0.00014014082613798503,
"loss": 1.0523,
"step": 593
},
{
"epoch": 1.48,
"grad_norm": 0.19337935745716095,
"learning_rate": 0.00013995886453588104,
"loss": 0.9841,
"step": 594
},
{
"epoch": 1.48,
"grad_norm": 0.1859455108642578,
"learning_rate": 0.00013977674535425337,
"loss": 1.0389,
"step": 595
},
{
"epoch": 1.48,
"grad_norm": 0.17890392243862152,
"learning_rate": 0.00013959446931129704,
"loss": 1.0308,
"step": 596
},
{
"epoch": 1.48,
"grad_norm": 0.1741844266653061,
"learning_rate": 0.00013941203712582553,
"loss": 1.0466,
"step": 597
},
{
"epoch": 1.49,
"grad_norm": 0.19279837608337402,
"learning_rate": 0.0001392294495172681,
"loss": 0.9952,
"step": 598
},
{
"epoch": 1.49,
"grad_norm": 0.19602486491203308,
"learning_rate": 0.00013904670720566698,
"loss": 1.0273,
"step": 599
},
{
"epoch": 1.49,
"grad_norm": 0.18000701069831848,
"learning_rate": 0.0001388638109116744,
"loss": 1.0131,
"step": 600
},
{
"epoch": 1.49,
"eval_loss": 1.080866813659668,
"eval_runtime": 81.6407,
"eval_samples_per_second": 31.835,
"eval_steps_per_second": 31.835,
"step": 600
},
{
"epoch": 1.49,
"grad_norm": 0.18183240294456482,
"learning_rate": 0.0001386807613565499,
"loss": 0.9962,
"step": 601
},
{
"epoch": 1.5,
"grad_norm": 0.1762516349554062,
"learning_rate": 0.00013849755926215735,
"loss": 1.0288,
"step": 602
},
{
"epoch": 1.5,
"grad_norm": 0.17683060467243195,
"learning_rate": 0.00013831420535096223,
"loss": 0.9464,
"step": 603
},
{
"epoch": 1.5,
"grad_norm": 0.1796884983778,
"learning_rate": 0.00013813070034602863,
"loss": 1.0294,
"step": 604
},
{
"epoch": 1.5,
"grad_norm": 0.1921350210905075,
"learning_rate": 0.00013794704497101655,
"loss": 1.0216,
"step": 605
},
{
"epoch": 1.51,
"grad_norm": 0.18306772410869598,
"learning_rate": 0.00013776323995017898,
"loss": 1.0552,
"step": 606
},
{
"epoch": 1.51,
"grad_norm": 0.18202297389507294,
"learning_rate": 0.000137579286008359,
"loss": 0.9735,
"step": 607
},
{
"epoch": 1.51,
"grad_norm": 0.18103723227977753,
"learning_rate": 0.00013739518387098705,
"loss": 0.9673,
"step": 608
},
{
"epoch": 1.51,
"grad_norm": 0.17903882265090942,
"learning_rate": 0.0001372109342640779,
"loss": 0.9405,
"step": 609
},
{
"epoch": 1.52,
"grad_norm": 0.18169891834259033,
"learning_rate": 0.0001370265379142279,
"loss": 0.9595,
"step": 610
},
{
"epoch": 1.52,
"grad_norm": 0.18569333851337433,
"learning_rate": 0.00013684199554861207,
"loss": 0.9859,
"step": 611
},
{
"epoch": 1.52,
"grad_norm": 0.18026390671730042,
"learning_rate": 0.0001366573078949813,
"loss": 0.9804,
"step": 612
},
{
"epoch": 1.52,
"grad_norm": 0.18330590426921844,
"learning_rate": 0.00013647247568165938,
"loss": 0.9623,
"step": 613
},
{
"epoch": 1.53,
"grad_norm": 0.18787868320941925,
"learning_rate": 0.00013628749963754026,
"loss": 0.977,
"step": 614
},
{
"epoch": 1.53,
"grad_norm": 0.17502212524414062,
"learning_rate": 0.00013610238049208495,
"loss": 0.9615,
"step": 615
},
{
"epoch": 1.53,
"grad_norm": 0.18354558944702148,
"learning_rate": 0.0001359171189753189,
"loss": 0.9493,
"step": 616
},
{
"epoch": 1.53,
"grad_norm": 0.18860042095184326,
"learning_rate": 0.00013573171581782897,
"loss": 1.0698,
"step": 617
},
{
"epoch": 1.54,
"grad_norm": 0.1900940239429474,
"learning_rate": 0.00013554617175076062,
"loss": 0.961,
"step": 618
},
{
"epoch": 1.54,
"grad_norm": 0.18823568522930145,
"learning_rate": 0.00013536048750581494,
"loss": 0.9106,
"step": 619
},
{
"epoch": 1.54,
"grad_norm": 0.18658524751663208,
"learning_rate": 0.0001351746638152458,
"loss": 0.9161,
"step": 620
},
{
"epoch": 1.54,
"grad_norm": 0.18179596960544586,
"learning_rate": 0.00013498870141185712,
"loss": 0.9394,
"step": 621
},
{
"epoch": 1.55,
"grad_norm": 0.18801775574684143,
"learning_rate": 0.00013480260102899966,
"loss": 0.9827,
"step": 622
},
{
"epoch": 1.55,
"grad_norm": 0.18649117648601532,
"learning_rate": 0.00013461636340056843,
"loss": 0.9565,
"step": 623
},
{
"epoch": 1.55,
"grad_norm": 0.1857774257659912,
"learning_rate": 0.0001344299892609996,
"loss": 1.0292,
"step": 624
},
{
"epoch": 1.55,
"grad_norm": 0.1910741627216339,
"learning_rate": 0.00013424347934526772,
"loss": 1.0411,
"step": 625
},
{
"epoch": 1.56,
"grad_norm": 0.19100044667720795,
"learning_rate": 0.00013405683438888282,
"loss": 1.0071,
"step": 626
},
{
"epoch": 1.56,
"grad_norm": 0.17907825112342834,
"learning_rate": 0.00013387005512788733,
"loss": 1.0374,
"step": 627
},
{
"epoch": 1.56,
"grad_norm": 0.1795564442873001,
"learning_rate": 0.00013368314229885347,
"loss": 1.0094,
"step": 628
},
{
"epoch": 1.56,
"grad_norm": 0.17529642581939697,
"learning_rate": 0.00013349609663888015,
"loss": 0.9316,
"step": 629
},
{
"epoch": 1.57,
"grad_norm": 0.18285749852657318,
"learning_rate": 0.00013330891888559002,
"loss": 0.9878,
"step": 630
},
{
"epoch": 1.57,
"grad_norm": 0.18477262556552887,
"learning_rate": 0.00013312160977712668,
"loss": 1.0027,
"step": 631
},
{
"epoch": 1.57,
"grad_norm": 0.1869228482246399,
"learning_rate": 0.00013293417005215188,
"loss": 1.0269,
"step": 632
},
{
"epoch": 1.57,
"grad_norm": 0.19262288510799408,
"learning_rate": 0.00013274660044984224,
"loss": 1.0839,
"step": 633
},
{
"epoch": 1.58,
"grad_norm": 0.18182508647441864,
"learning_rate": 0.0001325589017098867,
"loss": 0.9953,
"step": 634
},
{
"epoch": 1.58,
"grad_norm": 0.21832676231861115,
"learning_rate": 0.0001323710745724834,
"loss": 1.028,
"step": 635
},
{
"epoch": 1.58,
"grad_norm": 0.18413691222667694,
"learning_rate": 0.00013218311977833687,
"loss": 1.0081,
"step": 636
},
{
"epoch": 1.58,
"grad_norm": 0.182253897190094,
"learning_rate": 0.00013199503806865504,
"loss": 0.9492,
"step": 637
},
{
"epoch": 1.59,
"grad_norm": 0.19804389774799347,
"learning_rate": 0.0001318068301851463,
"loss": 0.9859,
"step": 638
},
{
"epoch": 1.59,
"grad_norm": 0.1846335232257843,
"learning_rate": 0.00013161849687001666,
"loss": 0.9594,
"step": 639
},
{
"epoch": 1.59,
"grad_norm": 0.18544115126132965,
"learning_rate": 0.00013143003886596669,
"loss": 1.0116,
"step": 640
},
{
"epoch": 1.59,
"grad_norm": 0.1846534013748169,
"learning_rate": 0.00013124145691618884,
"loss": 1.0081,
"step": 641
},
{
"epoch": 1.6,
"grad_norm": 0.17868997156620026,
"learning_rate": 0.0001310527517643642,
"loss": 0.9044,
"step": 642
},
{
"epoch": 1.6,
"grad_norm": 0.18729160726070404,
"learning_rate": 0.00013086392415465972,
"loss": 0.9888,
"step": 643
},
{
"epoch": 1.6,
"grad_norm": 0.1919986605644226,
"learning_rate": 0.00013067497483172538,
"loss": 1.0277,
"step": 644
},
{
"epoch": 1.6,
"grad_norm": 0.20795708894729614,
"learning_rate": 0.00013048590454069108,
"loss": 0.8709,
"step": 645
},
{
"epoch": 1.61,
"grad_norm": 0.19611623883247375,
"learning_rate": 0.00013029671402716366,
"loss": 0.984,
"step": 646
},
{
"epoch": 1.61,
"grad_norm": 0.19515739381313324,
"learning_rate": 0.0001301074040372242,
"loss": 0.9985,
"step": 647
},
{
"epoch": 1.61,
"grad_norm": 0.1995517462491989,
"learning_rate": 0.00012991797531742492,
"loss": 1.034,
"step": 648
},
{
"epoch": 1.61,
"grad_norm": 0.18805646896362305,
"learning_rate": 0.00012972842861478618,
"loss": 0.9625,
"step": 649
},
{
"epoch": 1.62,
"grad_norm": 0.19192944467067719,
"learning_rate": 0.00012953876467679373,
"loss": 1.0583,
"step": 650
},
{
"epoch": 1.62,
"grad_norm": 0.19570088386535645,
"learning_rate": 0.0001293489842513955,
"loss": 0.9634,
"step": 651
},
{
"epoch": 1.62,
"grad_norm": 0.19576574862003326,
"learning_rate": 0.00012915908808699893,
"loss": 1.0172,
"step": 652
},
{
"epoch": 1.62,
"grad_norm": 0.17955078184604645,
"learning_rate": 0.0001289690769324678,
"loss": 0.9849,
"step": 653
},
{
"epoch": 1.63,
"grad_norm": 0.18549513816833496,
"learning_rate": 0.00012877895153711935,
"loss": 0.9527,
"step": 654
},
{
"epoch": 1.63,
"grad_norm": 0.19443288445472717,
"learning_rate": 0.0001285887126507214,
"loss": 1.0151,
"step": 655
},
{
"epoch": 1.63,
"grad_norm": 0.17947880923748016,
"learning_rate": 0.00012839836102348926,
"loss": 0.9655,
"step": 656
},
{
"epoch": 1.63,
"grad_norm": 0.18537116050720215,
"learning_rate": 0.00012820789740608293,
"loss": 0.9429,
"step": 657
},
{
"epoch": 1.64,
"grad_norm": 0.19015100598335266,
"learning_rate": 0.00012801732254960388,
"loss": 1.0355,
"step": 658
},
{
"epoch": 1.64,
"grad_norm": 0.18511660397052765,
"learning_rate": 0.00012782663720559246,
"loss": 1.0473,
"step": 659
},
{
"epoch": 1.64,
"grad_norm": 0.18822525441646576,
"learning_rate": 0.00012763584212602453,
"loss": 0.9671,
"step": 660
},
{
"epoch": 1.64,
"grad_norm": 0.18707570433616638,
"learning_rate": 0.0001274449380633089,
"loss": 1.0481,
"step": 661
},
{
"epoch": 1.65,
"grad_norm": 0.1918199360370636,
"learning_rate": 0.00012725392577028402,
"loss": 1.0062,
"step": 662
},
{
"epoch": 1.65,
"grad_norm": 0.19667948782444,
"learning_rate": 0.00012706280600021522,
"loss": 0.9817,
"step": 663
},
{
"epoch": 1.65,
"grad_norm": 0.1822723001241684,
"learning_rate": 0.0001268715795067916,
"loss": 0.9716,
"step": 664
},
{
"epoch": 1.65,
"grad_norm": 0.1914030760526657,
"learning_rate": 0.00012668024704412317,
"loss": 1.0209,
"step": 665
},
{
"epoch": 1.66,
"grad_norm": 0.187057226896286,
"learning_rate": 0.00012648880936673787,
"loss": 1.0381,
"step": 666
},
{
"epoch": 1.66,
"grad_norm": 0.18619103729724884,
"learning_rate": 0.00012629726722957846,
"loss": 1.0432,
"step": 667
},
{
"epoch": 1.66,
"grad_norm": 0.19731828570365906,
"learning_rate": 0.00012610562138799978,
"loss": 1.0611,
"step": 668
},
{
"epoch": 1.66,
"grad_norm": 0.1894959807395935,
"learning_rate": 0.00012591387259776551,
"loss": 0.9914,
"step": 669
},
{
"epoch": 1.67,
"grad_norm": 0.1772470325231552,
"learning_rate": 0.00012572202161504543,
"loss": 0.9843,
"step": 670
},
{
"epoch": 1.67,
"grad_norm": 0.18182332813739777,
"learning_rate": 0.00012553006919641214,
"loss": 0.949,
"step": 671
},
{
"epoch": 1.67,
"grad_norm": 0.1846974790096283,
"learning_rate": 0.00012533801609883842,
"loss": 0.9959,
"step": 672
},
{
"epoch": 1.68,
"grad_norm": 0.18767496943473816,
"learning_rate": 0.0001251458630796941,
"loss": 0.9466,
"step": 673
},
{
"epoch": 1.68,
"grad_norm": 0.18881787359714508,
"learning_rate": 0.00012495361089674285,
"loss": 0.9637,
"step": 674
},
{
"epoch": 1.68,
"grad_norm": 0.1902247816324234,
"learning_rate": 0.00012476126030813963,
"loss": 0.9985,
"step": 675
},
{
"epoch": 1.68,
"grad_norm": 0.18302756547927856,
"learning_rate": 0.00012456881207242732,
"loss": 0.95,
"step": 676
},
{
"epoch": 1.69,
"grad_norm": 0.18244938552379608,
"learning_rate": 0.000124376266948534,
"loss": 0.9918,
"step": 677
},
{
"epoch": 1.69,
"grad_norm": 0.19507256150245667,
"learning_rate": 0.00012418362569576965,
"loss": 1.0055,
"step": 678
},
{
"epoch": 1.69,
"grad_norm": 0.19234226644039154,
"learning_rate": 0.0001239908890738235,
"loss": 1.0511,
"step": 679
},
{
"epoch": 1.69,
"grad_norm": 0.19556111097335815,
"learning_rate": 0.00012379805784276082,
"loss": 0.9981,
"step": 680
},
{
"epoch": 1.7,
"grad_norm": 0.19322308897972107,
"learning_rate": 0.00012360513276301997,
"loss": 0.9603,
"step": 681
},
{
"epoch": 1.7,
"grad_norm": 0.1905602067708969,
"learning_rate": 0.0001234121145954094,
"loss": 0.9937,
"step": 682
},
{
"epoch": 1.7,
"grad_norm": 0.19340857863426208,
"learning_rate": 0.00012321900410110464,
"loss": 0.9996,
"step": 683
},
{
"epoch": 1.7,
"grad_norm": 0.181385800242424,
"learning_rate": 0.00012302580204164541,
"loss": 0.9563,
"step": 684
},
{
"epoch": 1.71,
"grad_norm": 0.19400039315223694,
"learning_rate": 0.00012283250917893244,
"loss": 1.0732,
"step": 685
},
{
"epoch": 1.71,
"grad_norm": 0.1877606064081192,
"learning_rate": 0.0001226391262752245,
"loss": 1.0057,
"step": 686
},
{
"epoch": 1.71,
"grad_norm": 0.18977177143096924,
"learning_rate": 0.00012244565409313547,
"loss": 0.9898,
"step": 687
},
{
"epoch": 1.71,
"grad_norm": 0.19174890220165253,
"learning_rate": 0.00012225209339563145,
"loss": 1.0449,
"step": 688
},
{
"epoch": 1.72,
"grad_norm": 0.18353353440761566,
"learning_rate": 0.0001220584449460274,
"loss": 0.9952,
"step": 689
},
{
"epoch": 1.72,
"grad_norm": 0.18639762699604034,
"learning_rate": 0.00012186470950798445,
"loss": 0.9693,
"step": 690
},
{
"epoch": 1.72,
"grad_norm": 0.1900029480457306,
"learning_rate": 0.00012167088784550673,
"loss": 0.9574,
"step": 691
},
{
"epoch": 1.72,
"grad_norm": 0.18529686331748962,
"learning_rate": 0.00012147698072293842,
"loss": 1.0299,
"step": 692
},
{
"epoch": 1.73,
"grad_norm": 0.1907936930656433,
"learning_rate": 0.00012128298890496072,
"loss": 0.9557,
"step": 693
},
{
"epoch": 1.73,
"grad_norm": 0.1865403652191162,
"learning_rate": 0.00012108891315658879,
"loss": 0.946,
"step": 694
},
{
"epoch": 1.73,
"grad_norm": 0.18556007742881775,
"learning_rate": 0.00012089475424316883,
"loss": 1.0129,
"step": 695
},
{
"epoch": 1.73,
"grad_norm": 0.1845078021287918,
"learning_rate": 0.00012070051293037492,
"loss": 0.9436,
"step": 696
},
{
"epoch": 1.74,
"grad_norm": 0.18208341300487518,
"learning_rate": 0.00012050618998420624,
"loss": 0.9985,
"step": 697
},
{
"epoch": 1.74,
"grad_norm": 0.19252164661884308,
"learning_rate": 0.00012031178617098371,
"loss": 1.0147,
"step": 698
},
{
"epoch": 1.74,
"grad_norm": 0.1972821056842804,
"learning_rate": 0.00012011730225734723,
"loss": 1.0548,
"step": 699
},
{
"epoch": 1.74,
"grad_norm": 0.18477863073349,
"learning_rate": 0.00011992273901025269,
"loss": 0.9847,
"step": 700
},
{
"epoch": 1.74,
"eval_loss": 1.0762046575546265,
"eval_runtime": 81.6492,
"eval_samples_per_second": 31.831,
"eval_steps_per_second": 31.831,
"step": 700
},
{
"epoch": 1.75,
"grad_norm": 0.19482113420963287,
"learning_rate": 0.00011972809719696864,
"loss": 0.9685,
"step": 701
},
{
"epoch": 1.75,
"grad_norm": 0.19040922820568085,
"learning_rate": 0.0001195333775850736,
"loss": 1.0528,
"step": 702
},
{
"epoch": 1.75,
"grad_norm": 0.19116735458374023,
"learning_rate": 0.00011933858094245281,
"loss": 0.983,
"step": 703
},
{
"epoch": 1.75,
"grad_norm": 0.17496508359909058,
"learning_rate": 0.00011914370803729533,
"loss": 0.936,
"step": 704
},
{
"epoch": 1.76,
"grad_norm": 0.1774684637784958,
"learning_rate": 0.00011894875963809098,
"loss": 1.001,
"step": 705
},
{
"epoch": 1.76,
"grad_norm": 0.1926085203886032,
"learning_rate": 0.00011875373651362727,
"loss": 1.0406,
"step": 706
},
{
"epoch": 1.76,
"grad_norm": 0.18313874304294586,
"learning_rate": 0.00011855863943298631,
"loss": 0.9501,
"step": 707
},
{
"epoch": 1.76,
"grad_norm": 0.18082866072654724,
"learning_rate": 0.00011836346916554205,
"loss": 0.9773,
"step": 708
},
{
"epoch": 1.77,
"grad_norm": 0.1892704963684082,
"learning_rate": 0.00011816822648095687,
"loss": 0.9879,
"step": 709
},
{
"epoch": 1.77,
"grad_norm": 0.1928127110004425,
"learning_rate": 0.00011797291214917881,
"loss": 1.0106,
"step": 710
},
{
"epoch": 1.77,
"grad_norm": 0.191785529255867,
"learning_rate": 0.00011777752694043849,
"loss": 0.9633,
"step": 711
},
{
"epoch": 1.77,
"grad_norm": 0.18815581500530243,
"learning_rate": 0.00011758207162524598,
"loss": 1.0087,
"step": 712
},
{
"epoch": 1.78,
"grad_norm": 0.19140002131462097,
"learning_rate": 0.00011738654697438782,
"loss": 1.022,
"step": 713
},
{
"epoch": 1.78,
"grad_norm": 0.18412011861801147,
"learning_rate": 0.00011719095375892396,
"loss": 0.9177,
"step": 714
},
{
"epoch": 1.78,
"grad_norm": 0.19803179800510406,
"learning_rate": 0.00011699529275018484,
"loss": 1.056,
"step": 715
},
{
"epoch": 1.78,
"grad_norm": 0.18873557448387146,
"learning_rate": 0.00011679956471976814,
"loss": 0.9664,
"step": 716
},
{
"epoch": 1.79,
"grad_norm": 0.1954958438873291,
"learning_rate": 0.00011660377043953588,
"loss": 0.9837,
"step": 717
},
{
"epoch": 1.79,
"grad_norm": 0.1911032795906067,
"learning_rate": 0.0001164079106816113,
"loss": 1.0281,
"step": 718
},
{
"epoch": 1.79,
"grad_norm": 0.19415371119976044,
"learning_rate": 0.00011621198621837593,
"loss": 0.9596,
"step": 719
},
{
"epoch": 1.79,
"grad_norm": 0.1977900266647339,
"learning_rate": 0.00011601599782246646,
"loss": 0.9503,
"step": 720
},
{
"epoch": 1.8,
"grad_norm": 0.1874951422214508,
"learning_rate": 0.0001158199462667716,
"loss": 1.0024,
"step": 721
},
{
"epoch": 1.8,
"grad_norm": 0.1944780796766281,
"learning_rate": 0.00011562383232442926,
"loss": 0.9805,
"step": 722
},
{
"epoch": 1.8,
"grad_norm": 0.18960687518119812,
"learning_rate": 0.00011542765676882325,
"loss": 1.0155,
"step": 723
},
{
"epoch": 1.8,
"grad_norm": 0.1834162324666977,
"learning_rate": 0.0001152314203735805,
"loss": 0.9558,
"step": 724
},
{
"epoch": 1.81,
"grad_norm": 0.1892080008983612,
"learning_rate": 0.00011503512391256776,
"loss": 1.0202,
"step": 725
},
{
"epoch": 1.81,
"grad_norm": 0.19285555183887482,
"learning_rate": 0.00011483876815988867,
"loss": 0.986,
"step": 726
},
{
"epoch": 1.81,
"grad_norm": 0.1912676841020584,
"learning_rate": 0.00011464235388988067,
"loss": 1.0215,
"step": 727
},
{
"epoch": 1.81,
"grad_norm": 0.18774007260799408,
"learning_rate": 0.00011444588187711205,
"loss": 0.9133,
"step": 728
},
{
"epoch": 1.82,
"grad_norm": 0.18041113018989563,
"learning_rate": 0.0001142493528963787,
"loss": 0.9651,
"step": 729
},
{
"epoch": 1.82,
"grad_norm": 0.18634317815303802,
"learning_rate": 0.00011405276772270126,
"loss": 1.0167,
"step": 730
},
{
"epoch": 1.82,
"grad_norm": 0.18424159288406372,
"learning_rate": 0.0001138561271313219,
"loss": 0.9602,
"step": 731
},
{
"epoch": 1.82,
"grad_norm": 0.18384714424610138,
"learning_rate": 0.0001136594318977014,
"loss": 0.9298,
"step": 732
},
{
"epoch": 1.83,
"grad_norm": 0.19117358326911926,
"learning_rate": 0.00011346268279751595,
"loss": 0.9123,
"step": 733
},
{
"epoch": 1.83,
"grad_norm": 0.18405017256736755,
"learning_rate": 0.0001132658806066542,
"loss": 0.9986,
"step": 734
},
{
"epoch": 1.83,
"grad_norm": 0.1914985477924347,
"learning_rate": 0.00011306902610121419,
"loss": 0.9518,
"step": 735
},
{
"epoch": 1.83,
"grad_norm": 0.1904747486114502,
"learning_rate": 0.00011287212005750024,
"loss": 0.9891,
"step": 736
},
{
"epoch": 1.84,
"grad_norm": 0.1916552037000656,
"learning_rate": 0.00011267516325201985,
"loss": 0.9616,
"step": 737
},
{
"epoch": 1.84,
"grad_norm": 0.18625429272651672,
"learning_rate": 0.00011247815646148087,
"loss": 0.9592,
"step": 738
},
{
"epoch": 1.84,
"grad_norm": 0.1944790482521057,
"learning_rate": 0.00011228110046278808,
"loss": 0.9469,
"step": 739
},
{
"epoch": 1.84,
"grad_norm": 0.20122645795345306,
"learning_rate": 0.00011208399603304047,
"loss": 0.9849,
"step": 740
},
{
"epoch": 1.85,
"grad_norm": 0.19067947566509247,
"learning_rate": 0.00011188684394952789,
"loss": 1.0099,
"step": 741
},
{
"epoch": 1.85,
"grad_norm": 0.18489985167980194,
"learning_rate": 0.00011168964498972818,
"loss": 0.9669,
"step": 742
},
{
"epoch": 1.85,
"grad_norm": 0.1892281025648117,
"learning_rate": 0.00011149239993130403,
"loss": 0.9674,
"step": 743
},
{
"epoch": 1.85,
"grad_norm": 0.1811356395483017,
"learning_rate": 0.00011129510955209996,
"loss": 1.0119,
"step": 744
},
{
"epoch": 1.86,
"grad_norm": 0.19581769406795502,
"learning_rate": 0.00011109777463013915,
"loss": 0.9978,
"step": 745
},
{
"epoch": 1.86,
"grad_norm": 0.19298292696475983,
"learning_rate": 0.00011090039594362045,
"loss": 0.9971,
"step": 746
},
{
"epoch": 1.86,
"grad_norm": 0.1880626529455185,
"learning_rate": 0.00011070297427091534,
"loss": 1.0108,
"step": 747
},
{
"epoch": 1.86,
"grad_norm": 0.1833215206861496,
"learning_rate": 0.00011050551039056479,
"loss": 0.9353,
"step": 748
},
{
"epoch": 1.87,
"grad_norm": 0.18261606991291046,
"learning_rate": 0.0001103080050812762,
"loss": 0.9607,
"step": 749
},
{
"epoch": 1.87,
"grad_norm": 0.1790233999490738,
"learning_rate": 0.00011011045912192035,
"loss": 0.9579,
"step": 750
},
{
"epoch": 1.87,
"grad_norm": 0.20333704352378845,
"learning_rate": 0.00010991287329152838,
"loss": 1.0136,
"step": 751
},
{
"epoch": 1.87,
"grad_norm": 0.18839126825332642,
"learning_rate": 0.0001097152483692886,
"loss": 0.992,
"step": 752
},
{
"epoch": 1.88,
"grad_norm": 0.1932857632637024,
"learning_rate": 0.00010951758513454351,
"loss": 0.9098,
"step": 753
},
{
"epoch": 1.88,
"grad_norm": 0.19326822459697723,
"learning_rate": 0.00010931988436678666,
"loss": 0.9718,
"step": 754
},
{
"epoch": 1.88,
"grad_norm": 0.19290626049041748,
"learning_rate": 0.00010912214684565967,
"loss": 0.9569,
"step": 755
},
{
"epoch": 1.88,
"grad_norm": 0.1982078105211258,
"learning_rate": 0.00010892437335094912,
"loss": 0.929,
"step": 756
},
{
"epoch": 1.89,
"grad_norm": 0.18881501257419586,
"learning_rate": 0.00010872656466258328,
"loss": 1.0139,
"step": 757
},
{
"epoch": 1.89,
"grad_norm": 0.18985024094581604,
"learning_rate": 0.00010852872156062946,
"loss": 0.9946,
"step": 758
},
{
"epoch": 1.89,
"grad_norm": 0.19749155640602112,
"learning_rate": 0.00010833084482529048,
"loss": 1.0356,
"step": 759
},
{
"epoch": 1.89,
"grad_norm": 0.19211384654045105,
"learning_rate": 0.00010813293523690191,
"loss": 0.9779,
"step": 760
},
{
"epoch": 1.9,
"grad_norm": 0.19262412190437317,
"learning_rate": 0.0001079349935759288,
"loss": 0.9665,
"step": 761
},
{
"epoch": 1.9,
"grad_norm": 0.18871724605560303,
"learning_rate": 0.00010773702062296273,
"loss": 0.9511,
"step": 762
},
{
"epoch": 1.9,
"grad_norm": 0.18119603395462036,
"learning_rate": 0.00010753901715871866,
"loss": 0.9482,
"step": 763
},
{
"epoch": 1.9,
"grad_norm": 0.18349209427833557,
"learning_rate": 0.00010734098396403192,
"loss": 0.9386,
"step": 764
},
{
"epoch": 1.91,
"grad_norm": 0.19208337366580963,
"learning_rate": 0.00010714292181985498,
"loss": 0.9473,
"step": 765
},
{
"epoch": 1.91,
"grad_norm": 0.18588630855083466,
"learning_rate": 0.00010694483150725458,
"loss": 1.0278,
"step": 766
},
{
"epoch": 1.91,
"grad_norm": 0.18634718656539917,
"learning_rate": 0.00010674671380740851,
"loss": 1.0387,
"step": 767
},
{
"epoch": 1.91,
"grad_norm": 0.18514113128185272,
"learning_rate": 0.00010654856950160253,
"loss": 0.9557,
"step": 768
},
{
"epoch": 1.92,
"grad_norm": 0.18085001409053802,
"learning_rate": 0.00010635039937122733,
"loss": 0.9689,
"step": 769
},
{
"epoch": 1.92,
"grad_norm": 0.18852289021015167,
"learning_rate": 0.00010615220419777548,
"loss": 1.0444,
"step": 770
},
{
"epoch": 1.92,
"grad_norm": 0.19260498881340027,
"learning_rate": 0.00010595398476283827,
"loss": 0.9204,
"step": 771
},
{
"epoch": 1.92,
"grad_norm": 0.19677571952342987,
"learning_rate": 0.00010575574184810269,
"loss": 1.0183,
"step": 772
},
{
"epoch": 1.93,
"grad_norm": 0.19709721207618713,
"learning_rate": 0.00010555747623534831,
"loss": 1.011,
"step": 773
},
{
"epoch": 1.93,
"grad_norm": 0.18773804605007172,
"learning_rate": 0.0001053591887064442,
"loss": 0.9834,
"step": 774
},
{
"epoch": 1.93,
"grad_norm": 0.19036594033241272,
"learning_rate": 0.0001051608800433459,
"loss": 0.9657,
"step": 775
},
{
"epoch": 1.93,
"grad_norm": 0.1866806596517563,
"learning_rate": 0.00010496255102809223,
"loss": 0.9609,
"step": 776
},
{
"epoch": 1.94,
"grad_norm": 0.1847977638244629,
"learning_rate": 0.00010476420244280232,
"loss": 0.9814,
"step": 777
},
{
"epoch": 1.94,
"grad_norm": 0.19136272370815277,
"learning_rate": 0.00010456583506967248,
"loss": 1.0256,
"step": 778
},
{
"epoch": 1.94,
"grad_norm": 0.22890682518482208,
"learning_rate": 0.00010436744969097306,
"loss": 0.9979,
"step": 779
},
{
"epoch": 1.94,
"grad_norm": 0.19637508690357208,
"learning_rate": 0.00010416904708904548,
"loss": 0.9841,
"step": 780
},
{
"epoch": 1.95,
"grad_norm": 0.1934499442577362,
"learning_rate": 0.000103970628046299,
"loss": 0.9251,
"step": 781
},
{
"epoch": 1.95,
"grad_norm": 0.1859968602657318,
"learning_rate": 0.00010377219334520783,
"loss": 0.9702,
"step": 782
},
{
"epoch": 1.95,
"grad_norm": 0.18776066601276398,
"learning_rate": 0.00010357374376830775,
"loss": 0.95,
"step": 783
},
{
"epoch": 1.95,
"grad_norm": 0.19182752072811127,
"learning_rate": 0.00010337528009819344,
"loss": 0.9476,
"step": 784
},
{
"epoch": 1.96,
"grad_norm": 0.19188746809959412,
"learning_rate": 0.00010317680311751496,
"loss": 1.0165,
"step": 785
},
{
"epoch": 1.96,
"grad_norm": 0.18225421011447906,
"learning_rate": 0.00010297831360897492,
"loss": 0.9593,
"step": 786
},
{
"epoch": 1.96,
"grad_norm": 0.1944630891084671,
"learning_rate": 0.00010277981235532541,
"loss": 0.9439,
"step": 787
},
{
"epoch": 1.96,
"grad_norm": 0.1944238543510437,
"learning_rate": 0.00010258130013936474,
"loss": 1.0166,
"step": 788
},
{
"epoch": 1.97,
"grad_norm": 0.18848265707492828,
"learning_rate": 0.00010238277774393448,
"loss": 0.9808,
"step": 789
},
{
"epoch": 1.97,
"grad_norm": 0.1884046196937561,
"learning_rate": 0.00010218424595191631,
"loss": 1.0332,
"step": 790
},
{
"epoch": 1.97,
"grad_norm": 0.1906522959470749,
"learning_rate": 0.00010198570554622909,
"loss": 0.9361,
"step": 791
},
{
"epoch": 1.97,
"grad_norm": 0.1847391128540039,
"learning_rate": 0.00010178715730982549,
"loss": 0.9522,
"step": 792
},
{
"epoch": 1.98,
"grad_norm": 0.18664193153381348,
"learning_rate": 0.00010158860202568916,
"loss": 0.9834,
"step": 793
},
{
"epoch": 1.98,
"grad_norm": 0.19117935001850128,
"learning_rate": 0.00010139004047683151,
"loss": 0.9931,
"step": 794
},
{
"epoch": 1.98,
"grad_norm": 0.1847536265850067,
"learning_rate": 0.0001011914734462887,
"loss": 1.0131,
"step": 795
},
{
"epoch": 1.98,
"grad_norm": 0.18716172873973846,
"learning_rate": 0.00010099290171711841,
"loss": 0.948,
"step": 796
}
],
"logging_steps": 1,
"max_steps": 1592,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 398,
"total_flos": 5.93824319923028e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}