JVCGPT-Medium_fp16 / checkpoint-909 /trainer_state.json
Undi95's picture
Upload folder using huggingface_hub
94c8d3b verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05003027134129561,
"eval_steps": 500,
"global_step": 909,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 5.503880235566074e-05,
"grad_norm": 459.8753356933594,
"learning_rate": 1.0000000000000001e-07,
"loss": 3.303,
"step": 1
},
{
"epoch": 0.00011007760471132149,
"grad_norm": 314.2561950683594,
"learning_rate": 2.0000000000000002e-07,
"loss": 2.8226,
"step": 2
},
{
"epoch": 0.0001651164070669822,
"grad_norm": 314.1292419433594,
"learning_rate": 3.0000000000000004e-07,
"loss": 2.8517,
"step": 3
},
{
"epoch": 0.00022015520942264297,
"grad_norm": 312.4049072265625,
"learning_rate": 4.0000000000000003e-07,
"loss": 2.6248,
"step": 4
},
{
"epoch": 0.0002751940117783037,
"grad_norm": 353.7213134765625,
"learning_rate": 5.000000000000001e-07,
"loss": 2.7883,
"step": 5
},
{
"epoch": 0.0003302328141339644,
"grad_norm": 278.41668701171875,
"learning_rate": 6.000000000000001e-07,
"loss": 2.5468,
"step": 6
},
{
"epoch": 0.0003852716164896252,
"grad_norm": 336.14532470703125,
"learning_rate": 7.000000000000001e-07,
"loss": 2.7721,
"step": 7
},
{
"epoch": 0.00044031041884528595,
"grad_norm": 201.19374084472656,
"learning_rate": 8.000000000000001e-07,
"loss": 2.4873,
"step": 8
},
{
"epoch": 0.0004953492212009466,
"grad_norm": 184.7027587890625,
"learning_rate": 9.000000000000001e-07,
"loss": 2.6647,
"step": 9
},
{
"epoch": 0.0005503880235566074,
"grad_norm": 154.597412109375,
"learning_rate": 1.0000000000000002e-06,
"loss": 2.602,
"step": 10
},
{
"epoch": 0.0006054268259122681,
"grad_norm": 40.47785568237305,
"learning_rate": 1.1e-06,
"loss": 2.6716,
"step": 11
},
{
"epoch": 0.0006604656282679288,
"grad_norm": 25.338607788085938,
"learning_rate": 1.2000000000000002e-06,
"loss": 2.2631,
"step": 12
},
{
"epoch": 0.0007155044306235897,
"grad_norm": 24.976919174194336,
"learning_rate": 1.3e-06,
"loss": 2.3564,
"step": 13
},
{
"epoch": 0.0007705432329792504,
"grad_norm": 15.239912033081055,
"learning_rate": 1.4000000000000001e-06,
"loss": 2.3295,
"step": 14
},
{
"epoch": 0.0008255820353349112,
"grad_norm": 14.125042915344238,
"learning_rate": 1.5e-06,
"loss": 2.307,
"step": 15
},
{
"epoch": 0.0008806208376905719,
"grad_norm": 13.163726806640625,
"learning_rate": 1.6000000000000001e-06,
"loss": 2.1493,
"step": 16
},
{
"epoch": 0.0009356596400462326,
"grad_norm": 8.726515769958496,
"learning_rate": 1.7000000000000002e-06,
"loss": 2.0333,
"step": 17
},
{
"epoch": 0.0009906984424018933,
"grad_norm": 9.072502136230469,
"learning_rate": 1.8000000000000001e-06,
"loss": 2.2046,
"step": 18
},
{
"epoch": 0.001045737244757554,
"grad_norm": 9.412588119506836,
"learning_rate": 1.9000000000000002e-06,
"loss": 2.2001,
"step": 19
},
{
"epoch": 0.0011007760471132147,
"grad_norm": 8.67534065246582,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.7679,
"step": 20
},
{
"epoch": 0.0011558148494688755,
"grad_norm": 14.015918731689453,
"learning_rate": 2.1000000000000002e-06,
"loss": 1.9566,
"step": 21
},
{
"epoch": 0.0012108536518245362,
"grad_norm": 7.9474687576293945,
"learning_rate": 2.2e-06,
"loss": 1.9085,
"step": 22
},
{
"epoch": 0.001265892454180197,
"grad_norm": 6.806368350982666,
"learning_rate": 2.3000000000000004e-06,
"loss": 1.7918,
"step": 23
},
{
"epoch": 0.0013209312565358577,
"grad_norm": 5.3452582359313965,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.8321,
"step": 24
},
{
"epoch": 0.0013759700588915184,
"grad_norm": 8.744244575500488,
"learning_rate": 2.5e-06,
"loss": 1.6317,
"step": 25
},
{
"epoch": 0.0014310088612471794,
"grad_norm": 5.304683685302734,
"learning_rate": 2.6e-06,
"loss": 1.6846,
"step": 26
},
{
"epoch": 0.00148604766360284,
"grad_norm": 5.650127410888672,
"learning_rate": 2.7000000000000004e-06,
"loss": 1.7449,
"step": 27
},
{
"epoch": 0.0015410864659585008,
"grad_norm": 5.479269504547119,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.8158,
"step": 28
},
{
"epoch": 0.0015961252683141616,
"grad_norm": 4.873537063598633,
"learning_rate": 2.9e-06,
"loss": 1.8015,
"step": 29
},
{
"epoch": 0.0016511640706698223,
"grad_norm": 4.971101760864258,
"learning_rate": 3e-06,
"loss": 1.9034,
"step": 30
},
{
"epoch": 0.001706202873025483,
"grad_norm": 4.407571315765381,
"learning_rate": 3.1000000000000004e-06,
"loss": 1.9037,
"step": 31
},
{
"epoch": 0.0017612416753811438,
"grad_norm": 4.429073810577393,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.6812,
"step": 32
},
{
"epoch": 0.0018162804777368045,
"grad_norm": 5.16085147857666,
"learning_rate": 3.3000000000000006e-06,
"loss": 1.7627,
"step": 33
},
{
"epoch": 0.0018713192800924653,
"grad_norm": 4.0805768966674805,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.6799,
"step": 34
},
{
"epoch": 0.001926358082448126,
"grad_norm": 4.548702239990234,
"learning_rate": 3.5e-06,
"loss": 1.7799,
"step": 35
},
{
"epoch": 0.0019813968848037865,
"grad_norm": 5.181888580322266,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.8235,
"step": 36
},
{
"epoch": 0.0020364356871594475,
"grad_norm": 3.9876129627227783,
"learning_rate": 3.7e-06,
"loss": 1.5999,
"step": 37
},
{
"epoch": 0.002091474489515108,
"grad_norm": 6.325051307678223,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.7499,
"step": 38
},
{
"epoch": 0.002146513291870769,
"grad_norm": 6.199049949645996,
"learning_rate": 3.900000000000001e-06,
"loss": 1.784,
"step": 39
},
{
"epoch": 0.0022015520942264295,
"grad_norm": 4.83912992477417,
"learning_rate": 4.000000000000001e-06,
"loss": 1.8895,
"step": 40
},
{
"epoch": 0.0022565908965820904,
"grad_norm": 4.515626907348633,
"learning_rate": 4.1e-06,
"loss": 1.4887,
"step": 41
},
{
"epoch": 0.002311629698937751,
"grad_norm": 5.032265663146973,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.7324,
"step": 42
},
{
"epoch": 0.002366668501293412,
"grad_norm": 4.1879048347473145,
"learning_rate": 4.3e-06,
"loss": 1.4912,
"step": 43
},
{
"epoch": 0.0024217073036490724,
"grad_norm": 4.128026485443115,
"learning_rate": 4.4e-06,
"loss": 1.554,
"step": 44
},
{
"epoch": 0.0024767461060047334,
"grad_norm": 4.527958393096924,
"learning_rate": 4.5e-06,
"loss": 1.652,
"step": 45
},
{
"epoch": 0.002531784908360394,
"grad_norm": 4.8388190269470215,
"learning_rate": 4.600000000000001e-06,
"loss": 1.6696,
"step": 46
},
{
"epoch": 0.002586823710716055,
"grad_norm": 4.2088541984558105,
"learning_rate": 4.7e-06,
"loss": 1.568,
"step": 47
},
{
"epoch": 0.0026418625130717154,
"grad_norm": 4.789997577667236,
"learning_rate": 4.800000000000001e-06,
"loss": 1.642,
"step": 48
},
{
"epoch": 0.0026969013154273763,
"grad_norm": 4.408346652984619,
"learning_rate": 4.9000000000000005e-06,
"loss": 1.5181,
"step": 49
},
{
"epoch": 0.002751940117783037,
"grad_norm": 4.572340488433838,
"learning_rate": 5e-06,
"loss": 1.6698,
"step": 50
},
{
"epoch": 0.0028069789201386978,
"grad_norm": 4.728564739227295,
"learning_rate": 5.1e-06,
"loss": 1.5785,
"step": 51
},
{
"epoch": 0.0028620177224943587,
"grad_norm": 4.449855327606201,
"learning_rate": 5.2e-06,
"loss": 1.4624,
"step": 52
},
{
"epoch": 0.0029170565248500193,
"grad_norm": 4.127189636230469,
"learning_rate": 5.300000000000001e-06,
"loss": 1.6061,
"step": 53
},
{
"epoch": 0.00297209532720568,
"grad_norm": 4.244532108306885,
"learning_rate": 5.400000000000001e-06,
"loss": 1.491,
"step": 54
},
{
"epoch": 0.0030271341295613407,
"grad_norm": 3.437682628631592,
"learning_rate": 5.500000000000001e-06,
"loss": 1.1967,
"step": 55
},
{
"epoch": 0.0030821729319170017,
"grad_norm": 3.83516788482666,
"learning_rate": 5.600000000000001e-06,
"loss": 1.4731,
"step": 56
},
{
"epoch": 0.003137211734272662,
"grad_norm": 3.9108972549438477,
"learning_rate": 5.7e-06,
"loss": 1.4393,
"step": 57
},
{
"epoch": 0.003192250536628323,
"grad_norm": 3.5258419513702393,
"learning_rate": 5.8e-06,
"loss": 1.4206,
"step": 58
},
{
"epoch": 0.0032472893389839837,
"grad_norm": 4.124903678894043,
"learning_rate": 5.9e-06,
"loss": 1.4747,
"step": 59
},
{
"epoch": 0.0033023281413396446,
"grad_norm": 4.055769920349121,
"learning_rate": 6e-06,
"loss": 1.4655,
"step": 60
},
{
"epoch": 0.003357366943695305,
"grad_norm": 3.904837131500244,
"learning_rate": 6.1e-06,
"loss": 1.5125,
"step": 61
},
{
"epoch": 0.003412405746050966,
"grad_norm": 3.2904794216156006,
"learning_rate": 6.200000000000001e-06,
"loss": 1.4596,
"step": 62
},
{
"epoch": 0.0034674445484066266,
"grad_norm": 3.24053692817688,
"learning_rate": 6.300000000000001e-06,
"loss": 1.3851,
"step": 63
},
{
"epoch": 0.0035224833507622876,
"grad_norm": 3.457639217376709,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.4019,
"step": 64
},
{
"epoch": 0.003577522153117948,
"grad_norm": 3.073054790496826,
"learning_rate": 6.5000000000000004e-06,
"loss": 1.2872,
"step": 65
},
{
"epoch": 0.003632560955473609,
"grad_norm": 2.6726694107055664,
"learning_rate": 6.600000000000001e-06,
"loss": 1.2361,
"step": 66
},
{
"epoch": 0.0036875997578292696,
"grad_norm": 2.9378459453582764,
"learning_rate": 6.700000000000001e-06,
"loss": 1.4452,
"step": 67
},
{
"epoch": 0.0037426385601849305,
"grad_norm": 2.81107234954834,
"learning_rate": 6.800000000000001e-06,
"loss": 1.4804,
"step": 68
},
{
"epoch": 0.003797677362540591,
"grad_norm": 2.60062313079834,
"learning_rate": 6.9e-06,
"loss": 1.3263,
"step": 69
},
{
"epoch": 0.003852716164896252,
"grad_norm": 2.5642921924591064,
"learning_rate": 7e-06,
"loss": 1.2751,
"step": 70
},
{
"epoch": 0.0039077549672519125,
"grad_norm": 2.3608031272888184,
"learning_rate": 7.100000000000001e-06,
"loss": 1.2614,
"step": 71
},
{
"epoch": 0.003962793769607573,
"grad_norm": 2.7201738357543945,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.5018,
"step": 72
},
{
"epoch": 0.004017832571963234,
"grad_norm": 2.584726095199585,
"learning_rate": 7.3e-06,
"loss": 1.3519,
"step": 73
},
{
"epoch": 0.004072871374318895,
"grad_norm": 1.9693044424057007,
"learning_rate": 7.4e-06,
"loss": 1.0934,
"step": 74
},
{
"epoch": 0.0041279101766745555,
"grad_norm": 2.220736503601074,
"learning_rate": 7.500000000000001e-06,
"loss": 1.4687,
"step": 75
},
{
"epoch": 0.004182948979030216,
"grad_norm": 2.2629456520080566,
"learning_rate": 7.600000000000001e-06,
"loss": 1.3328,
"step": 76
},
{
"epoch": 0.004237987781385877,
"grad_norm": 2.051820993423462,
"learning_rate": 7.7e-06,
"loss": 1.3058,
"step": 77
},
{
"epoch": 0.004293026583741538,
"grad_norm": 2.2451820373535156,
"learning_rate": 7.800000000000002e-06,
"loss": 1.3556,
"step": 78
},
{
"epoch": 0.004348065386097198,
"grad_norm": 3.13584303855896,
"learning_rate": 7.9e-06,
"loss": 1.3262,
"step": 79
},
{
"epoch": 0.004403104188452859,
"grad_norm": 5.024479866027832,
"learning_rate": 8.000000000000001e-06,
"loss": 1.2103,
"step": 80
},
{
"epoch": 0.00445814299080852,
"grad_norm": 2.070889711380005,
"learning_rate": 8.1e-06,
"loss": 1.1994,
"step": 81
},
{
"epoch": 0.004513181793164181,
"grad_norm": 2.797286033630371,
"learning_rate": 8.2e-06,
"loss": 1.3075,
"step": 82
},
{
"epoch": 0.004568220595519841,
"grad_norm": 2.11370849609375,
"learning_rate": 8.3e-06,
"loss": 1.36,
"step": 83
},
{
"epoch": 0.004623259397875502,
"grad_norm": 2.5416152477264404,
"learning_rate": 8.400000000000001e-06,
"loss": 1.3484,
"step": 84
},
{
"epoch": 0.004678298200231163,
"grad_norm": 2.4702343940734863,
"learning_rate": 8.5e-06,
"loss": 1.3677,
"step": 85
},
{
"epoch": 0.004733337002586824,
"grad_norm": 3.670365333557129,
"learning_rate": 8.6e-06,
"loss": 1.2192,
"step": 86
},
{
"epoch": 0.004788375804942484,
"grad_norm": 2.282954692840576,
"learning_rate": 8.700000000000001e-06,
"loss": 1.2982,
"step": 87
},
{
"epoch": 0.004843414607298145,
"grad_norm": 2.3659238815307617,
"learning_rate": 8.8e-06,
"loss": 1.3206,
"step": 88
},
{
"epoch": 0.004898453409653806,
"grad_norm": 4.939981460571289,
"learning_rate": 8.900000000000001e-06,
"loss": 1.4328,
"step": 89
},
{
"epoch": 0.004953492212009467,
"grad_norm": 2.335858106613159,
"learning_rate": 9e-06,
"loss": 1.2603,
"step": 90
},
{
"epoch": 0.005008531014365127,
"grad_norm": 2.2165043354034424,
"learning_rate": 9.100000000000001e-06,
"loss": 1.3141,
"step": 91
},
{
"epoch": 0.005063569816720788,
"grad_norm": 2.7872185707092285,
"learning_rate": 9.200000000000002e-06,
"loss": 1.3314,
"step": 92
},
{
"epoch": 0.005118608619076449,
"grad_norm": 2.6353912353515625,
"learning_rate": 9.3e-06,
"loss": 1.2027,
"step": 93
},
{
"epoch": 0.00517364742143211,
"grad_norm": 3.2509102821350098,
"learning_rate": 9.4e-06,
"loss": 1.2316,
"step": 94
},
{
"epoch": 0.00522868622378777,
"grad_norm": 2.4560611248016357,
"learning_rate": 9.5e-06,
"loss": 1.1848,
"step": 95
},
{
"epoch": 0.005283725026143431,
"grad_norm": 2.338151216506958,
"learning_rate": 9.600000000000001e-06,
"loss": 1.2392,
"step": 96
},
{
"epoch": 0.005338763828499092,
"grad_norm": 2.231065034866333,
"learning_rate": 9.7e-06,
"loss": 1.2089,
"step": 97
},
{
"epoch": 0.005393802630854753,
"grad_norm": 2.278428077697754,
"learning_rate": 9.800000000000001e-06,
"loss": 1.2267,
"step": 98
},
{
"epoch": 0.005448841433210413,
"grad_norm": 2.4422810077667236,
"learning_rate": 9.9e-06,
"loss": 1.2041,
"step": 99
},
{
"epoch": 0.005503880235566074,
"grad_norm": 2.216248035430908,
"learning_rate": 1e-05,
"loss": 1.0798,
"step": 100
},
{
"epoch": 0.005558919037921735,
"grad_norm": 2.3301615715026855,
"learning_rate": 9.99999998121067e-06,
"loss": 1.3069,
"step": 101
},
{
"epoch": 0.0056139578402773956,
"grad_norm": 2.315436363220215,
"learning_rate": 9.999999924842678e-06,
"loss": 1.1589,
"step": 102
},
{
"epoch": 0.005668996642633056,
"grad_norm": 2.3522140979766846,
"learning_rate": 9.999999830896024e-06,
"loss": 1.0978,
"step": 103
},
{
"epoch": 0.0057240354449887175,
"grad_norm": 2.5798308849334717,
"learning_rate": 9.99999969937071e-06,
"loss": 1.0599,
"step": 104
},
{
"epoch": 0.005779074247344378,
"grad_norm": 2.456644058227539,
"learning_rate": 9.999999530266738e-06,
"loss": 1.1682,
"step": 105
},
{
"epoch": 0.0058341130497000385,
"grad_norm": 2.1559031009674072,
"learning_rate": 9.999999323584106e-06,
"loss": 1.0631,
"step": 106
},
{
"epoch": 0.005889151852055699,
"grad_norm": 2.2985048294067383,
"learning_rate": 9.99999907932282e-06,
"loss": 1.1455,
"step": 107
},
{
"epoch": 0.00594419065441136,
"grad_norm": 2.596167802810669,
"learning_rate": 9.999998797482877e-06,
"loss": 1.1686,
"step": 108
},
{
"epoch": 0.005999229456767021,
"grad_norm": 2.378618001937866,
"learning_rate": 9.999998478064283e-06,
"loss": 1.2226,
"step": 109
},
{
"epoch": 0.0060542682591226814,
"grad_norm": 2.228116750717163,
"learning_rate": 9.999998121067038e-06,
"loss": 1.1396,
"step": 110
},
{
"epoch": 0.006109307061478342,
"grad_norm": 2.4419472217559814,
"learning_rate": 9.999997726491146e-06,
"loss": 1.1401,
"step": 111
},
{
"epoch": 0.006164345863834003,
"grad_norm": 2.0695526599884033,
"learning_rate": 9.999997294336608e-06,
"loss": 1.1868,
"step": 112
},
{
"epoch": 0.006219384666189664,
"grad_norm": 2.3170363903045654,
"learning_rate": 9.99999682460343e-06,
"loss": 1.1172,
"step": 113
},
{
"epoch": 0.006274423468545324,
"grad_norm": 2.670466184616089,
"learning_rate": 9.999996317291615e-06,
"loss": 1.2481,
"step": 114
},
{
"epoch": 0.006329462270900985,
"grad_norm": 2.1214540004730225,
"learning_rate": 9.999995772401166e-06,
"loss": 0.9994,
"step": 115
},
{
"epoch": 0.006384501073256646,
"grad_norm": 1.9283969402313232,
"learning_rate": 9.999995189932085e-06,
"loss": 1.0692,
"step": 116
},
{
"epoch": 0.006439539875612307,
"grad_norm": 2.2620882987976074,
"learning_rate": 9.99999456988438e-06,
"loss": 1.0725,
"step": 117
},
{
"epoch": 0.006494578677967967,
"grad_norm": 2.2121341228485107,
"learning_rate": 9.999993912258055e-06,
"loss": 1.1328,
"step": 118
},
{
"epoch": 0.006549617480323628,
"grad_norm": 2.298126220703125,
"learning_rate": 9.999993217053113e-06,
"loss": 1.1272,
"step": 119
},
{
"epoch": 0.006604656282679289,
"grad_norm": 1.81593656539917,
"learning_rate": 9.99999248426956e-06,
"loss": 1.017,
"step": 120
},
{
"epoch": 0.00665969508503495,
"grad_norm": 2.1174378395080566,
"learning_rate": 9.999991713907403e-06,
"loss": 1.0557,
"step": 121
},
{
"epoch": 0.00671473388739061,
"grad_norm": 1.9061017036437988,
"learning_rate": 9.999990905966647e-06,
"loss": 1.0379,
"step": 122
},
{
"epoch": 0.006769772689746271,
"grad_norm": 1.912500023841858,
"learning_rate": 9.999990060447297e-06,
"loss": 1.104,
"step": 123
},
{
"epoch": 0.006824811492101932,
"grad_norm": 1.9249529838562012,
"learning_rate": 9.99998917734936e-06,
"loss": 1.0136,
"step": 124
},
{
"epoch": 0.006879850294457593,
"grad_norm": 1.8504948616027832,
"learning_rate": 9.999988256672843e-06,
"loss": 0.99,
"step": 125
},
{
"epoch": 0.006934889096813253,
"grad_norm": 1.720042109489441,
"learning_rate": 9.999987298417753e-06,
"loss": 1.0666,
"step": 126
},
{
"epoch": 0.006989927899168914,
"grad_norm": 1.778251051902771,
"learning_rate": 9.999986302584097e-06,
"loss": 1.0424,
"step": 127
},
{
"epoch": 0.007044966701524575,
"grad_norm": 1.9485961198806763,
"learning_rate": 9.999985269171881e-06,
"loss": 1.105,
"step": 128
},
{
"epoch": 0.007100005503880236,
"grad_norm": 3.0802104473114014,
"learning_rate": 9.999984198181114e-06,
"loss": 1.1081,
"step": 129
},
{
"epoch": 0.007155044306235896,
"grad_norm": 1.7476954460144043,
"learning_rate": 9.999983089611806e-06,
"loss": 0.9677,
"step": 130
},
{
"epoch": 0.007210083108591557,
"grad_norm": 1.6127299070358276,
"learning_rate": 9.999981943463963e-06,
"loss": 0.9937,
"step": 131
},
{
"epoch": 0.007265121910947218,
"grad_norm": 2.1477208137512207,
"learning_rate": 9.999980759737594e-06,
"loss": 1.0319,
"step": 132
},
{
"epoch": 0.007320160713302879,
"grad_norm": 1.531163215637207,
"learning_rate": 9.999979538432707e-06,
"loss": 0.8696,
"step": 133
},
{
"epoch": 0.007375199515658539,
"grad_norm": 1.8226820230484009,
"learning_rate": 9.999978279549313e-06,
"loss": 1.2061,
"step": 134
},
{
"epoch": 0.0074302383180142,
"grad_norm": 1.481895923614502,
"learning_rate": 9.99997698308742e-06,
"loss": 0.949,
"step": 135
},
{
"epoch": 0.007485277120369861,
"grad_norm": 1.6715927124023438,
"learning_rate": 9.99997564904704e-06,
"loss": 1.1579,
"step": 136
},
{
"epoch": 0.0075403159227255215,
"grad_norm": 1.4235272407531738,
"learning_rate": 9.999974277428179e-06,
"loss": 1.064,
"step": 137
},
{
"epoch": 0.007595354725081182,
"grad_norm": 1.3524872064590454,
"learning_rate": 9.999972868230852e-06,
"loss": 0.9141,
"step": 138
},
{
"epoch": 0.007650393527436843,
"grad_norm": 1.3741765022277832,
"learning_rate": 9.999971421455066e-06,
"loss": 1.0256,
"step": 139
},
{
"epoch": 0.007705432329792504,
"grad_norm": 1.9869598150253296,
"learning_rate": 9.999969937100835e-06,
"loss": 0.9489,
"step": 140
},
{
"epoch": 0.0077604711321481645,
"grad_norm": 1.4785465002059937,
"learning_rate": 9.999968415168166e-06,
"loss": 0.9243,
"step": 141
},
{
"epoch": 0.007815509934503825,
"grad_norm": 1.5476176738739014,
"learning_rate": 9.999966855657074e-06,
"loss": 1.178,
"step": 142
},
{
"epoch": 0.007870548736859486,
"grad_norm": 1.500401258468628,
"learning_rate": 9.99996525856757e-06,
"loss": 0.9837,
"step": 143
},
{
"epoch": 0.007925587539215146,
"grad_norm": 1.3777157068252563,
"learning_rate": 9.999963623899664e-06,
"loss": 1.0732,
"step": 144
},
{
"epoch": 0.007980626341570807,
"grad_norm": 1.4466841220855713,
"learning_rate": 9.99996195165337e-06,
"loss": 0.9779,
"step": 145
},
{
"epoch": 0.008035665143926469,
"grad_norm": 1.5304051637649536,
"learning_rate": 9.9999602418287e-06,
"loss": 1.196,
"step": 146
},
{
"epoch": 0.008090703946282128,
"grad_norm": 1.9012362957000732,
"learning_rate": 9.99995849442567e-06,
"loss": 0.9797,
"step": 147
},
{
"epoch": 0.00814574274863779,
"grad_norm": 1.430679202079773,
"learning_rate": 9.999956709444289e-06,
"loss": 0.9869,
"step": 148
},
{
"epoch": 0.00820078155099345,
"grad_norm": 1.3489817380905151,
"learning_rate": 9.99995488688457e-06,
"loss": 1.0137,
"step": 149
},
{
"epoch": 0.008255820353349111,
"grad_norm": 1.1878125667572021,
"learning_rate": 9.999953026746531e-06,
"loss": 0.9355,
"step": 150
},
{
"epoch": 0.008310859155704772,
"grad_norm": 1.3481942415237427,
"learning_rate": 9.999951129030182e-06,
"loss": 1.1235,
"step": 151
},
{
"epoch": 0.008365897958060432,
"grad_norm": 1.7335314750671387,
"learning_rate": 9.999949193735539e-06,
"loss": 0.9382,
"step": 152
},
{
"epoch": 0.008420936760416093,
"grad_norm": 1.2029480934143066,
"learning_rate": 9.999947220862615e-06,
"loss": 0.9419,
"step": 153
},
{
"epoch": 0.008475975562771755,
"grad_norm": 1.2104203701019287,
"learning_rate": 9.999945210411428e-06,
"loss": 0.9196,
"step": 154
},
{
"epoch": 0.008531014365127414,
"grad_norm": 1.1857126951217651,
"learning_rate": 9.999943162381991e-06,
"loss": 0.9421,
"step": 155
},
{
"epoch": 0.008586053167483076,
"grad_norm": 1.115027904510498,
"learning_rate": 9.999941076774319e-06,
"loss": 0.9634,
"step": 156
},
{
"epoch": 0.008641091969838737,
"grad_norm": 1.4227553606033325,
"learning_rate": 9.999938953588428e-06,
"loss": 1.0036,
"step": 157
},
{
"epoch": 0.008696130772194397,
"grad_norm": 1.2913776636123657,
"learning_rate": 9.999936792824334e-06,
"loss": 0.9232,
"step": 158
},
{
"epoch": 0.008751169574550058,
"grad_norm": 1.2817318439483643,
"learning_rate": 9.999934594482055e-06,
"loss": 0.9691,
"step": 159
},
{
"epoch": 0.008806208376905718,
"grad_norm": 1.5647841691970825,
"learning_rate": 9.999932358561604e-06,
"loss": 1.1842,
"step": 160
},
{
"epoch": 0.00886124717926138,
"grad_norm": 1.368135929107666,
"learning_rate": 9.999930085063002e-06,
"loss": 1.0873,
"step": 161
},
{
"epoch": 0.00891628598161704,
"grad_norm": 1.2297240495681763,
"learning_rate": 9.999927773986262e-06,
"loss": 1.0778,
"step": 162
},
{
"epoch": 0.0089713247839727,
"grad_norm": 1.0658279657363892,
"learning_rate": 9.999925425331405e-06,
"loss": 0.9008,
"step": 163
},
{
"epoch": 0.009026363586328362,
"grad_norm": 1.3484326601028442,
"learning_rate": 9.999923039098445e-06,
"loss": 1.0664,
"step": 164
},
{
"epoch": 0.009081402388684023,
"grad_norm": 1.1839075088500977,
"learning_rate": 9.999920615287401e-06,
"loss": 0.9257,
"step": 165
},
{
"epoch": 0.009136441191039683,
"grad_norm": 1.2757254838943481,
"learning_rate": 9.999918153898295e-06,
"loss": 0.9473,
"step": 166
},
{
"epoch": 0.009191479993395344,
"grad_norm": 1.2414579391479492,
"learning_rate": 9.99991565493114e-06,
"loss": 1.1091,
"step": 167
},
{
"epoch": 0.009246518795751004,
"grad_norm": 1.2802611589431763,
"learning_rate": 9.999913118385959e-06,
"loss": 1.063,
"step": 168
},
{
"epoch": 0.009301557598106665,
"grad_norm": 1.2055327892303467,
"learning_rate": 9.99991054426277e-06,
"loss": 0.8,
"step": 169
},
{
"epoch": 0.009356596400462327,
"grad_norm": 1.0391098260879517,
"learning_rate": 9.99990793256159e-06,
"loss": 0.8672,
"step": 170
},
{
"epoch": 0.009411635202817986,
"grad_norm": 1.131536602973938,
"learning_rate": 9.99990528328244e-06,
"loss": 0.9569,
"step": 171
},
{
"epoch": 0.009466674005173648,
"grad_norm": 1.164307951927185,
"learning_rate": 9.999902596425342e-06,
"loss": 0.9999,
"step": 172
},
{
"epoch": 0.009521712807529309,
"grad_norm": 1.2099504470825195,
"learning_rate": 9.999899871990313e-06,
"loss": 0.9994,
"step": 173
},
{
"epoch": 0.009576751609884969,
"grad_norm": 1.7294539213180542,
"learning_rate": 9.999897109977376e-06,
"loss": 1.0265,
"step": 174
},
{
"epoch": 0.00963179041224063,
"grad_norm": 1.3009883165359497,
"learning_rate": 9.99989431038655e-06,
"loss": 0.9022,
"step": 175
},
{
"epoch": 0.00968682921459629,
"grad_norm": 1.1014611721038818,
"learning_rate": 9.999891473217857e-06,
"loss": 0.8476,
"step": 176
},
{
"epoch": 0.009741868016951951,
"grad_norm": 1.2410900592803955,
"learning_rate": 9.99988859847132e-06,
"loss": 1.0272,
"step": 177
},
{
"epoch": 0.009796906819307612,
"grad_norm": 1.336348295211792,
"learning_rate": 9.999885686146957e-06,
"loss": 0.9456,
"step": 178
},
{
"epoch": 0.009851945621663272,
"grad_norm": 1.2931095361709595,
"learning_rate": 9.99988273624479e-06,
"loss": 0.9554,
"step": 179
},
{
"epoch": 0.009906984424018933,
"grad_norm": 1.2647838592529297,
"learning_rate": 9.999879748764845e-06,
"loss": 1.0394,
"step": 180
},
{
"epoch": 0.009962023226374595,
"grad_norm": 1.3485127687454224,
"learning_rate": 9.99987672370714e-06,
"loss": 1.1016,
"step": 181
},
{
"epoch": 0.010017062028730254,
"grad_norm": 1.110187292098999,
"learning_rate": 9.999873661071702e-06,
"loss": 0.946,
"step": 182
},
{
"epoch": 0.010072100831085916,
"grad_norm": 1.0991623401641846,
"learning_rate": 9.999870560858551e-06,
"loss": 1.0084,
"step": 183
},
{
"epoch": 0.010127139633441576,
"grad_norm": 1.049804449081421,
"learning_rate": 9.999867423067713e-06,
"loss": 0.8264,
"step": 184
},
{
"epoch": 0.010182178435797237,
"grad_norm": 1.0947058200836182,
"learning_rate": 9.999864247699207e-06,
"loss": 0.8884,
"step": 185
},
{
"epoch": 0.010237217238152898,
"grad_norm": 1.1147902011871338,
"learning_rate": 9.999861034753061e-06,
"loss": 0.9657,
"step": 186
},
{
"epoch": 0.010292256040508558,
"grad_norm": 1.260027527809143,
"learning_rate": 9.999857784229298e-06,
"loss": 1.0102,
"step": 187
},
{
"epoch": 0.01034729484286422,
"grad_norm": 1.1275582313537598,
"learning_rate": 9.999854496127942e-06,
"loss": 1.028,
"step": 188
},
{
"epoch": 0.01040233364521988,
"grad_norm": 1.1377174854278564,
"learning_rate": 9.999851170449018e-06,
"loss": 1.032,
"step": 189
},
{
"epoch": 0.01045737244757554,
"grad_norm": 1.1734225749969482,
"learning_rate": 9.999847807192552e-06,
"loss": 1.0009,
"step": 190
},
{
"epoch": 0.010512411249931202,
"grad_norm": 1.1934596300125122,
"learning_rate": 9.999844406358565e-06,
"loss": 1.0432,
"step": 191
},
{
"epoch": 0.010567450052286861,
"grad_norm": 1.0638024806976318,
"learning_rate": 9.99984096794709e-06,
"loss": 0.8651,
"step": 192
},
{
"epoch": 0.010622488854642523,
"grad_norm": 1.2381829023361206,
"learning_rate": 9.999837491958147e-06,
"loss": 1.0088,
"step": 193
},
{
"epoch": 0.010677527656998184,
"grad_norm": 1.030246615409851,
"learning_rate": 9.999833978391763e-06,
"loss": 0.9488,
"step": 194
},
{
"epoch": 0.010732566459353844,
"grad_norm": 1.1640657186508179,
"learning_rate": 9.999830427247965e-06,
"loss": 1.0588,
"step": 195
},
{
"epoch": 0.010787605261709505,
"grad_norm": 1.0431616306304932,
"learning_rate": 9.99982683852678e-06,
"loss": 0.8728,
"step": 196
},
{
"epoch": 0.010842644064065167,
"grad_norm": 1.032263159751892,
"learning_rate": 9.999823212228235e-06,
"loss": 0.9498,
"step": 197
},
{
"epoch": 0.010897682866420826,
"grad_norm": 1.1383745670318604,
"learning_rate": 9.999819548352358e-06,
"loss": 0.9498,
"step": 198
},
{
"epoch": 0.010952721668776488,
"grad_norm": 1.1324639320373535,
"learning_rate": 9.999815846899175e-06,
"loss": 1.0432,
"step": 199
},
{
"epoch": 0.011007760471132147,
"grad_norm": 1.188672661781311,
"learning_rate": 9.999812107868714e-06,
"loss": 0.982,
"step": 200
},
{
"epoch": 0.011062799273487809,
"grad_norm": 1.1011098623275757,
"learning_rate": 9.999808331261005e-06,
"loss": 0.9587,
"step": 201
},
{
"epoch": 0.01111783807584347,
"grad_norm": 1.1782938241958618,
"learning_rate": 9.999804517076073e-06,
"loss": 1.0659,
"step": 202
},
{
"epoch": 0.01117287687819913,
"grad_norm": 1.0520117282867432,
"learning_rate": 9.99980066531395e-06,
"loss": 1.0056,
"step": 203
},
{
"epoch": 0.011227915680554791,
"grad_norm": 1.1584919691085815,
"learning_rate": 9.999796775974663e-06,
"loss": 0.9435,
"step": 204
},
{
"epoch": 0.011282954482910452,
"grad_norm": 1.2201849222183228,
"learning_rate": 9.999792849058242e-06,
"loss": 1.0562,
"step": 205
},
{
"epoch": 0.011337993285266112,
"grad_norm": 1.2985976934432983,
"learning_rate": 9.999788884564715e-06,
"loss": 1.0126,
"step": 206
},
{
"epoch": 0.011393032087621774,
"grad_norm": 0.9926307201385498,
"learning_rate": 9.999784882494115e-06,
"loss": 0.7875,
"step": 207
},
{
"epoch": 0.011448070889977435,
"grad_norm": 1.103365182876587,
"learning_rate": 9.99978084284647e-06,
"loss": 0.9833,
"step": 208
},
{
"epoch": 0.011503109692333095,
"grad_norm": 1.1798462867736816,
"learning_rate": 9.99977676562181e-06,
"loss": 0.8479,
"step": 209
},
{
"epoch": 0.011558148494688756,
"grad_norm": 1.2887194156646729,
"learning_rate": 9.999772650820168e-06,
"loss": 0.9606,
"step": 210
},
{
"epoch": 0.011613187297044416,
"grad_norm": 1.1120634078979492,
"learning_rate": 9.99976849844157e-06,
"loss": 0.9604,
"step": 211
},
{
"epoch": 0.011668226099400077,
"grad_norm": 1.1248979568481445,
"learning_rate": 9.999764308486052e-06,
"loss": 0.9428,
"step": 212
},
{
"epoch": 0.011723264901755738,
"grad_norm": 1.274610161781311,
"learning_rate": 9.999760080953643e-06,
"loss": 0.9044,
"step": 213
},
{
"epoch": 0.011778303704111398,
"grad_norm": 1.1746865510940552,
"learning_rate": 9.999755815844377e-06,
"loss": 0.9114,
"step": 214
},
{
"epoch": 0.01183334250646706,
"grad_norm": 1.2531086206436157,
"learning_rate": 9.999751513158282e-06,
"loss": 1.0785,
"step": 215
},
{
"epoch": 0.01188838130882272,
"grad_norm": 1.0789539813995361,
"learning_rate": 9.999747172895395e-06,
"loss": 0.9794,
"step": 216
},
{
"epoch": 0.01194342011117838,
"grad_norm": 1.1805329322814941,
"learning_rate": 9.999742795055746e-06,
"loss": 0.9602,
"step": 217
},
{
"epoch": 0.011998458913534042,
"grad_norm": 2.309329032897949,
"learning_rate": 9.99973837963937e-06,
"loss": 0.9482,
"step": 218
},
{
"epoch": 0.012053497715889702,
"grad_norm": 1.2379088401794434,
"learning_rate": 9.999733926646296e-06,
"loss": 1.0237,
"step": 219
},
{
"epoch": 0.012108536518245363,
"grad_norm": 1.1581377983093262,
"learning_rate": 9.999729436076562e-06,
"loss": 1.0583,
"step": 220
},
{
"epoch": 0.012163575320601024,
"grad_norm": 1.3006727695465088,
"learning_rate": 9.999724907930199e-06,
"loss": 0.9581,
"step": 221
},
{
"epoch": 0.012218614122956684,
"grad_norm": 1.3215982913970947,
"learning_rate": 9.999720342207243e-06,
"loss": 0.9438,
"step": 222
},
{
"epoch": 0.012273652925312345,
"grad_norm": 1.1107337474822998,
"learning_rate": 9.999715738907727e-06,
"loss": 0.9987,
"step": 223
},
{
"epoch": 0.012328691727668007,
"grad_norm": 1.0745457410812378,
"learning_rate": 9.999711098031685e-06,
"loss": 0.9637,
"step": 224
},
{
"epoch": 0.012383730530023666,
"grad_norm": 1.110861897468567,
"learning_rate": 9.999706419579154e-06,
"loss": 1.0225,
"step": 225
},
{
"epoch": 0.012438769332379328,
"grad_norm": 1.0755527019500732,
"learning_rate": 9.999701703550167e-06,
"loss": 1.0204,
"step": 226
},
{
"epoch": 0.012493808134734987,
"grad_norm": 1.1694976091384888,
"learning_rate": 9.99969694994476e-06,
"loss": 1.0566,
"step": 227
},
{
"epoch": 0.012548846937090649,
"grad_norm": 1.455856442451477,
"learning_rate": 9.99969215876297e-06,
"loss": 0.9397,
"step": 228
},
{
"epoch": 0.01260388573944631,
"grad_norm": 1.0707073211669922,
"learning_rate": 9.99968733000483e-06,
"loss": 0.8286,
"step": 229
},
{
"epoch": 0.01265892454180197,
"grad_norm": 1.189548134803772,
"learning_rate": 9.99968246367038e-06,
"loss": 0.8762,
"step": 230
},
{
"epoch": 0.012713963344157631,
"grad_norm": 1.1439214944839478,
"learning_rate": 9.999677559759655e-06,
"loss": 0.9187,
"step": 231
},
{
"epoch": 0.012769002146513293,
"grad_norm": 1.2329761981964111,
"learning_rate": 9.999672618272691e-06,
"loss": 1.0374,
"step": 232
},
{
"epoch": 0.012824040948868952,
"grad_norm": 1.1545134782791138,
"learning_rate": 9.999667639209527e-06,
"loss": 0.9343,
"step": 233
},
{
"epoch": 0.012879079751224614,
"grad_norm": 1.0946775674819946,
"learning_rate": 9.999662622570198e-06,
"loss": 0.9568,
"step": 234
},
{
"epoch": 0.012934118553580273,
"grad_norm": 1.2099589109420776,
"learning_rate": 9.999657568354743e-06,
"loss": 1.0364,
"step": 235
},
{
"epoch": 0.012989157355935935,
"grad_norm": 1.09062922000885,
"learning_rate": 9.999652476563202e-06,
"loss": 1.0289,
"step": 236
},
{
"epoch": 0.013044196158291596,
"grad_norm": 1.154557228088379,
"learning_rate": 9.999647347195612e-06,
"loss": 0.9925,
"step": 237
},
{
"epoch": 0.013099234960647256,
"grad_norm": 1.025374174118042,
"learning_rate": 9.999642180252008e-06,
"loss": 0.9346,
"step": 238
},
{
"epoch": 0.013154273763002917,
"grad_norm": 1.1473641395568848,
"learning_rate": 9.999636975732433e-06,
"loss": 1.0244,
"step": 239
},
{
"epoch": 0.013209312565358578,
"grad_norm": 1.0421240329742432,
"learning_rate": 9.999631733636923e-06,
"loss": 0.9368,
"step": 240
},
{
"epoch": 0.013264351367714238,
"grad_norm": 1.1076610088348389,
"learning_rate": 9.99962645396552e-06,
"loss": 1.0276,
"step": 241
},
{
"epoch": 0.0133193901700699,
"grad_norm": 1.143559455871582,
"learning_rate": 9.999621136718266e-06,
"loss": 0.9626,
"step": 242
},
{
"epoch": 0.01337442897242556,
"grad_norm": 1.0958378314971924,
"learning_rate": 9.999615781895195e-06,
"loss": 1.0254,
"step": 243
},
{
"epoch": 0.01342946777478122,
"grad_norm": 1.117688536643982,
"learning_rate": 9.99961038949635e-06,
"loss": 0.9685,
"step": 244
},
{
"epoch": 0.013484506577136882,
"grad_norm": 1.1645647287368774,
"learning_rate": 9.999604959521771e-06,
"loss": 1.0666,
"step": 245
},
{
"epoch": 0.013539545379492542,
"grad_norm": 1.1238516569137573,
"learning_rate": 9.999599491971502e-06,
"loss": 1.0252,
"step": 246
},
{
"epoch": 0.013594584181848203,
"grad_norm": 1.0196914672851562,
"learning_rate": 9.999593986845579e-06,
"loss": 0.9389,
"step": 247
},
{
"epoch": 0.013649622984203864,
"grad_norm": 1.0231372117996216,
"learning_rate": 9.999588444144049e-06,
"loss": 0.8786,
"step": 248
},
{
"epoch": 0.013704661786559524,
"grad_norm": 1.2504147291183472,
"learning_rate": 9.999582863866947e-06,
"loss": 1.0969,
"step": 249
},
{
"epoch": 0.013759700588915185,
"grad_norm": 1.1123549938201904,
"learning_rate": 9.99957724601432e-06,
"loss": 0.8833,
"step": 250
},
{
"epoch": 0.013814739391270847,
"grad_norm": 1.1068202257156372,
"learning_rate": 9.999571590586208e-06,
"loss": 0.9709,
"step": 251
},
{
"epoch": 0.013869778193626506,
"grad_norm": 0.9891651272773743,
"learning_rate": 9.999565897582655e-06,
"loss": 0.8598,
"step": 252
},
{
"epoch": 0.013924816995982168,
"grad_norm": 0.9866491556167603,
"learning_rate": 9.999560167003703e-06,
"loss": 0.8101,
"step": 253
},
{
"epoch": 0.013979855798337828,
"grad_norm": 1.0862594842910767,
"learning_rate": 9.999554398849396e-06,
"loss": 0.9411,
"step": 254
},
{
"epoch": 0.014034894600693489,
"grad_norm": 1.1898949146270752,
"learning_rate": 9.999548593119774e-06,
"loss": 0.9548,
"step": 255
},
{
"epoch": 0.01408993340304915,
"grad_norm": 1.2167880535125732,
"learning_rate": 9.999542749814886e-06,
"loss": 1.0302,
"step": 256
},
{
"epoch": 0.01414497220540481,
"grad_norm": 1.0784146785736084,
"learning_rate": 9.999536868934771e-06,
"loss": 0.8875,
"step": 257
},
{
"epoch": 0.014200011007760471,
"grad_norm": 1.1128027439117432,
"learning_rate": 9.999530950479475e-06,
"loss": 0.9498,
"step": 258
},
{
"epoch": 0.014255049810116133,
"grad_norm": 1.1311595439910889,
"learning_rate": 9.999524994449044e-06,
"loss": 0.9035,
"step": 259
},
{
"epoch": 0.014310088612471792,
"grad_norm": 1.225615382194519,
"learning_rate": 9.999519000843521e-06,
"loss": 1.0104,
"step": 260
},
{
"epoch": 0.014365127414827454,
"grad_norm": 1.2347793579101562,
"learning_rate": 9.99951296966295e-06,
"loss": 1.0288,
"step": 261
},
{
"epoch": 0.014420166217183113,
"grad_norm": 1.1837103366851807,
"learning_rate": 9.99950690090738e-06,
"loss": 0.9553,
"step": 262
},
{
"epoch": 0.014475205019538775,
"grad_norm": 1.1985397338867188,
"learning_rate": 9.999500794576852e-06,
"loss": 0.9561,
"step": 263
},
{
"epoch": 0.014530243821894436,
"grad_norm": 1.036928415298462,
"learning_rate": 9.999494650671418e-06,
"loss": 0.8906,
"step": 264
},
{
"epoch": 0.014585282624250096,
"grad_norm": 1.0797842741012573,
"learning_rate": 9.999488469191116e-06,
"loss": 0.8975,
"step": 265
},
{
"epoch": 0.014640321426605757,
"grad_norm": 1.0571156740188599,
"learning_rate": 9.999482250136e-06,
"loss": 0.9334,
"step": 266
},
{
"epoch": 0.014695360228961419,
"grad_norm": 1.2065023183822632,
"learning_rate": 9.999475993506114e-06,
"loss": 0.8986,
"step": 267
},
{
"epoch": 0.014750399031317078,
"grad_norm": 1.201586127281189,
"learning_rate": 9.999469699301502e-06,
"loss": 0.9192,
"step": 268
},
{
"epoch": 0.01480543783367274,
"grad_norm": 1.0470168590545654,
"learning_rate": 9.999463367522216e-06,
"loss": 0.8604,
"step": 269
},
{
"epoch": 0.0148604766360284,
"grad_norm": 1.1142147779464722,
"learning_rate": 9.9994569981683e-06,
"loss": 0.9847,
"step": 270
},
{
"epoch": 0.01491551543838406,
"grad_norm": 1.0352061986923218,
"learning_rate": 9.999450591239805e-06,
"loss": 0.8927,
"step": 271
},
{
"epoch": 0.014970554240739722,
"grad_norm": 1.0353184938430786,
"learning_rate": 9.999444146736779e-06,
"loss": 0.8435,
"step": 272
},
{
"epoch": 0.015025593043095382,
"grad_norm": 1.2091951370239258,
"learning_rate": 9.999437664659267e-06,
"loss": 0.8959,
"step": 273
},
{
"epoch": 0.015080631845451043,
"grad_norm": 1.006361722946167,
"learning_rate": 9.999431145007319e-06,
"loss": 0.8579,
"step": 274
},
{
"epoch": 0.015135670647806704,
"grad_norm": 1.1265509128570557,
"learning_rate": 9.999424587780985e-06,
"loss": 0.8808,
"step": 275
},
{
"epoch": 0.015190709450162364,
"grad_norm": 1.060882568359375,
"learning_rate": 9.999417992980317e-06,
"loss": 1.044,
"step": 276
},
{
"epoch": 0.015245748252518026,
"grad_norm": 1.0216747522354126,
"learning_rate": 9.999411360605358e-06,
"loss": 0.7773,
"step": 277
},
{
"epoch": 0.015300787054873685,
"grad_norm": 1.1382462978363037,
"learning_rate": 9.999404690656163e-06,
"loss": 0.8954,
"step": 278
},
{
"epoch": 0.015355825857229347,
"grad_norm": 1.113815188407898,
"learning_rate": 9.99939798313278e-06,
"loss": 0.8143,
"step": 279
},
{
"epoch": 0.015410864659585008,
"grad_norm": 1.123530387878418,
"learning_rate": 9.99939123803526e-06,
"loss": 0.8872,
"step": 280
},
{
"epoch": 0.015465903461940668,
"grad_norm": 1.0873669385910034,
"learning_rate": 9.999384455363656e-06,
"loss": 1.008,
"step": 281
},
{
"epoch": 0.015520942264296329,
"grad_norm": 1.5956637859344482,
"learning_rate": 9.999377635118014e-06,
"loss": 0.9456,
"step": 282
},
{
"epoch": 0.01557598106665199,
"grad_norm": 1.1471425294876099,
"learning_rate": 9.999370777298389e-06,
"loss": 0.9897,
"step": 283
},
{
"epoch": 0.01563101986900765,
"grad_norm": 0.9960193634033203,
"learning_rate": 9.999363881904831e-06,
"loss": 0.8196,
"step": 284
},
{
"epoch": 0.01568605867136331,
"grad_norm": 1.1033951044082642,
"learning_rate": 9.999356948937393e-06,
"loss": 0.879,
"step": 285
},
{
"epoch": 0.015741097473718973,
"grad_norm": 1.157765507698059,
"learning_rate": 9.999349978396126e-06,
"loss": 1.0116,
"step": 286
},
{
"epoch": 0.015796136276074634,
"grad_norm": 1.0472352504730225,
"learning_rate": 9.999342970281084e-06,
"loss": 0.8657,
"step": 287
},
{
"epoch": 0.015851175078430292,
"grad_norm": 1.1346659660339355,
"learning_rate": 9.999335924592315e-06,
"loss": 0.8482,
"step": 288
},
{
"epoch": 0.015906213880785953,
"grad_norm": 1.1164487600326538,
"learning_rate": 9.999328841329879e-06,
"loss": 1.0542,
"step": 289
},
{
"epoch": 0.015961252683141615,
"grad_norm": 1.1890591382980347,
"learning_rate": 9.999321720493825e-06,
"loss": 0.9598,
"step": 290
},
{
"epoch": 0.016016291485497276,
"grad_norm": 1.0419867038726807,
"learning_rate": 9.999314562084205e-06,
"loss": 0.9548,
"step": 291
},
{
"epoch": 0.016071330287852938,
"grad_norm": 1.0652042627334595,
"learning_rate": 9.999307366101077e-06,
"loss": 0.9359,
"step": 292
},
{
"epoch": 0.016126369090208596,
"grad_norm": 1.0166404247283936,
"learning_rate": 9.999300132544492e-06,
"loss": 0.9276,
"step": 293
},
{
"epoch": 0.016181407892564257,
"grad_norm": 1.1638866662979126,
"learning_rate": 9.999292861414507e-06,
"loss": 0.957,
"step": 294
},
{
"epoch": 0.01623644669491992,
"grad_norm": 1.5505993366241455,
"learning_rate": 9.999285552711173e-06,
"loss": 0.9878,
"step": 295
},
{
"epoch": 0.01629148549727558,
"grad_norm": 1.177262783050537,
"learning_rate": 9.999278206434549e-06,
"loss": 0.8631,
"step": 296
},
{
"epoch": 0.01634652429963124,
"grad_norm": 1.8578168153762817,
"learning_rate": 9.999270822584687e-06,
"loss": 0.9684,
"step": 297
},
{
"epoch": 0.0164015631019869,
"grad_norm": 1.2617360353469849,
"learning_rate": 9.999263401161643e-06,
"loss": 1.014,
"step": 298
},
{
"epoch": 0.01645660190434256,
"grad_norm": 0.9740132689476013,
"learning_rate": 9.999255942165475e-06,
"loss": 0.8606,
"step": 299
},
{
"epoch": 0.016511640706698222,
"grad_norm": 0.9821745753288269,
"learning_rate": 9.999248445596238e-06,
"loss": 0.8241,
"step": 300
},
{
"epoch": 0.016566679509053883,
"grad_norm": 1.0200445652008057,
"learning_rate": 9.999240911453986e-06,
"loss": 0.8256,
"step": 301
},
{
"epoch": 0.016621718311409545,
"grad_norm": 1.4100390672683716,
"learning_rate": 9.999233339738779e-06,
"loss": 0.9057,
"step": 302
},
{
"epoch": 0.016676757113765206,
"grad_norm": 1.056544303894043,
"learning_rate": 9.99922573045067e-06,
"loss": 1.0808,
"step": 303
},
{
"epoch": 0.016731795916120864,
"grad_norm": 0.9271026253700256,
"learning_rate": 9.99921808358972e-06,
"loss": 0.878,
"step": 304
},
{
"epoch": 0.016786834718476525,
"grad_norm": 0.9864157438278198,
"learning_rate": 9.999210399155987e-06,
"loss": 0.9198,
"step": 305
},
{
"epoch": 0.016841873520832187,
"grad_norm": 1.093995451927185,
"learning_rate": 9.999202677149525e-06,
"loss": 0.9794,
"step": 306
},
{
"epoch": 0.016896912323187848,
"grad_norm": 0.9717912077903748,
"learning_rate": 9.999194917570395e-06,
"loss": 0.8764,
"step": 307
},
{
"epoch": 0.01695195112554351,
"grad_norm": 1.0026428699493408,
"learning_rate": 9.999187120418653e-06,
"loss": 0.8526,
"step": 308
},
{
"epoch": 0.017006989927899167,
"grad_norm": 1.122870922088623,
"learning_rate": 9.999179285694359e-06,
"loss": 0.9773,
"step": 309
},
{
"epoch": 0.01706202873025483,
"grad_norm": 1.0522836446762085,
"learning_rate": 9.999171413397572e-06,
"loss": 1.0183,
"step": 310
},
{
"epoch": 0.01711706753261049,
"grad_norm": 0.9303658604621887,
"learning_rate": 9.99916350352835e-06,
"loss": 0.8402,
"step": 311
},
{
"epoch": 0.01717210633496615,
"grad_norm": 0.9606096148490906,
"learning_rate": 9.999155556086755e-06,
"loss": 0.9692,
"step": 312
},
{
"epoch": 0.017227145137321813,
"grad_norm": 1.176992416381836,
"learning_rate": 9.999147571072844e-06,
"loss": 0.8172,
"step": 313
},
{
"epoch": 0.017282183939677474,
"grad_norm": 1.1948801279067993,
"learning_rate": 9.999139548486678e-06,
"loss": 1.0205,
"step": 314
},
{
"epoch": 0.017337222742033132,
"grad_norm": 1.0064897537231445,
"learning_rate": 9.999131488328318e-06,
"loss": 0.9479,
"step": 315
},
{
"epoch": 0.017392261544388794,
"grad_norm": 1.048242449760437,
"learning_rate": 9.999123390597822e-06,
"loss": 0.9862,
"step": 316
},
{
"epoch": 0.017447300346744455,
"grad_norm": 1.12875497341156,
"learning_rate": 9.999115255295256e-06,
"loss": 0.9743,
"step": 317
},
{
"epoch": 0.017502339149100116,
"grad_norm": 1.0607460737228394,
"learning_rate": 9.999107082420674e-06,
"loss": 0.8878,
"step": 318
},
{
"epoch": 0.017557377951455778,
"grad_norm": 1.1480191946029663,
"learning_rate": 9.999098871974144e-06,
"loss": 0.8769,
"step": 319
},
{
"epoch": 0.017612416753811436,
"grad_norm": 1.1150004863739014,
"learning_rate": 9.999090623955724e-06,
"loss": 0.8615,
"step": 320
},
{
"epoch": 0.017667455556167097,
"grad_norm": 1.137839913368225,
"learning_rate": 9.999082338365478e-06,
"loss": 0.9703,
"step": 321
},
{
"epoch": 0.01772249435852276,
"grad_norm": 1.0883489847183228,
"learning_rate": 9.999074015203467e-06,
"loss": 0.9273,
"step": 322
},
{
"epoch": 0.01777753316087842,
"grad_norm": 1.0999557971954346,
"learning_rate": 9.999065654469752e-06,
"loss": 0.9605,
"step": 323
},
{
"epoch": 0.01783257196323408,
"grad_norm": 0.9911689758300781,
"learning_rate": 9.999057256164401e-06,
"loss": 0.9117,
"step": 324
},
{
"epoch": 0.01788761076558974,
"grad_norm": 1.040933609008789,
"learning_rate": 9.999048820287472e-06,
"loss": 0.9229,
"step": 325
},
{
"epoch": 0.0179426495679454,
"grad_norm": 1.4341392517089844,
"learning_rate": 9.999040346839031e-06,
"loss": 1.0718,
"step": 326
},
{
"epoch": 0.017997688370301062,
"grad_norm": 1.0246332883834839,
"learning_rate": 9.99903183581914e-06,
"loss": 0.9617,
"step": 327
},
{
"epoch": 0.018052727172656723,
"grad_norm": 10.162322998046875,
"learning_rate": 9.999023287227863e-06,
"loss": 1.0391,
"step": 328
},
{
"epoch": 0.018107765975012385,
"grad_norm": 1.3370027542114258,
"learning_rate": 9.999014701065266e-06,
"loss": 1.0211,
"step": 329
},
{
"epoch": 0.018162804777368046,
"grad_norm": 1.0146219730377197,
"learning_rate": 9.999006077331413e-06,
"loss": 0.8611,
"step": 330
},
{
"epoch": 0.018217843579723704,
"grad_norm": 1.0899269580841064,
"learning_rate": 9.998997416026368e-06,
"loss": 0.9209,
"step": 331
},
{
"epoch": 0.018272882382079365,
"grad_norm": 1.1343204975128174,
"learning_rate": 9.998988717150198e-06,
"loss": 0.9405,
"step": 332
},
{
"epoch": 0.018327921184435027,
"grad_norm": 1.2308380603790283,
"learning_rate": 9.998979980702965e-06,
"loss": 0.9579,
"step": 333
},
{
"epoch": 0.018382959986790688,
"grad_norm": 1.1433519124984741,
"learning_rate": 9.998971206684737e-06,
"loss": 1.0045,
"step": 334
},
{
"epoch": 0.01843799878914635,
"grad_norm": 1.0585781335830688,
"learning_rate": 9.99896239509558e-06,
"loss": 0.9171,
"step": 335
},
{
"epoch": 0.018493037591502007,
"grad_norm": 1.2735164165496826,
"learning_rate": 9.99895354593556e-06,
"loss": 1.1001,
"step": 336
},
{
"epoch": 0.01854807639385767,
"grad_norm": 1.2905755043029785,
"learning_rate": 9.998944659204744e-06,
"loss": 1.0294,
"step": 337
},
{
"epoch": 0.01860311519621333,
"grad_norm": 1.1442075967788696,
"learning_rate": 9.998935734903198e-06,
"loss": 0.9385,
"step": 338
},
{
"epoch": 0.01865815399856899,
"grad_norm": 1.1005232334136963,
"learning_rate": 9.998926773030987e-06,
"loss": 1.026,
"step": 339
},
{
"epoch": 0.018713192800924653,
"grad_norm": 1.2770785093307495,
"learning_rate": 9.998917773588182e-06,
"loss": 1.0015,
"step": 340
},
{
"epoch": 0.01876823160328031,
"grad_norm": 1.0963070392608643,
"learning_rate": 9.998908736574849e-06,
"loss": 0.9347,
"step": 341
},
{
"epoch": 0.018823270405635972,
"grad_norm": 1.10364830493927,
"learning_rate": 9.998899661991055e-06,
"loss": 0.869,
"step": 342
},
{
"epoch": 0.018878309207991634,
"grad_norm": 1.0364975929260254,
"learning_rate": 9.99889054983687e-06,
"loss": 0.9855,
"step": 343
},
{
"epoch": 0.018933348010347295,
"grad_norm": 1.104702115058899,
"learning_rate": 9.998881400112362e-06,
"loss": 0.9555,
"step": 344
},
{
"epoch": 0.018988386812702956,
"grad_norm": 0.9957441687583923,
"learning_rate": 9.998872212817599e-06,
"loss": 0.9634,
"step": 345
},
{
"epoch": 0.019043425615058618,
"grad_norm": 1.262271523475647,
"learning_rate": 9.998862987952651e-06,
"loss": 1.0133,
"step": 346
},
{
"epoch": 0.019098464417414276,
"grad_norm": 1.2075226306915283,
"learning_rate": 9.998853725517587e-06,
"loss": 1.0588,
"step": 347
},
{
"epoch": 0.019153503219769937,
"grad_norm": 1.0609898567199707,
"learning_rate": 9.998844425512477e-06,
"loss": 0.9952,
"step": 348
},
{
"epoch": 0.0192085420221256,
"grad_norm": 1.1930195093154907,
"learning_rate": 9.998835087937389e-06,
"loss": 0.9617,
"step": 349
},
{
"epoch": 0.01926358082448126,
"grad_norm": 1.2359932661056519,
"learning_rate": 9.998825712792396e-06,
"loss": 0.8768,
"step": 350
},
{
"epoch": 0.01931861962683692,
"grad_norm": 0.9984115362167358,
"learning_rate": 9.998816300077566e-06,
"loss": 0.8205,
"step": 351
},
{
"epoch": 0.01937365842919258,
"grad_norm": 1.6853677034378052,
"learning_rate": 9.998806849792972e-06,
"loss": 0.9066,
"step": 352
},
{
"epoch": 0.01942869723154824,
"grad_norm": 1.2869856357574463,
"learning_rate": 9.998797361938683e-06,
"loss": 1.0054,
"step": 353
},
{
"epoch": 0.019483736033903902,
"grad_norm": 1.2791584730148315,
"learning_rate": 9.99878783651477e-06,
"loss": 0.7627,
"step": 354
},
{
"epoch": 0.019538774836259563,
"grad_norm": 1.0795867443084717,
"learning_rate": 9.998778273521307e-06,
"loss": 0.9343,
"step": 355
},
{
"epoch": 0.019593813638615225,
"grad_norm": 1.0926088094711304,
"learning_rate": 9.998768672958365e-06,
"loss": 0.943,
"step": 356
},
{
"epoch": 0.019648852440970886,
"grad_norm": 1.0530847311019897,
"learning_rate": 9.998759034826015e-06,
"loss": 0.9656,
"step": 357
},
{
"epoch": 0.019703891243326544,
"grad_norm": 1.1793400049209595,
"learning_rate": 9.99874935912433e-06,
"loss": 0.9799,
"step": 358
},
{
"epoch": 0.019758930045682205,
"grad_norm": 1.0726191997528076,
"learning_rate": 9.998739645853383e-06,
"loss": 0.8739,
"step": 359
},
{
"epoch": 0.019813968848037867,
"grad_norm": 1.0488981008529663,
"learning_rate": 9.998729895013246e-06,
"loss": 0.8986,
"step": 360
},
{
"epoch": 0.019869007650393528,
"grad_norm": 1.8267477750778198,
"learning_rate": 9.998720106603993e-06,
"loss": 0.9175,
"step": 361
},
{
"epoch": 0.01992404645274919,
"grad_norm": 0.9868306517601013,
"learning_rate": 9.9987102806257e-06,
"loss": 0.9609,
"step": 362
},
{
"epoch": 0.019979085255104848,
"grad_norm": 1.0171183347702026,
"learning_rate": 9.998700417078438e-06,
"loss": 0.8904,
"step": 363
},
{
"epoch": 0.02003412405746051,
"grad_norm": 0.9800812602043152,
"learning_rate": 9.998690515962282e-06,
"loss": 0.8344,
"step": 364
},
{
"epoch": 0.02008916285981617,
"grad_norm": 1.024707317352295,
"learning_rate": 9.998680577277304e-06,
"loss": 0.9026,
"step": 365
},
{
"epoch": 0.02014420166217183,
"grad_norm": 1.1056619882583618,
"learning_rate": 9.998670601023584e-06,
"loss": 1.017,
"step": 366
},
{
"epoch": 0.020199240464527493,
"grad_norm": 1.0555908679962158,
"learning_rate": 9.998660587201191e-06,
"loss": 0.9627,
"step": 367
},
{
"epoch": 0.02025427926688315,
"grad_norm": 0.9502031803131104,
"learning_rate": 9.998650535810204e-06,
"loss": 0.935,
"step": 368
},
{
"epoch": 0.020309318069238812,
"grad_norm": 1.0355613231658936,
"learning_rate": 9.998640446850699e-06,
"loss": 0.9946,
"step": 369
},
{
"epoch": 0.020364356871594474,
"grad_norm": 0.9906355142593384,
"learning_rate": 9.99863032032275e-06,
"loss": 0.9389,
"step": 370
},
{
"epoch": 0.020419395673950135,
"grad_norm": 0.9483911395072937,
"learning_rate": 9.99862015622643e-06,
"loss": 0.979,
"step": 371
},
{
"epoch": 0.020474434476305797,
"grad_norm": 0.9769986271858215,
"learning_rate": 9.998609954561822e-06,
"loss": 0.8972,
"step": 372
},
{
"epoch": 0.020529473278661458,
"grad_norm": 1.1682699918746948,
"learning_rate": 9.998599715329e-06,
"loss": 0.943,
"step": 373
},
{
"epoch": 0.020584512081017116,
"grad_norm": 1.007912516593933,
"learning_rate": 9.99858943852804e-06,
"loss": 0.8825,
"step": 374
},
{
"epoch": 0.020639550883372777,
"grad_norm": 0.9788785576820374,
"learning_rate": 9.99857912415902e-06,
"loss": 0.9667,
"step": 375
},
{
"epoch": 0.02069458968572844,
"grad_norm": 1.0804275274276733,
"learning_rate": 9.998568772222017e-06,
"loss": 1.0026,
"step": 376
},
{
"epoch": 0.0207496284880841,
"grad_norm": 1.0859237909317017,
"learning_rate": 9.998558382717109e-06,
"loss": 0.9592,
"step": 377
},
{
"epoch": 0.02080466729043976,
"grad_norm": 1.2925337553024292,
"learning_rate": 9.998547955644373e-06,
"loss": 0.9067,
"step": 378
},
{
"epoch": 0.02085970609279542,
"grad_norm": 0.9853373765945435,
"learning_rate": 9.99853749100389e-06,
"loss": 0.9538,
"step": 379
},
{
"epoch": 0.02091474489515108,
"grad_norm": 1.0461076498031616,
"learning_rate": 9.998526988795738e-06,
"loss": 0.9261,
"step": 380
},
{
"epoch": 0.020969783697506742,
"grad_norm": 1.024559497833252,
"learning_rate": 9.998516449019995e-06,
"loss": 0.9117,
"step": 381
},
{
"epoch": 0.021024822499862404,
"grad_norm": 1.1474825143814087,
"learning_rate": 9.998505871676739e-06,
"loss": 1.0177,
"step": 382
},
{
"epoch": 0.021079861302218065,
"grad_norm": 0.9587596654891968,
"learning_rate": 9.998495256766051e-06,
"loss": 0.8809,
"step": 383
},
{
"epoch": 0.021134900104573723,
"grad_norm": 0.9505122303962708,
"learning_rate": 9.998484604288013e-06,
"loss": 0.9266,
"step": 384
},
{
"epoch": 0.021189938906929384,
"grad_norm": 0.9625647664070129,
"learning_rate": 9.9984739142427e-06,
"loss": 0.9073,
"step": 385
},
{
"epoch": 0.021244977709285046,
"grad_norm": 0.9650934338569641,
"learning_rate": 9.998463186630196e-06,
"loss": 0.9042,
"step": 386
},
{
"epoch": 0.021300016511640707,
"grad_norm": 1.0289491415023804,
"learning_rate": 9.99845242145058e-06,
"loss": 0.929,
"step": 387
},
{
"epoch": 0.02135505531399637,
"grad_norm": 0.9543869495391846,
"learning_rate": 9.998441618703935e-06,
"loss": 0.9406,
"step": 388
},
{
"epoch": 0.02141009411635203,
"grad_norm": 0.9276942610740662,
"learning_rate": 9.99843077839034e-06,
"loss": 0.8982,
"step": 389
},
{
"epoch": 0.021465132918707688,
"grad_norm": 0.9264664053916931,
"learning_rate": 9.998419900509877e-06,
"loss": 0.7255,
"step": 390
},
{
"epoch": 0.02152017172106335,
"grad_norm": 0.9961187243461609,
"learning_rate": 9.998408985062628e-06,
"loss": 0.9826,
"step": 391
},
{
"epoch": 0.02157521052341901,
"grad_norm": 0.966596245765686,
"learning_rate": 9.998398032048676e-06,
"loss": 0.8159,
"step": 392
},
{
"epoch": 0.021630249325774672,
"grad_norm": 1.1336095333099365,
"learning_rate": 9.998387041468102e-06,
"loss": 0.9289,
"step": 393
},
{
"epoch": 0.021685288128130333,
"grad_norm": 1.0453619956970215,
"learning_rate": 9.998376013320989e-06,
"loss": 0.8816,
"step": 394
},
{
"epoch": 0.02174032693048599,
"grad_norm": 0.8961821794509888,
"learning_rate": 9.998364947607419e-06,
"loss": 0.871,
"step": 395
},
{
"epoch": 0.021795365732841653,
"grad_norm": 1.3420332670211792,
"learning_rate": 9.998353844327477e-06,
"loss": 0.9338,
"step": 396
},
{
"epoch": 0.021850404535197314,
"grad_norm": 0.9635335206985474,
"learning_rate": 9.998342703481246e-06,
"loss": 0.9592,
"step": 397
},
{
"epoch": 0.021905443337552975,
"grad_norm": 1.3322341442108154,
"learning_rate": 9.998331525068807e-06,
"loss": 1.0974,
"step": 398
},
{
"epoch": 0.021960482139908637,
"grad_norm": 1.017220377922058,
"learning_rate": 9.998320309090247e-06,
"loss": 0.9827,
"step": 399
},
{
"epoch": 0.022015520942264295,
"grad_norm": 1.0080329179763794,
"learning_rate": 9.99830905554565e-06,
"loss": 0.877,
"step": 400
},
{
"epoch": 0.022070559744619956,
"grad_norm": 0.9883211255073547,
"learning_rate": 9.998297764435101e-06,
"loss": 0.9625,
"step": 401
},
{
"epoch": 0.022125598546975617,
"grad_norm": 1.0948412418365479,
"learning_rate": 9.998286435758684e-06,
"loss": 0.9058,
"step": 402
},
{
"epoch": 0.02218063734933128,
"grad_norm": 0.9402000308036804,
"learning_rate": 9.998275069516482e-06,
"loss": 0.8882,
"step": 403
},
{
"epoch": 0.02223567615168694,
"grad_norm": 0.9858806133270264,
"learning_rate": 9.998263665708583e-06,
"loss": 0.9086,
"step": 404
},
{
"epoch": 0.0222907149540426,
"grad_norm": 1.0556131601333618,
"learning_rate": 9.998252224335073e-06,
"loss": 0.9583,
"step": 405
},
{
"epoch": 0.02234575375639826,
"grad_norm": 1.092766284942627,
"learning_rate": 9.998240745396037e-06,
"loss": 0.9124,
"step": 406
},
{
"epoch": 0.02240079255875392,
"grad_norm": 1.1902250051498413,
"learning_rate": 9.998229228891563e-06,
"loss": 1.0566,
"step": 407
},
{
"epoch": 0.022455831361109582,
"grad_norm": 1.067906141281128,
"learning_rate": 9.998217674821734e-06,
"loss": 0.9823,
"step": 408
},
{
"epoch": 0.022510870163465244,
"grad_norm": 1.0051710605621338,
"learning_rate": 9.998206083186638e-06,
"loss": 0.9141,
"step": 409
},
{
"epoch": 0.022565908965820905,
"grad_norm": 1.046412467956543,
"learning_rate": 9.998194453986367e-06,
"loss": 0.9439,
"step": 410
},
{
"epoch": 0.022620947768176563,
"grad_norm": 1.1103553771972656,
"learning_rate": 9.998182787221e-06,
"loss": 0.9494,
"step": 411
},
{
"epoch": 0.022675986570532224,
"grad_norm": 1.0508466958999634,
"learning_rate": 9.998171082890632e-06,
"loss": 0.9202,
"step": 412
},
{
"epoch": 0.022731025372887886,
"grad_norm": 1.1364226341247559,
"learning_rate": 9.998159340995347e-06,
"loss": 0.9859,
"step": 413
},
{
"epoch": 0.022786064175243547,
"grad_norm": 1.2073607444763184,
"learning_rate": 9.998147561535234e-06,
"loss": 0.8883,
"step": 414
},
{
"epoch": 0.02284110297759921,
"grad_norm": 1.0657012462615967,
"learning_rate": 9.998135744510384e-06,
"loss": 0.8321,
"step": 415
},
{
"epoch": 0.02289614177995487,
"grad_norm": 1.0101548433303833,
"learning_rate": 9.998123889920881e-06,
"loss": 0.9374,
"step": 416
},
{
"epoch": 0.022951180582310528,
"grad_norm": 1.057455062866211,
"learning_rate": 9.998111997766817e-06,
"loss": 0.8831,
"step": 417
},
{
"epoch": 0.02300621938466619,
"grad_norm": 1.206092357635498,
"learning_rate": 9.998100068048282e-06,
"loss": 0.8812,
"step": 418
},
{
"epoch": 0.02306125818702185,
"grad_norm": 1.0709773302078247,
"learning_rate": 9.998088100765366e-06,
"loss": 0.9486,
"step": 419
},
{
"epoch": 0.023116296989377512,
"grad_norm": 1.066469669342041,
"learning_rate": 9.998076095918156e-06,
"loss": 1.0229,
"step": 420
},
{
"epoch": 0.023171335791733173,
"grad_norm": 1.0443583726882935,
"learning_rate": 9.998064053506744e-06,
"loss": 0.8615,
"step": 421
},
{
"epoch": 0.02322637459408883,
"grad_norm": 1.103096842765808,
"learning_rate": 9.99805197353122e-06,
"loss": 0.9909,
"step": 422
},
{
"epoch": 0.023281413396444493,
"grad_norm": 0.9804643392562866,
"learning_rate": 9.998039855991677e-06,
"loss": 0.9214,
"step": 423
},
{
"epoch": 0.023336452198800154,
"grad_norm": 0.9880676865577698,
"learning_rate": 9.998027700888202e-06,
"loss": 0.9345,
"step": 424
},
{
"epoch": 0.023391491001155815,
"grad_norm": 0.9633826017379761,
"learning_rate": 9.99801550822089e-06,
"loss": 0.9897,
"step": 425
},
{
"epoch": 0.023446529803511477,
"grad_norm": 1.0159331560134888,
"learning_rate": 9.998003277989831e-06,
"loss": 0.9385,
"step": 426
},
{
"epoch": 0.023501568605867135,
"grad_norm": 1.009667158126831,
"learning_rate": 9.99799101019512e-06,
"loss": 0.9013,
"step": 427
},
{
"epoch": 0.023556607408222796,
"grad_norm": 0.9478578567504883,
"learning_rate": 9.997978704836842e-06,
"loss": 0.8775,
"step": 428
},
{
"epoch": 0.023611646210578457,
"grad_norm": 1.013181447982788,
"learning_rate": 9.997966361915096e-06,
"loss": 0.8797,
"step": 429
},
{
"epoch": 0.02366668501293412,
"grad_norm": 1.0337481498718262,
"learning_rate": 9.997953981429974e-06,
"loss": 1.0047,
"step": 430
},
{
"epoch": 0.02372172381528978,
"grad_norm": 0.9423721432685852,
"learning_rate": 9.997941563381566e-06,
"loss": 0.8639,
"step": 431
},
{
"epoch": 0.02377676261764544,
"grad_norm": 1.100492000579834,
"learning_rate": 9.997929107769968e-06,
"loss": 1.0022,
"step": 432
},
{
"epoch": 0.0238318014200011,
"grad_norm": 1.1232364177703857,
"learning_rate": 9.997916614595272e-06,
"loss": 0.9145,
"step": 433
},
{
"epoch": 0.02388684022235676,
"grad_norm": 0.9466833472251892,
"learning_rate": 9.997904083857572e-06,
"loss": 0.9397,
"step": 434
},
{
"epoch": 0.023941879024712422,
"grad_norm": 0.9514566659927368,
"learning_rate": 9.997891515556963e-06,
"loss": 0.8025,
"step": 435
},
{
"epoch": 0.023996917827068084,
"grad_norm": 0.9292222261428833,
"learning_rate": 9.997878909693539e-06,
"loss": 0.7739,
"step": 436
},
{
"epoch": 0.024051956629423745,
"grad_norm": 1.1049963235855103,
"learning_rate": 9.997866266267397e-06,
"loss": 0.9439,
"step": 437
},
{
"epoch": 0.024106995431779403,
"grad_norm": 1.0938019752502441,
"learning_rate": 9.997853585278627e-06,
"loss": 0.9479,
"step": 438
},
{
"epoch": 0.024162034234135064,
"grad_norm": 1.0423611402511597,
"learning_rate": 9.997840866727331e-06,
"loss": 0.9309,
"step": 439
},
{
"epoch": 0.024217073036490726,
"grad_norm": 1.0584756135940552,
"learning_rate": 9.997828110613598e-06,
"loss": 1.0218,
"step": 440
},
{
"epoch": 0.024272111838846387,
"grad_norm": 0.9986408948898315,
"learning_rate": 9.997815316937527e-06,
"loss": 0.9734,
"step": 441
},
{
"epoch": 0.02432715064120205,
"grad_norm": 0.9680983424186707,
"learning_rate": 9.997802485699215e-06,
"loss": 0.9286,
"step": 442
},
{
"epoch": 0.024382189443557706,
"grad_norm": 1.2231700420379639,
"learning_rate": 9.997789616898757e-06,
"loss": 0.8083,
"step": 443
},
{
"epoch": 0.024437228245913368,
"grad_norm": 1.0064021348953247,
"learning_rate": 9.99777671053625e-06,
"loss": 0.9161,
"step": 444
},
{
"epoch": 0.02449226704826903,
"grad_norm": 0.9658541679382324,
"learning_rate": 9.99776376661179e-06,
"loss": 0.8027,
"step": 445
},
{
"epoch": 0.02454730585062469,
"grad_norm": 0.9440343379974365,
"learning_rate": 9.997750785125477e-06,
"loss": 0.9124,
"step": 446
},
{
"epoch": 0.024602344652980352,
"grad_norm": 0.998792827129364,
"learning_rate": 9.997737766077404e-06,
"loss": 0.8699,
"step": 447
},
{
"epoch": 0.024657383455336013,
"grad_norm": 1.430880069732666,
"learning_rate": 9.997724709467676e-06,
"loss": 0.9158,
"step": 448
},
{
"epoch": 0.02471242225769167,
"grad_norm": 0.9737820029258728,
"learning_rate": 9.997711615296384e-06,
"loss": 0.9496,
"step": 449
},
{
"epoch": 0.024767461060047333,
"grad_norm": 0.9710075855255127,
"learning_rate": 9.997698483563629e-06,
"loss": 0.8714,
"step": 450
},
{
"epoch": 0.024822499862402994,
"grad_norm": 1.5286253690719604,
"learning_rate": 9.997685314269511e-06,
"loss": 0.8421,
"step": 451
},
{
"epoch": 0.024877538664758655,
"grad_norm": 1.0269445180892944,
"learning_rate": 9.99767210741413e-06,
"loss": 1.0131,
"step": 452
},
{
"epoch": 0.024932577467114317,
"grad_norm": 0.9780508279800415,
"learning_rate": 9.99765886299758e-06,
"loss": 0.9897,
"step": 453
},
{
"epoch": 0.024987616269469975,
"grad_norm": 0.998332679271698,
"learning_rate": 9.997645581019965e-06,
"loss": 0.9647,
"step": 454
},
{
"epoch": 0.025042655071825636,
"grad_norm": 1.7062602043151855,
"learning_rate": 9.997632261481383e-06,
"loss": 1.0729,
"step": 455
},
{
"epoch": 0.025097693874181298,
"grad_norm": 0.9793694615364075,
"learning_rate": 9.997618904381936e-06,
"loss": 0.9556,
"step": 456
},
{
"epoch": 0.02515273267653696,
"grad_norm": 1.0183895826339722,
"learning_rate": 9.997605509721721e-06,
"loss": 0.9194,
"step": 457
},
{
"epoch": 0.02520777147889262,
"grad_norm": 1.0288400650024414,
"learning_rate": 9.997592077500844e-06,
"loss": 0.955,
"step": 458
},
{
"epoch": 0.025262810281248282,
"grad_norm": 0.9551253914833069,
"learning_rate": 9.997578607719401e-06,
"loss": 0.8498,
"step": 459
},
{
"epoch": 0.02531784908360394,
"grad_norm": 0.9648008942604065,
"learning_rate": 9.997565100377494e-06,
"loss": 0.9306,
"step": 460
},
{
"epoch": 0.0253728878859596,
"grad_norm": 0.9206677675247192,
"learning_rate": 9.997551555475225e-06,
"loss": 0.7874,
"step": 461
},
{
"epoch": 0.025427926688315262,
"grad_norm": 1.0479545593261719,
"learning_rate": 9.997537973012698e-06,
"loss": 0.9201,
"step": 462
},
{
"epoch": 0.025482965490670924,
"grad_norm": 1.0329946279525757,
"learning_rate": 9.997524352990013e-06,
"loss": 0.9577,
"step": 463
},
{
"epoch": 0.025538004293026585,
"grad_norm": 1.1177828311920166,
"learning_rate": 9.997510695407273e-06,
"loss": 1.0041,
"step": 464
},
{
"epoch": 0.025593043095382243,
"grad_norm": 1.0351577997207642,
"learning_rate": 9.99749700026458e-06,
"loss": 0.9952,
"step": 465
},
{
"epoch": 0.025648081897737905,
"grad_norm": 0.905274510383606,
"learning_rate": 9.997483267562035e-06,
"loss": 0.8185,
"step": 466
},
{
"epoch": 0.025703120700093566,
"grad_norm": 1.0749776363372803,
"learning_rate": 9.997469497299747e-06,
"loss": 1.0611,
"step": 467
},
{
"epoch": 0.025758159502449227,
"grad_norm": 0.8972223401069641,
"learning_rate": 9.997455689477815e-06,
"loss": 0.8994,
"step": 468
},
{
"epoch": 0.02581319830480489,
"grad_norm": 1.0669914484024048,
"learning_rate": 9.997441844096342e-06,
"loss": 1.06,
"step": 469
},
{
"epoch": 0.025868237107160547,
"grad_norm": 1.0431914329528809,
"learning_rate": 9.997427961155435e-06,
"loss": 0.8657,
"step": 470
},
{
"epoch": 0.025923275909516208,
"grad_norm": 0.9609962701797485,
"learning_rate": 9.997414040655198e-06,
"loss": 0.8864,
"step": 471
},
{
"epoch": 0.02597831471187187,
"grad_norm": 1.0829721689224243,
"learning_rate": 9.997400082595735e-06,
"loss": 0.9221,
"step": 472
},
{
"epoch": 0.02603335351422753,
"grad_norm": 0.992082953453064,
"learning_rate": 9.99738608697715e-06,
"loss": 0.8455,
"step": 473
},
{
"epoch": 0.026088392316583192,
"grad_norm": 1.0486301183700562,
"learning_rate": 9.997372053799547e-06,
"loss": 0.8729,
"step": 474
},
{
"epoch": 0.026143431118938854,
"grad_norm": 1.0328491926193237,
"learning_rate": 9.997357983063036e-06,
"loss": 0.8788,
"step": 475
},
{
"epoch": 0.02619846992129451,
"grad_norm": 0.963333249092102,
"learning_rate": 9.997343874767719e-06,
"loss": 0.892,
"step": 476
},
{
"epoch": 0.026253508723650173,
"grad_norm": 1.1606497764587402,
"learning_rate": 9.997329728913704e-06,
"loss": 0.9984,
"step": 477
},
{
"epoch": 0.026308547526005834,
"grad_norm": 1.241650104522705,
"learning_rate": 9.997315545501096e-06,
"loss": 0.946,
"step": 478
},
{
"epoch": 0.026363586328361496,
"grad_norm": 1.008004069328308,
"learning_rate": 9.99730132453e-06,
"loss": 0.849,
"step": 479
},
{
"epoch": 0.026418625130717157,
"grad_norm": 0.9883478879928589,
"learning_rate": 9.997287066000527e-06,
"loss": 0.9478,
"step": 480
},
{
"epoch": 0.026473663933072815,
"grad_norm": 1.0224446058273315,
"learning_rate": 9.997272769912783e-06,
"loss": 1.0318,
"step": 481
},
{
"epoch": 0.026528702735428476,
"grad_norm": 0.9412569403648376,
"learning_rate": 9.997258436266874e-06,
"loss": 0.9119,
"step": 482
},
{
"epoch": 0.026583741537784138,
"grad_norm": 0.9214537739753723,
"learning_rate": 9.997244065062906e-06,
"loss": 0.8785,
"step": 483
},
{
"epoch": 0.0266387803401398,
"grad_norm": 1.0015628337860107,
"learning_rate": 9.997229656300991e-06,
"loss": 0.8869,
"step": 484
},
{
"epoch": 0.02669381914249546,
"grad_norm": 0.8965190052986145,
"learning_rate": 9.997215209981237e-06,
"loss": 0.7009,
"step": 485
},
{
"epoch": 0.02674885794485112,
"grad_norm": 1.1976135969161987,
"learning_rate": 9.997200726103749e-06,
"loss": 0.9795,
"step": 486
},
{
"epoch": 0.02680389674720678,
"grad_norm": 0.864780843257904,
"learning_rate": 9.997186204668639e-06,
"loss": 0.7687,
"step": 487
},
{
"epoch": 0.02685893554956244,
"grad_norm": 0.9946566820144653,
"learning_rate": 9.997171645676013e-06,
"loss": 0.9672,
"step": 488
},
{
"epoch": 0.026913974351918103,
"grad_norm": 1.043835997581482,
"learning_rate": 9.997157049125985e-06,
"loss": 0.862,
"step": 489
},
{
"epoch": 0.026969013154273764,
"grad_norm": 0.9697456955909729,
"learning_rate": 9.99714241501866e-06,
"loss": 0.8368,
"step": 490
},
{
"epoch": 0.027024051956629425,
"grad_norm": 0.9975618124008179,
"learning_rate": 9.997127743354153e-06,
"loss": 0.8739,
"step": 491
},
{
"epoch": 0.027079090758985083,
"grad_norm": 1.0055313110351562,
"learning_rate": 9.99711303413257e-06,
"loss": 0.9227,
"step": 492
},
{
"epoch": 0.027134129561340745,
"grad_norm": 1.0418384075164795,
"learning_rate": 9.997098287354024e-06,
"loss": 0.9978,
"step": 493
},
{
"epoch": 0.027189168363696406,
"grad_norm": 0.8648970723152161,
"learning_rate": 9.997083503018625e-06,
"loss": 0.8363,
"step": 494
},
{
"epoch": 0.027244207166052067,
"grad_norm": 1.13506019115448,
"learning_rate": 9.997068681126483e-06,
"loss": 0.8851,
"step": 495
},
{
"epoch": 0.02729924596840773,
"grad_norm": 0.974400520324707,
"learning_rate": 9.997053821677712e-06,
"loss": 0.8533,
"step": 496
},
{
"epoch": 0.027354284770763387,
"grad_norm": 1.226507544517517,
"learning_rate": 9.997038924672419e-06,
"loss": 0.8586,
"step": 497
},
{
"epoch": 0.027409323573119048,
"grad_norm": 1.004753589630127,
"learning_rate": 9.997023990110721e-06,
"loss": 0.8974,
"step": 498
},
{
"epoch": 0.02746436237547471,
"grad_norm": 1.0492571592330933,
"learning_rate": 9.997009017992729e-06,
"loss": 0.8457,
"step": 499
},
{
"epoch": 0.02751940117783037,
"grad_norm": 1.0068167448043823,
"learning_rate": 9.996994008318554e-06,
"loss": 0.9608,
"step": 500
},
{
"epoch": 0.027574439980186032,
"grad_norm": 0.9686044454574585,
"learning_rate": 9.996978961088311e-06,
"loss": 0.9041,
"step": 501
},
{
"epoch": 0.027629478782541694,
"grad_norm": 1.281728744506836,
"learning_rate": 9.99696387630211e-06,
"loss": 0.9739,
"step": 502
},
{
"epoch": 0.02768451758489735,
"grad_norm": 0.9069758653640747,
"learning_rate": 9.996948753960065e-06,
"loss": 0.8467,
"step": 503
},
{
"epoch": 0.027739556387253013,
"grad_norm": 1.0337222814559937,
"learning_rate": 9.996933594062293e-06,
"loss": 0.9638,
"step": 504
},
{
"epoch": 0.027794595189608674,
"grad_norm": 0.9695359468460083,
"learning_rate": 9.996918396608905e-06,
"loss": 0.8986,
"step": 505
},
{
"epoch": 0.027849633991964336,
"grad_norm": 0.9120615124702454,
"learning_rate": 9.996903161600016e-06,
"loss": 0.9103,
"step": 506
},
{
"epoch": 0.027904672794319997,
"grad_norm": 0.9736546874046326,
"learning_rate": 9.996887889035741e-06,
"loss": 0.9308,
"step": 507
},
{
"epoch": 0.027959711596675655,
"grad_norm": 1.0184897184371948,
"learning_rate": 9.996872578916192e-06,
"loss": 0.8978,
"step": 508
},
{
"epoch": 0.028014750399031316,
"grad_norm": 0.9791838526725769,
"learning_rate": 9.996857231241489e-06,
"loss": 0.8639,
"step": 509
},
{
"epoch": 0.028069789201386978,
"grad_norm": 1.2985681295394897,
"learning_rate": 9.996841846011742e-06,
"loss": 0.9581,
"step": 510
},
{
"epoch": 0.02812482800374264,
"grad_norm": 1.0647368431091309,
"learning_rate": 9.996826423227071e-06,
"loss": 1.0565,
"step": 511
},
{
"epoch": 0.0281798668060983,
"grad_norm": 1.0336421728134155,
"learning_rate": 9.996810962887591e-06,
"loss": 1.008,
"step": 512
},
{
"epoch": 0.02823490560845396,
"grad_norm": 1.1838933229446411,
"learning_rate": 9.996795464993416e-06,
"loss": 0.8359,
"step": 513
},
{
"epoch": 0.02828994441080962,
"grad_norm": 0.9898360371589661,
"learning_rate": 9.996779929544663e-06,
"loss": 0.8501,
"step": 514
},
{
"epoch": 0.02834498321316528,
"grad_norm": 0.9836066365242004,
"learning_rate": 9.99676435654145e-06,
"loss": 0.8795,
"step": 515
},
{
"epoch": 0.028400022015520943,
"grad_norm": 1.0621601343154907,
"learning_rate": 9.996748745983895e-06,
"loss": 0.8746,
"step": 516
},
{
"epoch": 0.028455060817876604,
"grad_norm": 1.0082437992095947,
"learning_rate": 9.996733097872113e-06,
"loss": 0.9278,
"step": 517
},
{
"epoch": 0.028510099620232265,
"grad_norm": 0.9903931617736816,
"learning_rate": 9.996717412206222e-06,
"loss": 0.8264,
"step": 518
},
{
"epoch": 0.028565138422587923,
"grad_norm": 1.0797243118286133,
"learning_rate": 9.996701688986342e-06,
"loss": 1.0077,
"step": 519
},
{
"epoch": 0.028620177224943585,
"grad_norm": 1.147133231163025,
"learning_rate": 9.99668592821259e-06,
"loss": 0.9374,
"step": 520
},
{
"epoch": 0.028675216027299246,
"grad_norm": 0.9993947744369507,
"learning_rate": 9.996670129885082e-06,
"loss": 0.9562,
"step": 521
},
{
"epoch": 0.028730254829654907,
"grad_norm": 0.8580895066261292,
"learning_rate": 9.99665429400394e-06,
"loss": 0.7985,
"step": 522
},
{
"epoch": 0.02878529363201057,
"grad_norm": 0.9251388907432556,
"learning_rate": 9.996638420569281e-06,
"loss": 0.7323,
"step": 523
},
{
"epoch": 0.028840332434366227,
"grad_norm": 1.0010193586349487,
"learning_rate": 9.996622509581227e-06,
"loss": 0.9316,
"step": 524
},
{
"epoch": 0.028895371236721888,
"grad_norm": 0.9822579026222229,
"learning_rate": 9.996606561039894e-06,
"loss": 0.8978,
"step": 525
},
{
"epoch": 0.02895041003907755,
"grad_norm": 1.0760595798492432,
"learning_rate": 9.996590574945403e-06,
"loss": 0.9125,
"step": 526
},
{
"epoch": 0.02900544884143321,
"grad_norm": 1.138869285583496,
"learning_rate": 9.996574551297876e-06,
"loss": 0.8185,
"step": 527
},
{
"epoch": 0.029060487643788872,
"grad_norm": 1.002994179725647,
"learning_rate": 9.996558490097433e-06,
"loss": 0.9404,
"step": 528
},
{
"epoch": 0.02911552644614453,
"grad_norm": 0.9550611972808838,
"learning_rate": 9.996542391344194e-06,
"loss": 0.859,
"step": 529
},
{
"epoch": 0.02917056524850019,
"grad_norm": 0.9236055612564087,
"learning_rate": 9.996526255038277e-06,
"loss": 0.7758,
"step": 530
},
{
"epoch": 0.029225604050855853,
"grad_norm": 1.103966474533081,
"learning_rate": 9.996510081179808e-06,
"loss": 1.0147,
"step": 531
},
{
"epoch": 0.029280642853211514,
"grad_norm": 0.9884665012359619,
"learning_rate": 9.996493869768906e-06,
"loss": 0.8784,
"step": 532
},
{
"epoch": 0.029335681655567176,
"grad_norm": 0.9173223376274109,
"learning_rate": 9.996477620805694e-06,
"loss": 0.8741,
"step": 533
},
{
"epoch": 0.029390720457922837,
"grad_norm": 0.965548574924469,
"learning_rate": 9.996461334290294e-06,
"loss": 0.8989,
"step": 534
},
{
"epoch": 0.029445759260278495,
"grad_norm": 0.9939296245574951,
"learning_rate": 9.996445010222828e-06,
"loss": 0.8552,
"step": 535
},
{
"epoch": 0.029500798062634156,
"grad_norm": 1.0081578493118286,
"learning_rate": 9.996428648603417e-06,
"loss": 0.9138,
"step": 536
},
{
"epoch": 0.029555836864989818,
"grad_norm": 1.0139487981796265,
"learning_rate": 9.996412249432188e-06,
"loss": 0.9452,
"step": 537
},
{
"epoch": 0.02961087566734548,
"grad_norm": 0.9463647603988647,
"learning_rate": 9.996395812709262e-06,
"loss": 0.8721,
"step": 538
},
{
"epoch": 0.02966591446970114,
"grad_norm": 0.9981473684310913,
"learning_rate": 9.99637933843476e-06,
"loss": 0.7791,
"step": 539
},
{
"epoch": 0.0297209532720568,
"grad_norm": 1.1637190580368042,
"learning_rate": 9.996362826608812e-06,
"loss": 0.8798,
"step": 540
},
{
"epoch": 0.02977599207441246,
"grad_norm": 2.2887051105499268,
"learning_rate": 9.996346277231536e-06,
"loss": 0.9303,
"step": 541
},
{
"epoch": 0.02983103087676812,
"grad_norm": 0.9173391461372375,
"learning_rate": 9.99632969030306e-06,
"loss": 0.8627,
"step": 542
},
{
"epoch": 0.029886069679123783,
"grad_norm": 1.033355474472046,
"learning_rate": 9.996313065823506e-06,
"loss": 0.9906,
"step": 543
},
{
"epoch": 0.029941108481479444,
"grad_norm": 0.9286639094352722,
"learning_rate": 9.996296403793002e-06,
"loss": 0.7043,
"step": 544
},
{
"epoch": 0.029996147283835102,
"grad_norm": 0.963238000869751,
"learning_rate": 9.996279704211671e-06,
"loss": 1.0236,
"step": 545
},
{
"epoch": 0.030051186086190763,
"grad_norm": 1.0275089740753174,
"learning_rate": 9.99626296707964e-06,
"loss": 0.976,
"step": 546
},
{
"epoch": 0.030106224888546425,
"grad_norm": 1.0944674015045166,
"learning_rate": 9.996246192397032e-06,
"loss": 0.9209,
"step": 547
},
{
"epoch": 0.030161263690902086,
"grad_norm": 0.9620945453643799,
"learning_rate": 9.996229380163976e-06,
"loss": 0.8973,
"step": 548
},
{
"epoch": 0.030216302493257748,
"grad_norm": 1.032549500465393,
"learning_rate": 9.996212530380597e-06,
"loss": 0.892,
"step": 549
},
{
"epoch": 0.03027134129561341,
"grad_norm": 1.0433719158172607,
"learning_rate": 9.996195643047023e-06,
"loss": 0.8428,
"step": 550
},
{
"epoch": 0.030326380097969067,
"grad_norm": 1.1541085243225098,
"learning_rate": 9.996178718163378e-06,
"loss": 0.9084,
"step": 551
},
{
"epoch": 0.03038141890032473,
"grad_norm": 0.9386873245239258,
"learning_rate": 9.996161755729793e-06,
"loss": 0.9246,
"step": 552
},
{
"epoch": 0.03043645770268039,
"grad_norm": 1.092236042022705,
"learning_rate": 9.996144755746393e-06,
"loss": 0.8419,
"step": 553
},
{
"epoch": 0.03049149650503605,
"grad_norm": 0.9517606496810913,
"learning_rate": 9.996127718213306e-06,
"loss": 0.9002,
"step": 554
},
{
"epoch": 0.030546535307391712,
"grad_norm": 0.965972900390625,
"learning_rate": 9.996110643130661e-06,
"loss": 0.9197,
"step": 555
},
{
"epoch": 0.03060157410974737,
"grad_norm": 0.9396095275878906,
"learning_rate": 9.996093530498586e-06,
"loss": 0.8686,
"step": 556
},
{
"epoch": 0.030656612912103032,
"grad_norm": 1.0154120922088623,
"learning_rate": 9.99607638031721e-06,
"loss": 0.9773,
"step": 557
},
{
"epoch": 0.030711651714458693,
"grad_norm": 1.3572301864624023,
"learning_rate": 9.99605919258666e-06,
"loss": 0.911,
"step": 558
},
{
"epoch": 0.030766690516814355,
"grad_norm": 0.968278169631958,
"learning_rate": 9.996041967307066e-06,
"loss": 0.7704,
"step": 559
},
{
"epoch": 0.030821729319170016,
"grad_norm": 0.9867869019508362,
"learning_rate": 9.99602470447856e-06,
"loss": 0.873,
"step": 560
},
{
"epoch": 0.030876768121525677,
"grad_norm": 1.056450605392456,
"learning_rate": 9.996007404101269e-06,
"loss": 0.941,
"step": 561
},
{
"epoch": 0.030931806923881335,
"grad_norm": 1.0419799089431763,
"learning_rate": 9.995990066175321e-06,
"loss": 0.957,
"step": 562
},
{
"epoch": 0.030986845726236997,
"grad_norm": 0.9789314866065979,
"learning_rate": 9.995972690700852e-06,
"loss": 0.9229,
"step": 563
},
{
"epoch": 0.031041884528592658,
"grad_norm": 0.917783796787262,
"learning_rate": 9.995955277677989e-06,
"loss": 0.8186,
"step": 564
},
{
"epoch": 0.03109692333094832,
"grad_norm": 1.0231432914733887,
"learning_rate": 9.995937827106863e-06,
"loss": 0.8624,
"step": 565
},
{
"epoch": 0.03115196213330398,
"grad_norm": 0.9552083015441895,
"learning_rate": 9.995920338987605e-06,
"loss": 0.7967,
"step": 566
},
{
"epoch": 0.03120700093565964,
"grad_norm": 0.9441083669662476,
"learning_rate": 9.995902813320349e-06,
"loss": 0.8471,
"step": 567
},
{
"epoch": 0.0312620397380153,
"grad_norm": 1.0025299787521362,
"learning_rate": 9.995885250105223e-06,
"loss": 0.8646,
"step": 568
},
{
"epoch": 0.03131707854037096,
"grad_norm": 0.8997280597686768,
"learning_rate": 9.99586764934236e-06,
"loss": 0.8736,
"step": 569
},
{
"epoch": 0.03137211734272662,
"grad_norm": 0.9090663194656372,
"learning_rate": 9.995850011031896e-06,
"loss": 0.8548,
"step": 570
},
{
"epoch": 0.031427156145082284,
"grad_norm": 0.9641294479370117,
"learning_rate": 9.995832335173959e-06,
"loss": 0.8667,
"step": 571
},
{
"epoch": 0.031482194947437946,
"grad_norm": 0.9165804982185364,
"learning_rate": 9.995814621768682e-06,
"loss": 0.803,
"step": 572
},
{
"epoch": 0.03153723374979361,
"grad_norm": 0.9672492742538452,
"learning_rate": 9.995796870816202e-06,
"loss": 0.8335,
"step": 573
},
{
"epoch": 0.03159227255214927,
"grad_norm": 0.9359404444694519,
"learning_rate": 9.995779082316648e-06,
"loss": 0.8294,
"step": 574
},
{
"epoch": 0.03164731135450492,
"grad_norm": 0.926925003528595,
"learning_rate": 9.995761256270157e-06,
"loss": 0.7714,
"step": 575
},
{
"epoch": 0.031702350156860584,
"grad_norm": 1.1848629713058472,
"learning_rate": 9.995743392676862e-06,
"loss": 0.8925,
"step": 576
},
{
"epoch": 0.031757388959216246,
"grad_norm": 0.9624786972999573,
"learning_rate": 9.995725491536897e-06,
"loss": 0.9292,
"step": 577
},
{
"epoch": 0.03181242776157191,
"grad_norm": 0.9479736089706421,
"learning_rate": 9.995707552850396e-06,
"loss": 0.8797,
"step": 578
},
{
"epoch": 0.03186746656392757,
"grad_norm": 0.9551546573638916,
"learning_rate": 9.995689576617494e-06,
"loss": 0.8793,
"step": 579
},
{
"epoch": 0.03192250536628323,
"grad_norm": 0.9210056662559509,
"learning_rate": 9.995671562838325e-06,
"loss": 0.9714,
"step": 580
},
{
"epoch": 0.03197754416863889,
"grad_norm": 1.063117504119873,
"learning_rate": 9.995653511513029e-06,
"loss": 0.9608,
"step": 581
},
{
"epoch": 0.03203258297099455,
"grad_norm": 0.9426459670066833,
"learning_rate": 9.995635422641736e-06,
"loss": 0.9102,
"step": 582
},
{
"epoch": 0.032087621773350214,
"grad_norm": 1.0176693201065063,
"learning_rate": 9.995617296224584e-06,
"loss": 0.9109,
"step": 583
},
{
"epoch": 0.032142660575705875,
"grad_norm": 0.9457042217254639,
"learning_rate": 9.995599132261711e-06,
"loss": 0.9017,
"step": 584
},
{
"epoch": 0.03219769937806154,
"grad_norm": 1.5851638317108154,
"learning_rate": 9.995580930753252e-06,
"loss": 0.967,
"step": 585
},
{
"epoch": 0.03225273818041719,
"grad_norm": 0.9961487054824829,
"learning_rate": 9.995562691699345e-06,
"loss": 0.9396,
"step": 586
},
{
"epoch": 0.03230777698277285,
"grad_norm": 0.9892112016677856,
"learning_rate": 9.995544415100125e-06,
"loss": 0.9058,
"step": 587
},
{
"epoch": 0.032362815785128514,
"grad_norm": 0.9052272439002991,
"learning_rate": 9.99552610095573e-06,
"loss": 0.9194,
"step": 588
},
{
"epoch": 0.032417854587484175,
"grad_norm": 0.8381399512290955,
"learning_rate": 9.995507749266297e-06,
"loss": 0.7465,
"step": 589
},
{
"epoch": 0.03247289338983984,
"grad_norm": 1.018964171409607,
"learning_rate": 9.995489360031969e-06,
"loss": 0.841,
"step": 590
},
{
"epoch": 0.0325279321921955,
"grad_norm": 0.908311128616333,
"learning_rate": 9.995470933252876e-06,
"loss": 0.8592,
"step": 591
},
{
"epoch": 0.03258297099455116,
"grad_norm": 1.2986040115356445,
"learning_rate": 9.995452468929162e-06,
"loss": 0.8341,
"step": 592
},
{
"epoch": 0.03263800979690682,
"grad_norm": 1.6565190553665161,
"learning_rate": 9.995433967060966e-06,
"loss": 0.8681,
"step": 593
},
{
"epoch": 0.03269304859926248,
"grad_norm": 0.9725674390792847,
"learning_rate": 9.995415427648423e-06,
"loss": 0.8449,
"step": 594
},
{
"epoch": 0.032748087401618144,
"grad_norm": 0.8683852553367615,
"learning_rate": 9.995396850691677e-06,
"loss": 0.8478,
"step": 595
},
{
"epoch": 0.0328031262039738,
"grad_norm": 0.9912856817245483,
"learning_rate": 9.995378236190862e-06,
"loss": 0.8912,
"step": 596
},
{
"epoch": 0.03285816500632946,
"grad_norm": 0.9396800398826599,
"learning_rate": 9.995359584146125e-06,
"loss": 0.856,
"step": 597
},
{
"epoch": 0.03291320380868512,
"grad_norm": 1.385006308555603,
"learning_rate": 9.995340894557601e-06,
"loss": 0.9633,
"step": 598
},
{
"epoch": 0.03296824261104078,
"grad_norm": 0.8982875943183899,
"learning_rate": 9.995322167425433e-06,
"loss": 0.9244,
"step": 599
},
{
"epoch": 0.033023281413396444,
"grad_norm": 0.8981022834777832,
"learning_rate": 9.995303402749759e-06,
"loss": 0.8854,
"step": 600
},
{
"epoch": 0.033078320215752105,
"grad_norm": 0.9917197227478027,
"learning_rate": 9.995284600530724e-06,
"loss": 1.0086,
"step": 601
},
{
"epoch": 0.033133359018107766,
"grad_norm": 1.0540626049041748,
"learning_rate": 9.995265760768464e-06,
"loss": 1.0022,
"step": 602
},
{
"epoch": 0.03318839782046343,
"grad_norm": 0.9523479342460632,
"learning_rate": 9.995246883463126e-06,
"loss": 0.9893,
"step": 603
},
{
"epoch": 0.03324343662281909,
"grad_norm": 0.9824770092964172,
"learning_rate": 9.99522796861485e-06,
"loss": 0.8385,
"step": 604
},
{
"epoch": 0.03329847542517475,
"grad_norm": 1.0968893766403198,
"learning_rate": 9.995209016223776e-06,
"loss": 1.0109,
"step": 605
},
{
"epoch": 0.03335351422753041,
"grad_norm": 0.9115625023841858,
"learning_rate": 9.995190026290049e-06,
"loss": 0.8656,
"step": 606
},
{
"epoch": 0.033408553029886066,
"grad_norm": 0.9795814156532288,
"learning_rate": 9.99517099881381e-06,
"loss": 0.8941,
"step": 607
},
{
"epoch": 0.03346359183224173,
"grad_norm": 0.9317291378974915,
"learning_rate": 9.995151933795204e-06,
"loss": 0.7819,
"step": 608
},
{
"epoch": 0.03351863063459739,
"grad_norm": 0.9936283230781555,
"learning_rate": 9.995132831234373e-06,
"loss": 0.8674,
"step": 609
},
{
"epoch": 0.03357366943695305,
"grad_norm": 0.9872812032699585,
"learning_rate": 9.995113691131462e-06,
"loss": 0.9038,
"step": 610
},
{
"epoch": 0.03362870823930871,
"grad_norm": 0.9516895413398743,
"learning_rate": 9.995094513486611e-06,
"loss": 0.9038,
"step": 611
},
{
"epoch": 0.03368374704166437,
"grad_norm": 1.090579867362976,
"learning_rate": 9.995075298299968e-06,
"loss": 0.9587,
"step": 612
},
{
"epoch": 0.033738785844020035,
"grad_norm": 1.021398663520813,
"learning_rate": 9.995056045571677e-06,
"loss": 0.9569,
"step": 613
},
{
"epoch": 0.033793824646375696,
"grad_norm": 1.009657382965088,
"learning_rate": 9.99503675530188e-06,
"loss": 0.8346,
"step": 614
},
{
"epoch": 0.03384886344873136,
"grad_norm": 1.0478712320327759,
"learning_rate": 9.995017427490725e-06,
"loss": 1.0566,
"step": 615
},
{
"epoch": 0.03390390225108702,
"grad_norm": 1.1391830444335938,
"learning_rate": 9.994998062138355e-06,
"loss": 1.0727,
"step": 616
},
{
"epoch": 0.03395894105344268,
"grad_norm": 1.0172302722930908,
"learning_rate": 9.994978659244918e-06,
"loss": 0.7869,
"step": 617
},
{
"epoch": 0.034013979855798335,
"grad_norm": 1.0532630681991577,
"learning_rate": 9.994959218810558e-06,
"loss": 0.8626,
"step": 618
},
{
"epoch": 0.034069018658153996,
"grad_norm": 0.8300478458404541,
"learning_rate": 9.99493974083542e-06,
"loss": 0.8166,
"step": 619
},
{
"epoch": 0.03412405746050966,
"grad_norm": 1.0613664388656616,
"learning_rate": 9.994920225319656e-06,
"loss": 0.8899,
"step": 620
},
{
"epoch": 0.03417909626286532,
"grad_norm": 0.9827042818069458,
"learning_rate": 9.994900672263406e-06,
"loss": 0.8243,
"step": 621
},
{
"epoch": 0.03423413506522098,
"grad_norm": 0.8790082931518555,
"learning_rate": 9.994881081666818e-06,
"loss": 0.8153,
"step": 622
},
{
"epoch": 0.03428917386757664,
"grad_norm": 1.033378005027771,
"learning_rate": 9.994861453530044e-06,
"loss": 0.8916,
"step": 623
},
{
"epoch": 0.0343442126699323,
"grad_norm": 0.9547238349914551,
"learning_rate": 9.994841787853227e-06,
"loss": 0.9141,
"step": 624
},
{
"epoch": 0.034399251472287964,
"grad_norm": 0.9606438279151917,
"learning_rate": 9.994822084636514e-06,
"loss": 0.9435,
"step": 625
},
{
"epoch": 0.034454290274643626,
"grad_norm": 0.8461503982543945,
"learning_rate": 9.994802343880059e-06,
"loss": 0.7914,
"step": 626
},
{
"epoch": 0.03450932907699929,
"grad_norm": 1.144538402557373,
"learning_rate": 9.994782565584004e-06,
"loss": 0.8025,
"step": 627
},
{
"epoch": 0.03456436787935495,
"grad_norm": 1.0099962949752808,
"learning_rate": 9.994762749748502e-06,
"loss": 0.9607,
"step": 628
},
{
"epoch": 0.0346194066817106,
"grad_norm": 0.9822041988372803,
"learning_rate": 9.9947428963737e-06,
"loss": 0.9216,
"step": 629
},
{
"epoch": 0.034674445484066264,
"grad_norm": 0.9056866765022278,
"learning_rate": 9.994723005459746e-06,
"loss": 0.7913,
"step": 630
},
{
"epoch": 0.034729484286421926,
"grad_norm": 1.0099287033081055,
"learning_rate": 9.994703077006792e-06,
"loss": 0.9937,
"step": 631
},
{
"epoch": 0.03478452308877759,
"grad_norm": 0.9559167623519897,
"learning_rate": 9.994683111014984e-06,
"loss": 0.9774,
"step": 632
},
{
"epoch": 0.03483956189113325,
"grad_norm": 1.0359059572219849,
"learning_rate": 9.994663107484478e-06,
"loss": 0.9062,
"step": 633
},
{
"epoch": 0.03489460069348891,
"grad_norm": 0.8803057074546814,
"learning_rate": 9.99464306641542e-06,
"loss": 0.9638,
"step": 634
},
{
"epoch": 0.03494963949584457,
"grad_norm": 1.0926579236984253,
"learning_rate": 9.994622987807962e-06,
"loss": 1.0467,
"step": 635
},
{
"epoch": 0.03500467829820023,
"grad_norm": 1.0051401853561401,
"learning_rate": 9.994602871662253e-06,
"loss": 0.8717,
"step": 636
},
{
"epoch": 0.035059717100555894,
"grad_norm": 1.2007508277893066,
"learning_rate": 9.994582717978448e-06,
"loss": 0.8004,
"step": 637
},
{
"epoch": 0.035114755902911556,
"grad_norm": 0.8826266527175903,
"learning_rate": 9.994562526756695e-06,
"loss": 0.8888,
"step": 638
},
{
"epoch": 0.03516979470526721,
"grad_norm": 0.9953717589378357,
"learning_rate": 9.994542297997147e-06,
"loss": 0.8999,
"step": 639
},
{
"epoch": 0.03522483350762287,
"grad_norm": 1.0203614234924316,
"learning_rate": 9.994522031699958e-06,
"loss": 0.8241,
"step": 640
},
{
"epoch": 0.03527987230997853,
"grad_norm": 0.8760203719139099,
"learning_rate": 9.994501727865276e-06,
"loss": 0.7893,
"step": 641
},
{
"epoch": 0.035334911112334194,
"grad_norm": 1.024888277053833,
"learning_rate": 9.994481386493257e-06,
"loss": 0.9865,
"step": 642
},
{
"epoch": 0.035389949914689856,
"grad_norm": 0.907454788684845,
"learning_rate": 9.994461007584052e-06,
"loss": 0.891,
"step": 643
},
{
"epoch": 0.03544498871704552,
"grad_norm": 1.0400965213775635,
"learning_rate": 9.994440591137816e-06,
"loss": 0.9345,
"step": 644
},
{
"epoch": 0.03550002751940118,
"grad_norm": 0.9816616177558899,
"learning_rate": 9.9944201371547e-06,
"loss": 0.91,
"step": 645
},
{
"epoch": 0.03555506632175684,
"grad_norm": 1.0528117418289185,
"learning_rate": 9.99439964563486e-06,
"loss": 0.952,
"step": 646
},
{
"epoch": 0.0356101051241125,
"grad_norm": 0.9802080988883972,
"learning_rate": 9.99437911657845e-06,
"loss": 0.9392,
"step": 647
},
{
"epoch": 0.03566514392646816,
"grad_norm": 0.9580393433570862,
"learning_rate": 9.994358549985623e-06,
"loss": 0.874,
"step": 648
},
{
"epoch": 0.035720182728823824,
"grad_norm": 0.8935576677322388,
"learning_rate": 9.994337945856533e-06,
"loss": 0.8435,
"step": 649
},
{
"epoch": 0.03577522153117948,
"grad_norm": 1.009699821472168,
"learning_rate": 9.994317304191337e-06,
"loss": 0.9436,
"step": 650
},
{
"epoch": 0.03583026033353514,
"grad_norm": 0.9126121401786804,
"learning_rate": 9.994296624990188e-06,
"loss": 0.8424,
"step": 651
},
{
"epoch": 0.0358852991358908,
"grad_norm": 0.9555553197860718,
"learning_rate": 9.994275908253243e-06,
"loss": 0.93,
"step": 652
},
{
"epoch": 0.03594033793824646,
"grad_norm": 0.8359857797622681,
"learning_rate": 9.994255153980658e-06,
"loss": 0.6326,
"step": 653
},
{
"epoch": 0.035995376740602124,
"grad_norm": 0.8918783664703369,
"learning_rate": 9.994234362172587e-06,
"loss": 0.8287,
"step": 654
},
{
"epoch": 0.036050415542957785,
"grad_norm": 0.9878549575805664,
"learning_rate": 9.994213532829188e-06,
"loss": 0.8841,
"step": 655
},
{
"epoch": 0.03610545434531345,
"grad_norm": 0.9504040479660034,
"learning_rate": 9.994192665950617e-06,
"loss": 1.0182,
"step": 656
},
{
"epoch": 0.03616049314766911,
"grad_norm": 0.9531422257423401,
"learning_rate": 9.99417176153703e-06,
"loss": 0.8504,
"step": 657
},
{
"epoch": 0.03621553195002477,
"grad_norm": 0.9580292105674744,
"learning_rate": 9.994150819588587e-06,
"loss": 0.8048,
"step": 658
},
{
"epoch": 0.03627057075238043,
"grad_norm": 0.9786819815635681,
"learning_rate": 9.99412984010544e-06,
"loss": 0.9124,
"step": 659
},
{
"epoch": 0.03632560955473609,
"grad_norm": 0.9733422994613647,
"learning_rate": 9.994108823087751e-06,
"loss": 0.8868,
"step": 660
},
{
"epoch": 0.03638064835709175,
"grad_norm": 1.093173623085022,
"learning_rate": 9.994087768535679e-06,
"loss": 0.9428,
"step": 661
},
{
"epoch": 0.03643568715944741,
"grad_norm": 0.9067148566246033,
"learning_rate": 9.994066676449378e-06,
"loss": 0.8838,
"step": 662
},
{
"epoch": 0.03649072596180307,
"grad_norm": 0.9509521722793579,
"learning_rate": 9.99404554682901e-06,
"loss": 0.9034,
"step": 663
},
{
"epoch": 0.03654576476415873,
"grad_norm": 0.9523824453353882,
"learning_rate": 9.994024379674731e-06,
"loss": 0.9623,
"step": 664
},
{
"epoch": 0.03660080356651439,
"grad_norm": 0.987276554107666,
"learning_rate": 9.994003174986703e-06,
"loss": 0.8817,
"step": 665
},
{
"epoch": 0.036655842368870054,
"grad_norm": 0.9500744342803955,
"learning_rate": 9.993981932765083e-06,
"loss": 0.9742,
"step": 666
},
{
"epoch": 0.036710881171225715,
"grad_norm": 0.9420705437660217,
"learning_rate": 9.993960653010034e-06,
"loss": 0.9657,
"step": 667
},
{
"epoch": 0.036765919973581376,
"grad_norm": 0.9443248510360718,
"learning_rate": 9.99393933572171e-06,
"loss": 0.8468,
"step": 668
},
{
"epoch": 0.03682095877593704,
"grad_norm": 0.9666558504104614,
"learning_rate": 9.993917980900276e-06,
"loss": 0.9871,
"step": 669
},
{
"epoch": 0.0368759975782927,
"grad_norm": 1.0236201286315918,
"learning_rate": 9.993896588545892e-06,
"loss": 0.9814,
"step": 670
},
{
"epoch": 0.03693103638064836,
"grad_norm": 1.016190528869629,
"learning_rate": 9.993875158658716e-06,
"loss": 1.0156,
"step": 671
},
{
"epoch": 0.036986075183004015,
"grad_norm": 0.9296661019325256,
"learning_rate": 9.993853691238913e-06,
"loss": 0.7956,
"step": 672
},
{
"epoch": 0.037041113985359676,
"grad_norm": 0.9276684522628784,
"learning_rate": 9.993832186286643e-06,
"loss": 0.9253,
"step": 673
},
{
"epoch": 0.03709615278771534,
"grad_norm": 0.8588787913322449,
"learning_rate": 9.993810643802065e-06,
"loss": 0.7878,
"step": 674
},
{
"epoch": 0.037151191590071,
"grad_norm": 0.9955212473869324,
"learning_rate": 9.993789063785344e-06,
"loss": 0.8711,
"step": 675
},
{
"epoch": 0.03720623039242666,
"grad_norm": 0.925578236579895,
"learning_rate": 9.993767446236642e-06,
"loss": 0.9431,
"step": 676
},
{
"epoch": 0.03726126919478232,
"grad_norm": 0.9610552787780762,
"learning_rate": 9.99374579115612e-06,
"loss": 0.887,
"step": 677
},
{
"epoch": 0.03731630799713798,
"grad_norm": 1.0052428245544434,
"learning_rate": 9.99372409854394e-06,
"loss": 0.8751,
"step": 678
},
{
"epoch": 0.037371346799493645,
"grad_norm": 0.9503066539764404,
"learning_rate": 9.99370236840027e-06,
"loss": 0.8556,
"step": 679
},
{
"epoch": 0.037426385601849306,
"grad_norm": 2.426232099533081,
"learning_rate": 9.993680600725266e-06,
"loss": 0.9077,
"step": 680
},
{
"epoch": 0.03748142440420497,
"grad_norm": 0.9119723439216614,
"learning_rate": 9.993658795519096e-06,
"loss": 0.8575,
"step": 681
},
{
"epoch": 0.03753646320656062,
"grad_norm": 0.9688286781311035,
"learning_rate": 9.993636952781923e-06,
"loss": 0.8921,
"step": 682
},
{
"epoch": 0.03759150200891628,
"grad_norm": 1.030013084411621,
"learning_rate": 9.993615072513913e-06,
"loss": 0.8622,
"step": 683
},
{
"epoch": 0.037646540811271945,
"grad_norm": 1.055187463760376,
"learning_rate": 9.993593154715228e-06,
"loss": 0.9251,
"step": 684
},
{
"epoch": 0.037701579613627606,
"grad_norm": 1.0518591403961182,
"learning_rate": 9.993571199386032e-06,
"loss": 0.9575,
"step": 685
},
{
"epoch": 0.03775661841598327,
"grad_norm": 0.9232666492462158,
"learning_rate": 9.993549206526495e-06,
"loss": 0.8522,
"step": 686
},
{
"epoch": 0.03781165721833893,
"grad_norm": 1.0212332010269165,
"learning_rate": 9.993527176136775e-06,
"loss": 0.9358,
"step": 687
},
{
"epoch": 0.03786669602069459,
"grad_norm": 0.9137141108512878,
"learning_rate": 9.993505108217045e-06,
"loss": 0.8561,
"step": 688
},
{
"epoch": 0.03792173482305025,
"grad_norm": 1.0069375038146973,
"learning_rate": 9.993483002767465e-06,
"loss": 0.8274,
"step": 689
},
{
"epoch": 0.03797677362540591,
"grad_norm": 0.9820672869682312,
"learning_rate": 9.993460859788204e-06,
"loss": 0.907,
"step": 690
},
{
"epoch": 0.038031812427761574,
"grad_norm": 1.0042002201080322,
"learning_rate": 9.993438679279428e-06,
"loss": 0.9263,
"step": 691
},
{
"epoch": 0.038086851230117236,
"grad_norm": 0.9733695983886719,
"learning_rate": 9.993416461241304e-06,
"loss": 0.8455,
"step": 692
},
{
"epoch": 0.03814189003247289,
"grad_norm": 0.9106015563011169,
"learning_rate": 9.993394205673996e-06,
"loss": 0.8469,
"step": 693
},
{
"epoch": 0.03819692883482855,
"grad_norm": 0.9802660346031189,
"learning_rate": 9.993371912577677e-06,
"loss": 0.8662,
"step": 694
},
{
"epoch": 0.03825196763718421,
"grad_norm": 0.9183964729309082,
"learning_rate": 9.99334958195251e-06,
"loss": 0.8968,
"step": 695
},
{
"epoch": 0.038307006439539874,
"grad_norm": 0.9572185277938843,
"learning_rate": 9.993327213798663e-06,
"loss": 0.953,
"step": 696
},
{
"epoch": 0.038362045241895536,
"grad_norm": 1.4480071067810059,
"learning_rate": 9.993304808116307e-06,
"loss": 1.1131,
"step": 697
},
{
"epoch": 0.0384170840442512,
"grad_norm": 0.9297361969947815,
"learning_rate": 9.993282364905607e-06,
"loss": 0.884,
"step": 698
},
{
"epoch": 0.03847212284660686,
"grad_norm": 0.9400073885917664,
"learning_rate": 9.993259884166735e-06,
"loss": 0.932,
"step": 699
},
{
"epoch": 0.03852716164896252,
"grad_norm": 0.9231798052787781,
"learning_rate": 9.993237365899858e-06,
"loss": 0.8981,
"step": 700
},
{
"epoch": 0.03858220045131818,
"grad_norm": 0.8233712911605835,
"learning_rate": 9.993214810105144e-06,
"loss": 0.8218,
"step": 701
},
{
"epoch": 0.03863723925367384,
"grad_norm": 1.0997854471206665,
"learning_rate": 9.993192216782768e-06,
"loss": 0.9298,
"step": 702
},
{
"epoch": 0.038692278056029504,
"grad_norm": 0.9570802450180054,
"learning_rate": 9.993169585932893e-06,
"loss": 0.7815,
"step": 703
},
{
"epoch": 0.03874731685838516,
"grad_norm": 0.9913730025291443,
"learning_rate": 9.993146917555692e-06,
"loss": 0.9621,
"step": 704
},
{
"epoch": 0.03880235566074082,
"grad_norm": 1.088767409324646,
"learning_rate": 9.993124211651334e-06,
"loss": 0.9295,
"step": 705
},
{
"epoch": 0.03885739446309648,
"grad_norm": 0.8199124336242676,
"learning_rate": 9.993101468219995e-06,
"loss": 0.7613,
"step": 706
},
{
"epoch": 0.03891243326545214,
"grad_norm": 1.112566351890564,
"learning_rate": 9.99307868726184e-06,
"loss": 0.791,
"step": 707
},
{
"epoch": 0.038967472067807804,
"grad_norm": 0.9372578859329224,
"learning_rate": 9.99305586877704e-06,
"loss": 0.8567,
"step": 708
},
{
"epoch": 0.039022510870163465,
"grad_norm": 1.0167721509933472,
"learning_rate": 9.99303301276577e-06,
"loss": 0.9787,
"step": 709
},
{
"epoch": 0.03907754967251913,
"grad_norm": 1.3526856899261475,
"learning_rate": 9.993010119228202e-06,
"loss": 1.2215,
"step": 710
},
{
"epoch": 0.03913258847487479,
"grad_norm": 0.8819016814231873,
"learning_rate": 9.992987188164505e-06,
"loss": 0.7736,
"step": 711
},
{
"epoch": 0.03918762727723045,
"grad_norm": 1.0033677816390991,
"learning_rate": 9.992964219574852e-06,
"loss": 0.9919,
"step": 712
},
{
"epoch": 0.03924266607958611,
"grad_norm": 0.894926130771637,
"learning_rate": 9.992941213459417e-06,
"loss": 0.9058,
"step": 713
},
{
"epoch": 0.03929770488194177,
"grad_norm": 0.9481377005577087,
"learning_rate": 9.992918169818373e-06,
"loss": 0.8436,
"step": 714
},
{
"epoch": 0.03935274368429743,
"grad_norm": 0.9312933087348938,
"learning_rate": 9.992895088651893e-06,
"loss": 0.8869,
"step": 715
},
{
"epoch": 0.03940778248665309,
"grad_norm": 0.9765705466270447,
"learning_rate": 9.99287196996015e-06,
"loss": 0.9512,
"step": 716
},
{
"epoch": 0.03946282128900875,
"grad_norm": 0.9610235691070557,
"learning_rate": 9.992848813743317e-06,
"loss": 0.8005,
"step": 717
},
{
"epoch": 0.03951786009136441,
"grad_norm": 1.102995753288269,
"learning_rate": 9.99282562000157e-06,
"loss": 0.8017,
"step": 718
},
{
"epoch": 0.03957289889372007,
"grad_norm": 1.023317575454712,
"learning_rate": 9.99280238873508e-06,
"loss": 0.911,
"step": 719
},
{
"epoch": 0.039627937696075734,
"grad_norm": 1.0531049966812134,
"learning_rate": 9.992779119944025e-06,
"loss": 0.8562,
"step": 720
},
{
"epoch": 0.039682976498431395,
"grad_norm": 0.918250322341919,
"learning_rate": 9.992755813628579e-06,
"loss": 0.92,
"step": 721
},
{
"epoch": 0.039738015300787057,
"grad_norm": 0.8508251309394836,
"learning_rate": 9.992732469788915e-06,
"loss": 0.7347,
"step": 722
},
{
"epoch": 0.03979305410314272,
"grad_norm": 0.9184926152229309,
"learning_rate": 9.992709088425211e-06,
"loss": 0.8732,
"step": 723
},
{
"epoch": 0.03984809290549838,
"grad_norm": 1.1613929271697998,
"learning_rate": 9.992685669537643e-06,
"loss": 0.9522,
"step": 724
},
{
"epoch": 0.039903131707854034,
"grad_norm": 1.091513752937317,
"learning_rate": 9.992662213126386e-06,
"loss": 0.9646,
"step": 725
},
{
"epoch": 0.039958170510209695,
"grad_norm": 1.057803750038147,
"learning_rate": 9.992638719191615e-06,
"loss": 0.7032,
"step": 726
},
{
"epoch": 0.040013209312565357,
"grad_norm": 0.8771823644638062,
"learning_rate": 9.992615187733508e-06,
"loss": 0.8577,
"step": 727
},
{
"epoch": 0.04006824811492102,
"grad_norm": 0.9471028447151184,
"learning_rate": 9.992591618752244e-06,
"loss": 0.9057,
"step": 728
},
{
"epoch": 0.04012328691727668,
"grad_norm": 0.9547705054283142,
"learning_rate": 9.992568012247995e-06,
"loss": 0.9549,
"step": 729
},
{
"epoch": 0.04017832571963234,
"grad_norm": 0.8862974047660828,
"learning_rate": 9.992544368220941e-06,
"loss": 0.8593,
"step": 730
},
{
"epoch": 0.040233364521988,
"grad_norm": 0.906334400177002,
"learning_rate": 9.992520686671261e-06,
"loss": 0.8832,
"step": 731
},
{
"epoch": 0.04028840332434366,
"grad_norm": 1.07270085811615,
"learning_rate": 9.992496967599133e-06,
"loss": 0.9409,
"step": 732
},
{
"epoch": 0.040343442126699325,
"grad_norm": 0.9026005268096924,
"learning_rate": 9.992473211004734e-06,
"loss": 0.8326,
"step": 733
},
{
"epoch": 0.040398480929054986,
"grad_norm": 0.9762942790985107,
"learning_rate": 9.992449416888241e-06,
"loss": 0.9048,
"step": 734
},
{
"epoch": 0.04045351973141065,
"grad_norm": 0.9658033847808838,
"learning_rate": 9.992425585249837e-06,
"loss": 0.9219,
"step": 735
},
{
"epoch": 0.0405085585337663,
"grad_norm": 0.8909044861793518,
"learning_rate": 9.992401716089698e-06,
"loss": 0.8564,
"step": 736
},
{
"epoch": 0.04056359733612196,
"grad_norm": 1.0387929677963257,
"learning_rate": 9.992377809408001e-06,
"loss": 0.9533,
"step": 737
},
{
"epoch": 0.040618636138477625,
"grad_norm": 0.9044275879859924,
"learning_rate": 9.99235386520493e-06,
"loss": 0.8508,
"step": 738
},
{
"epoch": 0.040673674940833286,
"grad_norm": 1.019377589225769,
"learning_rate": 9.992329883480667e-06,
"loss": 0.8684,
"step": 739
},
{
"epoch": 0.04072871374318895,
"grad_norm": 0.9394627213478088,
"learning_rate": 9.992305864235385e-06,
"loss": 0.7665,
"step": 740
},
{
"epoch": 0.04078375254554461,
"grad_norm": 0.8652323484420776,
"learning_rate": 9.99228180746927e-06,
"loss": 0.8576,
"step": 741
},
{
"epoch": 0.04083879134790027,
"grad_norm": 0.9347619414329529,
"learning_rate": 9.992257713182502e-06,
"loss": 0.9586,
"step": 742
},
{
"epoch": 0.04089383015025593,
"grad_norm": 0.9510203003883362,
"learning_rate": 9.99223358137526e-06,
"loss": 0.9092,
"step": 743
},
{
"epoch": 0.04094886895261159,
"grad_norm": 0.8242866396903992,
"learning_rate": 9.992209412047729e-06,
"loss": 0.6997,
"step": 744
},
{
"epoch": 0.041003907754967255,
"grad_norm": 0.8842730522155762,
"learning_rate": 9.992185205200087e-06,
"loss": 0.8873,
"step": 745
},
{
"epoch": 0.041058946557322916,
"grad_norm": 1.0813730955123901,
"learning_rate": 9.992160960832518e-06,
"loss": 1.0162,
"step": 746
},
{
"epoch": 0.04111398535967857,
"grad_norm": 1.1276283264160156,
"learning_rate": 9.9921366789452e-06,
"loss": 1.0004,
"step": 747
},
{
"epoch": 0.04116902416203423,
"grad_norm": 0.8810326457023621,
"learning_rate": 9.992112359538323e-06,
"loss": 0.7823,
"step": 748
},
{
"epoch": 0.04122406296438989,
"grad_norm": 0.9939407110214233,
"learning_rate": 9.992088002612066e-06,
"loss": 1.0016,
"step": 749
},
{
"epoch": 0.041279101766745555,
"grad_norm": 1.0963523387908936,
"learning_rate": 9.99206360816661e-06,
"loss": 0.9252,
"step": 750
},
{
"epoch": 0.041334140569101216,
"grad_norm": 1.1346478462219238,
"learning_rate": 9.99203917620214e-06,
"loss": 0.9608,
"step": 751
},
{
"epoch": 0.04138917937145688,
"grad_norm": 1.0108580589294434,
"learning_rate": 9.992014706718841e-06,
"loss": 0.9179,
"step": 752
},
{
"epoch": 0.04144421817381254,
"grad_norm": 0.897293210029602,
"learning_rate": 9.991990199716894e-06,
"loss": 0.9295,
"step": 753
},
{
"epoch": 0.0414992569761682,
"grad_norm": 1.0152363777160645,
"learning_rate": 9.991965655196488e-06,
"loss": 0.8467,
"step": 754
},
{
"epoch": 0.04155429577852386,
"grad_norm": 0.8655388355255127,
"learning_rate": 9.9919410731578e-06,
"loss": 0.796,
"step": 755
},
{
"epoch": 0.04160933458087952,
"grad_norm": 1.0140331983566284,
"learning_rate": 9.991916453601023e-06,
"loss": 0.8444,
"step": 756
},
{
"epoch": 0.041664373383235184,
"grad_norm": 0.9387341141700745,
"learning_rate": 9.991891796526338e-06,
"loss": 0.8669,
"step": 757
},
{
"epoch": 0.04171941218559084,
"grad_norm": 0.9395696520805359,
"learning_rate": 9.991867101933928e-06,
"loss": 0.8376,
"step": 758
},
{
"epoch": 0.0417744509879465,
"grad_norm": 1.0856634378433228,
"learning_rate": 9.991842369823983e-06,
"loss": 0.9271,
"step": 759
},
{
"epoch": 0.04182948979030216,
"grad_norm": 0.8777190446853638,
"learning_rate": 9.991817600196687e-06,
"loss": 0.9197,
"step": 760
},
{
"epoch": 0.04188452859265782,
"grad_norm": 0.9639917016029358,
"learning_rate": 9.991792793052225e-06,
"loss": 0.8835,
"step": 761
},
{
"epoch": 0.041939567395013484,
"grad_norm": 0.9384773969650269,
"learning_rate": 9.991767948390785e-06,
"loss": 0.8403,
"step": 762
},
{
"epoch": 0.041994606197369146,
"grad_norm": 0.8987650275230408,
"learning_rate": 9.991743066212554e-06,
"loss": 0.7948,
"step": 763
},
{
"epoch": 0.04204964499972481,
"grad_norm": 1.0545049905776978,
"learning_rate": 9.991718146517717e-06,
"loss": 0.9359,
"step": 764
},
{
"epoch": 0.04210468380208047,
"grad_norm": 0.9840022325515747,
"learning_rate": 9.991693189306463e-06,
"loss": 0.9188,
"step": 765
},
{
"epoch": 0.04215972260443613,
"grad_norm": 0.8769927620887756,
"learning_rate": 9.991668194578981e-06,
"loss": 0.8647,
"step": 766
},
{
"epoch": 0.04221476140679179,
"grad_norm": 0.9268791675567627,
"learning_rate": 9.991643162335455e-06,
"loss": 0.897,
"step": 767
},
{
"epoch": 0.042269800209147446,
"grad_norm": 0.9316747784614563,
"learning_rate": 9.991618092576075e-06,
"loss": 0.9341,
"step": 768
},
{
"epoch": 0.04232483901150311,
"grad_norm": 0.8348364233970642,
"learning_rate": 9.991592985301031e-06,
"loss": 0.7528,
"step": 769
},
{
"epoch": 0.04237987781385877,
"grad_norm": 0.9139068126678467,
"learning_rate": 9.99156784051051e-06,
"loss": 0.8596,
"step": 770
},
{
"epoch": 0.04243491661621443,
"grad_norm": 0.9403928518295288,
"learning_rate": 9.991542658204701e-06,
"loss": 0.974,
"step": 771
},
{
"epoch": 0.04248995541857009,
"grad_norm": 0.993549108505249,
"learning_rate": 9.991517438383793e-06,
"loss": 0.9479,
"step": 772
},
{
"epoch": 0.04254499422092575,
"grad_norm": 0.8494916558265686,
"learning_rate": 9.991492181047975e-06,
"loss": 0.9149,
"step": 773
},
{
"epoch": 0.042600033023281414,
"grad_norm": 1.0351910591125488,
"learning_rate": 9.991466886197441e-06,
"loss": 0.9552,
"step": 774
},
{
"epoch": 0.042655071825637075,
"grad_norm": 0.916829526424408,
"learning_rate": 9.991441553832375e-06,
"loss": 0.8781,
"step": 775
},
{
"epoch": 0.04271011062799274,
"grad_norm": 1.113476276397705,
"learning_rate": 9.991416183952972e-06,
"loss": 0.8137,
"step": 776
},
{
"epoch": 0.0427651494303484,
"grad_norm": 1.1608171463012695,
"learning_rate": 9.991390776559421e-06,
"loss": 1.0045,
"step": 777
},
{
"epoch": 0.04282018823270406,
"grad_norm": 1.0045493841171265,
"learning_rate": 9.991365331651913e-06,
"loss": 0.8813,
"step": 778
},
{
"epoch": 0.042875227035059714,
"grad_norm": 0.918820858001709,
"learning_rate": 9.991339849230639e-06,
"loss": 0.9198,
"step": 779
},
{
"epoch": 0.042930265837415375,
"grad_norm": 0.9875735640525818,
"learning_rate": 9.991314329295792e-06,
"loss": 0.8665,
"step": 780
},
{
"epoch": 0.04298530463977104,
"grad_norm": 0.873768150806427,
"learning_rate": 9.991288771847561e-06,
"loss": 0.8606,
"step": 781
},
{
"epoch": 0.0430403434421267,
"grad_norm": 0.8892746567726135,
"learning_rate": 9.991263176886139e-06,
"loss": 0.9011,
"step": 782
},
{
"epoch": 0.04309538224448236,
"grad_norm": 1.097734808921814,
"learning_rate": 9.99123754441172e-06,
"loss": 1.009,
"step": 783
},
{
"epoch": 0.04315042104683802,
"grad_norm": 1.0065964460372925,
"learning_rate": 9.991211874424497e-06,
"loss": 0.9492,
"step": 784
},
{
"epoch": 0.04320545984919368,
"grad_norm": 1.0791678428649902,
"learning_rate": 9.99118616692466e-06,
"loss": 1.0142,
"step": 785
},
{
"epoch": 0.043260498651549344,
"grad_norm": 0.9454777836799622,
"learning_rate": 9.991160421912404e-06,
"loss": 0.8058,
"step": 786
},
{
"epoch": 0.043315537453905005,
"grad_norm": 0.9448156952857971,
"learning_rate": 9.991134639387922e-06,
"loss": 0.8184,
"step": 787
},
{
"epoch": 0.043370576256260666,
"grad_norm": 0.9636550545692444,
"learning_rate": 9.99110881935141e-06,
"loss": 0.8606,
"step": 788
},
{
"epoch": 0.04342561505861633,
"grad_norm": 0.9933613538742065,
"learning_rate": 9.991082961803058e-06,
"loss": 0.9449,
"step": 789
},
{
"epoch": 0.04348065386097198,
"grad_norm": 0.8906797170639038,
"learning_rate": 9.991057066743065e-06,
"loss": 0.8053,
"step": 790
},
{
"epoch": 0.043535692663327644,
"grad_norm": 1.0393906831741333,
"learning_rate": 9.991031134171621e-06,
"loss": 0.8487,
"step": 791
},
{
"epoch": 0.043590731465683305,
"grad_norm": 1.0618231296539307,
"learning_rate": 9.991005164088923e-06,
"loss": 0.9847,
"step": 792
},
{
"epoch": 0.043645770268038966,
"grad_norm": 0.9525149464607239,
"learning_rate": 9.990979156495167e-06,
"loss": 0.9318,
"step": 793
},
{
"epoch": 0.04370080907039463,
"grad_norm": 0.9430851936340332,
"learning_rate": 9.990953111390546e-06,
"loss": 0.8483,
"step": 794
},
{
"epoch": 0.04375584787275029,
"grad_norm": 0.9259672164916992,
"learning_rate": 9.99092702877526e-06,
"loss": 0.9365,
"step": 795
},
{
"epoch": 0.04381088667510595,
"grad_norm": 0.942609965801239,
"learning_rate": 9.9909009086495e-06,
"loss": 0.8408,
"step": 796
},
{
"epoch": 0.04386592547746161,
"grad_norm": 0.939255952835083,
"learning_rate": 9.990874751013467e-06,
"loss": 0.8749,
"step": 797
},
{
"epoch": 0.04392096427981727,
"grad_norm": 1.1701711416244507,
"learning_rate": 9.990848555867353e-06,
"loss": 0.9312,
"step": 798
},
{
"epoch": 0.043976003082172935,
"grad_norm": 1.0441124439239502,
"learning_rate": 9.990822323211358e-06,
"loss": 0.8618,
"step": 799
},
{
"epoch": 0.04403104188452859,
"grad_norm": 0.9601489305496216,
"learning_rate": 9.990796053045679e-06,
"loss": 0.9569,
"step": 800
},
{
"epoch": 0.04408608068688425,
"grad_norm": 0.9394032955169678,
"learning_rate": 9.990769745370513e-06,
"loss": 0.846,
"step": 801
},
{
"epoch": 0.04414111948923991,
"grad_norm": 0.9631348252296448,
"learning_rate": 9.990743400186056e-06,
"loss": 0.8754,
"step": 802
},
{
"epoch": 0.04419615829159557,
"grad_norm": 0.9234963059425354,
"learning_rate": 9.990717017492508e-06,
"loss": 0.8613,
"step": 803
},
{
"epoch": 0.044251197093951235,
"grad_norm": 0.9169090390205383,
"learning_rate": 9.990690597290069e-06,
"loss": 0.8867,
"step": 804
},
{
"epoch": 0.044306235896306896,
"grad_norm": 1.0194867849349976,
"learning_rate": 9.990664139578933e-06,
"loss": 0.8675,
"step": 805
},
{
"epoch": 0.04436127469866256,
"grad_norm": 1.3226114511489868,
"learning_rate": 9.990637644359302e-06,
"loss": 0.997,
"step": 806
},
{
"epoch": 0.04441631350101822,
"grad_norm": 0.8904317617416382,
"learning_rate": 9.990611111631374e-06,
"loss": 0.7274,
"step": 807
},
{
"epoch": 0.04447135230337388,
"grad_norm": 0.8909007906913757,
"learning_rate": 9.99058454139535e-06,
"loss": 0.8141,
"step": 808
},
{
"epoch": 0.04452639110572954,
"grad_norm": 1.004015564918518,
"learning_rate": 9.990557933651429e-06,
"loss": 0.9883,
"step": 809
},
{
"epoch": 0.0445814299080852,
"grad_norm": 1.1215732097625732,
"learning_rate": 9.990531288399807e-06,
"loss": 0.9355,
"step": 810
},
{
"epoch": 0.04463646871044086,
"grad_norm": 1.0545012950897217,
"learning_rate": 9.99050460564069e-06,
"loss": 0.9532,
"step": 811
},
{
"epoch": 0.04469150751279652,
"grad_norm": 0.9608867168426514,
"learning_rate": 9.990477885374277e-06,
"loss": 0.9363,
"step": 812
},
{
"epoch": 0.04474654631515218,
"grad_norm": 0.8750461935997009,
"learning_rate": 9.990451127600766e-06,
"loss": 0.7343,
"step": 813
},
{
"epoch": 0.04480158511750784,
"grad_norm": 0.891740620136261,
"learning_rate": 9.99042433232036e-06,
"loss": 0.8541,
"step": 814
},
{
"epoch": 0.0448566239198635,
"grad_norm": 1.1520029306411743,
"learning_rate": 9.990397499533264e-06,
"loss": 0.7696,
"step": 815
},
{
"epoch": 0.044911662722219164,
"grad_norm": 0.9526278972625732,
"learning_rate": 9.990370629239673e-06,
"loss": 0.8953,
"step": 816
},
{
"epoch": 0.044966701524574826,
"grad_norm": 0.9218434691429138,
"learning_rate": 9.990343721439795e-06,
"loss": 0.8198,
"step": 817
},
{
"epoch": 0.04502174032693049,
"grad_norm": 0.8502745628356934,
"learning_rate": 9.990316776133827e-06,
"loss": 0.8035,
"step": 818
},
{
"epoch": 0.04507677912928615,
"grad_norm": 0.8861565589904785,
"learning_rate": 9.990289793321975e-06,
"loss": 0.8626,
"step": 819
},
{
"epoch": 0.04513181793164181,
"grad_norm": 1.1113256216049194,
"learning_rate": 9.99026277300444e-06,
"loss": 0.9363,
"step": 820
},
{
"epoch": 0.04518685673399747,
"grad_norm": 0.9984708428382874,
"learning_rate": 9.990235715181426e-06,
"loss": 1.0376,
"step": 821
},
{
"epoch": 0.045241895536353126,
"grad_norm": 0.9026711583137512,
"learning_rate": 9.990208619853137e-06,
"loss": 0.9079,
"step": 822
},
{
"epoch": 0.04529693433870879,
"grad_norm": 0.8724965453147888,
"learning_rate": 9.990181487019775e-06,
"loss": 0.8665,
"step": 823
},
{
"epoch": 0.04535197314106445,
"grad_norm": 0.8923047780990601,
"learning_rate": 9.990154316681543e-06,
"loss": 0.7779,
"step": 824
},
{
"epoch": 0.04540701194342011,
"grad_norm": 0.9024640321731567,
"learning_rate": 9.99012710883865e-06,
"loss": 0.8859,
"step": 825
},
{
"epoch": 0.04546205074577577,
"grad_norm": 0.9245888590812683,
"learning_rate": 9.990099863491296e-06,
"loss": 0.8501,
"step": 826
},
{
"epoch": 0.04551708954813143,
"grad_norm": 0.9257050156593323,
"learning_rate": 9.990072580639687e-06,
"loss": 0.9561,
"step": 827
},
{
"epoch": 0.045572128350487094,
"grad_norm": 0.995610773563385,
"learning_rate": 9.99004526028403e-06,
"loss": 0.917,
"step": 828
},
{
"epoch": 0.045627167152842756,
"grad_norm": 0.9524009823799133,
"learning_rate": 9.990017902424525e-06,
"loss": 0.9184,
"step": 829
},
{
"epoch": 0.04568220595519842,
"grad_norm": 0.9264503121376038,
"learning_rate": 9.989990507061385e-06,
"loss": 0.8615,
"step": 830
},
{
"epoch": 0.04573724475755408,
"grad_norm": 1.0068570375442505,
"learning_rate": 9.989963074194809e-06,
"loss": 0.8331,
"step": 831
},
{
"epoch": 0.04579228355990974,
"grad_norm": 0.9295952320098877,
"learning_rate": 9.989935603825009e-06,
"loss": 0.8387,
"step": 832
},
{
"epoch": 0.045847322362265394,
"grad_norm": 1.0408827066421509,
"learning_rate": 9.989908095952186e-06,
"loss": 0.9686,
"step": 833
},
{
"epoch": 0.045902361164621056,
"grad_norm": 0.8874136209487915,
"learning_rate": 9.989880550576551e-06,
"loss": 0.815,
"step": 834
},
{
"epoch": 0.04595739996697672,
"grad_norm": 0.9898836016654968,
"learning_rate": 9.989852967698311e-06,
"loss": 0.9458,
"step": 835
},
{
"epoch": 0.04601243876933238,
"grad_norm": 0.9828970432281494,
"learning_rate": 9.989825347317668e-06,
"loss": 0.7922,
"step": 836
},
{
"epoch": 0.04606747757168804,
"grad_norm": 1.025447964668274,
"learning_rate": 9.989797689434836e-06,
"loss": 0.9349,
"step": 837
},
{
"epoch": 0.0461225163740437,
"grad_norm": 0.8623831272125244,
"learning_rate": 9.98976999405002e-06,
"loss": 0.8786,
"step": 838
},
{
"epoch": 0.04617755517639936,
"grad_norm": 0.9614997506141663,
"learning_rate": 9.98974226116343e-06,
"loss": 0.7885,
"step": 839
},
{
"epoch": 0.046232593978755024,
"grad_norm": 1.0207616090774536,
"learning_rate": 9.989714490775269e-06,
"loss": 0.9786,
"step": 840
},
{
"epoch": 0.046287632781110685,
"grad_norm": 0.8509595990180969,
"learning_rate": 9.98968668288575e-06,
"loss": 0.7312,
"step": 841
},
{
"epoch": 0.04634267158346635,
"grad_norm": 0.9822607040405273,
"learning_rate": 9.989658837495084e-06,
"loss": 0.952,
"step": 842
},
{
"epoch": 0.046397710385822,
"grad_norm": 1.0058252811431885,
"learning_rate": 9.989630954603477e-06,
"loss": 0.8811,
"step": 843
},
{
"epoch": 0.04645274918817766,
"grad_norm": 1.0146985054016113,
"learning_rate": 9.989603034211139e-06,
"loss": 0.9051,
"step": 844
},
{
"epoch": 0.046507787990533324,
"grad_norm": 0.8976503610610962,
"learning_rate": 9.98957507631828e-06,
"loss": 0.879,
"step": 845
},
{
"epoch": 0.046562826792888985,
"grad_norm": 0.8791939616203308,
"learning_rate": 9.989547080925111e-06,
"loss": 0.8944,
"step": 846
},
{
"epoch": 0.04661786559524465,
"grad_norm": 0.8530884981155396,
"learning_rate": 9.989519048031842e-06,
"loss": 0.9029,
"step": 847
},
{
"epoch": 0.04667290439760031,
"grad_norm": 0.9621617197990417,
"learning_rate": 9.989490977638683e-06,
"loss": 0.8374,
"step": 848
},
{
"epoch": 0.04672794319995597,
"grad_norm": 0.9629075527191162,
"learning_rate": 9.989462869745845e-06,
"loss": 0.9032,
"step": 849
},
{
"epoch": 0.04678298200231163,
"grad_norm": 1.3256126642227173,
"learning_rate": 9.989434724353541e-06,
"loss": 0.9748,
"step": 850
},
{
"epoch": 0.04683802080466729,
"grad_norm": 1.0230494737625122,
"learning_rate": 9.989406541461979e-06,
"loss": 0.9752,
"step": 851
},
{
"epoch": 0.046893059607022954,
"grad_norm": 0.8454533219337463,
"learning_rate": 9.989378321071375e-06,
"loss": 0.8426,
"step": 852
},
{
"epoch": 0.046948098409378615,
"grad_norm": 0.9995863437652588,
"learning_rate": 9.989350063181939e-06,
"loss": 0.9955,
"step": 853
},
{
"epoch": 0.04700313721173427,
"grad_norm": 0.8956604599952698,
"learning_rate": 9.989321767793883e-06,
"loss": 0.9024,
"step": 854
},
{
"epoch": 0.04705817601408993,
"grad_norm": 1.0123292207717896,
"learning_rate": 9.989293434907419e-06,
"loss": 0.7856,
"step": 855
},
{
"epoch": 0.04711321481644559,
"grad_norm": 0.814577043056488,
"learning_rate": 9.989265064522762e-06,
"loss": 0.8377,
"step": 856
},
{
"epoch": 0.047168253618801254,
"grad_norm": 1.1571552753448486,
"learning_rate": 9.989236656640125e-06,
"loss": 0.8562,
"step": 857
},
{
"epoch": 0.047223292421156915,
"grad_norm": 0.9681577682495117,
"learning_rate": 9.98920821125972e-06,
"loss": 0.8473,
"step": 858
},
{
"epoch": 0.047278331223512576,
"grad_norm": 0.9680121541023254,
"learning_rate": 9.989179728381761e-06,
"loss": 0.9811,
"step": 859
},
{
"epoch": 0.04733337002586824,
"grad_norm": 0.985477089881897,
"learning_rate": 9.989151208006464e-06,
"loss": 0.6994,
"step": 860
},
{
"epoch": 0.0473884088282239,
"grad_norm": 0.8612962365150452,
"learning_rate": 9.98912265013404e-06,
"loss": 0.7667,
"step": 861
},
{
"epoch": 0.04744344763057956,
"grad_norm": 0.8884604573249817,
"learning_rate": 9.989094054764708e-06,
"loss": 0.8382,
"step": 862
},
{
"epoch": 0.04749848643293522,
"grad_norm": 1.036881923675537,
"learning_rate": 9.989065421898681e-06,
"loss": 0.8748,
"step": 863
},
{
"epoch": 0.04755352523529088,
"grad_norm": 0.9954493045806885,
"learning_rate": 9.989036751536171e-06,
"loss": 0.9174,
"step": 864
},
{
"epoch": 0.04760856403764654,
"grad_norm": 0.9984694123268127,
"learning_rate": 9.989008043677399e-06,
"loss": 0.7636,
"step": 865
},
{
"epoch": 0.0476636028400022,
"grad_norm": 1.0412588119506836,
"learning_rate": 9.988979298322576e-06,
"loss": 0.773,
"step": 866
},
{
"epoch": 0.04771864164235786,
"grad_norm": 0.8034874796867371,
"learning_rate": 9.98895051547192e-06,
"loss": 0.7914,
"step": 867
},
{
"epoch": 0.04777368044471352,
"grad_norm": 0.8983979225158691,
"learning_rate": 9.988921695125648e-06,
"loss": 0.7292,
"step": 868
},
{
"epoch": 0.04782871924706918,
"grad_norm": 0.9445077776908875,
"learning_rate": 9.988892837283976e-06,
"loss": 0.8263,
"step": 869
},
{
"epoch": 0.047883758049424845,
"grad_norm": 1.0753306150436401,
"learning_rate": 9.988863941947121e-06,
"loss": 1.1122,
"step": 870
},
{
"epoch": 0.047938796851780506,
"grad_norm": 1.0091484785079956,
"learning_rate": 9.9888350091153e-06,
"loss": 0.9276,
"step": 871
},
{
"epoch": 0.04799383565413617,
"grad_norm": 1.0977306365966797,
"learning_rate": 9.988806038788732e-06,
"loss": 0.854,
"step": 872
},
{
"epoch": 0.04804887445649183,
"grad_norm": 1.0285007953643799,
"learning_rate": 9.988777030967632e-06,
"loss": 0.9441,
"step": 873
},
{
"epoch": 0.04810391325884749,
"grad_norm": 0.8973976373672485,
"learning_rate": 9.988747985652218e-06,
"loss": 0.786,
"step": 874
},
{
"epoch": 0.04815895206120315,
"grad_norm": 0.9809553623199463,
"learning_rate": 9.98871890284271e-06,
"loss": 0.9042,
"step": 875
},
{
"epoch": 0.048213990863558806,
"grad_norm": 0.8514279723167419,
"learning_rate": 9.988689782539326e-06,
"loss": 0.7874,
"step": 876
},
{
"epoch": 0.04826902966591447,
"grad_norm": 0.8299674391746521,
"learning_rate": 9.988660624742286e-06,
"loss": 0.8704,
"step": 877
},
{
"epoch": 0.04832406846827013,
"grad_norm": 0.9862462282180786,
"learning_rate": 9.988631429451809e-06,
"loss": 0.9963,
"step": 878
},
{
"epoch": 0.04837910727062579,
"grad_norm": 0.9041131734848022,
"learning_rate": 9.988602196668111e-06,
"loss": 0.9207,
"step": 879
},
{
"epoch": 0.04843414607298145,
"grad_norm": 0.8597276210784912,
"learning_rate": 9.988572926391416e-06,
"loss": 0.8226,
"step": 880
},
{
"epoch": 0.04848918487533711,
"grad_norm": 0.9494329690933228,
"learning_rate": 9.988543618621941e-06,
"loss": 0.8834,
"step": 881
},
{
"epoch": 0.048544223677692774,
"grad_norm": 0.9129118323326111,
"learning_rate": 9.98851427335991e-06,
"loss": 0.7819,
"step": 882
},
{
"epoch": 0.048599262480048436,
"grad_norm": 0.9145999550819397,
"learning_rate": 9.988484890605539e-06,
"loss": 0.885,
"step": 883
},
{
"epoch": 0.0486543012824041,
"grad_norm": 1.0115307569503784,
"learning_rate": 9.98845547035905e-06,
"loss": 0.8347,
"step": 884
},
{
"epoch": 0.04870934008475976,
"grad_norm": 1.1372706890106201,
"learning_rate": 9.988426012620667e-06,
"loss": 0.944,
"step": 885
},
{
"epoch": 0.04876437888711541,
"grad_norm": 0.9502811431884766,
"learning_rate": 9.98839651739061e-06,
"loss": 0.9054,
"step": 886
},
{
"epoch": 0.048819417689471074,
"grad_norm": 0.9612823128700256,
"learning_rate": 9.988366984669097e-06,
"loss": 0.8796,
"step": 887
},
{
"epoch": 0.048874456491826736,
"grad_norm": 0.9551461935043335,
"learning_rate": 9.988337414456355e-06,
"loss": 0.8769,
"step": 888
},
{
"epoch": 0.0489294952941824,
"grad_norm": 0.8554086089134216,
"learning_rate": 9.988307806752603e-06,
"loss": 0.892,
"step": 889
},
{
"epoch": 0.04898453409653806,
"grad_norm": 0.8418886661529541,
"learning_rate": 9.988278161558067e-06,
"loss": 0.7568,
"step": 890
},
{
"epoch": 0.04903957289889372,
"grad_norm": 1.4780360460281372,
"learning_rate": 9.988248478872967e-06,
"loss": 0.9126,
"step": 891
},
{
"epoch": 0.04909461170124938,
"grad_norm": 0.8236714005470276,
"learning_rate": 9.988218758697526e-06,
"loss": 0.7317,
"step": 892
},
{
"epoch": 0.04914965050360504,
"grad_norm": 0.8777141571044922,
"learning_rate": 9.988189001031968e-06,
"loss": 0.7989,
"step": 893
},
{
"epoch": 0.049204689305960704,
"grad_norm": 1.0235031843185425,
"learning_rate": 9.988159205876516e-06,
"loss": 0.8335,
"step": 894
},
{
"epoch": 0.049259728108316365,
"grad_norm": 0.9340357184410095,
"learning_rate": 9.988129373231395e-06,
"loss": 0.8129,
"step": 895
},
{
"epoch": 0.04931476691067203,
"grad_norm": 1.7686667442321777,
"learning_rate": 9.98809950309683e-06,
"loss": 0.9792,
"step": 896
},
{
"epoch": 0.04936980571302768,
"grad_norm": 0.9252369403839111,
"learning_rate": 9.988069595473044e-06,
"loss": 0.8671,
"step": 897
},
{
"epoch": 0.04942484451538334,
"grad_norm": 0.9989960789680481,
"learning_rate": 9.988039650360262e-06,
"loss": 0.9245,
"step": 898
},
{
"epoch": 0.049479883317739004,
"grad_norm": 1.062912106513977,
"learning_rate": 9.98800966775871e-06,
"loss": 0.9146,
"step": 899
},
{
"epoch": 0.049534922120094665,
"grad_norm": 0.8698169589042664,
"learning_rate": 9.98797964766861e-06,
"loss": 0.8606,
"step": 900
},
{
"epoch": 0.04958996092245033,
"grad_norm": 1.6754224300384521,
"learning_rate": 9.98794959009019e-06,
"loss": 0.9236,
"step": 901
},
{
"epoch": 0.04964499972480599,
"grad_norm": 1.084174394607544,
"learning_rate": 9.98791949502368e-06,
"loss": 0.9252,
"step": 902
},
{
"epoch": 0.04970003852716165,
"grad_norm": 0.9866724610328674,
"learning_rate": 9.987889362469301e-06,
"loss": 0.9096,
"step": 903
},
{
"epoch": 0.04975507732951731,
"grad_norm": 0.8814040422439575,
"learning_rate": 9.987859192427279e-06,
"loss": 0.8475,
"step": 904
},
{
"epoch": 0.04981011613187297,
"grad_norm": 0.8796457052230835,
"learning_rate": 9.987828984897843e-06,
"loss": 0.8478,
"step": 905
},
{
"epoch": 0.049865154934228634,
"grad_norm": 1.0541884899139404,
"learning_rate": 9.98779873988122e-06,
"loss": 0.9799,
"step": 906
},
{
"epoch": 0.049920193736584295,
"grad_norm": 0.91409832239151,
"learning_rate": 9.987768457377636e-06,
"loss": 0.8701,
"step": 907
},
{
"epoch": 0.04997523253893995,
"grad_norm": 1.0120370388031006,
"learning_rate": 9.98773813738732e-06,
"loss": 0.8417,
"step": 908
},
{
"epoch": 0.05003027134129561,
"grad_norm": 1.7744206190109253,
"learning_rate": 9.987707779910499e-06,
"loss": 0.9263,
"step": 909
}
],
"logging_steps": 1,
"max_steps": 36338,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 909,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.682514714121994e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}