dhavani-1b-base / trainer_state.json
upperwal's picture
Upload folder using huggingface_hub
ab6a1c0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1079,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009267840593141798,
"grad_norm": 356.406982421875,
"learning_rate": 0.005,
"loss": 15.9,
"step": 1
},
{
"epoch": 0.0018535681186283596,
"grad_norm": 32.9332389831543,
"learning_rate": 0.0049999894033994794,
"loss": 13.6,
"step": 2
},
{
"epoch": 0.0027803521779425394,
"grad_norm": 10.453313827514648,
"learning_rate": 0.004999957613687751,
"loss": 21.425,
"step": 3
},
{
"epoch": 0.0037071362372567192,
"grad_norm": 3.510478973388672,
"learning_rate": 0.004999904631134301,
"loss": 15.225,
"step": 4
},
{
"epoch": 0.004633920296570899,
"grad_norm": 35.607364654541016,
"learning_rate": 0.004999830456188281,
"loss": 18.325,
"step": 5
},
{
"epoch": 0.005560704355885079,
"grad_norm": 4.46471643447876,
"learning_rate": 0.004999735089478491,
"loss": 19.7,
"step": 6
},
{
"epoch": 0.006487488415199258,
"grad_norm": 1.207599401473999,
"learning_rate": 0.004999618531813382,
"loss": 14.125,
"step": 7
},
{
"epoch": 0.0074142724745134385,
"grad_norm": 46.56653594970703,
"learning_rate": 0.004999480784181046,
"loss": 32.7,
"step": 8
},
{
"epoch": 0.008341056533827619,
"grad_norm": 2.0620079040527344,
"learning_rate": 0.004999321847749208,
"loss": 13.4,
"step": 9
},
{
"epoch": 0.009267840593141797,
"grad_norm": 3.376063823699951,
"learning_rate": 0.0049991417238652155,
"loss": 13.3,
"step": 10
},
{
"epoch": 0.010194624652455977,
"grad_norm": 0.6672539710998535,
"learning_rate": 0.004998940414056032,
"loss": 13.4375,
"step": 11
},
{
"epoch": 0.011121408711770158,
"grad_norm": 0.4186709523200989,
"learning_rate": 0.004998717920028215,
"loss": 12.6375,
"step": 12
},
{
"epoch": 0.012048192771084338,
"grad_norm": 0.4992158114910126,
"learning_rate": 0.00499847424366791,
"loss": 11.6625,
"step": 13
},
{
"epoch": 0.012974976830398516,
"grad_norm": 0.21440155804157257,
"learning_rate": 0.004998209387040828,
"loss": 10.5375,
"step": 14
},
{
"epoch": 0.013901760889712697,
"grad_norm": 2.2223408222198486,
"learning_rate": 0.004997923352392236,
"loss": 11.6,
"step": 15
},
{
"epoch": 0.014828544949026877,
"grad_norm": 1.4461462497711182,
"learning_rate": 0.004997616142146927,
"loss": 12.7125,
"step": 16
},
{
"epoch": 0.015755329008341055,
"grad_norm": 1.9746646881103516,
"learning_rate": 0.004997287758909209,
"loss": 12.2125,
"step": 17
},
{
"epoch": 0.016682113067655237,
"grad_norm": 8.858609199523926,
"learning_rate": 0.004996938205462881,
"loss": 14.0625,
"step": 18
},
{
"epoch": 0.017608897126969416,
"grad_norm": 0.9914843440055847,
"learning_rate": 0.004996567484771203,
"loss": 11.35,
"step": 19
},
{
"epoch": 0.018535681186283594,
"grad_norm": 0.8945605158805847,
"learning_rate": 0.004996175599976878,
"loss": 11.725,
"step": 20
},
{
"epoch": 0.019462465245597776,
"grad_norm": 1.340647578239441,
"learning_rate": 0.004995762554402026,
"loss": 12.8875,
"step": 21
},
{
"epoch": 0.020389249304911955,
"grad_norm": 0.6224690079689026,
"learning_rate": 0.004995328351548148,
"loss": 11.7,
"step": 22
},
{
"epoch": 0.021316033364226137,
"grad_norm": 0.6904886960983276,
"learning_rate": 0.004994872995096104,
"loss": 10.6375,
"step": 23
},
{
"epoch": 0.022242817423540315,
"grad_norm": 0.7552493214607239,
"learning_rate": 0.004994396488906078,
"loss": 13.275,
"step": 24
},
{
"epoch": 0.023169601482854494,
"grad_norm": 0.1830722540616989,
"learning_rate": 0.004993898837017547,
"loss": 10.225,
"step": 25
},
{
"epoch": 0.024096385542168676,
"grad_norm": 0.31753918528556824,
"learning_rate": 0.004993380043649245,
"loss": 10.0875,
"step": 26
},
{
"epoch": 0.025023169601482854,
"grad_norm": 0.17651186883449554,
"learning_rate": 0.00499284011319913,
"loss": 9.675,
"step": 27
},
{
"epoch": 0.025949953660797033,
"grad_norm": 0.1835695058107376,
"learning_rate": 0.004992279050244343,
"loss": 9.625,
"step": 28
},
{
"epoch": 0.026876737720111215,
"grad_norm": 0.15531466901302338,
"learning_rate": 0.004991696859541173,
"loss": 9.525,
"step": 29
},
{
"epoch": 0.027803521779425393,
"grad_norm": 0.1167324110865593,
"learning_rate": 0.004991093546025012,
"loss": 9.3375,
"step": 30
},
{
"epoch": 0.028730305838739572,
"grad_norm": 0.06774014979600906,
"learning_rate": 0.004990469114810318,
"loss": 9.275,
"step": 31
},
{
"epoch": 0.029657089898053754,
"grad_norm": 0.11318591982126236,
"learning_rate": 0.004989823571190571,
"loss": 9.2875,
"step": 32
},
{
"epoch": 0.030583873957367932,
"grad_norm": 0.039967115968465805,
"learning_rate": 0.004989156920638226,
"loss": 9.225,
"step": 33
},
{
"epoch": 0.03151065801668211,
"grad_norm": 0.07919777184724808,
"learning_rate": 0.004988469168804664,
"loss": 9.2375,
"step": 34
},
{
"epoch": 0.03243744207599629,
"grad_norm": 0.04368596524000168,
"learning_rate": 0.0049877603215201525,
"loss": 9.1875,
"step": 35
},
{
"epoch": 0.033364226135310475,
"grad_norm": 0.04921940341591835,
"learning_rate": 0.004987030384793787,
"loss": 9.1875,
"step": 36
},
{
"epoch": 0.03429101019462465,
"grad_norm": 0.040833037346601486,
"learning_rate": 0.0049862793648134465,
"loss": 9.1625,
"step": 37
},
{
"epoch": 0.03521779425393883,
"grad_norm": 0.03423991799354553,
"learning_rate": 0.004985507267945738,
"loss": 9.1125,
"step": 38
},
{
"epoch": 0.03614457831325301,
"grad_norm": 0.04628804698586464,
"learning_rate": 0.004984714100735943,
"loss": 9.1375,
"step": 39
},
{
"epoch": 0.03707136237256719,
"grad_norm": 0.02513456903398037,
"learning_rate": 0.0049838998699079625,
"loss": 9.125,
"step": 40
},
{
"epoch": 0.037998146431881374,
"grad_norm": 0.04390294477343559,
"learning_rate": 0.00498306458236426,
"loss": 9.125,
"step": 41
},
{
"epoch": 0.03892493049119555,
"grad_norm": 0.02223977819085121,
"learning_rate": 0.004982208245185801,
"loss": 9.1125,
"step": 42
},
{
"epoch": 0.03985171455050973,
"grad_norm": 0.03464260324835777,
"learning_rate": 0.004981330865631997,
"loss": 9.1125,
"step": 43
},
{
"epoch": 0.04077849860982391,
"grad_norm": 0.0259235929697752,
"learning_rate": 0.00498043245114064,
"loss": 9.0625,
"step": 44
},
{
"epoch": 0.04170528266913809,
"grad_norm": 0.023725276812911034,
"learning_rate": 0.004979513009327842,
"loss": 9.1,
"step": 45
},
{
"epoch": 0.042632066728452274,
"grad_norm": 0.022491367533802986,
"learning_rate": 0.004978572547987968,
"loss": 9.05,
"step": 46
},
{
"epoch": 0.04355885078776645,
"grad_norm": 0.018162831664085388,
"learning_rate": 0.004977611075093574,
"loss": 9.0875,
"step": 47
},
{
"epoch": 0.04448563484708063,
"grad_norm": 0.033248819410800934,
"learning_rate": 0.004976628598795336,
"loss": 9.025,
"step": 48
},
{
"epoch": 0.04541241890639481,
"grad_norm": 0.015689486637711525,
"learning_rate": 0.0049756251274219775,
"loss": 9.0625,
"step": 49
},
{
"epoch": 0.04633920296570899,
"grad_norm": 0.022721588611602783,
"learning_rate": 0.00497460066948021,
"loss": 9.0375,
"step": 50
},
{
"epoch": 0.047265987025023166,
"grad_norm": 0.020086370408535004,
"learning_rate": 0.00497355523365465,
"loss": 9.0625,
"step": 51
},
{
"epoch": 0.04819277108433735,
"grad_norm": 0.01713702268898487,
"learning_rate": 0.00497248882880775,
"loss": 9.0375,
"step": 52
},
{
"epoch": 0.04911955514365153,
"grad_norm": 0.01819983310997486,
"learning_rate": 0.004971401463979721,
"loss": 9.0375,
"step": 53
},
{
"epoch": 0.05004633920296571,
"grad_norm": 0.01858202926814556,
"learning_rate": 0.004970293148388463,
"loss": 9.0125,
"step": 54
},
{
"epoch": 0.05097312326227989,
"grad_norm": 0.016383878886699677,
"learning_rate": 0.004969163891429476,
"loss": 9.0,
"step": 55
},
{
"epoch": 0.051899907321594066,
"grad_norm": 0.01655055209994316,
"learning_rate": 0.0049680137026757885,
"loss": 9.025,
"step": 56
},
{
"epoch": 0.05282669138090825,
"grad_norm": 0.01438821293413639,
"learning_rate": 0.004966842591877872,
"loss": 9.0,
"step": 57
},
{
"epoch": 0.05375347544022243,
"grad_norm": 0.01816794089972973,
"learning_rate": 0.004965650568963563,
"loss": 9.0,
"step": 58
},
{
"epoch": 0.05468025949953661,
"grad_norm": 0.017415305599570274,
"learning_rate": 0.004964437644037973,
"loss": 8.9625,
"step": 59
},
{
"epoch": 0.05560704355885079,
"grad_norm": 0.017612161114811897,
"learning_rate": 0.004963203827383406,
"loss": 8.975,
"step": 60
},
{
"epoch": 0.056533827618164965,
"grad_norm": 0.014700948260724545,
"learning_rate": 0.0049619491294592725,
"loss": 9.0,
"step": 61
},
{
"epoch": 0.057460611677479144,
"grad_norm": 0.0167540330439806,
"learning_rate": 0.004960673560901999,
"loss": 8.9875,
"step": 62
},
{
"epoch": 0.05838739573679333,
"grad_norm": 0.029445504769682884,
"learning_rate": 0.004959377132524938,
"loss": 8.9625,
"step": 63
},
{
"epoch": 0.05931417979610751,
"grad_norm": 0.013282664120197296,
"learning_rate": 0.004958059855318275,
"loss": 8.9625,
"step": 64
},
{
"epoch": 0.060240963855421686,
"grad_norm": 0.019158177077770233,
"learning_rate": 0.00495672174044894,
"loss": 8.9,
"step": 65
},
{
"epoch": 0.061167747914735865,
"grad_norm": 0.02090335451066494,
"learning_rate": 0.004955362799260506,
"loss": 8.9125,
"step": 66
},
{
"epoch": 0.06209453197405004,
"grad_norm": 0.019786162301898003,
"learning_rate": 0.004953983043273102,
"loss": 8.95,
"step": 67
},
{
"epoch": 0.06302131603336422,
"grad_norm": 0.0192793570458889,
"learning_rate": 0.004952582484183302,
"loss": 8.925,
"step": 68
},
{
"epoch": 0.0639481000926784,
"grad_norm": 0.029085692018270493,
"learning_rate": 0.0049511611338640404,
"loss": 8.9625,
"step": 69
},
{
"epoch": 0.06487488415199258,
"grad_norm": 0.028297357261180878,
"learning_rate": 0.004949719004364503,
"loss": 8.925,
"step": 70
},
{
"epoch": 0.06580166821130677,
"grad_norm": 0.013140903785824776,
"learning_rate": 0.0049482561079100245,
"loss": 8.925,
"step": 71
},
{
"epoch": 0.06672845227062095,
"grad_norm": 0.016508571803569794,
"learning_rate": 0.004946772456901989,
"loss": 8.95,
"step": 72
},
{
"epoch": 0.06765523632993513,
"grad_norm": 0.028362734243273735,
"learning_rate": 0.004945268063917723,
"loss": 8.9375,
"step": 73
},
{
"epoch": 0.0685820203892493,
"grad_norm": 0.028645526617765427,
"learning_rate": 0.004943742941710386,
"loss": 8.9375,
"step": 74
},
{
"epoch": 0.06950880444856349,
"grad_norm": 0.010765830054879189,
"learning_rate": 0.004942197103208867,
"loss": 8.925,
"step": 75
},
{
"epoch": 0.07043558850787766,
"grad_norm": 0.022227909415960312,
"learning_rate": 0.004940630561517674,
"loss": 8.9375,
"step": 76
},
{
"epoch": 0.07136237256719184,
"grad_norm": 0.020959695801138878,
"learning_rate": 0.004939043329916819,
"loss": 8.95,
"step": 77
},
{
"epoch": 0.07228915662650602,
"grad_norm": 0.01679840497672558,
"learning_rate": 0.00493743542186171,
"loss": 8.925,
"step": 78
},
{
"epoch": 0.0732159406858202,
"grad_norm": 0.01441862341016531,
"learning_rate": 0.004935806850983033,
"loss": 8.9125,
"step": 79
},
{
"epoch": 0.07414272474513438,
"grad_norm": 0.014738287776708603,
"learning_rate": 0.004934157631086642,
"loss": 8.9,
"step": 80
},
{
"epoch": 0.07506950880444857,
"grad_norm": 0.013974464498460293,
"learning_rate": 0.004932487776153435,
"loss": 8.875,
"step": 81
},
{
"epoch": 0.07599629286376275,
"grad_norm": 0.014242907054722309,
"learning_rate": 0.004930797300339241,
"loss": 8.8875,
"step": 82
},
{
"epoch": 0.07692307692307693,
"grad_norm": 0.014142482541501522,
"learning_rate": 0.004929086217974697,
"loss": 8.875,
"step": 83
},
{
"epoch": 0.0778498609823911,
"grad_norm": 0.011345421895384789,
"learning_rate": 0.0049273545435651305,
"loss": 8.9,
"step": 84
},
{
"epoch": 0.07877664504170528,
"grad_norm": 0.01937839388847351,
"learning_rate": 0.004925602291790427,
"loss": 8.875,
"step": 85
},
{
"epoch": 0.07970342910101946,
"grad_norm": 0.019322404637932777,
"learning_rate": 0.0049238294775049195,
"loss": 8.875,
"step": 86
},
{
"epoch": 0.08063021316033364,
"grad_norm": 0.02427850104868412,
"learning_rate": 0.004922036115737251,
"loss": 8.875,
"step": 87
},
{
"epoch": 0.08155699721964782,
"grad_norm": 0.02773062139749527,
"learning_rate": 0.0049202222216902505,
"loss": 8.875,
"step": 88
},
{
"epoch": 0.082483781278962,
"grad_norm": 0.022121064364910126,
"learning_rate": 0.0049183878107408084,
"loss": 8.875,
"step": 89
},
{
"epoch": 0.08341056533827618,
"grad_norm": 0.014306942000985146,
"learning_rate": 0.00491653289843974,
"loss": 8.85,
"step": 90
},
{
"epoch": 0.08433734939759036,
"grad_norm": 0.01174082513898611,
"learning_rate": 0.004914657500511657,
"loss": 8.85,
"step": 91
},
{
"epoch": 0.08526413345690455,
"grad_norm": 0.017720786854624748,
"learning_rate": 0.004912761632854833,
"loss": 8.8625,
"step": 92
},
{
"epoch": 0.08619091751621873,
"grad_norm": 0.023863809183239937,
"learning_rate": 0.004910845311541071,
"loss": 8.8625,
"step": 93
},
{
"epoch": 0.0871177015755329,
"grad_norm": 0.034596893936395645,
"learning_rate": 0.004908908552815563,
"loss": 8.8625,
"step": 94
},
{
"epoch": 0.08804448563484708,
"grad_norm": 0.04321544989943504,
"learning_rate": 0.004906951373096757,
"loss": 8.85,
"step": 95
},
{
"epoch": 0.08897126969416126,
"grad_norm": 0.05180607736110687,
"learning_rate": 0.004904973788976213,
"loss": 8.8625,
"step": 96
},
{
"epoch": 0.08989805375347544,
"grad_norm": 0.04927121847867966,
"learning_rate": 0.004902975817218467,
"loss": 8.825,
"step": 97
},
{
"epoch": 0.09082483781278962,
"grad_norm": 0.030304012820124626,
"learning_rate": 0.004900957474760885,
"loss": 8.825,
"step": 98
},
{
"epoch": 0.0917516218721038,
"grad_norm": 0.018640510737895966,
"learning_rate": 0.004898918778713524,
"loss": 8.8,
"step": 99
},
{
"epoch": 0.09267840593141798,
"grad_norm": 0.033853888511657715,
"learning_rate": 0.004896859746358979,
"loss": 8.7875,
"step": 100
},
{
"epoch": 0.09360518999073215,
"grad_norm": 0.04043276980519295,
"learning_rate": 0.004894780395152247,
"loss": 8.775,
"step": 101
},
{
"epoch": 0.09453197405004633,
"grad_norm": 0.0534222349524498,
"learning_rate": 0.004892680742720571,
"loss": 8.7375,
"step": 102
},
{
"epoch": 0.09545875810936053,
"grad_norm": 0.082061268389225,
"learning_rate": 0.004890560806863293,
"loss": 8.8,
"step": 103
},
{
"epoch": 0.0963855421686747,
"grad_norm": 0.05508153885602951,
"learning_rate": 0.004888420605551703,
"loss": 8.775,
"step": 104
},
{
"epoch": 0.09731232622798888,
"grad_norm": 0.04220907762646675,
"learning_rate": 0.004886260156928888,
"loss": 8.7625,
"step": 105
},
{
"epoch": 0.09823911028730306,
"grad_norm": 0.04727254807949066,
"learning_rate": 0.004884079479309578,
"loss": 8.7875,
"step": 106
},
{
"epoch": 0.09916589434661724,
"grad_norm": 0.04981837049126625,
"learning_rate": 0.004881878591179988,
"loss": 8.75,
"step": 107
},
{
"epoch": 0.10009267840593142,
"grad_norm": 0.039716847240924835,
"learning_rate": 0.004879657511197662,
"loss": 8.675,
"step": 108
},
{
"epoch": 0.1010194624652456,
"grad_norm": 0.028658628463745117,
"learning_rate": 0.0048774162581913215,
"loss": 8.675,
"step": 109
},
{
"epoch": 0.10194624652455977,
"grad_norm": 0.03913936764001846,
"learning_rate": 0.0048751548511606945,
"loss": 8.6625,
"step": 110
},
{
"epoch": 0.10287303058387395,
"grad_norm": 0.027623698115348816,
"learning_rate": 0.004872873309276362,
"loss": 8.6625,
"step": 111
},
{
"epoch": 0.10379981464318813,
"grad_norm": 0.0399942547082901,
"learning_rate": 0.004870571651879596,
"loss": 8.6625,
"step": 112
},
{
"epoch": 0.10472659870250231,
"grad_norm": 0.02140922099351883,
"learning_rate": 0.00486824989848219,
"loss": 8.5875,
"step": 113
},
{
"epoch": 0.1056533827618165,
"grad_norm": 0.0371641181409359,
"learning_rate": 0.0048659080687663,
"loss": 8.6,
"step": 114
},
{
"epoch": 0.10658016682113068,
"grad_norm": 0.018301891162991524,
"learning_rate": 0.004863546182584273,
"loss": 8.575,
"step": 115
},
{
"epoch": 0.10750695088044486,
"grad_norm": 0.029274851083755493,
"learning_rate": 0.0048611642599584795,
"loss": 8.55,
"step": 116
},
{
"epoch": 0.10843373493975904,
"grad_norm": 0.025735612958669662,
"learning_rate": 0.004858762321081146,
"loss": 8.525,
"step": 117
},
{
"epoch": 0.10936051899907322,
"grad_norm": 0.036481715738773346,
"learning_rate": 0.004856340386314182,
"loss": 8.4875,
"step": 118
},
{
"epoch": 0.1102873030583874,
"grad_norm": 0.11254877597093582,
"learning_rate": 0.004853898476189007,
"loss": 8.5375,
"step": 119
},
{
"epoch": 0.11121408711770157,
"grad_norm": 0.19445450603961945,
"learning_rate": 0.00485143661140638,
"loss": 8.85,
"step": 120
},
{
"epoch": 0.11214087117701575,
"grad_norm": 0.16596297919750214,
"learning_rate": 0.004848954812836217,
"loss": 8.7625,
"step": 121
},
{
"epoch": 0.11306765523632993,
"grad_norm": 0.044869761914014816,
"learning_rate": 0.004846453101517421,
"loss": 8.5125,
"step": 122
},
{
"epoch": 0.11399443929564411,
"grad_norm": 0.08229261636734009,
"learning_rate": 0.0048439314986577,
"loss": 8.6,
"step": 123
},
{
"epoch": 0.11492122335495829,
"grad_norm": 0.04814854636788368,
"learning_rate": 0.00484139002563339,
"loss": 8.475,
"step": 124
},
{
"epoch": 0.11584800741427248,
"grad_norm": 0.07902152091264725,
"learning_rate": 0.004838828703989269,
"loss": 8.55,
"step": 125
},
{
"epoch": 0.11677479147358666,
"grad_norm": 0.02725468948483467,
"learning_rate": 0.0048362475554383786,
"loss": 8.4,
"step": 126
},
{
"epoch": 0.11770157553290084,
"grad_norm": 0.05269164219498634,
"learning_rate": 0.004833646601861841,
"loss": 8.4375,
"step": 127
},
{
"epoch": 0.11862835959221502,
"grad_norm": 0.03333018347620964,
"learning_rate": 0.004831025865308667,
"loss": 8.3625,
"step": 128
},
{
"epoch": 0.1195551436515292,
"grad_norm": 0.040032755583524704,
"learning_rate": 0.004828385367995575,
"loss": 8.325,
"step": 129
},
{
"epoch": 0.12048192771084337,
"grad_norm": 0.03257158771157265,
"learning_rate": 0.004825725132306803,
"loss": 8.2625,
"step": 130
},
{
"epoch": 0.12140871177015755,
"grad_norm": 0.03259531036019325,
"learning_rate": 0.0048230451807939135,
"loss": 8.225,
"step": 131
},
{
"epoch": 0.12233549582947173,
"grad_norm": 0.03383934497833252,
"learning_rate": 0.004820345536175607,
"loss": 8.2,
"step": 132
},
{
"epoch": 0.12326227988878591,
"grad_norm": 0.02867773361504078,
"learning_rate": 0.004817626221337529,
"loss": 8.15,
"step": 133
},
{
"epoch": 0.12418906394810009,
"grad_norm": 0.03943765163421631,
"learning_rate": 0.004814887259332073,
"loss": 8.125,
"step": 134
},
{
"epoch": 0.12511584800741427,
"grad_norm": 0.034471139311790466,
"learning_rate": 0.004812128673378188,
"loss": 7.9875,
"step": 135
},
{
"epoch": 0.12604263206672844,
"grad_norm": 0.03869534283876419,
"learning_rate": 0.004809350486861181,
"loss": 7.95,
"step": 136
},
{
"epoch": 0.12696941612604262,
"grad_norm": 0.03380202502012253,
"learning_rate": 0.0048065527233325175,
"loss": 7.875,
"step": 137
},
{
"epoch": 0.1278962001853568,
"grad_norm": 0.03459366410970688,
"learning_rate": 0.004803735406509625,
"loss": 7.7812,
"step": 138
},
{
"epoch": 0.12882298424467098,
"grad_norm": 0.0600280836224556,
"learning_rate": 0.0048008985602756874,
"loss": 7.65,
"step": 139
},
{
"epoch": 0.12974976830398516,
"grad_norm": 0.11870339512825012,
"learning_rate": 0.004798042208679445,
"loss": 7.6375,
"step": 140
},
{
"epoch": 0.13067655236329936,
"grad_norm": 0.1849852204322815,
"learning_rate": 0.0047951663759349915,
"loss": 7.7,
"step": 141
},
{
"epoch": 0.13160333642261354,
"grad_norm": 0.15893682837486267,
"learning_rate": 0.0047922710864215685,
"loss": 7.6375,
"step": 142
},
{
"epoch": 0.13253012048192772,
"grad_norm": 0.10825814306735992,
"learning_rate": 0.004789356364683356,
"loss": 7.4437,
"step": 143
},
{
"epoch": 0.1334569045412419,
"grad_norm": 0.12936848402023315,
"learning_rate": 0.004786422235429268,
"loss": 7.3688,
"step": 144
},
{
"epoch": 0.13438368860055608,
"grad_norm": 0.07664606720209122,
"learning_rate": 0.0047834687235327415,
"loss": 7.2625,
"step": 145
},
{
"epoch": 0.13531047265987026,
"grad_norm": 0.1079607829451561,
"learning_rate": 0.0047804958540315235,
"loss": 7.2125,
"step": 146
},
{
"epoch": 0.13623725671918444,
"grad_norm": 0.04593510553240776,
"learning_rate": 0.004777503652127464,
"loss": 7.0687,
"step": 147
},
{
"epoch": 0.1371640407784986,
"grad_norm": 0.06448942422866821,
"learning_rate": 0.004774492143186296,
"loss": 7.075,
"step": 148
},
{
"epoch": 0.1380908248378128,
"grad_norm": 0.04284033551812172,
"learning_rate": 0.004771461352737427,
"loss": 6.9688,
"step": 149
},
{
"epoch": 0.13901760889712697,
"grad_norm": 0.048541247844696045,
"learning_rate": 0.004768411306473717,
"loss": 6.9125,
"step": 150
},
{
"epoch": 0.13994439295644115,
"grad_norm": 0.0369611531496048,
"learning_rate": 0.004765342030251263,
"loss": 6.8875,
"step": 151
},
{
"epoch": 0.14087117701575533,
"grad_norm": 0.07809454202651978,
"learning_rate": 0.004762253550089181,
"loss": 6.8375,
"step": 152
},
{
"epoch": 0.1417979610750695,
"grad_norm": 0.030714238062500954,
"learning_rate": 0.004759145892169382,
"loss": 6.8063,
"step": 153
},
{
"epoch": 0.14272474513438368,
"grad_norm": 0.030746718868613243,
"learning_rate": 0.004756019082836354,
"loss": 6.7875,
"step": 154
},
{
"epoch": 0.14365152919369786,
"grad_norm": 0.026088058948516846,
"learning_rate": 0.004752873148596938,
"loss": 6.7438,
"step": 155
},
{
"epoch": 0.14457831325301204,
"grad_norm": 0.017927952110767365,
"learning_rate": 0.004749708116120099,
"loss": 6.7688,
"step": 156
},
{
"epoch": 0.14550509731232622,
"grad_norm": 0.023661252111196518,
"learning_rate": 0.004746524012236706,
"loss": 6.725,
"step": 157
},
{
"epoch": 0.1464318813716404,
"grad_norm": 0.018965313211083412,
"learning_rate": 0.004743320863939299,
"loss": 6.725,
"step": 158
},
{
"epoch": 0.14735866543095458,
"grad_norm": 0.022316887974739075,
"learning_rate": 0.004740098698381866,
"loss": 6.675,
"step": 159
},
{
"epoch": 0.14828544949026876,
"grad_norm": 0.019958553835749626,
"learning_rate": 0.004736857542879608,
"loss": 6.6875,
"step": 160
},
{
"epoch": 0.14921223354958293,
"grad_norm": 0.016147589311003685,
"learning_rate": 0.004733597424908707,
"loss": 6.6875,
"step": 161
},
{
"epoch": 0.15013901760889714,
"grad_norm": 0.020692575722932816,
"learning_rate": 0.004730318372106099,
"loss": 6.6438,
"step": 162
},
{
"epoch": 0.15106580166821132,
"grad_norm": 0.014802551828324795,
"learning_rate": 0.004727020412269234,
"loss": 6.6312,
"step": 163
},
{
"epoch": 0.1519925857275255,
"grad_norm": 0.01826154999434948,
"learning_rate": 0.004723703573355842,
"loss": 6.6375,
"step": 164
},
{
"epoch": 0.15291936978683968,
"grad_norm": 0.014861056581139565,
"learning_rate": 0.004720367883483697,
"loss": 6.6562,
"step": 165
},
{
"epoch": 0.15384615384615385,
"grad_norm": 0.0160931795835495,
"learning_rate": 0.004717013370930377,
"loss": 6.6,
"step": 166
},
{
"epoch": 0.15477293790546803,
"grad_norm": 0.02078167535364628,
"learning_rate": 0.004713640064133024,
"loss": 6.6063,
"step": 167
},
{
"epoch": 0.1556997219647822,
"grad_norm": 0.01577616296708584,
"learning_rate": 0.004710247991688109,
"loss": 6.5563,
"step": 168
},
{
"epoch": 0.1566265060240964,
"grad_norm": 0.019711369648575783,
"learning_rate": 0.0047068371823511795,
"loss": 6.575,
"step": 169
},
{
"epoch": 0.15755329008341057,
"grad_norm": 0.01820039190351963,
"learning_rate": 0.004703407665036622,
"loss": 6.5813,
"step": 170
},
{
"epoch": 0.15848007414272475,
"grad_norm": 0.015363371931016445,
"learning_rate": 0.004699959468817417,
"loss": 6.5375,
"step": 171
},
{
"epoch": 0.15940685820203893,
"grad_norm": 0.015872852876782417,
"learning_rate": 0.004696492622924892,
"loss": 6.5687,
"step": 172
},
{
"epoch": 0.1603336422613531,
"grad_norm": 0.016906000673770905,
"learning_rate": 0.004693007156748471,
"loss": 6.5125,
"step": 173
},
{
"epoch": 0.16126042632066728,
"grad_norm": 0.016961950808763504,
"learning_rate": 0.0046895030998354275,
"loss": 6.525,
"step": 174
},
{
"epoch": 0.16218721037998146,
"grad_norm": 0.016262684017419815,
"learning_rate": 0.004685980481890634,
"loss": 6.5062,
"step": 175
},
{
"epoch": 0.16311399443929564,
"grad_norm": 0.014922458678483963,
"learning_rate": 0.004682439332776313,
"loss": 6.4688,
"step": 176
},
{
"epoch": 0.16404077849860982,
"grad_norm": 0.022018995136022568,
"learning_rate": 0.004678879682511777,
"loss": 6.5188,
"step": 177
},
{
"epoch": 0.164967562557924,
"grad_norm": 0.014819780364632607,
"learning_rate": 0.004675301561273179,
"loss": 6.4437,
"step": 178
},
{
"epoch": 0.16589434661723818,
"grad_norm": 0.0183818731456995,
"learning_rate": 0.004671704999393256,
"loss": 6.4563,
"step": 179
},
{
"epoch": 0.16682113067655235,
"grad_norm": 0.020285405218601227,
"learning_rate": 0.004668090027361074,
"loss": 6.4563,
"step": 180
},
{
"epoch": 0.16774791473586653,
"grad_norm": 0.0204929132014513,
"learning_rate": 0.004664456675821761,
"loss": 6.4813,
"step": 181
},
{
"epoch": 0.1686746987951807,
"grad_norm": 0.022332845255732536,
"learning_rate": 0.0046608049755762606,
"loss": 6.4563,
"step": 182
},
{
"epoch": 0.1696014828544949,
"grad_norm": 0.014836137183010578,
"learning_rate": 0.004657134957581057,
"loss": 6.4625,
"step": 183
},
{
"epoch": 0.1705282669138091,
"grad_norm": 0.024512965232133865,
"learning_rate": 0.0046534466529479235,
"loss": 6.4563,
"step": 184
},
{
"epoch": 0.17145505097312327,
"grad_norm": 0.025079630315303802,
"learning_rate": 0.004649740092943651,
"loss": 6.4188,
"step": 185
},
{
"epoch": 0.17238183503243745,
"grad_norm": 0.032594986259937286,
"learning_rate": 0.00464601530898979,
"loss": 6.4125,
"step": 186
},
{
"epoch": 0.17330861909175163,
"grad_norm": 0.028524870052933693,
"learning_rate": 0.004642272332662377,
"loss": 6.4125,
"step": 187
},
{
"epoch": 0.1742354031510658,
"grad_norm": 0.02017652988433838,
"learning_rate": 0.0046385111956916735,
"loss": 6.3938,
"step": 188
},
{
"epoch": 0.17516218721038,
"grad_norm": 0.023051844909787178,
"learning_rate": 0.004634731929961891,
"loss": 6.4062,
"step": 189
},
{
"epoch": 0.17608897126969417,
"grad_norm": 0.025438351556658745,
"learning_rate": 0.004630934567510925,
"loss": 6.3812,
"step": 190
},
{
"epoch": 0.17701575532900835,
"grad_norm": 0.037845317274332047,
"learning_rate": 0.004627119140530083,
"loss": 6.4062,
"step": 191
},
{
"epoch": 0.17794253938832252,
"grad_norm": 0.05386321246623993,
"learning_rate": 0.004623285681363807,
"loss": 6.4062,
"step": 192
},
{
"epoch": 0.1788693234476367,
"grad_norm": 0.0913223922252655,
"learning_rate": 0.004619434222509408,
"loss": 6.3875,
"step": 193
},
{
"epoch": 0.17979610750695088,
"grad_norm": 0.1158546730875969,
"learning_rate": 0.00461556479661678,
"loss": 6.4563,
"step": 194
},
{
"epoch": 0.18072289156626506,
"grad_norm": 0.08018877357244492,
"learning_rate": 0.0046116774364881345,
"loss": 6.375,
"step": 195
},
{
"epoch": 0.18164967562557924,
"grad_norm": 0.03276560455560684,
"learning_rate": 0.0046077721750777114,
"loss": 6.3812,
"step": 196
},
{
"epoch": 0.18257645968489342,
"grad_norm": 0.07004847377538681,
"learning_rate": 0.0046038490454915065,
"loss": 6.3875,
"step": 197
},
{
"epoch": 0.1835032437442076,
"grad_norm": 0.03939942270517349,
"learning_rate": 0.004599908080986991,
"loss": 6.325,
"step": 198
},
{
"epoch": 0.18443002780352177,
"grad_norm": 0.0445321649312973,
"learning_rate": 0.004595949314972824,
"loss": 6.3125,
"step": 199
},
{
"epoch": 0.18535681186283595,
"grad_norm": 0.04666861146688461,
"learning_rate": 0.004591972781008576,
"loss": 6.3375,
"step": 200
},
{
"epoch": 0.18628359592215013,
"grad_norm": 0.032554373145103455,
"learning_rate": 0.0045879785128044425,
"loss": 6.3187,
"step": 201
},
{
"epoch": 0.1872103799814643,
"grad_norm": 0.03748049587011337,
"learning_rate": 0.004583966544220952,
"loss": 6.3313,
"step": 202
},
{
"epoch": 0.1881371640407785,
"grad_norm": 0.02630574069917202,
"learning_rate": 0.00457993690926869,
"loss": 6.3563,
"step": 203
},
{
"epoch": 0.18906394810009267,
"grad_norm": 0.04539572447538376,
"learning_rate": 0.004575889642107998,
"loss": 6.3063,
"step": 204
},
{
"epoch": 0.18999073215940684,
"grad_norm": 0.02216522768139839,
"learning_rate": 0.0045718247770487,
"loss": 6.2812,
"step": 205
},
{
"epoch": 0.19091751621872105,
"grad_norm": 0.05376052483916283,
"learning_rate": 0.004567742348549793,
"loss": 6.35,
"step": 206
},
{
"epoch": 0.19184430027803523,
"grad_norm": 0.02676314301788807,
"learning_rate": 0.004563642391219168,
"loss": 6.3,
"step": 207
},
{
"epoch": 0.1927710843373494,
"grad_norm": 0.039810191839933395,
"learning_rate": 0.004559524939813316,
"loss": 6.2875,
"step": 208
},
{
"epoch": 0.1936978683966636,
"grad_norm": 0.03783705458045006,
"learning_rate": 0.0045553900292370254,
"loss": 6.2625,
"step": 209
},
{
"epoch": 0.19462465245597776,
"grad_norm": 0.02999858744442463,
"learning_rate": 0.004551237694543092,
"loss": 6.2438,
"step": 210
},
{
"epoch": 0.19555143651529194,
"grad_norm": 0.0282985121011734,
"learning_rate": 0.004547067970932022,
"loss": 6.2438,
"step": 211
},
{
"epoch": 0.19647822057460612,
"grad_norm": 0.03198060020804405,
"learning_rate": 0.004542880893751732,
"loss": 6.2625,
"step": 212
},
{
"epoch": 0.1974050046339203,
"grad_norm": 0.03950299322605133,
"learning_rate": 0.00453867649849725,
"loss": 6.2188,
"step": 213
},
{
"epoch": 0.19833178869323448,
"grad_norm": 0.026990199461579323,
"learning_rate": 0.004534454820810412,
"loss": 6.2063,
"step": 214
},
{
"epoch": 0.19925857275254866,
"grad_norm": 0.0420188382267952,
"learning_rate": 0.004530215896479564,
"loss": 6.2625,
"step": 215
},
{
"epoch": 0.20018535681186284,
"grad_norm": 0.04251977428793907,
"learning_rate": 0.004525959761439257,
"loss": 6.2063,
"step": 216
},
{
"epoch": 0.20111214087117701,
"grad_norm": 0.06442005932331085,
"learning_rate": 0.0045216864517699405,
"loss": 6.2125,
"step": 217
},
{
"epoch": 0.2020389249304912,
"grad_norm": 0.05594475567340851,
"learning_rate": 0.004517396003697659,
"loss": 6.1562,
"step": 218
},
{
"epoch": 0.20296570898980537,
"grad_norm": 0.038938529789447784,
"learning_rate": 0.004513088453593744,
"loss": 6.1937,
"step": 219
},
{
"epoch": 0.20389249304911955,
"grad_norm": 0.057002611458301544,
"learning_rate": 0.0045087638379745065,
"loss": 6.175,
"step": 220
},
{
"epoch": 0.20481927710843373,
"grad_norm": 0.047009214758872986,
"learning_rate": 0.004504422193500925,
"loss": 6.1688,
"step": 221
},
{
"epoch": 0.2057460611677479,
"grad_norm": 0.05817709118127823,
"learning_rate": 0.004500063556978336,
"loss": 6.1375,
"step": 222
},
{
"epoch": 0.20667284522706209,
"grad_norm": 0.05288264900445938,
"learning_rate": 0.004495687965356126,
"loss": 6.1688,
"step": 223
},
{
"epoch": 0.20759962928637626,
"grad_norm": 0.03736674785614014,
"learning_rate": 0.00449129545572741,
"loss": 6.175,
"step": 224
},
{
"epoch": 0.20852641334569044,
"grad_norm": 0.034431926906108856,
"learning_rate": 0.004486886065328725,
"loss": 6.1125,
"step": 225
},
{
"epoch": 0.20945319740500462,
"grad_norm": 0.03445250913500786,
"learning_rate": 0.004482459831539709,
"loss": 6.1625,
"step": 226
},
{
"epoch": 0.21037998146431883,
"grad_norm": 0.035410068929195404,
"learning_rate": 0.004478016791882787,
"loss": 6.0875,
"step": 227
},
{
"epoch": 0.211306765523633,
"grad_norm": 0.026350026950240135,
"learning_rate": 0.004473556984022854,
"loss": 6.125,
"step": 228
},
{
"epoch": 0.21223354958294718,
"grad_norm": 0.028956936672329903,
"learning_rate": 0.0044690804457669505,
"loss": 6.1063,
"step": 229
},
{
"epoch": 0.21316033364226136,
"grad_norm": 0.03521239385008812,
"learning_rate": 0.004464587215063946,
"loss": 6.0875,
"step": 230
},
{
"epoch": 0.21408711770157554,
"grad_norm": 0.04613986983895302,
"learning_rate": 0.004460077330004218,
"loss": 6.1312,
"step": 231
},
{
"epoch": 0.21501390176088972,
"grad_norm": 0.05228109285235405,
"learning_rate": 0.0044555508288193265,
"loss": 6.1063,
"step": 232
},
{
"epoch": 0.2159406858202039,
"grad_norm": 0.045205965638160706,
"learning_rate": 0.004451007749881691,
"loss": 6.1,
"step": 233
},
{
"epoch": 0.21686746987951808,
"grad_norm": 0.028526296839118004,
"learning_rate": 0.004446448131704267,
"loss": 6.0813,
"step": 234
},
{
"epoch": 0.21779425393883226,
"grad_norm": 0.027809731662273407,
"learning_rate": 0.004441872012940214,
"loss": 6.075,
"step": 235
},
{
"epoch": 0.21872103799814643,
"grad_norm": 0.04913929104804993,
"learning_rate": 0.004437279432382576,
"loss": 6.075,
"step": 236
},
{
"epoch": 0.2196478220574606,
"grad_norm": 0.046848297119140625,
"learning_rate": 0.004432670428963946,
"loss": 6.0938,
"step": 237
},
{
"epoch": 0.2205746061167748,
"grad_norm": 0.0395938940346241,
"learning_rate": 0.004428045041756137,
"loss": 6.075,
"step": 238
},
{
"epoch": 0.22150139017608897,
"grad_norm": 0.0638502761721611,
"learning_rate": 0.004423403309969855,
"loss": 6.025,
"step": 239
},
{
"epoch": 0.22242817423540315,
"grad_norm": 0.06795669347047806,
"learning_rate": 0.004418745272954361,
"loss": 6.0438,
"step": 240
},
{
"epoch": 0.22335495829471733,
"grad_norm": 0.052847135812044144,
"learning_rate": 0.004414070970197141,
"loss": 6.0625,
"step": 241
},
{
"epoch": 0.2242817423540315,
"grad_norm": 0.04967901483178139,
"learning_rate": 0.0044093804413235715,
"loss": 6.0375,
"step": 242
},
{
"epoch": 0.22520852641334568,
"grad_norm": 0.0682300478219986,
"learning_rate": 0.004404673726096578,
"loss": 6.0625,
"step": 243
},
{
"epoch": 0.22613531047265986,
"grad_norm": 0.0553511306643486,
"learning_rate": 0.00439995086441631,
"loss": 5.9813,
"step": 244
},
{
"epoch": 0.22706209453197404,
"grad_norm": 0.028195617720484734,
"learning_rate": 0.004395211896319786,
"loss": 6.025,
"step": 245
},
{
"epoch": 0.22798887859128822,
"grad_norm": 0.04402211681008339,
"learning_rate": 0.00439045686198057,
"loss": 6.0125,
"step": 246
},
{
"epoch": 0.2289156626506024,
"grad_norm": 0.03047800622880459,
"learning_rate": 0.00438568580170842,
"loss": 5.9938,
"step": 247
},
{
"epoch": 0.22984244670991658,
"grad_norm": 0.03843539580702782,
"learning_rate": 0.004380898755948953,
"loss": 5.9813,
"step": 248
},
{
"epoch": 0.23076923076923078,
"grad_norm": 0.0366608090698719,
"learning_rate": 0.004376095765283298,
"loss": 6.0,
"step": 249
},
{
"epoch": 0.23169601482854496,
"grad_norm": 0.06157747656106949,
"learning_rate": 0.004371276870427753,
"loss": 6.025,
"step": 250
},
{
"epoch": 0.23262279888785914,
"grad_norm": 0.055426549166440964,
"learning_rate": 0.004366442112233441,
"loss": 5.975,
"step": 251
},
{
"epoch": 0.23354958294717332,
"grad_norm": 0.03506896272301674,
"learning_rate": 0.004361591531685964,
"loss": 5.9813,
"step": 252
},
{
"epoch": 0.2344763670064875,
"grad_norm": 0.03997468575835228,
"learning_rate": 0.004356725169905052,
"loss": 5.95,
"step": 253
},
{
"epoch": 0.23540315106580167,
"grad_norm": 0.06662409007549286,
"learning_rate": 0.0043518430681442205,
"loss": 5.9625,
"step": 254
},
{
"epoch": 0.23632993512511585,
"grad_norm": 0.0542214997112751,
"learning_rate": 0.004346945267790413,
"loss": 5.9625,
"step": 255
},
{
"epoch": 0.23725671918443003,
"grad_norm": 0.05418306961655617,
"learning_rate": 0.004342031810363658,
"loss": 5.9625,
"step": 256
},
{
"epoch": 0.2381835032437442,
"grad_norm": 0.08298410475254059,
"learning_rate": 0.004337102737516711,
"loss": 5.9563,
"step": 257
},
{
"epoch": 0.2391102873030584,
"grad_norm": 0.051485590636730194,
"learning_rate": 0.004332158091034705,
"loss": 5.9938,
"step": 258
},
{
"epoch": 0.24003707136237257,
"grad_norm": 0.041104063391685486,
"learning_rate": 0.004327197912834795,
"loss": 5.9125,
"step": 259
},
{
"epoch": 0.24096385542168675,
"grad_norm": 0.06750784069299698,
"learning_rate": 0.0043222222449658025,
"loss": 5.9563,
"step": 260
},
{
"epoch": 0.24189063948100092,
"grad_norm": 0.05327602103352547,
"learning_rate": 0.0043172311296078595,
"loss": 5.8812,
"step": 261
},
{
"epoch": 0.2428174235403151,
"grad_norm": 0.05027195066213608,
"learning_rate": 0.00431222460907205,
"loss": 5.9125,
"step": 262
},
{
"epoch": 0.24374420759962928,
"grad_norm": 0.06142845377326012,
"learning_rate": 0.004307202725800052,
"loss": 5.9,
"step": 263
},
{
"epoch": 0.24467099165894346,
"grad_norm": 0.06710369884967804,
"learning_rate": 0.004302165522363779,
"loss": 5.9437,
"step": 264
},
{
"epoch": 0.24559777571825764,
"grad_norm": 0.06705372035503387,
"learning_rate": 0.004297113041465017,
"loss": 5.9062,
"step": 265
},
{
"epoch": 0.24652455977757182,
"grad_norm": 0.06116189435124397,
"learning_rate": 0.004292045325935063,
"loss": 5.9,
"step": 266
},
{
"epoch": 0.247451343836886,
"grad_norm": 0.054194726049900055,
"learning_rate": 0.004286962418734364,
"loss": 5.875,
"step": 267
},
{
"epoch": 0.24837812789620017,
"grad_norm": 0.0627150684595108,
"learning_rate": 0.004281864362952147,
"loss": 5.8875,
"step": 268
},
{
"epoch": 0.24930491195551435,
"grad_norm": 0.0440673902630806,
"learning_rate": 0.004276751201806063,
"loss": 5.8938,
"step": 269
},
{
"epoch": 0.25023169601482853,
"grad_norm": 0.034663740545511246,
"learning_rate": 0.004271622978641812,
"loss": 5.8625,
"step": 270
},
{
"epoch": 0.2511584800741427,
"grad_norm": 0.04779878258705139,
"learning_rate": 0.004266479736932779,
"loss": 5.8563,
"step": 271
},
{
"epoch": 0.2520852641334569,
"grad_norm": 0.060510262846946716,
"learning_rate": 0.004261321520279666,
"loss": 5.8563,
"step": 272
},
{
"epoch": 0.25301204819277107,
"grad_norm": 0.05226600542664528,
"learning_rate": 0.004256148372410125,
"loss": 5.8375,
"step": 273
},
{
"epoch": 0.25393883225208524,
"grad_norm": 0.05810929834842682,
"learning_rate": 0.004250960337178377,
"loss": 5.8625,
"step": 274
},
{
"epoch": 0.2548656163113994,
"grad_norm": 0.07357963919639587,
"learning_rate": 0.004245757458564855,
"loss": 5.8688,
"step": 275
},
{
"epoch": 0.2557924003707136,
"grad_norm": 0.07380347698926926,
"learning_rate": 0.004240539780675817,
"loss": 5.8563,
"step": 276
},
{
"epoch": 0.2567191844300278,
"grad_norm": 0.05101478099822998,
"learning_rate": 0.0042353073477429835,
"loss": 5.825,
"step": 277
},
{
"epoch": 0.25764596848934196,
"grad_norm": 0.03864740952849388,
"learning_rate": 0.004230060204123156,
"loss": 5.8688,
"step": 278
},
{
"epoch": 0.25857275254865614,
"grad_norm": 0.06766132265329361,
"learning_rate": 0.004224798394297841,
"loss": 5.85,
"step": 279
},
{
"epoch": 0.2594995366079703,
"grad_norm": 0.06980055570602417,
"learning_rate": 0.004219521962872876,
"loss": 5.875,
"step": 280
},
{
"epoch": 0.26042632066728455,
"grad_norm": 0.04153401404619217,
"learning_rate": 0.004214230954578051,
"loss": 5.8313,
"step": 281
},
{
"epoch": 0.26135310472659873,
"grad_norm": 0.045340005308389664,
"learning_rate": 0.004208925414266726,
"loss": 5.8125,
"step": 282
},
{
"epoch": 0.2622798887859129,
"grad_norm": 0.04986559599637985,
"learning_rate": 0.004203605386915454,
"loss": 5.825,
"step": 283
},
{
"epoch": 0.2632066728452271,
"grad_norm": 0.04970383271574974,
"learning_rate": 0.004198270917623599,
"loss": 5.7688,
"step": 284
},
{
"epoch": 0.26413345690454126,
"grad_norm": 0.05129897966980934,
"learning_rate": 0.004192922051612953,
"loss": 5.8,
"step": 285
},
{
"epoch": 0.26506024096385544,
"grad_norm": 0.03994636610150337,
"learning_rate": 0.004187558834227354,
"loss": 5.8,
"step": 286
},
{
"epoch": 0.2659870250231696,
"grad_norm": 0.05204310640692711,
"learning_rate": 0.004182181310932297,
"loss": 5.7938,
"step": 287
},
{
"epoch": 0.2669138090824838,
"grad_norm": 0.03257805109024048,
"learning_rate": 0.004176789527314558,
"loss": 5.7562,
"step": 288
},
{
"epoch": 0.267840593141798,
"grad_norm": 0.035661760717630386,
"learning_rate": 0.004171383529081797,
"loss": 5.7812,
"step": 289
},
{
"epoch": 0.26876737720111216,
"grad_norm": 0.04478088766336441,
"learning_rate": 0.004165963362062177,
"loss": 5.7562,
"step": 290
},
{
"epoch": 0.26969416126042633,
"grad_norm": 0.03838647902011871,
"learning_rate": 0.004160529072203974,
"loss": 5.7688,
"step": 291
},
{
"epoch": 0.2706209453197405,
"grad_norm": 0.040849462151527405,
"learning_rate": 0.004155080705575188,
"loss": 5.7438,
"step": 292
},
{
"epoch": 0.2715477293790547,
"grad_norm": 0.051210496574640274,
"learning_rate": 0.004149618308363149,
"loss": 5.7375,
"step": 293
},
{
"epoch": 0.27247451343836887,
"grad_norm": 0.07401825487613678,
"learning_rate": 0.00414414192687413,
"loss": 5.7812,
"step": 294
},
{
"epoch": 0.27340129749768305,
"grad_norm": 0.10748963057994843,
"learning_rate": 0.004138651607532954,
"loss": 5.75,
"step": 295
},
{
"epoch": 0.2743280815569972,
"grad_norm": 0.07754500955343246,
"learning_rate": 0.004133147396882597,
"loss": 5.7562,
"step": 296
},
{
"epoch": 0.2752548656163114,
"grad_norm": 0.04524754732847214,
"learning_rate": 0.004127629341583795,
"loss": 5.7375,
"step": 297
},
{
"epoch": 0.2761816496756256,
"grad_norm": 0.06774584203958511,
"learning_rate": 0.004122097488414652,
"loss": 5.7375,
"step": 298
},
{
"epoch": 0.27710843373493976,
"grad_norm": 0.050472185015678406,
"learning_rate": 0.004116551884270237,
"loss": 5.6937,
"step": 299
},
{
"epoch": 0.27803521779425394,
"grad_norm": 0.040967270731925964,
"learning_rate": 0.0041109925761621926,
"loss": 5.7313,
"step": 300
},
{
"epoch": 0.2789620018535681,
"grad_norm": 0.03739303722977638,
"learning_rate": 0.004105419611218332,
"loss": 5.7188,
"step": 301
},
{
"epoch": 0.2798887859128823,
"grad_norm": 0.04636852815747261,
"learning_rate": 0.004099833036682241,
"loss": 5.725,
"step": 302
},
{
"epoch": 0.2808155699721965,
"grad_norm": 0.08012169599533081,
"learning_rate": 0.00409423289991288,
"loss": 5.7313,
"step": 303
},
{
"epoch": 0.28174235403151066,
"grad_norm": 0.05987093225121498,
"learning_rate": 0.004088619248384178,
"loss": 5.7125,
"step": 304
},
{
"epoch": 0.28266913809082483,
"grad_norm": 0.07735589891672134,
"learning_rate": 0.0040829921296846325,
"loss": 5.7,
"step": 305
},
{
"epoch": 0.283595922150139,
"grad_norm": 0.09283655136823654,
"learning_rate": 0.004077351591516908,
"loss": 5.675,
"step": 306
},
{
"epoch": 0.2845227062094532,
"grad_norm": 0.09337766468524933,
"learning_rate": 0.004071697681697427,
"loss": 5.7375,
"step": 307
},
{
"epoch": 0.28544949026876737,
"grad_norm": 0.06437985599040985,
"learning_rate": 0.00406603044815597,
"loss": 5.6875,
"step": 308
},
{
"epoch": 0.28637627432808155,
"grad_norm": 0.04110102728009224,
"learning_rate": 0.004060349938935264,
"loss": 5.6937,
"step": 309
},
{
"epoch": 0.2873030583873957,
"grad_norm": 0.06071547046303749,
"learning_rate": 0.004054656202190578,
"loss": 5.7375,
"step": 310
},
{
"epoch": 0.2882298424467099,
"grad_norm": 0.05311071500182152,
"learning_rate": 0.004048949286189315,
"loss": 5.65,
"step": 311
},
{
"epoch": 0.2891566265060241,
"grad_norm": 0.031259018927812576,
"learning_rate": 0.004043229239310603,
"loss": 5.6688,
"step": 312
},
{
"epoch": 0.29008341056533826,
"grad_norm": 0.03335728868842125,
"learning_rate": 0.0040374961100448845,
"loss": 5.675,
"step": 313
},
{
"epoch": 0.29101019462465244,
"grad_norm": 0.035077281296253204,
"learning_rate": 0.004031749946993501,
"loss": 5.675,
"step": 314
},
{
"epoch": 0.2919369786839666,
"grad_norm": 0.030766339972615242,
"learning_rate": 0.004025990798868291,
"loss": 5.6688,
"step": 315
},
{
"epoch": 0.2928637627432808,
"grad_norm": 0.03741341829299927,
"learning_rate": 0.004020218714491166,
"loss": 5.6625,
"step": 316
},
{
"epoch": 0.293790546802595,
"grad_norm": 0.044073686003685,
"learning_rate": 0.0040144337427937046,
"loss": 5.6375,
"step": 317
},
{
"epoch": 0.29471733086190915,
"grad_norm": 0.05024448409676552,
"learning_rate": 0.004008635932816734,
"loss": 5.6813,
"step": 318
},
{
"epoch": 0.29564411492122333,
"grad_norm": 0.045678358525037766,
"learning_rate": 0.004002825333709915,
"loss": 5.5938,
"step": 319
},
{
"epoch": 0.2965708989805375,
"grad_norm": 0.05762135609984398,
"learning_rate": 0.003997001994731328,
"loss": 5.6438,
"step": 320
},
{
"epoch": 0.2974976830398517,
"grad_norm": 0.07177098840475082,
"learning_rate": 0.003991165965247046,
"loss": 5.6375,
"step": 321
},
{
"epoch": 0.29842446709916587,
"grad_norm": 0.07682537287473679,
"learning_rate": 0.003985317294730731,
"loss": 5.675,
"step": 322
},
{
"epoch": 0.29935125115848005,
"grad_norm": 0.08128990978002548,
"learning_rate": 0.003979456032763201,
"loss": 5.675,
"step": 323
},
{
"epoch": 0.3002780352177943,
"grad_norm": 0.08135168999433517,
"learning_rate": 0.003973582229032019,
"loss": 5.7125,
"step": 324
},
{
"epoch": 0.30120481927710846,
"grad_norm": 0.10176597535610199,
"learning_rate": 0.003967695933331064,
"loss": 5.6875,
"step": 325
},
{
"epoch": 0.30213160333642264,
"grad_norm": 0.10529598593711853,
"learning_rate": 0.003961797195560118,
"loss": 5.675,
"step": 326
},
{
"epoch": 0.3030583873957368,
"grad_norm": 0.06495360285043716,
"learning_rate": 0.003955886065724433,
"loss": 5.6312,
"step": 327
},
{
"epoch": 0.303985171455051,
"grad_norm": 0.06810038536787033,
"learning_rate": 0.003949962593934316,
"loss": 5.6312,
"step": 328
},
{
"epoch": 0.3049119555143652,
"grad_norm": 0.058491405099630356,
"learning_rate": 0.003944026830404698,
"loss": 5.5813,
"step": 329
},
{
"epoch": 0.30583873957367935,
"grad_norm": 0.05078050121665001,
"learning_rate": 0.003938078825454709,
"loss": 5.575,
"step": 330
},
{
"epoch": 0.30676552363299353,
"grad_norm": 0.06602590531110764,
"learning_rate": 0.003932118629507257,
"loss": 5.5875,
"step": 331
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.0416589193046093,
"learning_rate": 0.0039261462930885935,
"loss": 5.6,
"step": 332
},
{
"epoch": 0.3086190917516219,
"grad_norm": 0.04823141545057297,
"learning_rate": 0.003920161866827889,
"loss": 5.5813,
"step": 333
},
{
"epoch": 0.30954587581093607,
"grad_norm": 0.03508712351322174,
"learning_rate": 0.003914165401456804,
"loss": 5.5875,
"step": 334
},
{
"epoch": 0.31047265987025024,
"grad_norm": 0.03729189559817314,
"learning_rate": 0.003908156947809056,
"loss": 5.575,
"step": 335
},
{
"epoch": 0.3113994439295644,
"grad_norm": 0.047349270433187485,
"learning_rate": 0.0039021365568199917,
"loss": 5.5625,
"step": 336
},
{
"epoch": 0.3123262279888786,
"grad_norm": 0.04627249017357826,
"learning_rate": 0.0038961042795261536,
"loss": 5.5375,
"step": 337
},
{
"epoch": 0.3132530120481928,
"grad_norm": 0.03604106232523918,
"learning_rate": 0.0038900601670648484,
"loss": 5.575,
"step": 338
},
{
"epoch": 0.31417979610750696,
"grad_norm": 0.040808554738759995,
"learning_rate": 0.0038840042706737112,
"loss": 5.5563,
"step": 339
},
{
"epoch": 0.31510658016682114,
"grad_norm": 0.027617141604423523,
"learning_rate": 0.003877936641690275,
"loss": 5.5813,
"step": 340
},
{
"epoch": 0.3160333642261353,
"grad_norm": 0.03513359650969505,
"learning_rate": 0.0038718573315515317,
"loss": 5.5438,
"step": 341
},
{
"epoch": 0.3169601482854495,
"grad_norm": 0.03978215530514717,
"learning_rate": 0.0038657663917934983,
"loss": 5.575,
"step": 342
},
{
"epoch": 0.3178869323447637,
"grad_norm": 0.025322776287794113,
"learning_rate": 0.0038596638740507785,
"loss": 5.525,
"step": 343
},
{
"epoch": 0.31881371640407785,
"grad_norm": 0.04898100346326828,
"learning_rate": 0.0038535498300561266,
"loss": 5.525,
"step": 344
},
{
"epoch": 0.31974050046339203,
"grad_norm": 0.0469982884824276,
"learning_rate": 0.003847424311640009,
"loss": 5.5438,
"step": 345
},
{
"epoch": 0.3206672845227062,
"grad_norm": 0.03919081762433052,
"learning_rate": 0.0038412873707301615,
"loss": 5.5312,
"step": 346
},
{
"epoch": 0.3215940685820204,
"grad_norm": 0.04740371182560921,
"learning_rate": 0.0038351390593511546,
"loss": 5.5,
"step": 347
},
{
"epoch": 0.32252085264133457,
"grad_norm": 0.05560089647769928,
"learning_rate": 0.003828979429623947,
"loss": 5.5125,
"step": 348
},
{
"epoch": 0.32344763670064874,
"grad_norm": 0.060783710330724716,
"learning_rate": 0.0038228085337654472,
"loss": 5.5312,
"step": 349
},
{
"epoch": 0.3243744207599629,
"grad_norm": 0.0725303441286087,
"learning_rate": 0.00381662642408807,
"loss": 5.5,
"step": 350
},
{
"epoch": 0.3253012048192771,
"grad_norm": 0.07496823370456696,
"learning_rate": 0.003810433152999293,
"loss": 5.5,
"step": 351
},
{
"epoch": 0.3262279888785913,
"grad_norm": 0.06248985975980759,
"learning_rate": 0.0038042287730012114,
"loss": 5.525,
"step": 352
},
{
"epoch": 0.32715477293790546,
"grad_norm": 0.06995397806167603,
"learning_rate": 0.003798013336690095,
"loss": 5.5188,
"step": 353
},
{
"epoch": 0.32808155699721964,
"grad_norm": 0.04727565497159958,
"learning_rate": 0.0037917868967559387,
"loss": 5.525,
"step": 354
},
{
"epoch": 0.3290083410565338,
"grad_norm": 0.05960770696401596,
"learning_rate": 0.0037855495059820215,
"loss": 5.5,
"step": 355
},
{
"epoch": 0.329935125115848,
"grad_norm": 0.049259670078754425,
"learning_rate": 0.0037793012172444534,
"loss": 5.4813,
"step": 356
},
{
"epoch": 0.33086190917516217,
"grad_norm": 0.06020974740386009,
"learning_rate": 0.003773042083511731,
"loss": 5.4625,
"step": 357
},
{
"epoch": 0.33178869323447635,
"grad_norm": 0.0410022996366024,
"learning_rate": 0.003766772157844284,
"loss": 5.4813,
"step": 358
},
{
"epoch": 0.33271547729379053,
"grad_norm": 0.04682173952460289,
"learning_rate": 0.003760491493394032,
"loss": 5.5,
"step": 359
},
{
"epoch": 0.3336422613531047,
"grad_norm": 0.055474553257226944,
"learning_rate": 0.003754200143403929,
"loss": 5.4938,
"step": 360
},
{
"epoch": 0.3345690454124189,
"grad_norm": 0.04533625394105911,
"learning_rate": 0.0037478981612075126,
"loss": 5.4625,
"step": 361
},
{
"epoch": 0.33549582947173306,
"grad_norm": 0.0564807690680027,
"learning_rate": 0.0037415856002284524,
"loss": 5.4188,
"step": 362
},
{
"epoch": 0.33642261353104724,
"grad_norm": 0.056940093636512756,
"learning_rate": 0.003735262513980099,
"loss": 5.4313,
"step": 363
},
{
"epoch": 0.3373493975903614,
"grad_norm": 0.03561275824904442,
"learning_rate": 0.003728928956065027,
"loss": 5.4313,
"step": 364
},
{
"epoch": 0.3382761816496756,
"grad_norm": 0.04059695452451706,
"learning_rate": 0.003722584980174583,
"loss": 5.425,
"step": 365
},
{
"epoch": 0.3392029657089898,
"grad_norm": 0.05738742649555206,
"learning_rate": 0.0037162306400884307,
"loss": 5.45,
"step": 366
},
{
"epoch": 0.340129749768304,
"grad_norm": 0.057356227189302444,
"learning_rate": 0.0037098659896740906,
"loss": 5.45,
"step": 367
},
{
"epoch": 0.3410565338276182,
"grad_norm": 0.049577098339796066,
"learning_rate": 0.0037034910828864904,
"loss": 5.4625,
"step": 368
},
{
"epoch": 0.34198331788693237,
"grad_norm": 0.03639480471611023,
"learning_rate": 0.003697105973767503,
"loss": 5.3875,
"step": 369
},
{
"epoch": 0.34291010194624655,
"grad_norm": 0.0382065586745739,
"learning_rate": 0.003690710716445488,
"loss": 5.4437,
"step": 370
},
{
"epoch": 0.3438368860055607,
"grad_norm": 0.06564627587795258,
"learning_rate": 0.0036843053651348357,
"loss": 5.4062,
"step": 371
},
{
"epoch": 0.3447636700648749,
"grad_norm": 0.08808669447898865,
"learning_rate": 0.003677889974135504,
"loss": 5.4062,
"step": 372
},
{
"epoch": 0.3456904541241891,
"grad_norm": 0.05307735130190849,
"learning_rate": 0.0036714645978325636,
"loss": 5.4,
"step": 373
},
{
"epoch": 0.34661723818350326,
"grad_norm": 0.05861683562397957,
"learning_rate": 0.0036650292906957294,
"loss": 5.4563,
"step": 374
},
{
"epoch": 0.34754402224281744,
"grad_norm": 0.06583855301141739,
"learning_rate": 0.003658584107278905,
"loss": 5.3938,
"step": 375
},
{
"epoch": 0.3484708063021316,
"grad_norm": 0.038819484412670135,
"learning_rate": 0.0036521291022197184,
"loss": 5.3625,
"step": 376
},
{
"epoch": 0.3493975903614458,
"grad_norm": 0.0668378546833992,
"learning_rate": 0.0036456643302390564,
"loss": 5.3688,
"step": 377
},
{
"epoch": 0.35032437442076,
"grad_norm": 0.06500761210918427,
"learning_rate": 0.0036391898461406043,
"loss": 5.3688,
"step": 378
},
{
"epoch": 0.35125115848007415,
"grad_norm": 0.06566040962934494,
"learning_rate": 0.003632705704810379,
"loss": 5.3875,
"step": 379
},
{
"epoch": 0.35217794253938833,
"grad_norm": 0.04046965390443802,
"learning_rate": 0.0036262119612162657,
"loss": 5.3563,
"step": 380
},
{
"epoch": 0.3531047265987025,
"grad_norm": 0.04664246365427971,
"learning_rate": 0.0036197086704075495,
"loss": 5.35,
"step": 381
},
{
"epoch": 0.3540315106580167,
"grad_norm": 0.06433206051588058,
"learning_rate": 0.0036131958875144496,
"loss": 5.3938,
"step": 382
},
{
"epoch": 0.35495829471733087,
"grad_norm": 0.06552179157733917,
"learning_rate": 0.003606673667747653,
"loss": 5.375,
"step": 383
},
{
"epoch": 0.35588507877664505,
"grad_norm": 0.0640706792473793,
"learning_rate": 0.0036001420663978466,
"loss": 5.3938,
"step": 384
},
{
"epoch": 0.3568118628359592,
"grad_norm": 0.0631820559501648,
"learning_rate": 0.003593601138835246,
"loss": 5.3375,
"step": 385
},
{
"epoch": 0.3577386468952734,
"grad_norm": 0.0694313570857048,
"learning_rate": 0.0035870509405091272,
"loss": 5.3812,
"step": 386
},
{
"epoch": 0.3586654309545876,
"grad_norm": 0.05696525424718857,
"learning_rate": 0.0035804915269473598,
"loss": 5.3563,
"step": 387
},
{
"epoch": 0.35959221501390176,
"grad_norm": 0.041316401213407516,
"learning_rate": 0.0035739229537559316,
"loss": 5.3313,
"step": 388
},
{
"epoch": 0.36051899907321594,
"grad_norm": 0.05180737376213074,
"learning_rate": 0.003567345276618479,
"loss": 5.3625,
"step": 389
},
{
"epoch": 0.3614457831325301,
"grad_norm": 0.06132522597908974,
"learning_rate": 0.003560758551295816,
"loss": 5.3375,
"step": 390
},
{
"epoch": 0.3623725671918443,
"grad_norm": 0.0825105607509613,
"learning_rate": 0.00355416283362546,
"loss": 5.3625,
"step": 391
},
{
"epoch": 0.3632993512511585,
"grad_norm": 0.09952400624752045,
"learning_rate": 0.0035475581795211594,
"loss": 5.375,
"step": 392
},
{
"epoch": 0.36422613531047265,
"grad_norm": 0.11159048974514008,
"learning_rate": 0.0035409446449724187,
"loss": 5.3875,
"step": 393
},
{
"epoch": 0.36515291936978683,
"grad_norm": 0.06153342127799988,
"learning_rate": 0.0035343222860440247,
"loss": 5.35,
"step": 394
},
{
"epoch": 0.366079703429101,
"grad_norm": 0.055650901049375534,
"learning_rate": 0.0035276911588755723,
"loss": 5.2938,
"step": 395
},
{
"epoch": 0.3670064874884152,
"grad_norm": 0.05008624121546745,
"learning_rate": 0.003521051319680984,
"loss": 5.3375,
"step": 396
},
{
"epoch": 0.36793327154772937,
"grad_norm": 0.04708503931760788,
"learning_rate": 0.0035144028247480405,
"loss": 5.3438,
"step": 397
},
{
"epoch": 0.36886005560704355,
"grad_norm": 0.041482266038656235,
"learning_rate": 0.0035077457304378964,
"loss": 5.2875,
"step": 398
},
{
"epoch": 0.3697868396663577,
"grad_norm": 0.056157998740673065,
"learning_rate": 0.003501080093184607,
"loss": 5.3,
"step": 399
},
{
"epoch": 0.3707136237256719,
"grad_norm": 0.047049764543771744,
"learning_rate": 0.0034944059694946494,
"loss": 5.3,
"step": 400
},
{
"epoch": 0.3716404077849861,
"grad_norm": 0.0425553135573864,
"learning_rate": 0.0034877234159464412,
"loss": 5.325,
"step": 401
},
{
"epoch": 0.37256719184430026,
"grad_norm": 0.036974068731069565,
"learning_rate": 0.003481032489189862,
"loss": 5.275,
"step": 402
},
{
"epoch": 0.37349397590361444,
"grad_norm": 0.038740385323762894,
"learning_rate": 0.003474333245945775,
"loss": 5.2438,
"step": 403
},
{
"epoch": 0.3744207599629286,
"grad_norm": 0.037295546382665634,
"learning_rate": 0.0034676257430055436,
"loss": 5.2688,
"step": 404
},
{
"epoch": 0.3753475440222428,
"grad_norm": 0.04598161205649376,
"learning_rate": 0.00346091003723055,
"loss": 5.2812,
"step": 405
},
{
"epoch": 0.376274328081557,
"grad_norm": 0.052688293159008026,
"learning_rate": 0.003454186185551717,
"loss": 5.2625,
"step": 406
},
{
"epoch": 0.37720111214087115,
"grad_norm": 0.0431685745716095,
"learning_rate": 0.0034474542449690203,
"loss": 5.2313,
"step": 407
},
{
"epoch": 0.37812789620018533,
"grad_norm": 0.047002580016851425,
"learning_rate": 0.0034407142725510075,
"loss": 5.25,
"step": 408
},
{
"epoch": 0.3790546802594995,
"grad_norm": 0.034174490720033646,
"learning_rate": 0.003433966325434315,
"loss": 5.2438,
"step": 409
},
{
"epoch": 0.3799814643188137,
"grad_norm": 0.037927597761154175,
"learning_rate": 0.0034272104608231825,
"loss": 5.2562,
"step": 410
},
{
"epoch": 0.3809082483781279,
"grad_norm": 0.040478792041540146,
"learning_rate": 0.003420446735988969,
"loss": 5.25,
"step": 411
},
{
"epoch": 0.3818350324374421,
"grad_norm": 0.043072253465652466,
"learning_rate": 0.0034136752082696664,
"loss": 5.1688,
"step": 412
},
{
"epoch": 0.3827618164967563,
"grad_norm": 0.04011726379394531,
"learning_rate": 0.003406895935069414,
"loss": 5.2375,
"step": 413
},
{
"epoch": 0.38368860055607046,
"grad_norm": 0.056565847247838974,
"learning_rate": 0.0034001089738580127,
"loss": 5.2562,
"step": 414
},
{
"epoch": 0.38461538461538464,
"grad_norm": 0.045512937009334564,
"learning_rate": 0.0033933143821704343,
"loss": 5.25,
"step": 415
},
{
"epoch": 0.3855421686746988,
"grad_norm": 0.05256471410393715,
"learning_rate": 0.003386512217606339,
"loss": 5.2375,
"step": 416
},
{
"epoch": 0.386468952734013,
"grad_norm": 0.055981192737817764,
"learning_rate": 0.0033797025378295826,
"loss": 5.2438,
"step": 417
},
{
"epoch": 0.3873957367933272,
"grad_norm": 0.06136908382177353,
"learning_rate": 0.003372885400567731,
"loss": 5.2375,
"step": 418
},
{
"epoch": 0.38832252085264135,
"grad_norm": 0.07198972254991531,
"learning_rate": 0.003366060863611567,
"loss": 5.225,
"step": 419
},
{
"epoch": 0.38924930491195553,
"grad_norm": 0.05037841945886612,
"learning_rate": 0.003359228984814605,
"loss": 5.1937,
"step": 420
},
{
"epoch": 0.3901760889712697,
"grad_norm": 0.0768144503235817,
"learning_rate": 0.0033523898220925974,
"loss": 5.1875,
"step": 421
},
{
"epoch": 0.3911028730305839,
"grad_norm": 0.08858561515808105,
"learning_rate": 0.003345543433423044,
"loss": 5.2625,
"step": 422
},
{
"epoch": 0.39202965708989806,
"grad_norm": 0.10811244696378708,
"learning_rate": 0.0033386898768447016,
"loss": 5.2375,
"step": 423
},
{
"epoch": 0.39295644114921224,
"grad_norm": 0.11364039778709412,
"learning_rate": 0.003331829210457091,
"loss": 5.2812,
"step": 424
},
{
"epoch": 0.3938832252085264,
"grad_norm": 0.08991072326898575,
"learning_rate": 0.0033249614924200054,
"loss": 5.2188,
"step": 425
},
{
"epoch": 0.3948100092678406,
"grad_norm": 0.0634012222290039,
"learning_rate": 0.003318086780953016,
"loss": 5.1813,
"step": 426
},
{
"epoch": 0.3957367933271548,
"grad_norm": 0.07201571762561798,
"learning_rate": 0.003311205134334979,
"loss": 5.2,
"step": 427
},
{
"epoch": 0.39666357738646896,
"grad_norm": 0.0652351826429367,
"learning_rate": 0.0033043166109035446,
"loss": 5.2,
"step": 428
},
{
"epoch": 0.39759036144578314,
"grad_norm": 0.04549067094922066,
"learning_rate": 0.0032974212690546558,
"loss": 5.1875,
"step": 429
},
{
"epoch": 0.3985171455050973,
"grad_norm": 0.06608382612466812,
"learning_rate": 0.0032905191672420596,
"loss": 5.2313,
"step": 430
},
{
"epoch": 0.3994439295644115,
"grad_norm": 0.04941621795296669,
"learning_rate": 0.003283610363976809,
"loss": 5.1375,
"step": 431
},
{
"epoch": 0.40037071362372567,
"grad_norm": 0.05331863835453987,
"learning_rate": 0.0032766949178267657,
"loss": 5.1188,
"step": 432
},
{
"epoch": 0.40129749768303985,
"grad_norm": 0.04874474182724953,
"learning_rate": 0.003269772887416106,
"loss": 5.1562,
"step": 433
},
{
"epoch": 0.40222428174235403,
"grad_norm": 0.05278300493955612,
"learning_rate": 0.0032628443314248233,
"loss": 5.1438,
"step": 434
},
{
"epoch": 0.4031510658016682,
"grad_norm": 0.04638415202498436,
"learning_rate": 0.003255909308588229,
"loss": 5.1438,
"step": 435
},
{
"epoch": 0.4040778498609824,
"grad_norm": 0.06462404876947403,
"learning_rate": 0.003248967877696457,
"loss": 5.1875,
"step": 436
},
{
"epoch": 0.40500463392029656,
"grad_norm": 0.04122454300522804,
"learning_rate": 0.0032420200975939633,
"loss": 5.1375,
"step": 437
},
{
"epoch": 0.40593141797961074,
"grad_norm": 0.05846314877271652,
"learning_rate": 0.003235066027179028,
"loss": 5.15,
"step": 438
},
{
"epoch": 0.4068582020389249,
"grad_norm": 0.06503690779209137,
"learning_rate": 0.0032281057254032563,
"loss": 5.1375,
"step": 439
},
{
"epoch": 0.4077849860982391,
"grad_norm": 0.05073606222867966,
"learning_rate": 0.0032211392512710773,
"loss": 5.0875,
"step": 440
},
{
"epoch": 0.4087117701575533,
"grad_norm": 0.06046286225318909,
"learning_rate": 0.003214166663839247,
"loss": 5.1188,
"step": 441
},
{
"epoch": 0.40963855421686746,
"grad_norm": 0.03978972136974335,
"learning_rate": 0.003207188022216343,
"loss": 5.125,
"step": 442
},
{
"epoch": 0.41056533827618164,
"grad_norm": 0.04392355680465698,
"learning_rate": 0.0032002033855622683,
"loss": 5.125,
"step": 443
},
{
"epoch": 0.4114921223354958,
"grad_norm": 0.039449259638786316,
"learning_rate": 0.003193212813087745,
"loss": 5.125,
"step": 444
},
{
"epoch": 0.41241890639481,
"grad_norm": 0.04521370679140091,
"learning_rate": 0.003186216364053818,
"loss": 5.0813,
"step": 445
},
{
"epoch": 0.41334569045412417,
"grad_norm": 0.06002253293991089,
"learning_rate": 0.003179214097771346,
"loss": 5.0875,
"step": 446
},
{
"epoch": 0.41427247451343835,
"grad_norm": 0.07361883670091629,
"learning_rate": 0.0031722060736005054,
"loss": 5.1312,
"step": 447
},
{
"epoch": 0.4151992585727525,
"grad_norm": 0.06389747560024261,
"learning_rate": 0.0031651923509502817,
"loss": 5.0875,
"step": 448
},
{
"epoch": 0.4161260426320667,
"grad_norm": 0.07580303400754929,
"learning_rate": 0.003158172989277968,
"loss": 5.1438,
"step": 449
},
{
"epoch": 0.4170528266913809,
"grad_norm": 0.06630785763263702,
"learning_rate": 0.0031511480480886623,
"loss": 5.125,
"step": 450
},
{
"epoch": 0.41797961075069506,
"grad_norm": 0.05100114271044731,
"learning_rate": 0.0031441175869347604,
"loss": 5.0563,
"step": 451
},
{
"epoch": 0.41890639481000924,
"grad_norm": 0.044168341904878616,
"learning_rate": 0.003137081665415453,
"loss": 5.1063,
"step": 452
},
{
"epoch": 0.4198331788693234,
"grad_norm": 0.036300163716077805,
"learning_rate": 0.0031300403431762202,
"loss": 5.0938,
"step": 453
},
{
"epoch": 0.42075996292863765,
"grad_norm": 0.03885301947593689,
"learning_rate": 0.003122993679908325,
"loss": 5.075,
"step": 454
},
{
"epoch": 0.42168674698795183,
"grad_norm": 0.047411106526851654,
"learning_rate": 0.0031159417353483075,
"loss": 5.0813,
"step": 455
},
{
"epoch": 0.422613531047266,
"grad_norm": 0.04042837396264076,
"learning_rate": 0.00310888456927748,
"loss": 5.025,
"step": 456
},
{
"epoch": 0.4235403151065802,
"grad_norm": 0.0529557429254055,
"learning_rate": 0.0031018222415214176,
"loss": 5.0938,
"step": 457
},
{
"epoch": 0.42446709916589437,
"grad_norm": 0.03582127019762993,
"learning_rate": 0.003094754811949453,
"loss": 5.05,
"step": 458
},
{
"epoch": 0.42539388322520855,
"grad_norm": 0.04631989449262619,
"learning_rate": 0.0030876823404741693,
"loss": 5.0625,
"step": 459
},
{
"epoch": 0.4263206672845227,
"grad_norm": 0.05943077430129051,
"learning_rate": 0.0030806048870508896,
"loss": 5.0375,
"step": 460
},
{
"epoch": 0.4272474513438369,
"grad_norm": 0.04641159623861313,
"learning_rate": 0.003073522511677171,
"loss": 5.0687,
"step": 461
},
{
"epoch": 0.4281742354031511,
"grad_norm": 0.04967037960886955,
"learning_rate": 0.0030664352743922964,
"loss": 5.05,
"step": 462
},
{
"epoch": 0.42910101946246526,
"grad_norm": 0.05452379956841469,
"learning_rate": 0.0030593432352767637,
"loss": 5.0563,
"step": 463
},
{
"epoch": 0.43002780352177944,
"grad_norm": 0.05275031551718712,
"learning_rate": 0.003052246454451776,
"loss": 5.05,
"step": 464
},
{
"epoch": 0.4309545875810936,
"grad_norm": 0.0582866407930851,
"learning_rate": 0.0030451449920787356,
"loss": 5.0375,
"step": 465
},
{
"epoch": 0.4318813716404078,
"grad_norm": 0.07089794427156448,
"learning_rate": 0.00303803890835873,
"loss": 5.0813,
"step": 466
},
{
"epoch": 0.432808155699722,
"grad_norm": 0.05818159505724907,
"learning_rate": 0.0030309282635320235,
"loss": 5.025,
"step": 467
},
{
"epoch": 0.43373493975903615,
"grad_norm": 0.05577028915286064,
"learning_rate": 0.0030238131178775465,
"loss": 5.0312,
"step": 468
},
{
"epoch": 0.43466172381835033,
"grad_norm": 0.0684211254119873,
"learning_rate": 0.0030166935317123824,
"loss": 5.0,
"step": 469
},
{
"epoch": 0.4355885078776645,
"grad_norm": 0.06801000237464905,
"learning_rate": 0.0030095695653912617,
"loss": 5.0687,
"step": 470
},
{
"epoch": 0.4365152919369787,
"grad_norm": 0.07714419811964035,
"learning_rate": 0.0030024412793060442,
"loss": 5.05,
"step": 471
},
{
"epoch": 0.43744207599629287,
"grad_norm": 0.07117122411727905,
"learning_rate": 0.0029953087338852086,
"loss": 5.0375,
"step": 472
},
{
"epoch": 0.43836886005560705,
"grad_norm": 0.05810219794511795,
"learning_rate": 0.002988171989593344,
"loss": 5.0125,
"step": 473
},
{
"epoch": 0.4392956441149212,
"grad_norm": 0.0630822405219078,
"learning_rate": 0.002981031106930632,
"loss": 4.9938,
"step": 474
},
{
"epoch": 0.4402224281742354,
"grad_norm": 0.09144022315740585,
"learning_rate": 0.002973886146432338,
"loss": 5.05,
"step": 475
},
{
"epoch": 0.4411492122335496,
"grad_norm": 0.07084767520427704,
"learning_rate": 0.002966737168668295,
"loss": 5.0062,
"step": 476
},
{
"epoch": 0.44207599629286376,
"grad_norm": 0.048369865864515305,
"learning_rate": 0.0029595842342423936,
"loss": 4.9313,
"step": 477
},
{
"epoch": 0.44300278035217794,
"grad_norm": 0.05783843249082565,
"learning_rate": 0.002952427403792063,
"loss": 4.9375,
"step": 478
},
{
"epoch": 0.4439295644114921,
"grad_norm": 0.05991849675774574,
"learning_rate": 0.002945266737987763,
"loss": 4.9688,
"step": 479
},
{
"epoch": 0.4448563484708063,
"grad_norm": 0.05597536638379097,
"learning_rate": 0.0029381022975324645,
"loss": 5.0,
"step": 480
},
{
"epoch": 0.4457831325301205,
"grad_norm": 0.0695003792643547,
"learning_rate": 0.0029309341431611397,
"loss": 5.0125,
"step": 481
},
{
"epoch": 0.44670991658943465,
"grad_norm": 0.08234460651874542,
"learning_rate": 0.002923762335640242,
"loss": 5.0125,
"step": 482
},
{
"epoch": 0.44763670064874883,
"grad_norm": 0.07713950425386429,
"learning_rate": 0.002916586935767195,
"loss": 5.0125,
"step": 483
},
{
"epoch": 0.448563484708063,
"grad_norm": 0.07240517437458038,
"learning_rate": 0.002909408004369877,
"loss": 5.0125,
"step": 484
},
{
"epoch": 0.4494902687673772,
"grad_norm": 0.0547131672501564,
"learning_rate": 0.0029022256023061004,
"loss": 4.9625,
"step": 485
},
{
"epoch": 0.45041705282669137,
"grad_norm": 0.045404303818941116,
"learning_rate": 0.0028950397904631033,
"loss": 5.0,
"step": 486
},
{
"epoch": 0.45134383688600554,
"grad_norm": 0.05781068280339241,
"learning_rate": 0.002887850629757026,
"loss": 4.9563,
"step": 487
},
{
"epoch": 0.4522706209453197,
"grad_norm": 0.048498354852199554,
"learning_rate": 0.0028806581811324007,
"loss": 4.925,
"step": 488
},
{
"epoch": 0.4531974050046339,
"grad_norm": 0.039063528180122375,
"learning_rate": 0.002873462505561632,
"loss": 4.9688,
"step": 489
},
{
"epoch": 0.4541241890639481,
"grad_norm": 0.038773953914642334,
"learning_rate": 0.002866263664044479,
"loss": 4.9437,
"step": 490
},
{
"epoch": 0.45505097312326226,
"grad_norm": 0.058951422572135925,
"learning_rate": 0.002859061717607539,
"loss": 4.95,
"step": 491
},
{
"epoch": 0.45597775718257644,
"grad_norm": 0.058964647352695465,
"learning_rate": 0.0028518567273037327,
"loss": 4.9313,
"step": 492
},
{
"epoch": 0.4569045412418906,
"grad_norm": 0.05438453331589699,
"learning_rate": 0.002844648754211783,
"loss": 4.95,
"step": 493
},
{
"epoch": 0.4578313253012048,
"grad_norm": 0.04710723087191582,
"learning_rate": 0.002837437859435698,
"loss": 4.9062,
"step": 494
},
{
"epoch": 0.458758109360519,
"grad_norm": 0.0365031473338604,
"learning_rate": 0.0028302241041042566,
"loss": 4.9688,
"step": 495
},
{
"epoch": 0.45968489341983315,
"grad_norm": 0.03951582312583923,
"learning_rate": 0.0028230075493704838,
"loss": 4.9563,
"step": 496
},
{
"epoch": 0.4606116774791474,
"grad_norm": 0.04623804986476898,
"learning_rate": 0.0028157882564111385,
"loss": 4.9375,
"step": 497
},
{
"epoch": 0.46153846153846156,
"grad_norm": 0.040012940764427185,
"learning_rate": 0.002808566286426191,
"loss": 4.925,
"step": 498
},
{
"epoch": 0.46246524559777574,
"grad_norm": 0.04338626191020012,
"learning_rate": 0.0028013417006383075,
"loss": 4.95,
"step": 499
},
{
"epoch": 0.4633920296570899,
"grad_norm": 0.0410669781267643,
"learning_rate": 0.0027941145602923267,
"loss": 4.9125,
"step": 500
},
{
"epoch": 0.4643188137164041,
"grad_norm": 0.03322385624051094,
"learning_rate": 0.0027868849266547437,
"loss": 4.8875,
"step": 501
},
{
"epoch": 0.4652455977757183,
"grad_norm": 0.036676980555057526,
"learning_rate": 0.00277965286101319,
"loss": 4.95,
"step": 502
},
{
"epoch": 0.46617238183503246,
"grad_norm": 0.044222161173820496,
"learning_rate": 0.0027724184246759147,
"loss": 4.9125,
"step": 503
},
{
"epoch": 0.46709916589434664,
"grad_norm": 0.06456394493579865,
"learning_rate": 0.002765181678971263,
"loss": 4.9062,
"step": 504
},
{
"epoch": 0.4680259499536608,
"grad_norm": 0.0746362954378128,
"learning_rate": 0.0027579426852471574,
"loss": 4.8875,
"step": 505
},
{
"epoch": 0.468952734012975,
"grad_norm": 0.08617927134037018,
"learning_rate": 0.0027507015048705776,
"loss": 4.8938,
"step": 506
},
{
"epoch": 0.46987951807228917,
"grad_norm": 0.07306444644927979,
"learning_rate": 0.00274345819922704,
"loss": 4.9,
"step": 507
},
{
"epoch": 0.47080630213160335,
"grad_norm": 0.04307616129517555,
"learning_rate": 0.0027362128297200783,
"loss": 4.9062,
"step": 508
},
{
"epoch": 0.4717330861909175,
"grad_norm": 0.06619231402873993,
"learning_rate": 0.0027289654577707214,
"loss": 4.8938,
"step": 509
},
{
"epoch": 0.4726598702502317,
"grad_norm": 0.07649318128824234,
"learning_rate": 0.002721716144816973,
"loss": 4.8938,
"step": 510
},
{
"epoch": 0.4735866543095459,
"grad_norm": 0.0643559917807579,
"learning_rate": 0.002714464952313292,
"loss": 4.825,
"step": 511
},
{
"epoch": 0.47451343836886006,
"grad_norm": 0.07730736583471298,
"learning_rate": 0.0027072119417300713,
"loss": 4.8812,
"step": 512
},
{
"epoch": 0.47544022242817424,
"grad_norm": 0.08054769784212112,
"learning_rate": 0.002699957174553115,
"loss": 4.9062,
"step": 513
},
{
"epoch": 0.4763670064874884,
"grad_norm": 0.06001604348421097,
"learning_rate": 0.002692700712283119,
"loss": 4.8938,
"step": 514
},
{
"epoch": 0.4772937905468026,
"grad_norm": 0.04911705106496811,
"learning_rate": 0.0026854426164351483,
"loss": 4.8625,
"step": 515
},
{
"epoch": 0.4782205746061168,
"grad_norm": 0.04762764275074005,
"learning_rate": 0.002678182948538117,
"loss": 4.8375,
"step": 516
},
{
"epoch": 0.47914735866543096,
"grad_norm": 0.045550934970378876,
"learning_rate": 0.002670921770134266,
"loss": 4.8938,
"step": 517
},
{
"epoch": 0.48007414272474513,
"grad_norm": 0.057238396257162094,
"learning_rate": 0.00266365914277864,
"loss": 4.8875,
"step": 518
},
{
"epoch": 0.4810009267840593,
"grad_norm": 0.053200677037239075,
"learning_rate": 0.002656395128038568,
"loss": 4.8438,
"step": 519
},
{
"epoch": 0.4819277108433735,
"grad_norm": 0.047585804015398026,
"learning_rate": 0.00264912978749314,
"loss": 4.8063,
"step": 520
},
{
"epoch": 0.48285449490268767,
"grad_norm": 0.05673938989639282,
"learning_rate": 0.0026418631827326857,
"loss": 4.8875,
"step": 521
},
{
"epoch": 0.48378127896200185,
"grad_norm": 0.05663244426250458,
"learning_rate": 0.0026345953753582497,
"loss": 4.9,
"step": 522
},
{
"epoch": 0.484708063021316,
"grad_norm": 0.04882281646132469,
"learning_rate": 0.0026273264269810743,
"loss": 4.8313,
"step": 523
},
{
"epoch": 0.4856348470806302,
"grad_norm": 0.0483589768409729,
"learning_rate": 0.0026200563992220733,
"loss": 4.8438,
"step": 524
},
{
"epoch": 0.4865616311399444,
"grad_norm": 0.05800378695130348,
"learning_rate": 0.00261278535371131,
"loss": 4.8125,
"step": 525
},
{
"epoch": 0.48748841519925856,
"grad_norm": 0.04723868519067764,
"learning_rate": 0.002605513352087477,
"loss": 4.7812,
"step": 526
},
{
"epoch": 0.48841519925857274,
"grad_norm": 0.051099590957164764,
"learning_rate": 0.0025982404559973704,
"loss": 4.8125,
"step": 527
},
{
"epoch": 0.4893419833178869,
"grad_norm": 0.05315464735031128,
"learning_rate": 0.00259096672709537,
"loss": 4.775,
"step": 528
},
{
"epoch": 0.4902687673772011,
"grad_norm": 0.05382310971617699,
"learning_rate": 0.002583692227042916,
"loss": 4.7812,
"step": 529
},
{
"epoch": 0.4911955514365153,
"grad_norm": 0.05870763957500458,
"learning_rate": 0.002576417017507983,
"loss": 4.8625,
"step": 530
},
{
"epoch": 0.49212233549582945,
"grad_norm": 0.03859548643231392,
"learning_rate": 0.0025691411601645657,
"loss": 4.7938,
"step": 531
},
{
"epoch": 0.49304911955514363,
"grad_norm": 0.05789710581302643,
"learning_rate": 0.002561864716692145,
"loss": 4.8438,
"step": 532
},
{
"epoch": 0.4939759036144578,
"grad_norm": 0.04865971952676773,
"learning_rate": 0.0025545877487751735,
"loss": 4.7812,
"step": 533
},
{
"epoch": 0.494902687673772,
"grad_norm": 0.05406877398490906,
"learning_rate": 0.0025473103181025475,
"loss": 4.8313,
"step": 534
},
{
"epoch": 0.49582947173308617,
"grad_norm": 0.051227353513240814,
"learning_rate": 0.002540032486367089,
"loss": 4.7562,
"step": 535
},
{
"epoch": 0.49675625579240035,
"grad_norm": 0.05123087763786316,
"learning_rate": 0.002532754315265018,
"loss": 4.8187,
"step": 536
},
{
"epoch": 0.4976830398517145,
"grad_norm": 0.04913110285997391,
"learning_rate": 0.0025254758664954306,
"loss": 4.8125,
"step": 537
},
{
"epoch": 0.4986098239110287,
"grad_norm": 0.04741792008280754,
"learning_rate": 0.0025181972017597806,
"loss": 4.7875,
"step": 538
},
{
"epoch": 0.4995366079703429,
"grad_norm": 0.055246248841285706,
"learning_rate": 0.0025109183827613474,
"loss": 4.8063,
"step": 539
},
{
"epoch": 0.5004633920296571,
"grad_norm": 0.037354640662670135,
"learning_rate": 0.002503639471204722,
"loss": 4.75,
"step": 540
},
{
"epoch": 0.5013901760889713,
"grad_norm": 0.04416719824075699,
"learning_rate": 0.002496360528795279,
"loss": 4.7812,
"step": 541
},
{
"epoch": 0.5023169601482854,
"grad_norm": 0.04072472080588341,
"learning_rate": 0.0024890816172386527,
"loss": 4.75,
"step": 542
},
{
"epoch": 0.5032437442075997,
"grad_norm": 0.048542048782110214,
"learning_rate": 0.002481802798240221,
"loss": 4.7688,
"step": 543
},
{
"epoch": 0.5041705282669138,
"grad_norm": 0.05309506133198738,
"learning_rate": 0.0024745241335045695,
"loss": 4.775,
"step": 544
},
{
"epoch": 0.505097312326228,
"grad_norm": 0.037804365158081055,
"learning_rate": 0.0024672456847349834,
"loss": 4.75,
"step": 545
},
{
"epoch": 0.5060240963855421,
"grad_norm": 0.045449260622262955,
"learning_rate": 0.0024599675136329113,
"loss": 4.7625,
"step": 546
},
{
"epoch": 0.5069508804448564,
"grad_norm": 0.046078864485025406,
"learning_rate": 0.002452689681897453,
"loss": 4.7688,
"step": 547
},
{
"epoch": 0.5078776645041705,
"grad_norm": 0.04518760368227959,
"learning_rate": 0.002445412251224827,
"loss": 4.7375,
"step": 548
},
{
"epoch": 0.5088044485634847,
"grad_norm": 0.03942165523767471,
"learning_rate": 0.002438135283307855,
"loss": 4.75,
"step": 549
},
{
"epoch": 0.5097312326227988,
"grad_norm": 0.045819394290447235,
"learning_rate": 0.0024308588398354344,
"loss": 4.7313,
"step": 550
},
{
"epoch": 0.5106580166821131,
"grad_norm": 0.06149514392018318,
"learning_rate": 0.002423582982492017,
"loss": 4.7313,
"step": 551
},
{
"epoch": 0.5115848007414272,
"grad_norm": 0.06028604507446289,
"learning_rate": 0.002416307772957085,
"loss": 4.7438,
"step": 552
},
{
"epoch": 0.5125115848007414,
"grad_norm": 0.043709807097911835,
"learning_rate": 0.002409033272904631,
"loss": 4.7625,
"step": 553
},
{
"epoch": 0.5134383688600556,
"grad_norm": 0.042988523840904236,
"learning_rate": 0.00240175954400263,
"loss": 4.7562,
"step": 554
},
{
"epoch": 0.5143651529193698,
"grad_norm": 0.053336091339588165,
"learning_rate": 0.002394486647912524,
"loss": 4.6875,
"step": 555
},
{
"epoch": 0.5152919369786839,
"grad_norm": 0.061223022639751434,
"learning_rate": 0.00238721464628869,
"loss": 4.725,
"step": 556
},
{
"epoch": 0.5162187210379982,
"grad_norm": 0.0704147219657898,
"learning_rate": 0.0023799436007779277,
"loss": 4.6813,
"step": 557
},
{
"epoch": 0.5171455050973123,
"grad_norm": 0.06097421795129776,
"learning_rate": 0.002372673573018926,
"loss": 4.7625,
"step": 558
},
{
"epoch": 0.5180722891566265,
"grad_norm": 0.04165394976735115,
"learning_rate": 0.0023654046246417513,
"loss": 4.7125,
"step": 559
},
{
"epoch": 0.5189990732159406,
"grad_norm": 0.040571633726358414,
"learning_rate": 0.0023581368172673153,
"loss": 4.7625,
"step": 560
},
{
"epoch": 0.5199258572752549,
"grad_norm": 0.04544011875987053,
"learning_rate": 0.0023508702125068608,
"loss": 4.7625,
"step": 561
},
{
"epoch": 0.5208526413345691,
"grad_norm": 0.04342002421617508,
"learning_rate": 0.0023436048719614323,
"loss": 4.7313,
"step": 562
},
{
"epoch": 0.5217794253938832,
"grad_norm": 0.041976965963840485,
"learning_rate": 0.00233634085722136,
"loss": 4.7313,
"step": 563
},
{
"epoch": 0.5227062094531975,
"grad_norm": 0.0512029230594635,
"learning_rate": 0.0023290782298657346,
"loss": 4.6937,
"step": 564
},
{
"epoch": 0.5236329935125116,
"grad_norm": 0.06346142292022705,
"learning_rate": 0.002321817051461883,
"loss": 4.675,
"step": 565
},
{
"epoch": 0.5245597775718258,
"grad_norm": 0.05272765830159187,
"learning_rate": 0.002314557383564852,
"loss": 4.75,
"step": 566
},
{
"epoch": 0.5254865616311399,
"grad_norm": 0.038122035562992096,
"learning_rate": 0.002307299287716881,
"loss": 4.7125,
"step": 567
},
{
"epoch": 0.5264133456904542,
"grad_norm": 0.042520515620708466,
"learning_rate": 0.0023000428254468853,
"loss": 4.6875,
"step": 568
},
{
"epoch": 0.5273401297497683,
"grad_norm": 0.05327059328556061,
"learning_rate": 0.0022927880582699284,
"loss": 4.7438,
"step": 569
},
{
"epoch": 0.5282669138090825,
"grad_norm": 0.10062926262617111,
"learning_rate": 0.0022855350476867083,
"loss": 5.4125,
"step": 570
},
{
"epoch": 0.5291936978683967,
"grad_norm": 0.19139476120471954,
"learning_rate": 0.002278283855183027,
"loss": 5.9375,
"step": 571
},
{
"epoch": 0.5301204819277109,
"grad_norm": 0.30302053689956665,
"learning_rate": 0.002271034542229279,
"loss": 6.1438,
"step": 572
},
{
"epoch": 0.531047265987025,
"grad_norm": 0.3599642515182495,
"learning_rate": 0.002263787170279922,
"loss": 6.125,
"step": 573
},
{
"epoch": 0.5319740500463392,
"grad_norm": 0.2241661548614502,
"learning_rate": 0.00225654180077296,
"loss": 5.9938,
"step": 574
},
{
"epoch": 0.5329008341056534,
"grad_norm": 0.10801433026790619,
"learning_rate": 0.0022492984951294225,
"loss": 5.7938,
"step": 575
},
{
"epoch": 0.5338276181649676,
"grad_norm": 0.11764154583215714,
"learning_rate": 0.0022420573147528436,
"loss": 5.7812,
"step": 576
},
{
"epoch": 0.5347544022242817,
"grad_norm": 0.08790837973356247,
"learning_rate": 0.002234818321028737,
"loss": 5.7375,
"step": 577
},
{
"epoch": 0.535681186283596,
"grad_norm": 0.06823479384183884,
"learning_rate": 0.002227581575324086,
"loss": 5.6438,
"step": 578
},
{
"epoch": 0.5366079703429101,
"grad_norm": 0.0775035172700882,
"learning_rate": 0.00222034713898681,
"loss": 5.6375,
"step": 579
},
{
"epoch": 0.5375347544022243,
"grad_norm": 0.05802862346172333,
"learning_rate": 0.0022131150733452573,
"loss": 5.5687,
"step": 580
},
{
"epoch": 0.5384615384615384,
"grad_norm": 0.058500614017248154,
"learning_rate": 0.0022058854397076734,
"loss": 5.5438,
"step": 581
},
{
"epoch": 0.5393883225208527,
"grad_norm": 0.055464208126068115,
"learning_rate": 0.0021986582993616926,
"loss": 5.5,
"step": 582
},
{
"epoch": 0.5403151065801668,
"grad_norm": 0.041989766061306,
"learning_rate": 0.0021914337135738086,
"loss": 5.4563,
"step": 583
},
{
"epoch": 0.541241890639481,
"grad_norm": 0.05176004022359848,
"learning_rate": 0.0021842117435888625,
"loss": 5.45,
"step": 584
},
{
"epoch": 0.5421686746987951,
"grad_norm": 0.058837149292230606,
"learning_rate": 0.0021769924506295168,
"loss": 5.4563,
"step": 585
},
{
"epoch": 0.5430954587581094,
"grad_norm": 0.04392680153250694,
"learning_rate": 0.002169775895895745,
"loss": 5.4062,
"step": 586
},
{
"epoch": 0.5440222428174235,
"grad_norm": 0.05528188496828079,
"learning_rate": 0.002162562140564302,
"loss": 5.375,
"step": 587
},
{
"epoch": 0.5449490268767377,
"grad_norm": 0.04781576246023178,
"learning_rate": 0.002155351245788218,
"loss": 5.3938,
"step": 588
},
{
"epoch": 0.5458758109360519,
"grad_norm": 0.0435294434428215,
"learning_rate": 0.002148143272696268,
"loss": 5.3,
"step": 589
},
{
"epoch": 0.5468025949953661,
"grad_norm": 0.04509313404560089,
"learning_rate": 0.002140938282392461,
"loss": 5.35,
"step": 590
},
{
"epoch": 0.5477293790546802,
"grad_norm": 0.03679104149341583,
"learning_rate": 0.002133736335955522,
"loss": 5.2688,
"step": 591
},
{
"epoch": 0.5486561631139945,
"grad_norm": 0.05090980976819992,
"learning_rate": 0.0021265374944383682,
"loss": 5.2812,
"step": 592
},
{
"epoch": 0.5495829471733086,
"grad_norm": 0.03438156098127365,
"learning_rate": 0.0021193418188675994,
"loss": 5.2688,
"step": 593
},
{
"epoch": 0.5505097312326228,
"grad_norm": 0.03302653878927231,
"learning_rate": 0.002112149370242975,
"loss": 5.25,
"step": 594
},
{
"epoch": 0.5514365152919369,
"grad_norm": 0.039244670420885086,
"learning_rate": 0.0021049602095368973,
"loss": 5.2063,
"step": 595
},
{
"epoch": 0.5523632993512512,
"grad_norm": 0.03585642948746681,
"learning_rate": 0.0020977743976939005,
"loss": 5.275,
"step": 596
},
{
"epoch": 0.5532900834105653,
"grad_norm": 0.03510696068406105,
"learning_rate": 0.0020905919956301236,
"loss": 5.2438,
"step": 597
},
{
"epoch": 0.5542168674698795,
"grad_norm": 0.03569590672850609,
"learning_rate": 0.0020834130642328054,
"loss": 5.175,
"step": 598
},
{
"epoch": 0.5551436515291936,
"grad_norm": 0.030981766059994698,
"learning_rate": 0.0020762376643597585,
"loss": 5.2,
"step": 599
},
{
"epoch": 0.5560704355885079,
"grad_norm": 0.04017426446080208,
"learning_rate": 0.0020690658568388613,
"loss": 5.15,
"step": 600
},
{
"epoch": 0.556997219647822,
"grad_norm": 0.039772696793079376,
"learning_rate": 0.0020618977024675356,
"loss": 5.125,
"step": 601
},
{
"epoch": 0.5579240037071362,
"grad_norm": 0.043551571667194366,
"learning_rate": 0.002054733262012238,
"loss": 5.1438,
"step": 602
},
{
"epoch": 0.5588507877664504,
"grad_norm": 0.03988911956548691,
"learning_rate": 0.0020475725962079373,
"loss": 5.1688,
"step": 603
},
{
"epoch": 0.5597775718257646,
"grad_norm": 0.03845544904470444,
"learning_rate": 0.0020404157657576073,
"loss": 5.1375,
"step": 604
},
{
"epoch": 0.5607043558850788,
"grad_norm": 0.048617441207170486,
"learning_rate": 0.002033262831331705,
"loss": 5.15,
"step": 605
},
{
"epoch": 0.561631139944393,
"grad_norm": 0.03950534015893936,
"learning_rate": 0.0020261138535676614,
"loss": 5.1312,
"step": 606
},
{
"epoch": 0.5625579240037072,
"grad_norm": 0.04601586237549782,
"learning_rate": 0.002018968893069368,
"loss": 5.0687,
"step": 607
},
{
"epoch": 0.5634847080630213,
"grad_norm": 0.048377152532339096,
"learning_rate": 0.002011828010406656,
"loss": 5.0625,
"step": 608
},
{
"epoch": 0.5644114921223355,
"grad_norm": 0.04253297671675682,
"learning_rate": 0.0020046912661147915,
"loss": 5.1,
"step": 609
},
{
"epoch": 0.5653382761816497,
"grad_norm": 0.04242146387696266,
"learning_rate": 0.001997558720693956,
"loss": 5.0813,
"step": 610
},
{
"epoch": 0.5662650602409639,
"grad_norm": 0.042660947889089584,
"learning_rate": 0.001990430434608739,
"loss": 5.1188,
"step": 611
},
{
"epoch": 0.567191844300278,
"grad_norm": 0.03864769637584686,
"learning_rate": 0.0019833064682876177,
"loss": 5.0625,
"step": 612
},
{
"epoch": 0.5681186283595923,
"grad_norm": 0.03322991728782654,
"learning_rate": 0.0019761868821224545,
"loss": 5.0375,
"step": 613
},
{
"epoch": 0.5690454124189064,
"grad_norm": 0.032155055552721024,
"learning_rate": 0.001969071736467977,
"loss": 5.0687,
"step": 614
},
{
"epoch": 0.5699721964782206,
"grad_norm": 0.04553236439824104,
"learning_rate": 0.0019619610916412704,
"loss": 5.1,
"step": 615
},
{
"epoch": 0.5708989805375347,
"grad_norm": 0.039135731756687164,
"learning_rate": 0.001954855007921265,
"loss": 5.025,
"step": 616
},
{
"epoch": 0.571825764596849,
"grad_norm": 0.03503022342920303,
"learning_rate": 0.0019477535455482242,
"loss": 5.0312,
"step": 617
},
{
"epoch": 0.5727525486561631,
"grad_norm": 0.02648424543440342,
"learning_rate": 0.0019406567647232366,
"loss": 5.0125,
"step": 618
},
{
"epoch": 0.5736793327154773,
"grad_norm": 0.030889399349689484,
"learning_rate": 0.0019335647256077037,
"loss": 5.0312,
"step": 619
},
{
"epoch": 0.5746061167747915,
"grad_norm": 0.028193505480885506,
"learning_rate": 0.0019264774883228286,
"loss": 5.0563,
"step": 620
},
{
"epoch": 0.5755329008341057,
"grad_norm": 0.039721377193927765,
"learning_rate": 0.0019193951129491112,
"loss": 4.9563,
"step": 621
},
{
"epoch": 0.5764596848934198,
"grad_norm": 0.0343133881688118,
"learning_rate": 0.0019123176595258306,
"loss": 5.0,
"step": 622
},
{
"epoch": 0.577386468952734,
"grad_norm": 0.03925079479813576,
"learning_rate": 0.0019052451880505472,
"loss": 5.05,
"step": 623
},
{
"epoch": 0.5783132530120482,
"grad_norm": 0.061298515647649765,
"learning_rate": 0.0018981777584785823,
"loss": 5.0,
"step": 624
},
{
"epoch": 0.5792400370713624,
"grad_norm": 0.045300450176000595,
"learning_rate": 0.0018911154307225204,
"loss": 4.975,
"step": 625
},
{
"epoch": 0.5801668211306765,
"grad_norm": 0.03320182114839554,
"learning_rate": 0.0018840582646516924,
"loss": 4.9938,
"step": 626
},
{
"epoch": 0.5810936051899908,
"grad_norm": 0.04246627911925316,
"learning_rate": 0.0018770063200916757,
"loss": 4.9625,
"step": 627
},
{
"epoch": 0.5820203892493049,
"grad_norm": 0.04181812331080437,
"learning_rate": 0.0018699596568237799,
"loss": 4.9875,
"step": 628
},
{
"epoch": 0.5829471733086191,
"grad_norm": 0.038650691509246826,
"learning_rate": 0.0018629183345845477,
"loss": 4.9625,
"step": 629
},
{
"epoch": 0.5838739573679332,
"grad_norm": 0.03198286145925522,
"learning_rate": 0.0018558824130652399,
"loss": 4.9125,
"step": 630
},
{
"epoch": 0.5848007414272475,
"grad_norm": 0.030322790145874023,
"learning_rate": 0.0018488519519113387,
"loss": 4.9563,
"step": 631
},
{
"epoch": 0.5857275254865616,
"grad_norm": 0.03637656942009926,
"learning_rate": 0.0018418270107220325,
"loss": 4.9625,
"step": 632
},
{
"epoch": 0.5866543095458758,
"grad_norm": 0.03812320902943611,
"learning_rate": 0.001834807649049719,
"loss": 4.9062,
"step": 633
},
{
"epoch": 0.58758109360519,
"grad_norm": 0.038305167108774185,
"learning_rate": 0.001827793926399495,
"loss": 4.9062,
"step": 634
},
{
"epoch": 0.5885078776645042,
"grad_norm": 0.03868838772177696,
"learning_rate": 0.0018207859022286543,
"loss": 4.95,
"step": 635
},
{
"epoch": 0.5894346617238183,
"grad_norm": 0.05012492835521698,
"learning_rate": 0.0018137836359461822,
"loss": 4.9125,
"step": 636
},
{
"epoch": 0.5903614457831325,
"grad_norm": 0.04664972424507141,
"learning_rate": 0.0018067871869122559,
"loss": 4.9188,
"step": 637
},
{
"epoch": 0.5912882298424467,
"grad_norm": 0.03777710720896721,
"learning_rate": 0.0017997966144377327,
"loss": 4.9,
"step": 638
},
{
"epoch": 0.5922150139017609,
"grad_norm": 0.04331712797284126,
"learning_rate": 0.0017928119777836581,
"loss": 4.9062,
"step": 639
},
{
"epoch": 0.593141797961075,
"grad_norm": 0.04469927027821541,
"learning_rate": 0.0017858333361607537,
"loss": 4.9188,
"step": 640
},
{
"epoch": 0.5940685820203893,
"grad_norm": 0.02936607599258423,
"learning_rate": 0.0017788607487289232,
"loss": 4.9188,
"step": 641
},
{
"epoch": 0.5949953660797034,
"grad_norm": 0.05327693372964859,
"learning_rate": 0.0017718942745967442,
"loss": 4.9437,
"step": 642
},
{
"epoch": 0.5959221501390176,
"grad_norm": 0.04499313235282898,
"learning_rate": 0.0017649339728209726,
"loss": 4.9125,
"step": 643
},
{
"epoch": 0.5968489341983317,
"grad_norm": 0.03915273770689964,
"learning_rate": 0.0017579799024060366,
"loss": 4.9437,
"step": 644
},
{
"epoch": 0.597775718257646,
"grad_norm": 0.04526703059673309,
"learning_rate": 0.0017510321223035436,
"loss": 4.9062,
"step": 645
},
{
"epoch": 0.5987025023169601,
"grad_norm": 0.05192454531788826,
"learning_rate": 0.001744090691411771,
"loss": 4.8438,
"step": 646
},
{
"epoch": 0.5996292863762743,
"grad_norm": 0.06659810990095139,
"learning_rate": 0.0017371556685751776,
"loss": 4.875,
"step": 647
},
{
"epoch": 0.6005560704355886,
"grad_norm": 0.026750769466161728,
"learning_rate": 0.0017302271125838944,
"loss": 4.8688,
"step": 648
},
{
"epoch": 0.6014828544949027,
"grad_norm": 0.05911999195814133,
"learning_rate": 0.0017233050821732344,
"loss": 4.9125,
"step": 649
},
{
"epoch": 0.6024096385542169,
"grad_norm": 0.046929407864809036,
"learning_rate": 0.0017163896360231918,
"loss": 4.8438,
"step": 650
},
{
"epoch": 0.603336422613531,
"grad_norm": 0.0461881086230278,
"learning_rate": 0.00170948083275794,
"loss": 4.8812,
"step": 651
},
{
"epoch": 0.6042632066728453,
"grad_norm": 0.041216105222702026,
"learning_rate": 0.0017025787309453443,
"loss": 4.8625,
"step": 652
},
{
"epoch": 0.6051899907321594,
"grad_norm": 0.047575026750564575,
"learning_rate": 0.001695683389096455,
"loss": 4.8625,
"step": 653
},
{
"epoch": 0.6061167747914736,
"grad_norm": 0.039946090430021286,
"learning_rate": 0.001688794865665021,
"loss": 4.8688,
"step": 654
},
{
"epoch": 0.6070435588507878,
"grad_norm": 0.03767408803105354,
"learning_rate": 0.0016819132190469843,
"loss": 4.8563,
"step": 655
},
{
"epoch": 0.607970342910102,
"grad_norm": 0.046980541199445724,
"learning_rate": 0.0016750385075799952,
"loss": 4.8438,
"step": 656
},
{
"epoch": 0.6088971269694161,
"grad_norm": 0.04574093222618103,
"learning_rate": 0.0016681707895429094,
"loss": 4.8563,
"step": 657
},
{
"epoch": 0.6098239110287303,
"grad_norm": 0.049847353249788284,
"learning_rate": 0.0016613101231552987,
"loss": 4.8563,
"step": 658
},
{
"epoch": 0.6107506950880445,
"grad_norm": 0.04778432473540306,
"learning_rate": 0.0016544565665769558,
"loss": 4.8625,
"step": 659
},
{
"epoch": 0.6116774791473587,
"grad_norm": 0.042641252279281616,
"learning_rate": 0.001647610177907403,
"loss": 4.875,
"step": 660
},
{
"epoch": 0.6126042632066728,
"grad_norm": 0.03789043426513672,
"learning_rate": 0.001640771015185395,
"loss": 4.8563,
"step": 661
},
{
"epoch": 0.6135310472659871,
"grad_norm": 0.0583447702229023,
"learning_rate": 0.0016339391363884334,
"loss": 4.7875,
"step": 662
},
{
"epoch": 0.6144578313253012,
"grad_norm": 0.047329407185316086,
"learning_rate": 0.0016271145994322693,
"loss": 4.8313,
"step": 663
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.051290351897478104,
"learning_rate": 0.0016202974621704175,
"loss": 4.8438,
"step": 664
},
{
"epoch": 0.6163113994439295,
"grad_norm": 0.04638203606009483,
"learning_rate": 0.0016134877823936607,
"loss": 4.8187,
"step": 665
},
{
"epoch": 0.6172381835032438,
"grad_norm": 0.0436415858566761,
"learning_rate": 0.0016066856178295658,
"loss": 4.8063,
"step": 666
},
{
"epoch": 0.6181649675625579,
"grad_norm": 0.05077355355024338,
"learning_rate": 0.0015998910261419874,
"loss": 4.8063,
"step": 667
},
{
"epoch": 0.6190917516218721,
"grad_norm": 0.05078209191560745,
"learning_rate": 0.0015931040649305862,
"loss": 4.8688,
"step": 668
},
{
"epoch": 0.6200185356811863,
"grad_norm": 0.06357160210609436,
"learning_rate": 0.0015863247917303337,
"loss": 4.8313,
"step": 669
},
{
"epoch": 0.6209453197405005,
"grad_norm": 0.03996184095740318,
"learning_rate": 0.0015795532640110316,
"loss": 4.8688,
"step": 670
},
{
"epoch": 0.6218721037998146,
"grad_norm": 0.05953163281083107,
"learning_rate": 0.0015727895391768176,
"loss": 4.7938,
"step": 671
},
{
"epoch": 0.6227988878591288,
"grad_norm": 0.05362982302904129,
"learning_rate": 0.0015660336745656862,
"loss": 4.7875,
"step": 672
},
{
"epoch": 0.623725671918443,
"grad_norm": 0.03395141288638115,
"learning_rate": 0.001559285727448993,
"loss": 4.7875,
"step": 673
},
{
"epoch": 0.6246524559777572,
"grad_norm": 0.06038745865225792,
"learning_rate": 0.0015525457550309802,
"loss": 4.775,
"step": 674
},
{
"epoch": 0.6255792400370713,
"grad_norm": 0.04683006927371025,
"learning_rate": 0.0015458138144482832,
"loss": 4.8625,
"step": 675
},
{
"epoch": 0.6265060240963856,
"grad_norm": 0.04466160014271736,
"learning_rate": 0.0015390899627694505,
"loss": 4.7812,
"step": 676
},
{
"epoch": 0.6274328081556997,
"grad_norm": 0.054469116032123566,
"learning_rate": 0.0015323742569944572,
"loss": 4.775,
"step": 677
},
{
"epoch": 0.6283595922150139,
"grad_norm": 0.05092649534344673,
"learning_rate": 0.001525666754054226,
"loss": 4.775,
"step": 678
},
{
"epoch": 0.629286376274328,
"grad_norm": 0.04114770516753197,
"learning_rate": 0.0015189675108101385,
"loss": 4.8063,
"step": 679
},
{
"epoch": 0.6302131603336423,
"grad_norm": 0.04045185446739197,
"learning_rate": 0.0015122765840535602,
"loss": 4.8063,
"step": 680
},
{
"epoch": 0.6311399443929564,
"grad_norm": 0.04068306088447571,
"learning_rate": 0.0015055940305053511,
"loss": 4.7688,
"step": 681
},
{
"epoch": 0.6320667284522706,
"grad_norm": 0.048991914838552475,
"learning_rate": 0.0014989199068153936,
"loss": 4.7812,
"step": 682
},
{
"epoch": 0.6329935125115848,
"grad_norm": 0.04630220681428909,
"learning_rate": 0.0014922542695621041,
"loss": 4.8313,
"step": 683
},
{
"epoch": 0.633920296570899,
"grad_norm": 0.05090312659740448,
"learning_rate": 0.0014855971752519607,
"loss": 4.75,
"step": 684
},
{
"epoch": 0.6348470806302131,
"grad_norm": 0.03676120191812515,
"learning_rate": 0.001478948680319016,
"loss": 4.775,
"step": 685
},
{
"epoch": 0.6357738646895273,
"grad_norm": 0.04959641024470329,
"learning_rate": 0.001472308841124429,
"loss": 4.8063,
"step": 686
},
{
"epoch": 0.6367006487488415,
"grad_norm": 0.04228943958878517,
"learning_rate": 0.0014656777139559754,
"loss": 4.8125,
"step": 687
},
{
"epoch": 0.6376274328081557,
"grad_norm": 0.04116208478808403,
"learning_rate": 0.001459055355027582,
"loss": 4.7562,
"step": 688
},
{
"epoch": 0.6385542168674698,
"grad_norm": 0.05446736142039299,
"learning_rate": 0.0014524418204788405,
"loss": 4.75,
"step": 689
},
{
"epoch": 0.6394810009267841,
"grad_norm": 0.04483804479241371,
"learning_rate": 0.0014458371663745402,
"loss": 4.7688,
"step": 690
},
{
"epoch": 0.6404077849860983,
"grad_norm": 0.04954027384519577,
"learning_rate": 0.0014392414487041838,
"loss": 4.6937,
"step": 691
},
{
"epoch": 0.6413345690454124,
"grad_norm": 0.043852776288986206,
"learning_rate": 0.00143265472338152,
"loss": 4.7938,
"step": 692
},
{
"epoch": 0.6422613531047267,
"grad_norm": 0.046749938279390335,
"learning_rate": 0.001426077046244068,
"loss": 4.7688,
"step": 693
},
{
"epoch": 0.6431881371640408,
"grad_norm": 0.05037090927362442,
"learning_rate": 0.0014195084730526395,
"loss": 4.7562,
"step": 694
},
{
"epoch": 0.644114921223355,
"grad_norm": 0.0452822744846344,
"learning_rate": 0.0014129490594908729,
"loss": 4.8,
"step": 695
},
{
"epoch": 0.6450417052826691,
"grad_norm": 0.03884583339095116,
"learning_rate": 0.001406398861164754,
"loss": 4.725,
"step": 696
},
{
"epoch": 0.6459684893419834,
"grad_norm": 0.04877614974975586,
"learning_rate": 0.0013998579336021535,
"loss": 4.7063,
"step": 697
},
{
"epoch": 0.6468952734012975,
"grad_norm": 0.043750159442424774,
"learning_rate": 0.0013933263322523466,
"loss": 4.7063,
"step": 698
},
{
"epoch": 0.6478220574606117,
"grad_norm": 0.047424763441085815,
"learning_rate": 0.0013868041124855508,
"loss": 4.7562,
"step": 699
},
{
"epoch": 0.6487488415199258,
"grad_norm": 0.044932421296834946,
"learning_rate": 0.0013802913295924508,
"loss": 4.6875,
"step": 700
},
{
"epoch": 0.6496756255792401,
"grad_norm": 0.03677170351147652,
"learning_rate": 0.0013737880387837348,
"loss": 4.7688,
"step": 701
},
{
"epoch": 0.6506024096385542,
"grad_norm": 0.049118272960186005,
"learning_rate": 0.0013672942951896206,
"loss": 4.7188,
"step": 702
},
{
"epoch": 0.6515291936978684,
"grad_norm": 0.06206013634800911,
"learning_rate": 0.0013608101538593964,
"loss": 4.75,
"step": 703
},
{
"epoch": 0.6524559777571826,
"grad_norm": 0.045777998864650726,
"learning_rate": 0.0013543356697609439,
"loss": 4.8063,
"step": 704
},
{
"epoch": 0.6533827618164968,
"grad_norm": 0.06643692404031754,
"learning_rate": 0.0013478708977802823,
"loss": 4.7438,
"step": 705
},
{
"epoch": 0.6543095458758109,
"grad_norm": 0.05065048485994339,
"learning_rate": 0.0013414158927210946,
"loss": 4.7375,
"step": 706
},
{
"epoch": 0.6552363299351252,
"grad_norm": 0.047690439969301224,
"learning_rate": 0.0013349707093042707,
"loss": 4.75,
"step": 707
},
{
"epoch": 0.6561631139944393,
"grad_norm": 0.05915187671780586,
"learning_rate": 0.0013285354021674361,
"loss": 4.675,
"step": 708
},
{
"epoch": 0.6570898980537535,
"grad_norm": 0.04628239572048187,
"learning_rate": 0.0013221100258644957,
"loss": 4.7375,
"step": 709
},
{
"epoch": 0.6580166821130676,
"grad_norm": 0.04324619472026825,
"learning_rate": 0.0013156946348651644,
"loss": 4.7,
"step": 710
},
{
"epoch": 0.6589434661723819,
"grad_norm": 0.048746492713689804,
"learning_rate": 0.0013092892835545123,
"loss": 4.7438,
"step": 711
},
{
"epoch": 0.659870250231696,
"grad_norm": 0.04211176931858063,
"learning_rate": 0.001302894026232497,
"loss": 4.7188,
"step": 712
},
{
"epoch": 0.6607970342910102,
"grad_norm": 0.04411826655268669,
"learning_rate": 0.0012965089171135097,
"loss": 4.7375,
"step": 713
},
{
"epoch": 0.6617238183503243,
"grad_norm": 0.049165111035108566,
"learning_rate": 0.0012901340103259097,
"loss": 4.7,
"step": 714
},
{
"epoch": 0.6626506024096386,
"grad_norm": 0.04350108280777931,
"learning_rate": 0.0012837693599115707,
"loss": 4.6813,
"step": 715
},
{
"epoch": 0.6635773864689527,
"grad_norm": 0.053538527339696884,
"learning_rate": 0.001277415019825417,
"loss": 4.7375,
"step": 716
},
{
"epoch": 0.6645041705282669,
"grad_norm": 0.03999413177371025,
"learning_rate": 0.0012710710439349739,
"loss": 4.6625,
"step": 717
},
{
"epoch": 0.6654309545875811,
"grad_norm": 0.05112524330615997,
"learning_rate": 0.0012647374860199018,
"loss": 4.7375,
"step": 718
},
{
"epoch": 0.6663577386468953,
"grad_norm": 0.03731364756822586,
"learning_rate": 0.0012584143997715486,
"loss": 4.6625,
"step": 719
},
{
"epoch": 0.6672845227062094,
"grad_norm": 0.036096684634685516,
"learning_rate": 0.0012521018387924884,
"loss": 4.7,
"step": 720
},
{
"epoch": 0.6682113067655236,
"grad_norm": 0.040185850113630295,
"learning_rate": 0.0012457998565960724,
"loss": 4.7,
"step": 721
},
{
"epoch": 0.6691380908248378,
"grad_norm": 0.03686061128973961,
"learning_rate": 0.0012395085066059686,
"loss": 4.7125,
"step": 722
},
{
"epoch": 0.670064874884152,
"grad_norm": 0.04309338331222534,
"learning_rate": 0.0012332278421557175,
"loss": 4.6875,
"step": 723
},
{
"epoch": 0.6709916589434661,
"grad_norm": 0.033990684896707535,
"learning_rate": 0.0012269579164882706,
"loss": 4.7,
"step": 724
},
{
"epoch": 0.6719184430027804,
"grad_norm": 0.06331422179937363,
"learning_rate": 0.0012206987827555469,
"loss": 4.6875,
"step": 725
},
{
"epoch": 0.6728452270620945,
"grad_norm": 0.05111413821578026,
"learning_rate": 0.0012144504940179793,
"loss": 4.6625,
"step": 726
},
{
"epoch": 0.6737720111214087,
"grad_norm": 0.039602335542440414,
"learning_rate": 0.0012082131032440616,
"loss": 4.6562,
"step": 727
},
{
"epoch": 0.6746987951807228,
"grad_norm": 0.0525193028151989,
"learning_rate": 0.0012019866633099052,
"loss": 4.6562,
"step": 728
},
{
"epoch": 0.6756255792400371,
"grad_norm": 0.04521778225898743,
"learning_rate": 0.001195771226998789,
"loss": 4.675,
"step": 729
},
{
"epoch": 0.6765523632993512,
"grad_norm": 0.042900171130895615,
"learning_rate": 0.0011895668470007067,
"loss": 4.675,
"step": 730
},
{
"epoch": 0.6774791473586654,
"grad_norm": 0.046152036637067795,
"learning_rate": 0.0011833735759119303,
"loss": 4.6375,
"step": 731
},
{
"epoch": 0.6784059314179796,
"grad_norm": 0.03777175024151802,
"learning_rate": 0.0011771914662345527,
"loss": 4.7125,
"step": 732
},
{
"epoch": 0.6793327154772938,
"grad_norm": 0.04087323322892189,
"learning_rate": 0.0011710205703760535,
"loss": 4.6875,
"step": 733
},
{
"epoch": 0.680259499536608,
"grad_norm": 0.03955033794045448,
"learning_rate": 0.0011648609406488455,
"loss": 4.6562,
"step": 734
},
{
"epoch": 0.6811862835959221,
"grad_norm": 0.030934706330299377,
"learning_rate": 0.001158712629269838,
"loss": 4.6438,
"step": 735
},
{
"epoch": 0.6821130676552364,
"grad_norm": 0.03988910838961601,
"learning_rate": 0.0011525756883599915,
"loss": 4.6438,
"step": 736
},
{
"epoch": 0.6830398517145505,
"grad_norm": 0.03788105770945549,
"learning_rate": 0.0011464501699438728,
"loss": 4.65,
"step": 737
},
{
"epoch": 0.6839666357738647,
"grad_norm": 0.04469624534249306,
"learning_rate": 0.0011403361259492218,
"loss": 4.6937,
"step": 738
},
{
"epoch": 0.6848934198331789,
"grad_norm": 0.04028180614113808,
"learning_rate": 0.001134233608206502,
"loss": 4.65,
"step": 739
},
{
"epoch": 0.6858202038924931,
"grad_norm": 0.04203322157263756,
"learning_rate": 0.0011281426684484686,
"loss": 4.65,
"step": 740
},
{
"epoch": 0.6867469879518072,
"grad_norm": 0.045880451798439026,
"learning_rate": 0.0011220633583097247,
"loss": 4.65,
"step": 741
},
{
"epoch": 0.6876737720111215,
"grad_norm": 0.0346485935151577,
"learning_rate": 0.0011159957293262886,
"loss": 4.6562,
"step": 742
},
{
"epoch": 0.6886005560704356,
"grad_norm": 0.048363398760557175,
"learning_rate": 0.0011099398329351515,
"loss": 4.6438,
"step": 743
},
{
"epoch": 0.6895273401297498,
"grad_norm": 0.0373103991150856,
"learning_rate": 0.0011038957204738465,
"loss": 4.6813,
"step": 744
},
{
"epoch": 0.6904541241890639,
"grad_norm": 0.043777722865343094,
"learning_rate": 0.001097863443180008,
"loss": 4.6688,
"step": 745
},
{
"epoch": 0.6913809082483782,
"grad_norm": 0.03708568960428238,
"learning_rate": 0.0010918430521909442,
"loss": 4.6688,
"step": 746
},
{
"epoch": 0.6923076923076923,
"grad_norm": 0.04273151233792305,
"learning_rate": 0.0010858345985431956,
"loss": 4.6312,
"step": 747
},
{
"epoch": 0.6932344763670065,
"grad_norm": 0.04535781592130661,
"learning_rate": 0.0010798381331721108,
"loss": 4.675,
"step": 748
},
{
"epoch": 0.6941612604263206,
"grad_norm": 0.03782697021961212,
"learning_rate": 0.0010738537069114062,
"loss": 4.675,
"step": 749
},
{
"epoch": 0.6950880444856349,
"grad_norm": 0.04372243955731392,
"learning_rate": 0.0010678813704927434,
"loss": 4.6625,
"step": 750
},
{
"epoch": 0.696014828544949,
"grad_norm": 0.04960807040333748,
"learning_rate": 0.0010619211745452912,
"loss": 4.6375,
"step": 751
},
{
"epoch": 0.6969416126042632,
"grad_norm": 0.040741242468357086,
"learning_rate": 0.001055973169595303,
"loss": 4.6375,
"step": 752
},
{
"epoch": 0.6978683966635774,
"grad_norm": 0.04263027384877205,
"learning_rate": 0.0010500374060656839,
"loss": 4.5938,
"step": 753
},
{
"epoch": 0.6987951807228916,
"grad_norm": 0.046234361827373505,
"learning_rate": 0.001044113934275567,
"loss": 4.6688,
"step": 754
},
{
"epoch": 0.6997219647822057,
"grad_norm": 0.03574342280626297,
"learning_rate": 0.0010382028044398823,
"loss": 4.6375,
"step": 755
},
{
"epoch": 0.70064874884152,
"grad_norm": 0.044964589178562164,
"learning_rate": 0.0010323040666689366,
"loss": 4.6312,
"step": 756
},
{
"epoch": 0.7015755329008341,
"grad_norm": 0.037156179547309875,
"learning_rate": 0.001026417770967982,
"loss": 4.6188,
"step": 757
},
{
"epoch": 0.7025023169601483,
"grad_norm": 0.046747058629989624,
"learning_rate": 0.0010205439672368,
"loss": 4.5875,
"step": 758
},
{
"epoch": 0.7034291010194624,
"grad_norm": 0.042588070034980774,
"learning_rate": 0.0010146827052692701,
"loss": 4.6125,
"step": 759
},
{
"epoch": 0.7043558850787767,
"grad_norm": 0.036094602197408676,
"learning_rate": 0.0010088340347529552,
"loss": 4.6625,
"step": 760
},
{
"epoch": 0.7052826691380908,
"grad_norm": 0.03903704881668091,
"learning_rate": 0.0010029980052686733,
"loss": 4.5875,
"step": 761
},
{
"epoch": 0.706209453197405,
"grad_norm": 0.045382946729660034,
"learning_rate": 0.0009971746662900851,
"loss": 4.6375,
"step": 762
},
{
"epoch": 0.7071362372567191,
"grad_norm": 0.04216109961271286,
"learning_rate": 0.0009913640671832663,
"loss": 4.6063,
"step": 763
},
{
"epoch": 0.7080630213160334,
"grad_norm": 0.044599149376153946,
"learning_rate": 0.0009855662572062962,
"loss": 4.625,
"step": 764
},
{
"epoch": 0.7089898053753475,
"grad_norm": 0.0511021688580513,
"learning_rate": 0.0009797812855088348,
"loss": 4.5875,
"step": 765
},
{
"epoch": 0.7099165894346617,
"grad_norm": 0.04359891265630722,
"learning_rate": 0.0009740092011317095,
"loss": 4.6688,
"step": 766
},
{
"epoch": 0.7108433734939759,
"grad_norm": 0.047334376722574234,
"learning_rate": 0.0009682500530064992,
"loss": 4.5875,
"step": 767
},
{
"epoch": 0.7117701575532901,
"grad_norm": 0.04199070855975151,
"learning_rate": 0.0009625038899551161,
"loss": 4.625,
"step": 768
},
{
"epoch": 0.7126969416126042,
"grad_norm": 0.057890091091394424,
"learning_rate": 0.0009567707606893971,
"loss": 4.6125,
"step": 769
},
{
"epoch": 0.7136237256719185,
"grad_norm": 0.04788359999656677,
"learning_rate": 0.0009510507138106853,
"loss": 4.5875,
"step": 770
},
{
"epoch": 0.7145505097312326,
"grad_norm": 0.04499724879860878,
"learning_rate": 0.0009453437978094223,
"loss": 4.5938,
"step": 771
},
{
"epoch": 0.7154772937905468,
"grad_norm": 0.04197373613715172,
"learning_rate": 0.0009396500610647368,
"loss": 4.6562,
"step": 772
},
{
"epoch": 0.7164040778498609,
"grad_norm": 0.048124760389328,
"learning_rate": 0.00093396955184403,
"loss": 4.625,
"step": 773
},
{
"epoch": 0.7173308619091752,
"grad_norm": 0.05138612538576126,
"learning_rate": 0.000928302318302573,
"loss": 4.575,
"step": 774
},
{
"epoch": 0.7182576459684893,
"grad_norm": 0.044739775359630585,
"learning_rate": 0.0009226484084830918,
"loss": 4.625,
"step": 775
},
{
"epoch": 0.7191844300278035,
"grad_norm": 0.04016095772385597,
"learning_rate": 0.0009170078703153676,
"loss": 4.6063,
"step": 776
},
{
"epoch": 0.7201112140871178,
"grad_norm": 0.05538894608616829,
"learning_rate": 0.000911380751615822,
"loss": 4.625,
"step": 777
},
{
"epoch": 0.7210379981464319,
"grad_norm": 0.04083118215203285,
"learning_rate": 0.0009057671000871195,
"loss": 4.6063,
"step": 778
},
{
"epoch": 0.7219647822057461,
"grad_norm": 0.05446457862854004,
"learning_rate": 0.0009001669633177587,
"loss": 4.575,
"step": 779
},
{
"epoch": 0.7228915662650602,
"grad_norm": 0.03577585890889168,
"learning_rate": 0.0008945803887816678,
"loss": 4.6,
"step": 780
},
{
"epoch": 0.7238183503243745,
"grad_norm": 0.04933847859501839,
"learning_rate": 0.0008890074238378073,
"loss": 4.5875,
"step": 781
},
{
"epoch": 0.7247451343836886,
"grad_norm": 0.03600107133388519,
"learning_rate": 0.0008834481157297625,
"loss": 4.5875,
"step": 782
},
{
"epoch": 0.7256719184430028,
"grad_norm": 0.05166667327284813,
"learning_rate": 0.0008779025115853482,
"loss": 4.5938,
"step": 783
},
{
"epoch": 0.726598702502317,
"grad_norm": 0.03323368355631828,
"learning_rate": 0.0008723706584162044,
"loss": 4.5563,
"step": 784
},
{
"epoch": 0.7275254865616312,
"grad_norm": 0.04717453941702843,
"learning_rate": 0.0008668526031174034,
"loss": 4.6125,
"step": 785
},
{
"epoch": 0.7284522706209453,
"grad_norm": 0.04695433750748634,
"learning_rate": 0.0008613483924670457,
"loss": 4.5875,
"step": 786
},
{
"epoch": 0.7293790546802595,
"grad_norm": 0.04457440972328186,
"learning_rate": 0.00085585807312587,
"loss": 4.6,
"step": 787
},
{
"epoch": 0.7303058387395737,
"grad_norm": 0.04753506928682327,
"learning_rate": 0.0008503816916368512,
"loss": 4.5687,
"step": 788
},
{
"epoch": 0.7312326227988879,
"grad_norm": 0.04823901131749153,
"learning_rate": 0.0008449192944248127,
"loss": 4.5625,
"step": 789
},
{
"epoch": 0.732159406858202,
"grad_norm": 0.041306272149086,
"learning_rate": 0.0008394709277960255,
"loss": 4.5563,
"step": 790
},
{
"epoch": 0.7330861909175163,
"grad_norm": 0.054446831345558167,
"learning_rate": 0.0008340366379378234,
"loss": 4.55,
"step": 791
},
{
"epoch": 0.7340129749768304,
"grad_norm": 0.03289240226149559,
"learning_rate": 0.0008286164709182031,
"loss": 4.575,
"step": 792
},
{
"epoch": 0.7349397590361446,
"grad_norm": 0.04518633335828781,
"learning_rate": 0.0008232104726854425,
"loss": 4.6,
"step": 793
},
{
"epoch": 0.7358665430954587,
"grad_norm": 0.03345628082752228,
"learning_rate": 0.0008178186890677027,
"loss": 4.55,
"step": 794
},
{
"epoch": 0.736793327154773,
"grad_norm": 0.046789661049842834,
"learning_rate": 0.0008124411657726471,
"loss": 4.575,
"step": 795
},
{
"epoch": 0.7377201112140871,
"grad_norm": 0.03443962708115578,
"learning_rate": 0.0008070779483870469,
"loss": 4.55,
"step": 796
},
{
"epoch": 0.7386468952734013,
"grad_norm": 0.04330628737807274,
"learning_rate": 0.0008017290823764014,
"loss": 4.5563,
"step": 797
},
{
"epoch": 0.7395736793327155,
"grad_norm": 0.032368697226047516,
"learning_rate": 0.0007963946130845462,
"loss": 4.5438,
"step": 798
},
{
"epoch": 0.7405004633920297,
"grad_norm": 0.04270923137664795,
"learning_rate": 0.0007910745857332749,
"loss": 4.6,
"step": 799
},
{
"epoch": 0.7414272474513438,
"grad_norm": 0.03373492881655693,
"learning_rate": 0.0007857690454219494,
"loss": 4.5687,
"step": 800
},
{
"epoch": 0.742354031510658,
"grad_norm": 0.03647404536604881,
"learning_rate": 0.0007804780371271248,
"loss": 4.5125,
"step": 801
},
{
"epoch": 0.7432808155699722,
"grad_norm": 0.037898655980825424,
"learning_rate": 0.0007752016057021596,
"loss": 4.5687,
"step": 802
},
{
"epoch": 0.7442075996292864,
"grad_norm": 0.0339631550014019,
"learning_rate": 0.0007699397958768451,
"loss": 4.575,
"step": 803
},
{
"epoch": 0.7451343836886005,
"grad_norm": 0.03792402520775795,
"learning_rate": 0.0007646926522570166,
"loss": 4.5687,
"step": 804
},
{
"epoch": 0.7460611677479148,
"grad_norm": 0.03865986317396164,
"learning_rate": 0.0007594602193241839,
"loss": 4.5312,
"step": 805
},
{
"epoch": 0.7469879518072289,
"grad_norm": 0.03740232065320015,
"learning_rate": 0.0007542425414351462,
"loss": 4.55,
"step": 806
},
{
"epoch": 0.7479147358665431,
"grad_norm": 0.03663860633969307,
"learning_rate": 0.0007490396628216237,
"loss": 4.55,
"step": 807
},
{
"epoch": 0.7488415199258572,
"grad_norm": 0.0422244630753994,
"learning_rate": 0.0007438516275898762,
"loss": 4.5563,
"step": 808
},
{
"epoch": 0.7497683039851715,
"grad_norm": 0.03552339971065521,
"learning_rate": 0.0007386784797203335,
"loss": 4.5563,
"step": 809
},
{
"epoch": 0.7506950880444856,
"grad_norm": 0.03856317326426506,
"learning_rate": 0.0007335202630672222,
"loss": 4.5188,
"step": 810
},
{
"epoch": 0.7516218721037998,
"grad_norm": 0.03579216077923775,
"learning_rate": 0.0007283770213581889,
"loss": 4.525,
"step": 811
},
{
"epoch": 0.752548656163114,
"grad_norm": 0.04030256345868111,
"learning_rate": 0.0007232487981939371,
"loss": 4.5563,
"step": 812
},
{
"epoch": 0.7534754402224282,
"grad_norm": 0.03762529417872429,
"learning_rate": 0.0007181356370478531,
"loss": 4.55,
"step": 813
},
{
"epoch": 0.7544022242817423,
"grad_norm": 0.03724801167845726,
"learning_rate": 0.0007130375812656365,
"loss": 4.5375,
"step": 814
},
{
"epoch": 0.7553290083410565,
"grad_norm": 0.03805640712380409,
"learning_rate": 0.000707954674064937,
"loss": 4.575,
"step": 815
},
{
"epoch": 0.7562557924003707,
"grad_norm": 0.0410294272005558,
"learning_rate": 0.0007028869585349828,
"loss": 4.5625,
"step": 816
},
{
"epoch": 0.7571825764596849,
"grad_norm": 0.0386902280151844,
"learning_rate": 0.0006978344776362214,
"loss": 4.5188,
"step": 817
},
{
"epoch": 0.758109360518999,
"grad_norm": 0.037720050662755966,
"learning_rate": 0.000692797274199948,
"loss": 4.55,
"step": 818
},
{
"epoch": 0.7590361445783133,
"grad_norm": 0.029812660068273544,
"learning_rate": 0.0006877753909279508,
"loss": 4.475,
"step": 819
},
{
"epoch": 0.7599629286376274,
"grad_norm": 0.04356846958398819,
"learning_rate": 0.0006827688703921406,
"loss": 4.4938,
"step": 820
},
{
"epoch": 0.7608897126969416,
"grad_norm": 0.03893793001770973,
"learning_rate": 0.0006777777550341977,
"loss": 4.5188,
"step": 821
},
{
"epoch": 0.7618164967562558,
"grad_norm": 0.0387520007789135,
"learning_rate": 0.0006728020871652046,
"loss": 4.5188,
"step": 822
},
{
"epoch": 0.76274328081557,
"grad_norm": 0.0450495183467865,
"learning_rate": 0.0006678419089652943,
"loss": 4.5438,
"step": 823
},
{
"epoch": 0.7636700648748842,
"grad_norm": 0.04003477469086647,
"learning_rate": 0.0006628972624832891,
"loss": 4.5813,
"step": 824
},
{
"epoch": 0.7645968489341983,
"grad_norm": 0.05103557929396629,
"learning_rate": 0.0006579681896363418,
"loss": 4.5188,
"step": 825
},
{
"epoch": 0.7655236329935126,
"grad_norm": 0.038706224411726,
"learning_rate": 0.000653054732209587,
"loss": 4.5188,
"step": 826
},
{
"epoch": 0.7664504170528267,
"grad_norm": 0.04914843663573265,
"learning_rate": 0.0006481569318557793,
"loss": 4.525,
"step": 827
},
{
"epoch": 0.7673772011121409,
"grad_norm": 0.03715524449944496,
"learning_rate": 0.0006432748300949476,
"loss": 4.5062,
"step": 828
},
{
"epoch": 0.768303985171455,
"grad_norm": 0.03968851640820503,
"learning_rate": 0.0006384084683140359,
"loss": 4.5563,
"step": 829
},
{
"epoch": 0.7692307692307693,
"grad_norm": 0.042003631591796875,
"learning_rate": 0.000633557887766559,
"loss": 4.5312,
"step": 830
},
{
"epoch": 0.7701575532900834,
"grad_norm": 0.04498601332306862,
"learning_rate": 0.000628723129572247,
"loss": 4.5,
"step": 831
},
{
"epoch": 0.7710843373493976,
"grad_norm": 0.039209991693496704,
"learning_rate": 0.0006239042347167026,
"loss": 4.5375,
"step": 832
},
{
"epoch": 0.7720111214087118,
"grad_norm": 0.03667667508125305,
"learning_rate": 0.0006191012440510469,
"loss": 4.5375,
"step": 833
},
{
"epoch": 0.772937905468026,
"grad_norm": 0.03756443038582802,
"learning_rate": 0.0006143141982915801,
"loss": 4.525,
"step": 834
},
{
"epoch": 0.7738646895273401,
"grad_norm": 0.03308939188718796,
"learning_rate": 0.0006095431380194299,
"loss": 4.55,
"step": 835
},
{
"epoch": 0.7747914735866543,
"grad_norm": 0.03881024196743965,
"learning_rate": 0.0006047881036802141,
"loss": 4.5375,
"step": 836
},
{
"epoch": 0.7757182576459685,
"grad_norm": 0.03667169064283371,
"learning_rate": 0.0006000491355836904,
"loss": 4.5188,
"step": 837
},
{
"epoch": 0.7766450417052827,
"grad_norm": 0.03264870494604111,
"learning_rate": 0.0005953262739034218,
"loss": 4.5188,
"step": 838
},
{
"epoch": 0.7775718257645968,
"grad_norm": 0.0369790680706501,
"learning_rate": 0.0005906195586764294,
"loss": 4.5125,
"step": 839
},
{
"epoch": 0.7784986098239111,
"grad_norm": 0.03252223879098892,
"learning_rate": 0.0005859290298028596,
"loss": 4.4813,
"step": 840
},
{
"epoch": 0.7794253938832252,
"grad_norm": 0.03256712481379509,
"learning_rate": 0.0005812547270456397,
"loss": 4.5062,
"step": 841
},
{
"epoch": 0.7803521779425394,
"grad_norm": 0.031595002859830856,
"learning_rate": 0.0005765966900301462,
"loss": 4.5188,
"step": 842
},
{
"epoch": 0.7812789620018535,
"grad_norm": 0.0356653667986393,
"learning_rate": 0.0005719549582438636,
"loss": 4.5438,
"step": 843
},
{
"epoch": 0.7822057460611678,
"grad_norm": 0.038195762783288956,
"learning_rate": 0.0005673295710360555,
"loss": 4.4875,
"step": 844
},
{
"epoch": 0.7831325301204819,
"grad_norm": 0.02905537374317646,
"learning_rate": 0.0005627205676174244,
"loss": 4.525,
"step": 845
},
{
"epoch": 0.7840593141797961,
"grad_norm": 0.03345280513167381,
"learning_rate": 0.0005581279870597866,
"loss": 4.4938,
"step": 846
},
{
"epoch": 0.7849860982391103,
"grad_norm": 0.034679800271987915,
"learning_rate": 0.0005535518682957341,
"loss": 4.4938,
"step": 847
},
{
"epoch": 0.7859128822984245,
"grad_norm": 0.03583706170320511,
"learning_rate": 0.0005489922501183095,
"loss": 4.5188,
"step": 848
},
{
"epoch": 0.7868396663577386,
"grad_norm": 0.032523263245821,
"learning_rate": 0.000544449171180674,
"loss": 4.4938,
"step": 849
},
{
"epoch": 0.7877664504170528,
"grad_norm": 0.03378100320696831,
"learning_rate": 0.0005399226699957821,
"loss": 4.5062,
"step": 850
},
{
"epoch": 0.788693234476367,
"grad_norm": 0.03234217315912247,
"learning_rate": 0.0005354127849360543,
"loss": 4.45,
"step": 851
},
{
"epoch": 0.7896200185356812,
"grad_norm": 0.03637991473078728,
"learning_rate": 0.0005309195542330497,
"loss": 4.5188,
"step": 852
},
{
"epoch": 0.7905468025949953,
"grad_norm": 0.03120928816497326,
"learning_rate": 0.0005264430159771455,
"loss": 4.5,
"step": 853
},
{
"epoch": 0.7914735866543096,
"grad_norm": 0.03429511934518814,
"learning_rate": 0.0005219832081172124,
"loss": 4.5312,
"step": 854
},
{
"epoch": 0.7924003707136237,
"grad_norm": 0.029146216809749603,
"learning_rate": 0.0005175401684602912,
"loss": 4.4938,
"step": 855
},
{
"epoch": 0.7933271547729379,
"grad_norm": 0.029695888981223106,
"learning_rate": 0.0005131139346712758,
"loss": 4.4875,
"step": 856
},
{
"epoch": 0.794253938832252,
"grad_norm": 0.03263707831501961,
"learning_rate": 0.0005087045442725904,
"loss": 4.5312,
"step": 857
},
{
"epoch": 0.7951807228915663,
"grad_norm": 0.028736894950270653,
"learning_rate": 0.0005043120346438748,
"loss": 4.525,
"step": 858
},
{
"epoch": 0.7961075069508804,
"grad_norm": 0.030789796262979507,
"learning_rate": 0.0004999364430216638,
"loss": 4.5,
"step": 859
},
{
"epoch": 0.7970342910101946,
"grad_norm": 0.04033099114894867,
"learning_rate": 0.0004955778064990757,
"loss": 4.5125,
"step": 860
},
{
"epoch": 0.7979610750695088,
"grad_norm": 0.03556600585579872,
"learning_rate": 0.0004912361620254932,
"loss": 4.4813,
"step": 861
},
{
"epoch": 0.798887859128823,
"grad_norm": 0.031120220199227333,
"learning_rate": 0.00048691154640625566,
"loss": 4.4688,
"step": 862
},
{
"epoch": 0.7998146431881371,
"grad_norm": 0.03250223025679588,
"learning_rate": 0.0004826039963023407,
"loss": 4.4688,
"step": 863
},
{
"epoch": 0.8007414272474513,
"grad_norm": 0.029799439013004303,
"learning_rate": 0.0004783135482300596,
"loss": 4.4875,
"step": 864
},
{
"epoch": 0.8016682113067656,
"grad_norm": 0.030422599986195564,
"learning_rate": 0.0004740402385607431,
"loss": 4.4813,
"step": 865
},
{
"epoch": 0.8025949953660797,
"grad_norm": 0.029015803709626198,
"learning_rate": 0.0004697841035204356,
"loss": 4.4938,
"step": 866
},
{
"epoch": 0.8035217794253939,
"grad_norm": 0.031820014119148254,
"learning_rate": 0.00046554517918958845,
"loss": 4.5062,
"step": 867
},
{
"epoch": 0.8044485634847081,
"grad_norm": 0.03146743401885033,
"learning_rate": 0.00046132350150275005,
"loss": 4.475,
"step": 868
},
{
"epoch": 0.8053753475440223,
"grad_norm": 0.02848106250166893,
"learning_rate": 0.0004571191062482677,
"loss": 4.4875,
"step": 869
},
{
"epoch": 0.8063021316033364,
"grad_norm": 0.031561560928821564,
"learning_rate": 0.00045293202906797754,
"loss": 4.4875,
"step": 870
},
{
"epoch": 0.8072289156626506,
"grad_norm": 0.031885311007499695,
"learning_rate": 0.0004487623054569084,
"loss": 4.5062,
"step": 871
},
{
"epoch": 0.8081556997219648,
"grad_norm": 0.03388173505663872,
"learning_rate": 0.000444609970762975,
"loss": 4.4813,
"step": 872
},
{
"epoch": 0.809082483781279,
"grad_norm": 0.03390287980437279,
"learning_rate": 0.00044047506018668415,
"loss": 4.5,
"step": 873
},
{
"epoch": 0.8100092678405931,
"grad_norm": 0.032265473157167435,
"learning_rate": 0.0004363576087808313,
"loss": 4.4938,
"step": 874
},
{
"epoch": 0.8109360518999074,
"grad_norm": 0.03563728928565979,
"learning_rate": 0.00043225765145020803,
"loss": 4.5188,
"step": 875
},
{
"epoch": 0.8118628359592215,
"grad_norm": 0.03663501888513565,
"learning_rate": 0.0004281752229513006,
"loss": 4.5188,
"step": 876
},
{
"epoch": 0.8127896200185357,
"grad_norm": 0.03167020156979561,
"learning_rate": 0.00042411035789200163,
"loss": 4.4875,
"step": 877
},
{
"epoch": 0.8137164040778498,
"grad_norm": 0.03226330131292343,
"learning_rate": 0.0004200630907313108,
"loss": 4.5062,
"step": 878
},
{
"epoch": 0.8146431881371641,
"grad_norm": 0.029977647587656975,
"learning_rate": 0.00041603345577904824,
"loss": 4.4688,
"step": 879
},
{
"epoch": 0.8155699721964782,
"grad_norm": 0.03339603170752525,
"learning_rate": 0.0004120214871955577,
"loss": 4.5125,
"step": 880
},
{
"epoch": 0.8164967562557924,
"grad_norm": 0.031077727675437927,
"learning_rate": 0.00040802721899142356,
"loss": 4.4938,
"step": 881
},
{
"epoch": 0.8174235403151066,
"grad_norm": 0.02900145947933197,
"learning_rate": 0.0004040506850271761,
"loss": 4.4375,
"step": 882
},
{
"epoch": 0.8183503243744208,
"grad_norm": 0.029496431350708008,
"learning_rate": 0.00040009191901301005,
"loss": 4.4625,
"step": 883
},
{
"epoch": 0.8192771084337349,
"grad_norm": 0.02934381552040577,
"learning_rate": 0.00039615095450849374,
"loss": 4.5062,
"step": 884
},
{
"epoch": 0.8202038924930491,
"grad_norm": 0.030950119718909264,
"learning_rate": 0.00039222782492228937,
"loss": 4.5,
"step": 885
},
{
"epoch": 0.8211306765523633,
"grad_norm": 0.029751867055892944,
"learning_rate": 0.0003883225635118659,
"loss": 4.4625,
"step": 886
},
{
"epoch": 0.8220574606116775,
"grad_norm": 0.026806732639670372,
"learning_rate": 0.0003844352033832199,
"loss": 4.5125,
"step": 887
},
{
"epoch": 0.8229842446709916,
"grad_norm": 0.03083191066980362,
"learning_rate": 0.00038056577749059266,
"loss": 4.4688,
"step": 888
},
{
"epoch": 0.8239110287303059,
"grad_norm": 0.034451741725206375,
"learning_rate": 0.0003767143186361935,
"loss": 4.4563,
"step": 889
},
{
"epoch": 0.82483781278962,
"grad_norm": 0.030912496149539948,
"learning_rate": 0.0003728808594699179,
"loss": 4.475,
"step": 890
},
{
"epoch": 0.8257645968489342,
"grad_norm": 0.03567620739340782,
"learning_rate": 0.00036906543248907495,
"loss": 4.4938,
"step": 891
},
{
"epoch": 0.8266913809082483,
"grad_norm": 0.03392716869711876,
"learning_rate": 0.0003652680700381092,
"loss": 4.45,
"step": 892
},
{
"epoch": 0.8276181649675626,
"grad_norm": 0.032731059938669205,
"learning_rate": 0.0003614888043083264,
"loss": 4.4875,
"step": 893
},
{
"epoch": 0.8285449490268767,
"grad_norm": 0.035781849175691605,
"learning_rate": 0.00035772766733762284,
"loss": 4.4625,
"step": 894
},
{
"epoch": 0.8294717330861909,
"grad_norm": 0.02696853317320347,
"learning_rate": 0.00035398469101020983,
"loss": 4.4688,
"step": 895
},
{
"epoch": 0.830398517145505,
"grad_norm": 0.033876750618219376,
"learning_rate": 0.00035025990705634833,
"loss": 4.5,
"step": 896
},
{
"epoch": 0.8313253012048193,
"grad_norm": 0.03308440372347832,
"learning_rate": 0.0003465533470520768,
"loss": 4.5125,
"step": 897
},
{
"epoch": 0.8322520852641334,
"grad_norm": 0.0284098070114851,
"learning_rate": 0.0003428650424189428,
"loss": 4.5,
"step": 898
},
{
"epoch": 0.8331788693234476,
"grad_norm": 0.0362527072429657,
"learning_rate": 0.0003391950244237396,
"loss": 4.4813,
"step": 899
},
{
"epoch": 0.8341056533827618,
"grad_norm": 0.03239575773477554,
"learning_rate": 0.0003355433241782385,
"loss": 4.4437,
"step": 900
},
{
"epoch": 0.835032437442076,
"grad_norm": 0.028916817158460617,
"learning_rate": 0.00033190997263892683,
"loss": 4.5062,
"step": 901
},
{
"epoch": 0.8359592215013901,
"grad_norm": 0.037763047963380814,
"learning_rate": 0.0003282950006067439,
"loss": 4.475,
"step": 902
},
{
"epoch": 0.8368860055607044,
"grad_norm": 0.03783184662461281,
"learning_rate": 0.000324698438726822,
"loss": 4.4375,
"step": 903
},
{
"epoch": 0.8378127896200185,
"grad_norm": 0.03236427158117294,
"learning_rate": 0.00032112031748822407,
"loss": 4.425,
"step": 904
},
{
"epoch": 0.8387395736793327,
"grad_norm": 0.031087512150406837,
"learning_rate": 0.00031756066722368775,
"loss": 4.4875,
"step": 905
},
{
"epoch": 0.8396663577386468,
"grad_norm": 0.02958965301513672,
"learning_rate": 0.0003140195181093658,
"loss": 4.475,
"step": 906
},
{
"epoch": 0.8405931417979611,
"grad_norm": 0.028066281229257584,
"learning_rate": 0.0003104969001645735,
"loss": 4.4563,
"step": 907
},
{
"epoch": 0.8415199258572753,
"grad_norm": 0.030324235558509827,
"learning_rate": 0.00030699284325152955,
"loss": 4.4437,
"step": 908
},
{
"epoch": 0.8424467099165894,
"grad_norm": 0.03359181433916092,
"learning_rate": 0.00030350737707510764,
"loss": 4.4813,
"step": 909
},
{
"epoch": 0.8433734939759037,
"grad_norm": 0.02781173586845398,
"learning_rate": 0.0003000405311825824,
"loss": 4.4437,
"step": 910
},
{
"epoch": 0.8443002780352178,
"grad_norm": 0.03504948690533638,
"learning_rate": 0.0002965923349633778,
"loss": 4.45,
"step": 911
},
{
"epoch": 0.845227062094532,
"grad_norm": 0.03041827119886875,
"learning_rate": 0.00029316281764882074,
"loss": 4.4563,
"step": 912
},
{
"epoch": 0.8461538461538461,
"grad_norm": 0.03221605718135834,
"learning_rate": 0.00028975200831189067,
"loss": 4.475,
"step": 913
},
{
"epoch": 0.8470806302131604,
"grad_norm": 0.03199669346213341,
"learning_rate": 0.0002863599358669755,
"loss": 4.4313,
"step": 914
},
{
"epoch": 0.8480074142724745,
"grad_norm": 0.030510928481817245,
"learning_rate": 0.0002829866290696234,
"loss": 4.4,
"step": 915
},
{
"epoch": 0.8489341983317887,
"grad_norm": 0.02957424893975258,
"learning_rate": 0.0002796321165163032,
"loss": 4.5062,
"step": 916
},
{
"epoch": 0.8498609823911029,
"grad_norm": 0.0366031751036644,
"learning_rate": 0.0002762964266441578,
"loss": 4.4313,
"step": 917
},
{
"epoch": 0.8507877664504171,
"grad_norm": 0.03369331359863281,
"learning_rate": 0.0002729795877307659,
"loss": 4.4437,
"step": 918
},
{
"epoch": 0.8517145505097312,
"grad_norm": 0.03299278765916824,
"learning_rate": 0.00026968162789390074,
"loss": 4.4313,
"step": 919
},
{
"epoch": 0.8526413345690455,
"grad_norm": 0.03193372115492821,
"learning_rate": 0.0002664025750912932,
"loss": 4.4625,
"step": 920
},
{
"epoch": 0.8535681186283596,
"grad_norm": 0.029631877318024635,
"learning_rate": 0.00026314245712039276,
"loss": 4.4375,
"step": 921
},
{
"epoch": 0.8544949026876738,
"grad_norm": 0.03459390997886658,
"learning_rate": 0.00025990130161813427,
"loss": 4.4688,
"step": 922
},
{
"epoch": 0.8554216867469879,
"grad_norm": 0.0364365316927433,
"learning_rate": 0.00025667913606070095,
"loss": 4.4625,
"step": 923
},
{
"epoch": 0.8563484708063022,
"grad_norm": 0.0323617160320282,
"learning_rate": 0.000253475987763295,
"loss": 4.425,
"step": 924
},
{
"epoch": 0.8572752548656163,
"grad_norm": 0.02805604226887226,
"learning_rate": 0.0002502918838799015,
"loss": 4.4813,
"step": 925
},
{
"epoch": 0.8582020389249305,
"grad_norm": 0.033434659242630005,
"learning_rate": 0.0002471268514030628,
"loss": 4.425,
"step": 926
},
{
"epoch": 0.8591288229842446,
"grad_norm": 0.03157290443778038,
"learning_rate": 0.00024398091716364617,
"loss": 4.4313,
"step": 927
},
{
"epoch": 0.8600556070435589,
"grad_norm": 0.029048243537545204,
"learning_rate": 0.00024085410783061895,
"loss": 4.4625,
"step": 928
},
{
"epoch": 0.860982391102873,
"grad_norm": 0.0280530396848917,
"learning_rate": 0.00023774644991081978,
"loss": 4.4125,
"step": 929
},
{
"epoch": 0.8619091751621872,
"grad_norm": 0.03451543301343918,
"learning_rate": 0.00023465796974873722,
"loss": 4.4875,
"step": 930
},
{
"epoch": 0.8628359592215014,
"grad_norm": 0.030910175293684006,
"learning_rate": 0.00023158869352628286,
"loss": 4.45,
"step": 931
},
{
"epoch": 0.8637627432808156,
"grad_norm": 0.03156379237771034,
"learning_rate": 0.00022853864726257307,
"loss": 4.4125,
"step": 932
},
{
"epoch": 0.8646895273401297,
"grad_norm": 0.03295775502920151,
"learning_rate": 0.00022550785681370368,
"loss": 4.4313,
"step": 933
},
{
"epoch": 0.865616311399444,
"grad_norm": 0.026067038998007774,
"learning_rate": 0.00022249634787253615,
"loss": 4.45,
"step": 934
},
{
"epoch": 0.8665430954587581,
"grad_norm": 0.02678762935101986,
"learning_rate": 0.00021950414596847684,
"loss": 4.4563,
"step": 935
},
{
"epoch": 0.8674698795180723,
"grad_norm": 0.028849739581346512,
"learning_rate": 0.0002165312764672589,
"loss": 4.4437,
"step": 936
},
{
"epoch": 0.8683966635773864,
"grad_norm": 0.03232532739639282,
"learning_rate": 0.0002135777645707318,
"loss": 4.4,
"step": 937
},
{
"epoch": 0.8693234476367007,
"grad_norm": 0.027282997965812683,
"learning_rate": 0.0002106436353166441,
"loss": 4.4625,
"step": 938
},
{
"epoch": 0.8702502316960148,
"grad_norm": 0.026645608246326447,
"learning_rate": 0.0002077289135784316,
"loss": 4.4437,
"step": 939
},
{
"epoch": 0.871177015755329,
"grad_norm": 0.02711557038128376,
"learning_rate": 0.00020483362406500838,
"loss": 4.4313,
"step": 940
},
{
"epoch": 0.8721037998146431,
"grad_norm": 0.030816104263067245,
"learning_rate": 0.0002019577913205553,
"loss": 4.4625,
"step": 941
},
{
"epoch": 0.8730305838739574,
"grad_norm": 0.026929127052426338,
"learning_rate": 0.00019910143972431323,
"loss": 4.4313,
"step": 942
},
{
"epoch": 0.8739573679332715,
"grad_norm": 0.028096897527575493,
"learning_rate": 0.0001962645934903748,
"loss": 4.4875,
"step": 943
},
{
"epoch": 0.8748841519925857,
"grad_norm": 0.029124116525053978,
"learning_rate": 0.00019344727666748218,
"loss": 4.4563,
"step": 944
},
{
"epoch": 0.8758109360518999,
"grad_norm": 0.027243295684456825,
"learning_rate": 0.00019064951313881918,
"loss": 4.4375,
"step": 945
},
{
"epoch": 0.8767377201112141,
"grad_norm": 0.028546737506985664,
"learning_rate": 0.00018787132662181238,
"loss": 4.45,
"step": 946
},
{
"epoch": 0.8776645041705282,
"grad_norm": 0.026934707537293434,
"learning_rate": 0.00018511274066792733,
"loss": 4.425,
"step": 947
},
{
"epoch": 0.8785912882298424,
"grad_norm": 0.03399607166647911,
"learning_rate": 0.00018237377866247157,
"loss": 4.4563,
"step": 948
},
{
"epoch": 0.8795180722891566,
"grad_norm": 0.02882063016295433,
"learning_rate": 0.000179654463824393,
"loss": 4.4688,
"step": 949
},
{
"epoch": 0.8804448563484708,
"grad_norm": 0.026831530034542084,
"learning_rate": 0.00017695481920608713,
"loss": 4.4188,
"step": 950
},
{
"epoch": 0.881371640407785,
"grad_norm": 0.029771380126476288,
"learning_rate": 0.00017427486769319738,
"loss": 4.4062,
"step": 951
},
{
"epoch": 0.8822984244670992,
"grad_norm": 0.025736462324857712,
"learning_rate": 0.00017161463200442484,
"loss": 4.4125,
"step": 952
},
{
"epoch": 0.8832252085264134,
"grad_norm": 0.027890045195817947,
"learning_rate": 0.0001689741346913337,
"loss": 4.4625,
"step": 953
},
{
"epoch": 0.8841519925857275,
"grad_norm": 0.028950916603207588,
"learning_rate": 0.0001663533981381593,
"loss": 4.4375,
"step": 954
},
{
"epoch": 0.8850787766450418,
"grad_norm": 0.029823975637555122,
"learning_rate": 0.00016375244456162119,
"loss": 4.4688,
"step": 955
},
{
"epoch": 0.8860055607043559,
"grad_norm": 0.02855784259736538,
"learning_rate": 0.00016117129601073116,
"loss": 4.4563,
"step": 956
},
{
"epoch": 0.8869323447636701,
"grad_norm": 0.026093894615769386,
"learning_rate": 0.00015860997436661028,
"loss": 4.4875,
"step": 957
},
{
"epoch": 0.8878591288229842,
"grad_norm": 0.02811110019683838,
"learning_rate": 0.00015606850134229966,
"loss": 4.4375,
"step": 958
},
{
"epoch": 0.8887859128822985,
"grad_norm": 0.027288252487778664,
"learning_rate": 0.00015354689848257942,
"loss": 4.4188,
"step": 959
},
{
"epoch": 0.8897126969416126,
"grad_norm": 0.02676665410399437,
"learning_rate": 0.0001510451871637833,
"loss": 4.4188,
"step": 960
},
{
"epoch": 0.8906394810009268,
"grad_norm": 0.03431456908583641,
"learning_rate": 0.00014856338859362052,
"loss": 4.4188,
"step": 961
},
{
"epoch": 0.891566265060241,
"grad_norm": 0.026652604341506958,
"learning_rate": 0.0001461015238109925,
"loss": 4.375,
"step": 962
},
{
"epoch": 0.8924930491195552,
"grad_norm": 0.032444290816783905,
"learning_rate": 0.00014365961368581842,
"loss": 4.4313,
"step": 963
},
{
"epoch": 0.8934198331788693,
"grad_norm": 0.02602170594036579,
"learning_rate": 0.00014123767891885435,
"loss": 4.375,
"step": 964
},
{
"epoch": 0.8943466172381835,
"grad_norm": 0.026148205623030663,
"learning_rate": 0.00013883574004152106,
"loss": 4.425,
"step": 965
},
{
"epoch": 0.8952734012974977,
"grad_norm": 0.028608886525034904,
"learning_rate": 0.0001364538174157273,
"loss": 4.3812,
"step": 966
},
{
"epoch": 0.8962001853568119,
"grad_norm": 0.026529457420110703,
"learning_rate": 0.00013409193123369996,
"loss": 4.3812,
"step": 967
},
{
"epoch": 0.897126969416126,
"grad_norm": 0.029828151687979698,
"learning_rate": 0.00013175010151780965,
"loss": 4.4188,
"step": 968
},
{
"epoch": 0.8980537534754403,
"grad_norm": 0.03368750587105751,
"learning_rate": 0.0001294283481204042,
"loss": 4.4313,
"step": 969
},
{
"epoch": 0.8989805375347544,
"grad_norm": 0.02840586192905903,
"learning_rate": 0.00012712669072363763,
"loss": 4.4375,
"step": 970
},
{
"epoch": 0.8999073215940686,
"grad_norm": 0.030109241604804993,
"learning_rate": 0.0001248451488393057,
"loss": 4.4125,
"step": 971
},
{
"epoch": 0.9008341056533827,
"grad_norm": 0.028758615255355835,
"learning_rate": 0.00012258374180867837,
"loss": 4.45,
"step": 972
},
{
"epoch": 0.901760889712697,
"grad_norm": 0.02661893516778946,
"learning_rate": 0.00012034248880233744,
"loss": 4.4813,
"step": 973
},
{
"epoch": 0.9026876737720111,
"grad_norm": 0.02796340361237526,
"learning_rate": 0.00011812140882001277,
"loss": 4.45,
"step": 974
},
{
"epoch": 0.9036144578313253,
"grad_norm": 0.024077627807855606,
"learning_rate": 0.00011592052069042208,
"loss": 4.4625,
"step": 975
},
{
"epoch": 0.9045412418906394,
"grad_norm": 0.02510063722729683,
"learning_rate": 0.00011373984307111229,
"loss": 4.4188,
"step": 976
},
{
"epoch": 0.9054680259499537,
"grad_norm": 0.02504696324467659,
"learning_rate": 0.00011157939444829762,
"loss": 4.4437,
"step": 977
},
{
"epoch": 0.9063948100092678,
"grad_norm": 0.026624388992786407,
"learning_rate": 0.0001094391931367078,
"loss": 4.4563,
"step": 978
},
{
"epoch": 0.907321594068582,
"grad_norm": 0.02774794027209282,
"learning_rate": 0.00010731925727942932,
"loss": 4.4313,
"step": 979
},
{
"epoch": 0.9082483781278962,
"grad_norm": 0.027720240876078606,
"learning_rate": 0.00010521960484775273,
"loss": 4.425,
"step": 980
},
{
"epoch": 0.9091751621872104,
"grad_norm": 0.0258037019520998,
"learning_rate": 0.00010314025364102087,
"loss": 4.425,
"step": 981
},
{
"epoch": 0.9101019462465245,
"grad_norm": 0.031181413680315018,
"learning_rate": 0.00010108122128647645,
"loss": 4.425,
"step": 982
},
{
"epoch": 0.9110287303058388,
"grad_norm": 0.026958808302879333,
"learning_rate": 9.904252523911473e-05,
"loss": 4.425,
"step": 983
},
{
"epoch": 0.9119555143651529,
"grad_norm": 0.0251258946955204,
"learning_rate": 9.702418278153296e-05,
"loss": 4.3938,
"step": 984
},
{
"epoch": 0.9128822984244671,
"grad_norm": 0.026582978665828705,
"learning_rate": 9.502621102378706e-05,
"loss": 4.4062,
"step": 985
},
{
"epoch": 0.9138090824837812,
"grad_norm": 0.028273189440369606,
"learning_rate": 9.304862690324295e-05,
"loss": 4.4,
"step": 986
},
{
"epoch": 0.9147358665430955,
"grad_norm": 0.02678096853196621,
"learning_rate": 9.109144718443679e-05,
"loss": 4.4125,
"step": 987
},
{
"epoch": 0.9156626506024096,
"grad_norm": 0.024335335940122604,
"learning_rate": 8.915468845892894e-05,
"loss": 4.4125,
"step": 988
},
{
"epoch": 0.9165894346617238,
"grad_norm": 0.02453056164085865,
"learning_rate": 8.72383671451668e-05,
"loss": 4.4062,
"step": 989
},
{
"epoch": 0.917516218721038,
"grad_norm": 0.025096192955970764,
"learning_rate": 8.534249948834311e-05,
"loss": 4.4437,
"step": 990
},
{
"epoch": 0.9184430027803522,
"grad_norm": 0.025366991758346558,
"learning_rate": 8.346710156026033e-05,
"loss": 4.4062,
"step": 991
},
{
"epoch": 0.9193697868396663,
"grad_norm": 0.02832290157675743,
"learning_rate": 8.161218925919172e-05,
"loss": 4.4625,
"step": 992
},
{
"epoch": 0.9202965708989805,
"grad_norm": 0.027890915051102638,
"learning_rate": 7.977777830974947e-05,
"loss": 4.4375,
"step": 993
},
{
"epoch": 0.9212233549582948,
"grad_norm": 0.027829816564917564,
"learning_rate": 7.796388426274947e-05,
"loss": 4.45,
"step": 994
},
{
"epoch": 0.9221501390176089,
"grad_norm": 0.02420070953667164,
"learning_rate": 7.61705224950801e-05,
"loss": 4.4313,
"step": 995
},
{
"epoch": 0.9230769230769231,
"grad_norm": 0.024921340867877007,
"learning_rate": 7.43977082095726e-05,
"loss": 4.4125,
"step": 996
},
{
"epoch": 0.9240037071362373,
"grad_norm": 0.02533474750816822,
"learning_rate": 7.264545643486997e-05,
"loss": 4.4062,
"step": 997
},
{
"epoch": 0.9249304911955515,
"grad_norm": 0.02694832719862461,
"learning_rate": 7.091378202530224e-05,
"loss": 4.375,
"step": 998
},
{
"epoch": 0.9258572752548656,
"grad_norm": 0.024787478148937225,
"learning_rate": 6.920269966075893e-05,
"loss": 4.4125,
"step": 999
},
{
"epoch": 0.9267840593141798,
"grad_norm": 0.02519523911178112,
"learning_rate": 6.751222384656502e-05,
"loss": 4.425,
"step": 1000
},
{
"epoch": 0.927710843373494,
"grad_norm": 0.0249481238424778,
"learning_rate": 6.584236891335804e-05,
"loss": 4.45,
"step": 1001
},
{
"epoch": 0.9286376274328082,
"grad_norm": 0.027095666155219078,
"learning_rate": 6.419314901696671e-05,
"loss": 4.4125,
"step": 1002
},
{
"epoch": 0.9295644114921223,
"grad_norm": 0.026183003559708595,
"learning_rate": 6.256457813828997e-05,
"loss": 4.3938,
"step": 1003
},
{
"epoch": 0.9304911955514366,
"grad_norm": 0.025982800871133804,
"learning_rate": 6.095667008318068e-05,
"loss": 4.4062,
"step": 1004
},
{
"epoch": 0.9314179796107507,
"grad_norm": 0.027629397809505463,
"learning_rate": 5.936943848232568e-05,
"loss": 4.4625,
"step": 1005
},
{
"epoch": 0.9323447636700649,
"grad_norm": 0.02437759004533291,
"learning_rate": 5.78028967911326e-05,
"loss": 4.425,
"step": 1006
},
{
"epoch": 0.933271547729379,
"grad_norm": 0.024311203509569168,
"learning_rate": 5.625705828961436e-05,
"loss": 4.4375,
"step": 1007
},
{
"epoch": 0.9341983317886933,
"grad_norm": 0.024223096668720245,
"learning_rate": 5.473193608227789e-05,
"loss": 4.4062,
"step": 1008
},
{
"epoch": 0.9351251158480074,
"grad_norm": 0.023723123595118523,
"learning_rate": 5.322754309801115e-05,
"loss": 4.45,
"step": 1009
},
{
"epoch": 0.9360518999073216,
"grad_norm": 0.02314998209476471,
"learning_rate": 5.174389208997598e-05,
"loss": 4.4188,
"step": 1010
},
{
"epoch": 0.9369786839666358,
"grad_norm": 0.028589608147740364,
"learning_rate": 5.0280995635497705e-05,
"loss": 4.4375,
"step": 1011
},
{
"epoch": 0.93790546802595,
"grad_norm": 0.023467648774385452,
"learning_rate": 4.883886613595984e-05,
"loss": 4.3938,
"step": 1012
},
{
"epoch": 0.9388322520852641,
"grad_norm": 0.025684082880616188,
"learning_rate": 4.74175158166984e-05,
"loss": 4.4188,
"step": 1013
},
{
"epoch": 0.9397590361445783,
"grad_norm": 0.028895532712340355,
"learning_rate": 4.601695672689921e-05,
"loss": 4.4375,
"step": 1014
},
{
"epoch": 0.9406858202038925,
"grad_norm": 0.02598528377711773,
"learning_rate": 4.463720073949351e-05,
"loss": 4.4375,
"step": 1015
},
{
"epoch": 0.9416126042632067,
"grad_norm": 0.025186927989125252,
"learning_rate": 4.3278259551060015e-05,
"loss": 4.4188,
"step": 1016
},
{
"epoch": 0.9425393883225208,
"grad_norm": 0.02664157934486866,
"learning_rate": 4.194014468172469e-05,
"loss": 4.4313,
"step": 1017
},
{
"epoch": 0.943466172381835,
"grad_norm": 0.02440650388598442,
"learning_rate": 4.062286747506222e-05,
"loss": 4.45,
"step": 1018
},
{
"epoch": 0.9443929564411492,
"grad_norm": 0.022903352975845337,
"learning_rate": 3.932643909800082e-05,
"loss": 4.3875,
"step": 1019
},
{
"epoch": 0.9453197405004634,
"grad_norm": 0.024947639554739,
"learning_rate": 3.805087054072731e-05,
"loss": 4.4375,
"step": 1020
},
{
"epoch": 0.9462465245597775,
"grad_norm": 0.027093123644590378,
"learning_rate": 3.6796172616594126e-05,
"loss": 4.4188,
"step": 1021
},
{
"epoch": 0.9471733086190918,
"grad_norm": 0.023427557200193405,
"learning_rate": 3.5562355962027726e-05,
"loss": 4.4625,
"step": 1022
},
{
"epoch": 0.9481000926784059,
"grad_norm": 0.02435910701751709,
"learning_rate": 3.434943103643728e-05,
"loss": 4.4188,
"step": 1023
},
{
"epoch": 0.9490268767377201,
"grad_norm": 0.025206558406352997,
"learning_rate": 3.315740812212781e-05,
"loss": 4.4062,
"step": 1024
},
{
"epoch": 0.9499536607970342,
"grad_norm": 0.024215737357735634,
"learning_rate": 3.198629732421188e-05,
"loss": 4.4,
"step": 1025
},
{
"epoch": 0.9508804448563485,
"grad_norm": 0.022633830085396767,
"learning_rate": 3.0836108570524154e-05,
"loss": 4.4062,
"step": 1026
},
{
"epoch": 0.9518072289156626,
"grad_norm": 0.024218518286943436,
"learning_rate": 2.9706851611537023e-05,
"loss": 4.4938,
"step": 1027
},
{
"epoch": 0.9527340129749768,
"grad_norm": 0.023550162091851234,
"learning_rate": 2.8598536020278676e-05,
"loss": 4.4,
"step": 1028
},
{
"epoch": 0.953660797034291,
"grad_norm": 0.024799218401312828,
"learning_rate": 2.7511171192250718e-05,
"loss": 4.4375,
"step": 1029
},
{
"epoch": 0.9545875810936052,
"grad_norm": 0.025713039562106133,
"learning_rate": 2.6444766345350425e-05,
"loss": 4.4062,
"step": 1030
},
{
"epoch": 0.9555143651529193,
"grad_norm": 0.024386629462242126,
"learning_rate": 2.539933051978971e-05,
"loss": 4.4188,
"step": 1031
},
{
"epoch": 0.9564411492122336,
"grad_norm": 0.025705767795443535,
"learning_rate": 2.43748725780224e-05,
"loss": 4.375,
"step": 1032
},
{
"epoch": 0.9573679332715477,
"grad_norm": 0.026646282523870468,
"learning_rate": 2.3371401204664577e-05,
"loss": 4.45,
"step": 1033
},
{
"epoch": 0.9582947173308619,
"grad_norm": 0.025327732786536217,
"learning_rate": 2.238892490642547e-05,
"loss": 4.4437,
"step": 1034
},
{
"epoch": 0.959221501390176,
"grad_norm": 0.024950072169303894,
"learning_rate": 2.142745201203139e-05,
"loss": 4.45,
"step": 1035
},
{
"epoch": 0.9601482854494903,
"grad_norm": 0.023224515840411186,
"learning_rate": 2.048699067215831e-05,
"loss": 4.4125,
"step": 1036
},
{
"epoch": 0.9610750695088045,
"grad_norm": 0.024536075070500374,
"learning_rate": 1.9567548859359963e-05,
"loss": 4.45,
"step": 1037
},
{
"epoch": 0.9620018535681186,
"grad_norm": 0.025291137397289276,
"learning_rate": 1.866913436800316e-05,
"loss": 4.4563,
"step": 1038
},
{
"epoch": 0.9629286376274329,
"grad_norm": 0.023913368582725525,
"learning_rate": 1.7791754814199255e-05,
"loss": 4.4563,
"step": 1039
},
{
"epoch": 0.963855421686747,
"grad_norm": 0.02541198581457138,
"learning_rate": 1.693541763574058e-05,
"loss": 4.45,
"step": 1040
},
{
"epoch": 0.9647822057460612,
"grad_norm": 0.02386779710650444,
"learning_rate": 1.6100130092037703e-05,
"loss": 4.3812,
"step": 1041
},
{
"epoch": 0.9657089898053753,
"grad_norm": 0.02432171255350113,
"learning_rate": 1.528589926405727e-05,
"loss": 4.4563,
"step": 1042
},
{
"epoch": 0.9666357738646896,
"grad_norm": 0.026072759181261063,
"learning_rate": 1.4492732054262603e-05,
"loss": 4.4062,
"step": 1043
},
{
"epoch": 0.9675625579240037,
"grad_norm": 0.02468552440404892,
"learning_rate": 1.372063518655403e-05,
"loss": 4.45,
"step": 1044
},
{
"epoch": 0.9684893419833179,
"grad_norm": 0.023878788575530052,
"learning_rate": 1.2969615206213369e-05,
"loss": 4.4188,
"step": 1045
},
{
"epoch": 0.969416126042632,
"grad_norm": 0.0231490395963192,
"learning_rate": 1.223967847984786e-05,
"loss": 4.4188,
"step": 1046
},
{
"epoch": 0.9703429101019463,
"grad_norm": 0.024373695254325867,
"learning_rate": 1.1530831195335767e-05,
"loss": 4.4437,
"step": 1047
},
{
"epoch": 0.9712696941612604,
"grad_norm": 0.02477751113474369,
"learning_rate": 1.08430793617742e-05,
"loss": 4.4188,
"step": 1048
},
{
"epoch": 0.9721964782205746,
"grad_norm": 0.023831041529774666,
"learning_rate": 1.0176428809428318e-05,
"loss": 4.4813,
"step": 1049
},
{
"epoch": 0.9731232622798888,
"grad_norm": 0.02483510971069336,
"learning_rate": 9.530885189681649e-06,
"loss": 4.4125,
"step": 1050
},
{
"epoch": 0.974050046339203,
"grad_norm": 0.023760484531521797,
"learning_rate": 8.906453974988626e-06,
"loss": 4.4062,
"step": 1051
},
{
"epoch": 0.9749768303985171,
"grad_norm": 0.02444753795862198,
"learning_rate": 8.303140458827684e-06,
"loss": 4.4062,
"step": 1052
},
{
"epoch": 0.9759036144578314,
"grad_norm": 0.021337734535336494,
"learning_rate": 7.720949755657125e-06,
"loss": 4.4,
"step": 1053
},
{
"epoch": 0.9768303985171455,
"grad_norm": 0.022071754559874535,
"learning_rate": 7.159886800869875e-06,
"loss": 4.425,
"step": 1054
},
{
"epoch": 0.9777571825764597,
"grad_norm": 0.024915462359786034,
"learning_rate": 6.6199563507549075e-06,
"loss": 4.3938,
"step": 1055
},
{
"epoch": 0.9786839666357738,
"grad_norm": 0.022235747426748276,
"learning_rate": 6.1011629824533895e-06,
"loss": 4.4,
"step": 1056
},
{
"epoch": 0.9796107506950881,
"grad_norm": 0.02508777379989624,
"learning_rate": 5.60351109392232e-06,
"loss": 4.425,
"step": 1057
},
{
"epoch": 0.9805375347544022,
"grad_norm": 0.02421114780008793,
"learning_rate": 5.127004903896504e-06,
"loss": 4.4688,
"step": 1058
},
{
"epoch": 0.9814643188137164,
"grad_norm": 0.023330386728048325,
"learning_rate": 4.6716484518524726e-06,
"loss": 4.3875,
"step": 1059
},
{
"epoch": 0.9823911028730306,
"grad_norm": 0.02507002279162407,
"learning_rate": 4.237445597974343e-06,
"loss": 4.4563,
"step": 1060
},
{
"epoch": 0.9833178869323448,
"grad_norm": 0.023726079612970352,
"learning_rate": 3.824400023121621e-06,
"loss": 4.4688,
"step": 1061
},
{
"epoch": 0.9842446709916589,
"grad_norm": 0.022975319996476173,
"learning_rate": 3.4325152287975615e-06,
"loss": 4.3938,
"step": 1062
},
{
"epoch": 0.9851714550509731,
"grad_norm": 0.02411024458706379,
"learning_rate": 3.061794537119467e-06,
"loss": 4.4563,
"step": 1063
},
{
"epoch": 0.9860982391102873,
"grad_norm": 0.022638075053691864,
"learning_rate": 2.7122410907903794e-06,
"loss": 4.4563,
"step": 1064
},
{
"epoch": 0.9870250231696015,
"grad_norm": 0.023638809099793434,
"learning_rate": 2.383857853073268e-06,
"loss": 4.425,
"step": 1065
},
{
"epoch": 0.9879518072289156,
"grad_norm": 0.02219136245548725,
"learning_rate": 2.0766476077643813e-06,
"loss": 4.4,
"step": 1066
},
{
"epoch": 0.9888785912882299,
"grad_norm": 0.02723466046154499,
"learning_rate": 1.7906129591713227e-06,
"loss": 4.4437,
"step": 1067
},
{
"epoch": 0.989805375347544,
"grad_norm": 0.024723384529352188,
"learning_rate": 1.525756332090289e-06,
"loss": 4.4,
"step": 1068
},
{
"epoch": 0.9907321594068582,
"grad_norm": 0.023885123431682587,
"learning_rate": 1.2820799717849775e-06,
"loss": 4.4,
"step": 1069
},
{
"epoch": 0.9916589434661723,
"grad_norm": 0.022805040702223778,
"learning_rate": 1.059585943967989e-06,
"loss": 4.4437,
"step": 1070
},
{
"epoch": 0.9925857275254866,
"grad_norm": 0.023890964686870575,
"learning_rate": 8.58276134784175e-07,
"loss": 4.3812,
"step": 1071
},
{
"epoch": 0.9935125115848007,
"grad_norm": 0.025231240317225456,
"learning_rate": 6.781522507925964e-07,
"loss": 4.3688,
"step": 1072
},
{
"epoch": 0.9944392956441149,
"grad_norm": 0.021534454077482224,
"learning_rate": 5.192158189543106e-07,
"loss": 4.4938,
"step": 1073
},
{
"epoch": 0.995366079703429,
"grad_norm": 0.023576676845550537,
"learning_rate": 3.8146818661793925e-07,
"loss": 4.4,
"step": 1074
},
{
"epoch": 0.9962928637627433,
"grad_norm": 0.02641914412379265,
"learning_rate": 2.6491052150884323e-07,
"loss": 4.3625,
"step": 1075
},
{
"epoch": 0.9972196478220574,
"grad_norm": 0.02341269887983799,
"learning_rate": 1.6954381171885302e-07,
"loss": 4.3812,
"step": 1076
},
{
"epoch": 0.9981464318813716,
"grad_norm": 0.022809363901615143,
"learning_rate": 9.536886569849746e-08,
"loss": 4.4437,
"step": 1077
},
{
"epoch": 0.9990732159406858,
"grad_norm": 0.023255689069628716,
"learning_rate": 4.23863122495094e-08,
"loss": 4.4437,
"step": 1078
},
{
"epoch": 1.0,
"grad_norm": 0.03218919411301613,
"learning_rate": 1.059660052010747e-08,
"loss": 4.425,
"step": 1079
}
],
"logging_steps": 1,
"max_steps": 1079,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.154917754792837e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}