phi-4-openalex / trainer_state.json
furmaniak's picture
End of training
b95e3e6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997727789138832,
"eval_steps": 500,
"global_step": 550,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0018177686889343332,
"grad_norm": 0.1331978142261505,
"learning_rate": 3.5714285714285716e-07,
"loss": 1.8898,
"step": 1
},
{
"epoch": 0.0036355373778686664,
"grad_norm": 0.13566601276397705,
"learning_rate": 7.142857142857143e-07,
"loss": 1.8867,
"step": 2
},
{
"epoch": 0.0054533060668029995,
"grad_norm": 0.13576287031173706,
"learning_rate": 1.0714285714285714e-06,
"loss": 1.8848,
"step": 3
},
{
"epoch": 0.007271074755737333,
"grad_norm": 0.1231953352689743,
"learning_rate": 1.4285714285714286e-06,
"loss": 1.888,
"step": 4
},
{
"epoch": 0.009088843444671665,
"grad_norm": 0.09326394647359848,
"learning_rate": 1.7857142857142859e-06,
"loss": 1.8816,
"step": 5
},
{
"epoch": 0.010906612133605999,
"grad_norm": 0.08585168421268463,
"learning_rate": 2.1428571428571427e-06,
"loss": 1.8851,
"step": 6
},
{
"epoch": 0.012724380822540331,
"grad_norm": 0.0567106269299984,
"learning_rate": 2.5e-06,
"loss": 1.8799,
"step": 7
},
{
"epoch": 0.014542149511474665,
"grad_norm": 0.05393998324871063,
"learning_rate": 2.8571428571428573e-06,
"loss": 1.8785,
"step": 8
},
{
"epoch": 0.016359918200409,
"grad_norm": 0.05495736747980118,
"learning_rate": 3.2142857142857147e-06,
"loss": 1.8763,
"step": 9
},
{
"epoch": 0.01817768688934333,
"grad_norm": 0.05345786362886429,
"learning_rate": 3.5714285714285718e-06,
"loss": 1.8676,
"step": 10
},
{
"epoch": 0.019995455578277664,
"grad_norm": 0.047461625188589096,
"learning_rate": 3.928571428571429e-06,
"loss": 1.8634,
"step": 11
},
{
"epoch": 0.021813224267211998,
"grad_norm": 0.061344344168901443,
"learning_rate": 4.2857142857142855e-06,
"loss": 1.864,
"step": 12
},
{
"epoch": 0.02363099295614633,
"grad_norm": 0.06245123967528343,
"learning_rate": 4.642857142857144e-06,
"loss": 1.8647,
"step": 13
},
{
"epoch": 0.025448761645080663,
"grad_norm": 0.053826089948415756,
"learning_rate": 5e-06,
"loss": 1.8605,
"step": 14
},
{
"epoch": 0.027266530334014997,
"grad_norm": 0.04343092441558838,
"learning_rate": 5.357142857142857e-06,
"loss": 1.8648,
"step": 15
},
{
"epoch": 0.02908429902294933,
"grad_norm": 0.04949821159243584,
"learning_rate": 5.7142857142857145e-06,
"loss": 1.8536,
"step": 16
},
{
"epoch": 0.03090206771188366,
"grad_norm": 0.06119069084525108,
"learning_rate": 6.071428571428571e-06,
"loss": 1.8485,
"step": 17
},
{
"epoch": 0.032719836400818,
"grad_norm": 0.05067905783653259,
"learning_rate": 6.4285714285714295e-06,
"loss": 1.8536,
"step": 18
},
{
"epoch": 0.03453760508975233,
"grad_norm": 0.03722887113690376,
"learning_rate": 6.785714285714287e-06,
"loss": 1.8491,
"step": 19
},
{
"epoch": 0.03635537377868666,
"grad_norm": 0.04830312356352806,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.8361,
"step": 20
},
{
"epoch": 0.038173142467621,
"grad_norm": 0.052912868559360504,
"learning_rate": 7.500000000000001e-06,
"loss": 1.8522,
"step": 21
},
{
"epoch": 0.03999091115655533,
"grad_norm": 0.04469645023345947,
"learning_rate": 7.857142857142858e-06,
"loss": 1.8391,
"step": 22
},
{
"epoch": 0.04180867984548966,
"grad_norm": 0.04048198461532593,
"learning_rate": 8.214285714285714e-06,
"loss": 1.8345,
"step": 23
},
{
"epoch": 0.043626448534423996,
"grad_norm": 0.03836997598409653,
"learning_rate": 8.571428571428571e-06,
"loss": 1.8342,
"step": 24
},
{
"epoch": 0.04544421722335833,
"grad_norm": 0.038932956755161285,
"learning_rate": 8.92857142857143e-06,
"loss": 1.8399,
"step": 25
},
{
"epoch": 0.04726198591229266,
"grad_norm": 0.041100382804870605,
"learning_rate": 9.285714285714288e-06,
"loss": 1.833,
"step": 26
},
{
"epoch": 0.049079754601226995,
"grad_norm": 0.03821416571736336,
"learning_rate": 9.642857142857144e-06,
"loss": 1.8342,
"step": 27
},
{
"epoch": 0.050897523290161326,
"grad_norm": 0.037851471453905106,
"learning_rate": 1e-05,
"loss": 1.8313,
"step": 28
},
{
"epoch": 0.05271529197909566,
"grad_norm": 0.03763577714562416,
"learning_rate": 9.999909448127131e-06,
"loss": 1.8291,
"step": 29
},
{
"epoch": 0.054533060668029994,
"grad_norm": 0.03475307673215866,
"learning_rate": 9.999637795788383e-06,
"loss": 1.8185,
"step": 30
},
{
"epoch": 0.056350829356964324,
"grad_norm": 0.03289997950196266,
"learning_rate": 9.999185052823207e-06,
"loss": 1.8261,
"step": 31
},
{
"epoch": 0.05816859804589866,
"grad_norm": 0.03243958577513695,
"learning_rate": 9.99855123563029e-06,
"loss": 1.8237,
"step": 32
},
{
"epoch": 0.05998636673483299,
"grad_norm": 0.033227939158678055,
"learning_rate": 9.997736367166967e-06,
"loss": 1.827,
"step": 33
},
{
"epoch": 0.06180413542376732,
"grad_norm": 0.03226836398243904,
"learning_rate": 9.996740476948386e-06,
"loss": 1.8257,
"step": 34
},
{
"epoch": 0.06362190411270166,
"grad_norm": 0.029187630861997604,
"learning_rate": 9.995563601046434e-06,
"loss": 1.819,
"step": 35
},
{
"epoch": 0.065439672801636,
"grad_norm": 0.026967501267790794,
"learning_rate": 9.994205782088438e-06,
"loss": 1.8136,
"step": 36
},
{
"epoch": 0.06725744149057032,
"grad_norm": 0.031199270859360695,
"learning_rate": 9.99266706925562e-06,
"loss": 1.8206,
"step": 37
},
{
"epoch": 0.06907521017950466,
"grad_norm": 0.030985625460743904,
"learning_rate": 9.990947518281312e-06,
"loss": 1.8281,
"step": 38
},
{
"epoch": 0.070892978868439,
"grad_norm": 0.02339562401175499,
"learning_rate": 9.989047191448934e-06,
"loss": 1.82,
"step": 39
},
{
"epoch": 0.07271074755737332,
"grad_norm": 0.0256453026086092,
"learning_rate": 9.986966157589751e-06,
"loss": 1.8079,
"step": 40
},
{
"epoch": 0.07452851624630766,
"grad_norm": 0.025680653750896454,
"learning_rate": 9.984704492080366e-06,
"loss": 1.8088,
"step": 41
},
{
"epoch": 0.076346284935242,
"grad_norm": 0.026331942528486252,
"learning_rate": 9.982262276840002e-06,
"loss": 1.8153,
"step": 42
},
{
"epoch": 0.07816405362417632,
"grad_norm": 0.026452744379639626,
"learning_rate": 9.979639600327522e-06,
"loss": 1.8082,
"step": 43
},
{
"epoch": 0.07998182231311066,
"grad_norm": 0.020438341423869133,
"learning_rate": 9.976836557538234e-06,
"loss": 1.8087,
"step": 44
},
{
"epoch": 0.081799591002045,
"grad_norm": 0.022149616852402687,
"learning_rate": 9.973853250000449e-06,
"loss": 1.8132,
"step": 45
},
{
"epoch": 0.08361735969097932,
"grad_norm": 0.020680025219917297,
"learning_rate": 9.970689785771798e-06,
"loss": 1.8077,
"step": 46
},
{
"epoch": 0.08543512837991366,
"grad_norm": 0.018105728551745415,
"learning_rate": 9.967346279435328e-06,
"loss": 1.8063,
"step": 47
},
{
"epoch": 0.08725289706884799,
"grad_norm": 0.020593147724866867,
"learning_rate": 9.963822852095344e-06,
"loss": 1.8036,
"step": 48
},
{
"epoch": 0.08907066575778232,
"grad_norm": 0.0193562563508749,
"learning_rate": 9.960119631373023e-06,
"loss": 1.8135,
"step": 49
},
{
"epoch": 0.09088843444671665,
"grad_norm": 0.017045950517058372,
"learning_rate": 9.95623675140179e-06,
"loss": 1.8115,
"step": 50
},
{
"epoch": 0.09270620313565099,
"grad_norm": 0.01905151829123497,
"learning_rate": 9.952174352822474e-06,
"loss": 1.8087,
"step": 51
},
{
"epoch": 0.09452397182458531,
"grad_norm": 0.019179217517375946,
"learning_rate": 9.947932582778188e-06,
"loss": 1.8093,
"step": 52
},
{
"epoch": 0.09634174051351965,
"grad_norm": 0.016135873273015022,
"learning_rate": 9.943511594909024e-06,
"loss": 1.8008,
"step": 53
},
{
"epoch": 0.09815950920245399,
"grad_norm": 0.016653183847665787,
"learning_rate": 9.938911549346473e-06,
"loss": 1.8075,
"step": 54
},
{
"epoch": 0.09997727789138833,
"grad_norm": 0.01784764975309372,
"learning_rate": 9.934132612707631e-06,
"loss": 1.8065,
"step": 55
},
{
"epoch": 0.10179504658032265,
"grad_norm": 0.01742948405444622,
"learning_rate": 9.929174958089167e-06,
"loss": 1.8066,
"step": 56
},
{
"epoch": 0.10361281526925699,
"grad_norm": 0.015608050860464573,
"learning_rate": 9.924038765061042e-06,
"loss": 1.8089,
"step": 57
},
{
"epoch": 0.10543058395819133,
"grad_norm": 0.017180059105157852,
"learning_rate": 9.918724219660013e-06,
"loss": 1.8063,
"step": 58
},
{
"epoch": 0.10724835264712565,
"grad_norm": 0.01681089587509632,
"learning_rate": 9.913231514382902e-06,
"loss": 1.7952,
"step": 59
},
{
"epoch": 0.10906612133605999,
"grad_norm": 0.016128279268741608,
"learning_rate": 9.907560848179607e-06,
"loss": 1.797,
"step": 60
},
{
"epoch": 0.11088389002499432,
"grad_norm": 0.016297221183776855,
"learning_rate": 9.901712426445901e-06,
"loss": 1.7966,
"step": 61
},
{
"epoch": 0.11270165871392865,
"grad_norm": 0.017089389264583588,
"learning_rate": 9.895686461016007e-06,
"loss": 1.8097,
"step": 62
},
{
"epoch": 0.11451942740286299,
"grad_norm": 0.01613052934408188,
"learning_rate": 9.889483170154903e-06,
"loss": 1.7984,
"step": 63
},
{
"epoch": 0.11633719609179732,
"grad_norm": 0.016225503757596016,
"learning_rate": 9.883102778550434e-06,
"loss": 1.8013,
"step": 64
},
{
"epoch": 0.11815496478073165,
"grad_norm": 0.015952223911881447,
"learning_rate": 9.876545517305163e-06,
"loss": 1.7993,
"step": 65
},
{
"epoch": 0.11997273346966598,
"grad_norm": 0.016045618802309036,
"learning_rate": 9.869811623928001e-06,
"loss": 1.7968,
"step": 66
},
{
"epoch": 0.12179050215860032,
"grad_norm": 0.015822941437363625,
"learning_rate": 9.862901342325617e-06,
"loss": 1.7947,
"step": 67
},
{
"epoch": 0.12360827084753465,
"grad_norm": 0.016080934554338455,
"learning_rate": 9.855814922793583e-06,
"loss": 1.8011,
"step": 68
},
{
"epoch": 0.125426039536469,
"grad_norm": 0.01613529957830906,
"learning_rate": 9.848552622007326e-06,
"loss": 1.7956,
"step": 69
},
{
"epoch": 0.12724380822540332,
"grad_norm": 0.01521450374275446,
"learning_rate": 9.841114703012817e-06,
"loss": 1.7961,
"step": 70
},
{
"epoch": 0.12906157691433764,
"grad_norm": 0.01613503508269787,
"learning_rate": 9.83350143521706e-06,
"loss": 1.7981,
"step": 71
},
{
"epoch": 0.130879345603272,
"grad_norm": 0.01576644368469715,
"learning_rate": 9.82571309437831e-06,
"loss": 1.8042,
"step": 72
},
{
"epoch": 0.13269711429220632,
"grad_norm": 0.017247062176465988,
"learning_rate": 9.817749962596115e-06,
"loss": 1.793,
"step": 73
},
{
"epoch": 0.13451488298114064,
"grad_norm": 0.014981955289840698,
"learning_rate": 9.809612328301071e-06,
"loss": 1.8074,
"step": 74
},
{
"epoch": 0.136332651670075,
"grad_norm": 0.0150354178622365,
"learning_rate": 9.801300486244385e-06,
"loss": 1.7973,
"step": 75
},
{
"epoch": 0.13815042035900932,
"grad_norm": 0.015270021744072437,
"learning_rate": 9.792814737487207e-06,
"loss": 1.7973,
"step": 76
},
{
"epoch": 0.13996818904794364,
"grad_norm": 0.016216879710555077,
"learning_rate": 9.784155389389713e-06,
"loss": 1.7986,
"step": 77
},
{
"epoch": 0.141785957736878,
"grad_norm": 0.015781838446855545,
"learning_rate": 9.775322755599979e-06,
"loss": 1.7937,
"step": 78
},
{
"epoch": 0.14360372642581232,
"grad_norm": 0.015398108400404453,
"learning_rate": 9.766317156042615e-06,
"loss": 1.7976,
"step": 79
},
{
"epoch": 0.14542149511474664,
"grad_norm": 0.01513028983026743,
"learning_rate": 9.757138916907184e-06,
"loss": 1.7915,
"step": 80
},
{
"epoch": 0.147239263803681,
"grad_norm": 0.015322140417993069,
"learning_rate": 9.747788370636389e-06,
"loss": 1.8053,
"step": 81
},
{
"epoch": 0.14905703249261532,
"grad_norm": 0.016009092330932617,
"learning_rate": 9.738265855914014e-06,
"loss": 1.7908,
"step": 82
},
{
"epoch": 0.15087480118154964,
"grad_norm": 0.01483672671020031,
"learning_rate": 9.728571717652677e-06,
"loss": 1.7888,
"step": 83
},
{
"epoch": 0.152692569870484,
"grad_norm": 0.014686529524624348,
"learning_rate": 9.718706306981332e-06,
"loss": 1.7911,
"step": 84
},
{
"epoch": 0.15451033855941831,
"grad_norm": 0.01669451966881752,
"learning_rate": 9.708669981232542e-06,
"loss": 1.8017,
"step": 85
},
{
"epoch": 0.15632810724835264,
"grad_norm": 0.014686268754303455,
"learning_rate": 9.698463103929542e-06,
"loss": 1.7979,
"step": 86
},
{
"epoch": 0.158145875937287,
"grad_norm": 0.01508221123367548,
"learning_rate": 9.688086044773079e-06,
"loss": 1.7872,
"step": 87
},
{
"epoch": 0.1599636446262213,
"grad_norm": 0.0154942087829113,
"learning_rate": 9.677539179628005e-06,
"loss": 1.794,
"step": 88
},
{
"epoch": 0.16178141331515564,
"grad_norm": 0.016326844692230225,
"learning_rate": 9.66682289050968e-06,
"loss": 1.7981,
"step": 89
},
{
"epoch": 0.16359918200409,
"grad_norm": 0.015189899131655693,
"learning_rate": 9.655937565570124e-06,
"loss": 1.7943,
"step": 90
},
{
"epoch": 0.1654169506930243,
"grad_norm": 0.014669873751699924,
"learning_rate": 9.644883599083959e-06,
"loss": 1.7873,
"step": 91
},
{
"epoch": 0.16723471938195864,
"grad_norm": 0.015705488622188568,
"learning_rate": 9.63366139143413e-06,
"loss": 1.7959,
"step": 92
},
{
"epoch": 0.169052488070893,
"grad_norm": 0.015006368048489094,
"learning_rate": 9.622271349097413e-06,
"loss": 1.7883,
"step": 93
},
{
"epoch": 0.1708702567598273,
"grad_norm": 0.015823103487491608,
"learning_rate": 9.610713884629667e-06,
"loss": 1.7864,
"step": 94
},
{
"epoch": 0.17268802544876163,
"grad_norm": 0.03225838020443916,
"learning_rate": 9.598989416650915e-06,
"loss": 1.7871,
"step": 95
},
{
"epoch": 0.17450579413769599,
"grad_norm": 0.015597975812852383,
"learning_rate": 9.587098369830171e-06,
"loss": 1.7804,
"step": 96
},
{
"epoch": 0.1763235628266303,
"grad_norm": 0.01537901721894741,
"learning_rate": 9.575041174870062e-06,
"loss": 1.7858,
"step": 97
},
{
"epoch": 0.17814133151556463,
"grad_norm": 0.023264285176992416,
"learning_rate": 9.562818268491216e-06,
"loss": 1.7823,
"step": 98
},
{
"epoch": 0.17995910020449898,
"grad_norm": 0.01551035512238741,
"learning_rate": 9.550430093416465e-06,
"loss": 1.7882,
"step": 99
},
{
"epoch": 0.1817768688934333,
"grad_norm": 0.015448925085365772,
"learning_rate": 9.537877098354787e-06,
"loss": 1.7836,
"step": 100
},
{
"epoch": 0.18359463758236763,
"grad_norm": 0.01610329933464527,
"learning_rate": 9.525159737985066e-06,
"loss": 1.7843,
"step": 101
},
{
"epoch": 0.18541240627130198,
"grad_norm": 0.015887994319200516,
"learning_rate": 9.512278472939627e-06,
"loss": 1.7835,
"step": 102
},
{
"epoch": 0.1872301749602363,
"grad_norm": 0.015717443078756332,
"learning_rate": 9.499233769787534e-06,
"loss": 1.7899,
"step": 103
},
{
"epoch": 0.18904794364917063,
"grad_norm": 0.01613277569413185,
"learning_rate": 9.486026101017711e-06,
"loss": 1.789,
"step": 104
},
{
"epoch": 0.19086571233810498,
"grad_norm": 0.0161016546189785,
"learning_rate": 9.472655945021815e-06,
"loss": 1.7885,
"step": 105
},
{
"epoch": 0.1926834810270393,
"grad_norm": 0.015553218312561512,
"learning_rate": 9.459123786076911e-06,
"loss": 1.7841,
"step": 106
},
{
"epoch": 0.19450124971597363,
"grad_norm": 0.01636493392288685,
"learning_rate": 9.445430114327936e-06,
"loss": 1.7864,
"step": 107
},
{
"epoch": 0.19631901840490798,
"grad_norm": 0.016063738614320755,
"learning_rate": 9.431575425769938e-06,
"loss": 1.7836,
"step": 108
},
{
"epoch": 0.1981367870938423,
"grad_norm": 0.016147315502166748,
"learning_rate": 9.417560222230115e-06,
"loss": 1.7786,
"step": 109
},
{
"epoch": 0.19995455578277666,
"grad_norm": 0.01560090109705925,
"learning_rate": 9.40338501134964e-06,
"loss": 1.7782,
"step": 110
},
{
"epoch": 0.20177232447171098,
"grad_norm": 0.015402060933411121,
"learning_rate": 9.389050306565269e-06,
"loss": 1.7814,
"step": 111
},
{
"epoch": 0.2035900931606453,
"grad_norm": 0.017125973477959633,
"learning_rate": 9.374556627090749e-06,
"loss": 1.7793,
"step": 112
},
{
"epoch": 0.20540786184957965,
"grad_norm": 0.015735799446702003,
"learning_rate": 9.359904497898009e-06,
"loss": 1.7872,
"step": 113
},
{
"epoch": 0.20722563053851398,
"grad_norm": 0.01627574861049652,
"learning_rate": 9.345094449698143e-06,
"loss": 1.7893,
"step": 114
},
{
"epoch": 0.2090433992274483,
"grad_norm": 0.014931687153875828,
"learning_rate": 9.330127018922195e-06,
"loss": 1.7825,
"step": 115
},
{
"epoch": 0.21086116791638265,
"grad_norm": 0.015015835873782635,
"learning_rate": 9.315002747701716e-06,
"loss": 1.77,
"step": 116
},
{
"epoch": 0.21267893660531698,
"grad_norm": 0.01571677438914776,
"learning_rate": 9.299722183849144e-06,
"loss": 1.7843,
"step": 117
},
{
"epoch": 0.2144967052942513,
"grad_norm": 0.014991500414907932,
"learning_rate": 9.284285880837947e-06,
"loss": 1.7824,
"step": 118
},
{
"epoch": 0.21631447398318565,
"grad_norm": 0.016052858904004097,
"learning_rate": 9.268694397782585e-06,
"loss": 1.7805,
"step": 119
},
{
"epoch": 0.21813224267211997,
"grad_norm": 0.015834221616387367,
"learning_rate": 9.252948299418255e-06,
"loss": 1.7855,
"step": 120
},
{
"epoch": 0.2199500113610543,
"grad_norm": 0.01614440232515335,
"learning_rate": 9.237048156080433e-06,
"loss": 1.7885,
"step": 121
},
{
"epoch": 0.22176778004998865,
"grad_norm": 0.01563919708132744,
"learning_rate": 9.220994543684225e-06,
"loss": 1.7799,
"step": 122
},
{
"epoch": 0.22358554873892297,
"grad_norm": 0.015689659863710403,
"learning_rate": 9.2047880437035e-06,
"loss": 1.7808,
"step": 123
},
{
"epoch": 0.2254033174278573,
"grad_norm": 0.015433340333402157,
"learning_rate": 9.188429243149824e-06,
"loss": 1.7769,
"step": 124
},
{
"epoch": 0.22722108611679165,
"grad_norm": 0.01560978963971138,
"learning_rate": 9.171918734551212e-06,
"loss": 1.7791,
"step": 125
},
{
"epoch": 0.22903885480572597,
"grad_norm": 0.016046756878495216,
"learning_rate": 9.155257115930651e-06,
"loss": 1.7778,
"step": 126
},
{
"epoch": 0.2308566234946603,
"grad_norm": 0.01664203219115734,
"learning_rate": 9.138444990784455e-06,
"loss": 1.7811,
"step": 127
},
{
"epoch": 0.23267439218359465,
"grad_norm": 0.015654807910323143,
"learning_rate": 9.121482968060384e-06,
"loss": 1.7841,
"step": 128
},
{
"epoch": 0.23449216087252897,
"grad_norm": 0.016352280974388123,
"learning_rate": 9.104371662135612e-06,
"loss": 1.7839,
"step": 129
},
{
"epoch": 0.2363099295614633,
"grad_norm": 0.016163717955350876,
"learning_rate": 9.08711169279446e-06,
"loss": 1.7847,
"step": 130
},
{
"epoch": 0.23812769825039765,
"grad_norm": 0.016361849382519722,
"learning_rate": 9.069703685205945e-06,
"loss": 1.7804,
"step": 131
},
{
"epoch": 0.23994546693933197,
"grad_norm": 0.01635843515396118,
"learning_rate": 9.052148269901145e-06,
"loss": 1.7811,
"step": 132
},
{
"epoch": 0.2417632356282663,
"grad_norm": 0.016859732568264008,
"learning_rate": 9.034446082750352e-06,
"loss": 1.7863,
"step": 133
},
{
"epoch": 0.24358100431720064,
"grad_norm": 0.016207806766033173,
"learning_rate": 9.01659776494005e-06,
"loss": 1.7739,
"step": 134
},
{
"epoch": 0.24539877300613497,
"grad_norm": 0.016936447471380234,
"learning_rate": 8.998603962949674e-06,
"loss": 1.7818,
"step": 135
},
{
"epoch": 0.2472165416950693,
"grad_norm": 0.015802595764398575,
"learning_rate": 8.98046532852822e-06,
"loss": 1.7836,
"step": 136
},
{
"epoch": 0.24903431038400364,
"grad_norm": 0.016628528013825417,
"learning_rate": 8.96218251867061e-06,
"loss": 1.7822,
"step": 137
},
{
"epoch": 0.250852079072938,
"grad_norm": 0.01642756536602974,
"learning_rate": 8.943756195593916e-06,
"loss": 1.7756,
"step": 138
},
{
"epoch": 0.2526698477618723,
"grad_norm": 0.016094859689474106,
"learning_rate": 8.925187026713363e-06,
"loss": 1.766,
"step": 139
},
{
"epoch": 0.25448761645080664,
"grad_norm": 0.015560369938611984,
"learning_rate": 8.90647568461816e-06,
"loss": 1.783,
"step": 140
},
{
"epoch": 0.256305385139741,
"grad_norm": 0.01574082300066948,
"learning_rate": 8.887622847047131e-06,
"loss": 1.7882,
"step": 141
},
{
"epoch": 0.2581231538286753,
"grad_norm": 0.01694745570421219,
"learning_rate": 8.868629196864182e-06,
"loss": 1.7797,
"step": 142
},
{
"epoch": 0.25994092251760964,
"grad_norm": 0.01562688499689102,
"learning_rate": 8.84949542203355e-06,
"loss": 1.7832,
"step": 143
},
{
"epoch": 0.261758691206544,
"grad_norm": 0.015506752766668797,
"learning_rate": 8.83022221559489e-06,
"loss": 1.7749,
"step": 144
},
{
"epoch": 0.2635764598954783,
"grad_norm": 0.017343781888484955,
"learning_rate": 8.810810275638183e-06,
"loss": 1.7736,
"step": 145
},
{
"epoch": 0.26539422858441264,
"grad_norm": 0.01597374677658081,
"learning_rate": 8.791260305278434e-06,
"loss": 1.7879,
"step": 146
},
{
"epoch": 0.267211997273347,
"grad_norm": 0.015632351860404015,
"learning_rate": 8.771573012630214e-06,
"loss": 1.7804,
"step": 147
},
{
"epoch": 0.2690297659622813,
"grad_norm": 0.01659367047250271,
"learning_rate": 8.751749110782013e-06,
"loss": 1.7827,
"step": 148
},
{
"epoch": 0.27084753465121564,
"grad_norm": 0.01651635952293873,
"learning_rate": 8.731789317770407e-06,
"loss": 1.7781,
"step": 149
},
{
"epoch": 0.27266530334015,
"grad_norm": 0.01517146173864603,
"learning_rate": 8.71169435655405e-06,
"loss": 1.7811,
"step": 150
},
{
"epoch": 0.2744830720290843,
"grad_norm": 0.015295923687517643,
"learning_rate": 8.691464954987494e-06,
"loss": 1.7677,
"step": 151
},
{
"epoch": 0.27630084071801864,
"grad_norm": 0.015585844404995441,
"learning_rate": 8.671101845794816e-06,
"loss": 1.7745,
"step": 152
},
{
"epoch": 0.278118609406953,
"grad_norm": 0.015692081302404404,
"learning_rate": 8.65060576654309e-06,
"loss": 1.7745,
"step": 153
},
{
"epoch": 0.2799363780958873,
"grad_norm": 0.015136554837226868,
"learning_rate": 8.629977459615655e-06,
"loss": 1.7863,
"step": 154
},
{
"epoch": 0.28175414678482164,
"grad_norm": 0.015603788197040558,
"learning_rate": 8.609217672185246e-06,
"loss": 1.7796,
"step": 155
},
{
"epoch": 0.283571915473756,
"grad_norm": 0.016288187354803085,
"learning_rate": 8.588327156186915e-06,
"loss": 1.7785,
"step": 156
},
{
"epoch": 0.2853896841626903,
"grad_norm": 0.016181934624910355,
"learning_rate": 8.567306668290801e-06,
"loss": 1.7597,
"step": 157
},
{
"epoch": 0.28720745285162463,
"grad_norm": 0.0157309602946043,
"learning_rate": 8.546156969874723e-06,
"loss": 1.7827,
"step": 158
},
{
"epoch": 0.289025221540559,
"grad_norm": 0.016916731372475624,
"learning_rate": 8.524878826996602e-06,
"loss": 1.7749,
"step": 159
},
{
"epoch": 0.2908429902294933,
"grad_norm": 0.015968995168805122,
"learning_rate": 8.503473010366713e-06,
"loss": 1.7683,
"step": 160
},
{
"epoch": 0.29266075891842763,
"grad_norm": 0.01594395563006401,
"learning_rate": 8.481940295319772e-06,
"loss": 1.7792,
"step": 161
},
{
"epoch": 0.294478527607362,
"grad_norm": 0.016326317563652992,
"learning_rate": 8.460281461786848e-06,
"loss": 1.7734,
"step": 162
},
{
"epoch": 0.2962962962962963,
"grad_norm": 0.016297809779644012,
"learning_rate": 8.438497294267117e-06,
"loss": 1.769,
"step": 163
},
{
"epoch": 0.29811406498523063,
"grad_norm": 0.017145946621894836,
"learning_rate": 8.416588581799447e-06,
"loss": 1.7767,
"step": 164
},
{
"epoch": 0.299931833674165,
"grad_norm": 0.016356928274035454,
"learning_rate": 8.394556117933816e-06,
"loss": 1.772,
"step": 165
},
{
"epoch": 0.3017496023630993,
"grad_norm": 0.016378790140151978,
"learning_rate": 8.372400700702569e-06,
"loss": 1.7701,
"step": 166
},
{
"epoch": 0.30356737105203363,
"grad_norm": 0.018152521923184395,
"learning_rate": 8.350123132591522e-06,
"loss": 1.7769,
"step": 167
},
{
"epoch": 0.305385139740968,
"grad_norm": 0.017259759828448296,
"learning_rate": 8.327724220510873e-06,
"loss": 1.7742,
"step": 168
},
{
"epoch": 0.3072029084299023,
"grad_norm": 0.016766058281064034,
"learning_rate": 8.305204775766003e-06,
"loss": 1.771,
"step": 169
},
{
"epoch": 0.30902067711883663,
"grad_norm": 0.017410485073924065,
"learning_rate": 8.282565614028068e-06,
"loss": 1.7663,
"step": 170
},
{
"epoch": 0.310838445807771,
"grad_norm": 0.017518077045679092,
"learning_rate": 8.259807555304469e-06,
"loss": 1.769,
"step": 171
},
{
"epoch": 0.3126562144967053,
"grad_norm": 0.017017841339111328,
"learning_rate": 8.23693142390914e-06,
"loss": 1.7733,
"step": 172
},
{
"epoch": 0.3144739831856396,
"grad_norm": 0.017034539952874184,
"learning_rate": 8.213938048432697e-06,
"loss": 1.7715,
"step": 173
},
{
"epoch": 0.316291751874574,
"grad_norm": 0.016053663566708565,
"learning_rate": 8.19082826171243e-06,
"loss": 1.768,
"step": 174
},
{
"epoch": 0.3181095205635083,
"grad_norm": 0.017002522945404053,
"learning_rate": 8.167602900802121e-06,
"loss": 1.7571,
"step": 175
},
{
"epoch": 0.3199272892524426,
"grad_norm": 0.016666986048221588,
"learning_rate": 8.144262806941743e-06,
"loss": 1.776,
"step": 176
},
{
"epoch": 0.321745057941377,
"grad_norm": 0.017756953835487366,
"learning_rate": 8.120808825526983e-06,
"loss": 1.7701,
"step": 177
},
{
"epoch": 0.3235628266303113,
"grad_norm": 0.01685352995991707,
"learning_rate": 8.097241806078616e-06,
"loss": 1.7697,
"step": 178
},
{
"epoch": 0.3253805953192456,
"grad_norm": 0.01626460626721382,
"learning_rate": 8.073562602211743e-06,
"loss": 1.7733,
"step": 179
},
{
"epoch": 0.32719836400818,
"grad_norm": 0.017634931951761246,
"learning_rate": 8.049772071604864e-06,
"loss": 1.7817,
"step": 180
},
{
"epoch": 0.32901613269711427,
"grad_norm": 0.0157694723457098,
"learning_rate": 8.025871075968828e-06,
"loss": 1.7667,
"step": 181
},
{
"epoch": 0.3308339013860486,
"grad_norm": 0.016742341220378876,
"learning_rate": 8.001860481015594e-06,
"loss": 1.7753,
"step": 182
},
{
"epoch": 0.332651670074983,
"grad_norm": 0.015466434881091118,
"learning_rate": 7.977741156426901e-06,
"loss": 1.7706,
"step": 183
},
{
"epoch": 0.33446943876391727,
"grad_norm": 0.017226146534085274,
"learning_rate": 7.953513975822755e-06,
"loss": 1.7665,
"step": 184
},
{
"epoch": 0.3362872074528516,
"grad_norm": 0.01610388606786728,
"learning_rate": 7.92917981672979e-06,
"loss": 1.7723,
"step": 185
},
{
"epoch": 0.338104976141786,
"grad_norm": 0.016837526112794876,
"learning_rate": 7.904739560549475e-06,
"loss": 1.7754,
"step": 186
},
{
"epoch": 0.33992274483072027,
"grad_norm": 0.016696933656930923,
"learning_rate": 7.8801940925262e-06,
"loss": 1.7707,
"step": 187
},
{
"epoch": 0.3417405135196546,
"grad_norm": 0.016263185068964958,
"learning_rate": 7.855544301715203e-06,
"loss": 1.7702,
"step": 188
},
{
"epoch": 0.34355828220858897,
"grad_norm": 0.01645650342106819,
"learning_rate": 7.830791080950373e-06,
"loss": 1.768,
"step": 189
},
{
"epoch": 0.34537605089752327,
"grad_norm": 0.01569991558790207,
"learning_rate": 7.805935326811913e-06,
"loss": 1.767,
"step": 190
},
{
"epoch": 0.3471938195864576,
"grad_norm": 0.015973446890711784,
"learning_rate": 7.780977939593856e-06,
"loss": 1.7713,
"step": 191
},
{
"epoch": 0.34901158827539197,
"grad_norm": 0.01654656231403351,
"learning_rate": 7.755919823271466e-06,
"loss": 1.7577,
"step": 192
},
{
"epoch": 0.35082935696432627,
"grad_norm": 0.015675723552703857,
"learning_rate": 7.730761885468486e-06,
"loss": 1.7732,
"step": 193
},
{
"epoch": 0.3526471256532606,
"grad_norm": 0.018406856805086136,
"learning_rate": 7.70550503742427e-06,
"loss": 1.7668,
"step": 194
},
{
"epoch": 0.35446489434219497,
"grad_norm": 0.016395216807723045,
"learning_rate": 7.68015019396078e-06,
"loss": 1.7672,
"step": 195
},
{
"epoch": 0.35628266303112927,
"grad_norm": 0.016013452783226967,
"learning_rate": 7.654698273449435e-06,
"loss": 1.7646,
"step": 196
},
{
"epoch": 0.3581004317200636,
"grad_norm": 0.01679440774023533,
"learning_rate": 7.629150197777866e-06,
"loss": 1.7612,
"step": 197
},
{
"epoch": 0.35991820040899797,
"grad_norm": 0.01686931401491165,
"learning_rate": 7.603506892316513e-06,
"loss": 1.7597,
"step": 198
},
{
"epoch": 0.36173596909793226,
"grad_norm": 0.017471209168434143,
"learning_rate": 7.57776928588511e-06,
"loss": 1.7756,
"step": 199
},
{
"epoch": 0.3635537377868666,
"grad_norm": 0.017604535445570946,
"learning_rate": 7.551938310719043e-06,
"loss": 1.7706,
"step": 200
},
{
"epoch": 0.36537150647580097,
"grad_norm": 0.016083979979157448,
"learning_rate": 7.526014902435583e-06,
"loss": 1.7689,
"step": 201
},
{
"epoch": 0.36718927516473526,
"grad_norm": 0.017569448798894882,
"learning_rate": 7.500000000000001e-06,
"loss": 1.7716,
"step": 202
},
{
"epoch": 0.3690070438536696,
"grad_norm": 0.018971305340528488,
"learning_rate": 7.4738945456915505e-06,
"loss": 1.7639,
"step": 203
},
{
"epoch": 0.37082481254260397,
"grad_norm": 0.017489226534962654,
"learning_rate": 7.447699485069342e-06,
"loss": 1.7695,
"step": 204
},
{
"epoch": 0.37264258123153826,
"grad_norm": 0.016599513590335846,
"learning_rate": 7.421415766938098e-06,
"loss": 1.758,
"step": 205
},
{
"epoch": 0.3744603499204726,
"grad_norm": 0.017470112070441246,
"learning_rate": 7.395044343313777e-06,
"loss": 1.7635,
"step": 206
},
{
"epoch": 0.37627811860940696,
"grad_norm": 0.01866212487220764,
"learning_rate": 7.3685861693891026e-06,
"loss": 1.7698,
"step": 207
},
{
"epoch": 0.37809588729834126,
"grad_norm": 0.016111081466078758,
"learning_rate": 7.342042203498952e-06,
"loss": 1.763,
"step": 208
},
{
"epoch": 0.3799136559872756,
"grad_norm": 0.01669992506504059,
"learning_rate": 7.315413407085656e-06,
"loss": 1.7614,
"step": 209
},
{
"epoch": 0.38173142467620996,
"grad_norm": 0.01589970290660858,
"learning_rate": 7.288700744664167e-06,
"loss": 1.773,
"step": 210
},
{
"epoch": 0.38354919336514426,
"grad_norm": 0.01591925323009491,
"learning_rate": 7.261905183787136e-06,
"loss": 1.7754,
"step": 211
},
{
"epoch": 0.3853669620540786,
"grad_norm": 0.01747284270823002,
"learning_rate": 7.235027695009846e-06,
"loss": 1.7721,
"step": 212
},
{
"epoch": 0.38718473074301296,
"grad_norm": 0.016405848786234856,
"learning_rate": 7.208069251855078e-06,
"loss": 1.7622,
"step": 213
},
{
"epoch": 0.38900249943194726,
"grad_norm": 0.01654895953834057,
"learning_rate": 7.181030830777838e-06,
"loss": 1.7636,
"step": 214
},
{
"epoch": 0.3908202681208816,
"grad_norm": 0.015662197023630142,
"learning_rate": 7.153913411129993e-06,
"loss": 1.7751,
"step": 215
},
{
"epoch": 0.39263803680981596,
"grad_norm": 0.015878858044743538,
"learning_rate": 7.1267179751248005e-06,
"loss": 1.7708,
"step": 216
},
{
"epoch": 0.3944558054987503,
"grad_norm": 0.016220899298787117,
"learning_rate": 7.099445507801324e-06,
"loss": 1.7679,
"step": 217
},
{
"epoch": 0.3962735741876846,
"grad_norm": 0.015889156609773636,
"learning_rate": 7.0720969969887595e-06,
"loss": 1.7657,
"step": 218
},
{
"epoch": 0.39809134287661896,
"grad_norm": 0.01594599336385727,
"learning_rate": 7.044673433270659e-06,
"loss": 1.7641,
"step": 219
},
{
"epoch": 0.3999091115655533,
"grad_norm": 0.015293586999177933,
"learning_rate": 7.017175809949044e-06,
"loss": 1.7677,
"step": 220
},
{
"epoch": 0.4017268802544876,
"grad_norm": 0.015891166403889656,
"learning_rate": 6.98960512300843e-06,
"loss": 1.7629,
"step": 221
},
{
"epoch": 0.40354464894342196,
"grad_norm": 0.016649074852466583,
"learning_rate": 6.961962371079752e-06,
"loss": 1.7655,
"step": 222
},
{
"epoch": 0.4053624176323563,
"grad_norm": 0.016516495496034622,
"learning_rate": 6.934248555404197e-06,
"loss": 1.7741,
"step": 223
},
{
"epoch": 0.4071801863212906,
"grad_norm": 0.01925363577902317,
"learning_rate": 6.906464679796927e-06,
"loss": 1.7572,
"step": 224
},
{
"epoch": 0.40899795501022496,
"grad_norm": 0.01634056493639946,
"learning_rate": 6.878611750610731e-06,
"loss": 1.759,
"step": 225
},
{
"epoch": 0.4108157236991593,
"grad_norm": 0.016612950712442398,
"learning_rate": 6.850690776699574e-06,
"loss": 1.7562,
"step": 226
},
{
"epoch": 0.4126334923880936,
"grad_norm": 0.01613459922373295,
"learning_rate": 6.822702769382042e-06,
"loss": 1.7697,
"step": 227
},
{
"epoch": 0.41445126107702795,
"grad_norm": 0.016045957803726196,
"learning_rate": 6.79464874240473e-06,
"loss": 1.7623,
"step": 228
},
{
"epoch": 0.4162690297659623,
"grad_norm": 0.016840225085616112,
"learning_rate": 6.766529711905513e-06,
"loss": 1.7742,
"step": 229
},
{
"epoch": 0.4180867984548966,
"grad_norm": 0.015475032851099968,
"learning_rate": 6.7383466963767386e-06,
"loss": 1.7644,
"step": 230
},
{
"epoch": 0.41990456714383095,
"grad_norm": 0.016331806778907776,
"learning_rate": 6.710100716628345e-06,
"loss": 1.7722,
"step": 231
},
{
"epoch": 0.4217223358327653,
"grad_norm": 0.016033973544836044,
"learning_rate": 6.681792795750876e-06,
"loss": 1.7572,
"step": 232
},
{
"epoch": 0.4235401045216996,
"grad_norm": 0.015963230282068253,
"learning_rate": 6.653423959078435e-06,
"loss": 1.7714,
"step": 233
},
{
"epoch": 0.42535787321063395,
"grad_norm": 0.016069794073700905,
"learning_rate": 6.624995234151539e-06,
"loss": 1.7702,
"step": 234
},
{
"epoch": 0.4271756418995683,
"grad_norm": 0.016175484284758568,
"learning_rate": 6.5965076506799e-06,
"loss": 1.7595,
"step": 235
},
{
"epoch": 0.4289934105885026,
"grad_norm": 0.017575398087501526,
"learning_rate": 6.567962240505136e-06,
"loss": 1.7589,
"step": 236
},
{
"epoch": 0.43081117927743695,
"grad_norm": 0.01609048619866371,
"learning_rate": 6.539360037563384e-06,
"loss": 1.7583,
"step": 237
},
{
"epoch": 0.4326289479663713,
"grad_norm": 0.016053223982453346,
"learning_rate": 6.510702077847864e-06,
"loss": 1.7593,
"step": 238
},
{
"epoch": 0.4344467166553056,
"grad_norm": 0.01691989041864872,
"learning_rate": 6.481989399371347e-06,
"loss": 1.7643,
"step": 239
},
{
"epoch": 0.43626448534423995,
"grad_norm": 0.017391884699463844,
"learning_rate": 6.453223042128556e-06,
"loss": 1.7588,
"step": 240
},
{
"epoch": 0.4380822540331743,
"grad_norm": 0.016525816172361374,
"learning_rate": 6.424404048058501e-06,
"loss": 1.7637,
"step": 241
},
{
"epoch": 0.4399000227221086,
"grad_norm": 0.01585998386144638,
"learning_rate": 6.395533461006736e-06,
"loss": 1.7652,
"step": 242
},
{
"epoch": 0.44171779141104295,
"grad_norm": 0.01582312397658825,
"learning_rate": 6.366612326687555e-06,
"loss": 1.7584,
"step": 243
},
{
"epoch": 0.4435355600999773,
"grad_norm": 0.01715337485074997,
"learning_rate": 6.337641692646106e-06,
"loss": 1.7606,
"step": 244
},
{
"epoch": 0.4453533287889116,
"grad_norm": 0.021504878997802734,
"learning_rate": 6.308622608220457e-06,
"loss": 1.762,
"step": 245
},
{
"epoch": 0.44717109747784595,
"grad_norm": 0.015527226962149143,
"learning_rate": 6.2795561245035895e-06,
"loss": 1.757,
"step": 246
},
{
"epoch": 0.4489888661667803,
"grad_norm": 0.017598124220967293,
"learning_rate": 6.250443294305315e-06,
"loss": 1.7547,
"step": 247
},
{
"epoch": 0.4508066348557146,
"grad_norm": 0.016357263550162315,
"learning_rate": 6.221285172114156e-06,
"loss": 1.7585,
"step": 248
},
{
"epoch": 0.45262440354464895,
"grad_norm": 0.01646249182522297,
"learning_rate": 6.192082814059141e-06,
"loss": 1.76,
"step": 249
},
{
"epoch": 0.4544421722335833,
"grad_norm": 0.016435401514172554,
"learning_rate": 6.162837277871553e-06,
"loss": 1.7664,
"step": 250
},
{
"epoch": 0.4562599409225176,
"grad_norm": 0.016678526997566223,
"learning_rate": 6.133549622846625e-06,
"loss": 1.7713,
"step": 251
},
{
"epoch": 0.45807770961145194,
"grad_norm": 0.017534134909510612,
"learning_rate": 6.104220909805162e-06,
"loss": 1.7589,
"step": 252
},
{
"epoch": 0.4598954783003863,
"grad_norm": 0.016283275559544563,
"learning_rate": 6.074852201055121e-06,
"loss": 1.7598,
"step": 253
},
{
"epoch": 0.4617132469893206,
"grad_norm": 0.017745792865753174,
"learning_rate": 6.045444560353136e-06,
"loss": 1.7643,
"step": 254
},
{
"epoch": 0.46353101567825494,
"grad_norm": 0.017753778025507927,
"learning_rate": 6.015999052865982e-06,
"loss": 1.7545,
"step": 255
},
{
"epoch": 0.4653487843671893,
"grad_norm": 0.017292464151978493,
"learning_rate": 5.986516745132e-06,
"loss": 1.7582,
"step": 256
},
{
"epoch": 0.4671665530561236,
"grad_norm": 0.01648300141096115,
"learning_rate": 5.956998705022464e-06,
"loss": 1.7603,
"step": 257
},
{
"epoch": 0.46898432174505794,
"grad_norm": 0.017090782523155212,
"learning_rate": 5.927446001702899e-06,
"loss": 1.7654,
"step": 258
},
{
"epoch": 0.4708020904339923,
"grad_norm": 0.015470580197870731,
"learning_rate": 5.8978597055943585e-06,
"loss": 1.7529,
"step": 259
},
{
"epoch": 0.4726198591229266,
"grad_norm": 0.016197843477129936,
"learning_rate": 5.8682408883346535e-06,
"loss": 1.7551,
"step": 260
},
{
"epoch": 0.47443762781186094,
"grad_norm": 0.018076961860060692,
"learning_rate": 5.8385906227395304e-06,
"loss": 1.7629,
"step": 261
},
{
"epoch": 0.4762553965007953,
"grad_norm": 0.015964508056640625,
"learning_rate": 5.808909982763825e-06,
"loss": 1.7668,
"step": 262
},
{
"epoch": 0.4780731651897296,
"grad_norm": 0.016753260046243668,
"learning_rate": 5.779200043462549e-06,
"loss": 1.753,
"step": 263
},
{
"epoch": 0.47989093387866394,
"grad_norm": 0.01664654165506363,
"learning_rate": 5.749461880951966e-06,
"loss": 1.7654,
"step": 264
},
{
"epoch": 0.4817087025675983,
"grad_norm": 0.01592446304857731,
"learning_rate": 5.719696572370596e-06,
"loss": 1.763,
"step": 265
},
{
"epoch": 0.4835264712565326,
"grad_norm": 0.016646496951580048,
"learning_rate": 5.689905195840216e-06,
"loss": 1.766,
"step": 266
},
{
"epoch": 0.48534423994546694,
"grad_norm": 0.016208553686738014,
"learning_rate": 5.660088830426804e-06,
"loss": 1.7551,
"step": 267
},
{
"epoch": 0.4871620086344013,
"grad_norm": 0.01585574448108673,
"learning_rate": 5.630248556101448e-06,
"loss": 1.7638,
"step": 268
},
{
"epoch": 0.4889797773233356,
"grad_norm": 0.016133490949869156,
"learning_rate": 5.600385453701241e-06,
"loss": 1.7644,
"step": 269
},
{
"epoch": 0.49079754601226994,
"grad_norm": 0.015675894916057587,
"learning_rate": 5.570500604890124e-06,
"loss": 1.7675,
"step": 270
},
{
"epoch": 0.4926153147012043,
"grad_norm": 0.01614633947610855,
"learning_rate": 5.540595092119709e-06,
"loss": 1.7636,
"step": 271
},
{
"epoch": 0.4944330833901386,
"grad_norm": 0.01666291244328022,
"learning_rate": 5.510669998590074e-06,
"loss": 1.7583,
"step": 272
},
{
"epoch": 0.49625085207907293,
"grad_norm": 0.016553543508052826,
"learning_rate": 5.480726408210519e-06,
"loss": 1.7586,
"step": 273
},
{
"epoch": 0.4980686207680073,
"grad_norm": 0.017047051340341568,
"learning_rate": 5.450765405560328e-06,
"loss": 1.7534,
"step": 274
},
{
"epoch": 0.4998863894569416,
"grad_norm": 0.01579987071454525,
"learning_rate": 5.4207880758494545e-06,
"loss": 1.7669,
"step": 275
},
{
"epoch": 0.501704158145876,
"grad_norm": 0.016013607382774353,
"learning_rate": 5.390795504879243e-06,
"loss": 1.7546,
"step": 276
},
{
"epoch": 0.5035219268348102,
"grad_norm": 0.015493376180529594,
"learning_rate": 5.360788779003082e-06,
"loss": 1.7555,
"step": 277
},
{
"epoch": 0.5053396955237446,
"grad_norm": 0.016125505790114403,
"learning_rate": 5.330768985087059e-06,
"loss": 1.7485,
"step": 278
},
{
"epoch": 0.5071574642126789,
"grad_norm": 0.015707215294241905,
"learning_rate": 5.300737210470603e-06,
"loss": 1.7556,
"step": 279
},
{
"epoch": 0.5089752329016133,
"grad_norm": 0.016529636457562447,
"learning_rate": 5.270694542927089e-06,
"loss": 1.7621,
"step": 280
},
{
"epoch": 0.5107930015905476,
"grad_norm": 0.015912501141428947,
"learning_rate": 5.2406420706244376e-06,
"loss": 1.7578,
"step": 281
},
{
"epoch": 0.512610770279482,
"grad_norm": 0.017320740967988968,
"learning_rate": 5.2105808820857126e-06,
"loss": 1.7509,
"step": 282
},
{
"epoch": 0.5144285389684162,
"grad_norm": 0.016190189868211746,
"learning_rate": 5.180512066149682e-06,
"loss": 1.7586,
"step": 283
},
{
"epoch": 0.5162463076573506,
"grad_norm": 0.01586255431175232,
"learning_rate": 5.150436711931387e-06,
"loss": 1.7618,
"step": 284
},
{
"epoch": 0.5180640763462849,
"grad_norm": 0.016613394021987915,
"learning_rate": 5.120355908782686e-06,
"loss": 1.7582,
"step": 285
},
{
"epoch": 0.5198818450352193,
"grad_norm": 0.016856033354997635,
"learning_rate": 5.090270746252803e-06,
"loss": 1.766,
"step": 286
},
{
"epoch": 0.5216996137241536,
"grad_norm": 0.015804223716259003,
"learning_rate": 5.060182314048865e-06,
"loss": 1.7548,
"step": 287
},
{
"epoch": 0.523517382413088,
"grad_norm": 0.01533227227628231,
"learning_rate": 5.030091701996428e-06,
"loss": 1.7508,
"step": 288
},
{
"epoch": 0.5253351511020222,
"grad_norm": 0.017301153391599655,
"learning_rate": 5e-06,
"loss": 1.7508,
"step": 289
},
{
"epoch": 0.5271529197909566,
"grad_norm": 0.016463877633213997,
"learning_rate": 4.9699082980035735e-06,
"loss": 1.7612,
"step": 290
},
{
"epoch": 0.5289706884798909,
"grad_norm": 0.017038939520716667,
"learning_rate": 4.939817685951135e-06,
"loss": 1.7557,
"step": 291
},
{
"epoch": 0.5307884571688253,
"grad_norm": 0.01651296392083168,
"learning_rate": 4.909729253747197e-06,
"loss": 1.7555,
"step": 292
},
{
"epoch": 0.5326062258577596,
"grad_norm": 0.01751718856394291,
"learning_rate": 4.879644091217317e-06,
"loss": 1.7524,
"step": 293
},
{
"epoch": 0.534423994546694,
"grad_norm": 0.016333656385540962,
"learning_rate": 4.8495632880686155e-06,
"loss": 1.7452,
"step": 294
},
{
"epoch": 0.5362417632356282,
"grad_norm": 0.016173357143998146,
"learning_rate": 4.819487933850319e-06,
"loss": 1.7611,
"step": 295
},
{
"epoch": 0.5380595319245626,
"grad_norm": 0.016298582777380943,
"learning_rate": 4.789419117914288e-06,
"loss": 1.752,
"step": 296
},
{
"epoch": 0.5398773006134969,
"grad_norm": 0.017157401889562607,
"learning_rate": 4.759357929375563e-06,
"loss": 1.7518,
"step": 297
},
{
"epoch": 0.5416950693024313,
"grad_norm": 0.01661343313753605,
"learning_rate": 4.729305457072913e-06,
"loss": 1.7637,
"step": 298
},
{
"epoch": 0.5435128379913656,
"grad_norm": 0.016558021306991577,
"learning_rate": 4.699262789529396e-06,
"loss": 1.7511,
"step": 299
},
{
"epoch": 0.5453306066803,
"grad_norm": 0.016143113374710083,
"learning_rate": 4.6692310149129425e-06,
"loss": 1.7562,
"step": 300
},
{
"epoch": 0.5471483753692342,
"grad_norm": 0.01550297997891903,
"learning_rate": 1e-05,
"loss": 1.7592,
"step": 301
},
{
"epoch": 0.5489661440581686,
"grad_norm": 0.016153663396835327,
"learning_rate": 1e-05,
"loss": 1.7495,
"step": 302
},
{
"epoch": 0.5507839127471029,
"grad_norm": 0.017202477902173996,
"learning_rate": 1e-05,
"loss": 1.7564,
"step": 303
},
{
"epoch": 0.5526016814360373,
"grad_norm": 0.01577403023838997,
"learning_rate": 1e-05,
"loss": 1.7635,
"step": 304
},
{
"epoch": 0.5544194501249716,
"grad_norm": 0.016280407086014748,
"learning_rate": 1e-05,
"loss": 1.748,
"step": 305
},
{
"epoch": 0.556237218813906,
"grad_norm": 0.016771433874964714,
"learning_rate": 1e-05,
"loss": 1.7467,
"step": 306
},
{
"epoch": 0.5580549875028402,
"grad_norm": 0.01556472573429346,
"learning_rate": 1e-05,
"loss": 1.751,
"step": 307
},
{
"epoch": 0.5598727561917746,
"grad_norm": 0.01656194217503071,
"learning_rate": 1e-05,
"loss": 1.7605,
"step": 308
},
{
"epoch": 0.5616905248807089,
"grad_norm": 0.017003118991851807,
"learning_rate": 1e-05,
"loss": 1.7516,
"step": 309
},
{
"epoch": 0.5635082935696433,
"grad_norm": 0.016028909012675285,
"learning_rate": 1e-05,
"loss": 1.7557,
"step": 310
},
{
"epoch": 0.5653260622585776,
"grad_norm": 0.016611898317933083,
"learning_rate": 1e-05,
"loss": 1.7548,
"step": 311
},
{
"epoch": 0.567143830947512,
"grad_norm": 0.01619804836809635,
"learning_rate": 1e-05,
"loss": 1.7569,
"step": 312
},
{
"epoch": 0.5689615996364462,
"grad_norm": 0.01763117127120495,
"learning_rate": 1e-05,
"loss": 1.7499,
"step": 313
},
{
"epoch": 0.5707793683253806,
"grad_norm": 0.017052598297595978,
"learning_rate": 1e-05,
"loss": 1.7628,
"step": 314
},
{
"epoch": 0.5725971370143149,
"grad_norm": 0.015606777742505074,
"learning_rate": 1e-05,
"loss": 1.7695,
"step": 315
},
{
"epoch": 0.5744149057032493,
"grad_norm": 0.017086924985051155,
"learning_rate": 1e-05,
"loss": 1.7573,
"step": 316
},
{
"epoch": 0.5762326743921836,
"grad_norm": 0.01597212627530098,
"learning_rate": 1e-05,
"loss": 1.7672,
"step": 317
},
{
"epoch": 0.578050443081118,
"grad_norm": 0.016126353293657303,
"learning_rate": 1e-05,
"loss": 1.7481,
"step": 318
},
{
"epoch": 0.5798682117700522,
"grad_norm": 0.016764555126428604,
"learning_rate": 1e-05,
"loss": 1.7543,
"step": 319
},
{
"epoch": 0.5816859804589866,
"grad_norm": 0.016383804380893707,
"learning_rate": 1e-05,
"loss": 1.7595,
"step": 320
},
{
"epoch": 0.5835037491479209,
"grad_norm": 0.016328634694218636,
"learning_rate": 1e-05,
"loss": 1.7624,
"step": 321
},
{
"epoch": 0.5853215178368553,
"grad_norm": 0.017615774646401405,
"learning_rate": 1e-05,
"loss": 1.7633,
"step": 322
},
{
"epoch": 0.5871392865257896,
"grad_norm": 0.016653137281537056,
"learning_rate": 1e-05,
"loss": 1.753,
"step": 323
},
{
"epoch": 0.588957055214724,
"grad_norm": 0.016418032348155975,
"learning_rate": 1e-05,
"loss": 1.7553,
"step": 324
},
{
"epoch": 0.5907748239036582,
"grad_norm": 0.01667468063533306,
"learning_rate": 1e-05,
"loss": 1.759,
"step": 325
},
{
"epoch": 0.5925925925925926,
"grad_norm": 0.015785276889801025,
"learning_rate": 1e-05,
"loss": 1.7545,
"step": 326
},
{
"epoch": 0.5944103612815269,
"grad_norm": 0.017045632004737854,
"learning_rate": 1e-05,
"loss": 1.7569,
"step": 327
},
{
"epoch": 0.5962281299704613,
"grad_norm": 0.016107341274619102,
"learning_rate": 1e-05,
"loss": 1.7551,
"step": 328
},
{
"epoch": 0.5980458986593956,
"grad_norm": 0.016075948253273964,
"learning_rate": 1e-05,
"loss": 1.7489,
"step": 329
},
{
"epoch": 0.59986366734833,
"grad_norm": 0.015299948863685131,
"learning_rate": 1e-05,
"loss": 1.7584,
"step": 330
},
{
"epoch": 0.6016814360372642,
"grad_norm": 0.01539833564311266,
"learning_rate": 1e-05,
"loss": 1.7484,
"step": 331
},
{
"epoch": 0.6034992047261986,
"grad_norm": 0.016403749585151672,
"learning_rate": 1e-05,
"loss": 1.7549,
"step": 332
},
{
"epoch": 0.6053169734151329,
"grad_norm": 0.017300885170698166,
"learning_rate": 1e-05,
"loss": 1.7503,
"step": 333
},
{
"epoch": 0.6071347421040673,
"grad_norm": 0.01626763306558132,
"learning_rate": 1e-05,
"loss": 1.7613,
"step": 334
},
{
"epoch": 0.6089525107930016,
"grad_norm": 0.01677662320435047,
"learning_rate": 1e-05,
"loss": 1.7539,
"step": 335
},
{
"epoch": 0.610770279481936,
"grad_norm": 0.017275378108024597,
"learning_rate": 1e-05,
"loss": 1.752,
"step": 336
},
{
"epoch": 0.6125880481708702,
"grad_norm": 0.015787243843078613,
"learning_rate": 1e-05,
"loss": 1.753,
"step": 337
},
{
"epoch": 0.6144058168598046,
"grad_norm": 0.016181068494915962,
"learning_rate": 1e-05,
"loss": 1.7574,
"step": 338
},
{
"epoch": 0.6162235855487389,
"grad_norm": 0.01625332608819008,
"learning_rate": 1e-05,
"loss": 1.7552,
"step": 339
},
{
"epoch": 0.6180413542376733,
"grad_norm": 0.01715734228491783,
"learning_rate": 1e-05,
"loss": 1.7538,
"step": 340
},
{
"epoch": 0.6198591229266076,
"grad_norm": 0.018199391663074493,
"learning_rate": 1e-05,
"loss": 1.7589,
"step": 341
},
{
"epoch": 0.621676891615542,
"grad_norm": 0.01592421531677246,
"learning_rate": 1e-05,
"loss": 1.7514,
"step": 342
},
{
"epoch": 0.6234946603044762,
"grad_norm": 0.015030477195978165,
"learning_rate": 1e-05,
"loss": 1.7578,
"step": 343
},
{
"epoch": 0.6253124289934106,
"grad_norm": 0.01609027571976185,
"learning_rate": 1e-05,
"loss": 1.7528,
"step": 344
},
{
"epoch": 0.6271301976823449,
"grad_norm": 0.015512831509113312,
"learning_rate": 1e-05,
"loss": 1.7511,
"step": 345
},
{
"epoch": 0.6289479663712793,
"grad_norm": 0.015017388388514519,
"learning_rate": 1e-05,
"loss": 1.7504,
"step": 346
},
{
"epoch": 0.6307657350602136,
"grad_norm": 0.01578696072101593,
"learning_rate": 1e-05,
"loss": 1.7545,
"step": 347
},
{
"epoch": 0.632583503749148,
"grad_norm": 0.015417453832924366,
"learning_rate": 1e-05,
"loss": 1.7481,
"step": 348
},
{
"epoch": 0.6344012724380823,
"grad_norm": 0.015762289986014366,
"learning_rate": 1e-05,
"loss": 1.7614,
"step": 349
},
{
"epoch": 0.6362190411270165,
"grad_norm": 0.01597565785050392,
"learning_rate": 1e-05,
"loss": 1.7497,
"step": 350
},
{
"epoch": 0.6380368098159509,
"grad_norm": 0.01767154410481453,
"learning_rate": 1e-05,
"loss": 1.7537,
"step": 351
},
{
"epoch": 0.6398545785048853,
"grad_norm": 0.01671607419848442,
"learning_rate": 1e-05,
"loss": 1.7456,
"step": 352
},
{
"epoch": 0.6416723471938196,
"grad_norm": 0.015792865306138992,
"learning_rate": 1e-05,
"loss": 1.7494,
"step": 353
},
{
"epoch": 0.643490115882754,
"grad_norm": 0.017053868621587753,
"learning_rate": 1e-05,
"loss": 1.743,
"step": 354
},
{
"epoch": 0.6453078845716883,
"grad_norm": 0.015672611072659492,
"learning_rate": 1e-05,
"loss": 1.7478,
"step": 355
},
{
"epoch": 0.6471256532606225,
"grad_norm": 0.01585494540631771,
"learning_rate": 1e-05,
"loss": 1.7535,
"step": 356
},
{
"epoch": 0.6489434219495569,
"grad_norm": 0.016009824350476265,
"learning_rate": 1e-05,
"loss": 1.759,
"step": 357
},
{
"epoch": 0.6507611906384912,
"grad_norm": 0.015507341362535954,
"learning_rate": 1e-05,
"loss": 1.755,
"step": 358
},
{
"epoch": 0.6525789593274256,
"grad_norm": 0.01644650474190712,
"learning_rate": 1e-05,
"loss": 1.7597,
"step": 359
},
{
"epoch": 0.65439672801636,
"grad_norm": 0.016472771763801575,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 360
},
{
"epoch": 0.6562144967052943,
"grad_norm": 0.016300657764077187,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 361
},
{
"epoch": 0.6580322653942285,
"grad_norm": 0.016034092754125595,
"learning_rate": 1e-05,
"loss": 1.7477,
"step": 362
},
{
"epoch": 0.6598500340831629,
"grad_norm": 0.01675514504313469,
"learning_rate": 1e-05,
"loss": 1.7643,
"step": 363
},
{
"epoch": 0.6616678027720972,
"grad_norm": 0.016840513795614243,
"learning_rate": 1e-05,
"loss": 1.7514,
"step": 364
},
{
"epoch": 0.6634855714610316,
"grad_norm": 0.017041552811861038,
"learning_rate": 1e-05,
"loss": 1.7581,
"step": 365
},
{
"epoch": 0.665303340149966,
"grad_norm": 0.016030827537178993,
"learning_rate": 1e-05,
"loss": 1.7455,
"step": 366
},
{
"epoch": 0.6671211088389003,
"grad_norm": 0.016785001382231712,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 367
},
{
"epoch": 0.6689388775278345,
"grad_norm": 0.017177637666463852,
"learning_rate": 1e-05,
"loss": 1.7512,
"step": 368
},
{
"epoch": 0.6707566462167689,
"grad_norm": 0.015744341537356377,
"learning_rate": 1e-05,
"loss": 1.7528,
"step": 369
},
{
"epoch": 0.6725744149057032,
"grad_norm": 0.015531038865447044,
"learning_rate": 1e-05,
"loss": 1.7446,
"step": 370
},
{
"epoch": 0.6743921835946376,
"grad_norm": 0.016207581385970116,
"learning_rate": 1e-05,
"loss": 1.7533,
"step": 371
},
{
"epoch": 0.676209952283572,
"grad_norm": 0.016298890113830566,
"learning_rate": 1e-05,
"loss": 1.7512,
"step": 372
},
{
"epoch": 0.6780277209725063,
"grad_norm": 0.016354553401470184,
"learning_rate": 1e-05,
"loss": 1.7533,
"step": 373
},
{
"epoch": 0.6798454896614405,
"grad_norm": 0.01599087379872799,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 374
},
{
"epoch": 0.6816632583503749,
"grad_norm": 0.015880877152085304,
"learning_rate": 1e-05,
"loss": 1.7514,
"step": 375
},
{
"epoch": 0.6834810270393092,
"grad_norm": 0.016650687903165817,
"learning_rate": 1e-05,
"loss": 1.746,
"step": 376
},
{
"epoch": 0.6852987957282436,
"grad_norm": 0.0163528211414814,
"learning_rate": 1e-05,
"loss": 1.7472,
"step": 377
},
{
"epoch": 0.6871165644171779,
"grad_norm": 0.01636846549808979,
"learning_rate": 1e-05,
"loss": 1.7445,
"step": 378
},
{
"epoch": 0.6889343331061123,
"grad_norm": 0.016309088096022606,
"learning_rate": 1e-05,
"loss": 1.7575,
"step": 379
},
{
"epoch": 0.6907521017950465,
"grad_norm": 0.01691536419093609,
"learning_rate": 1e-05,
"loss": 1.7478,
"step": 380
},
{
"epoch": 0.6925698704839809,
"grad_norm": 0.01824839785695076,
"learning_rate": 1e-05,
"loss": 1.7577,
"step": 381
},
{
"epoch": 0.6943876391729152,
"grad_norm": 0.01665637642145157,
"learning_rate": 1e-05,
"loss": 1.7516,
"step": 382
},
{
"epoch": 0.6962054078618496,
"grad_norm": 0.015938177704811096,
"learning_rate": 1e-05,
"loss": 1.7488,
"step": 383
},
{
"epoch": 0.6980231765507839,
"grad_norm": 0.01706807129085064,
"learning_rate": 1e-05,
"loss": 1.7545,
"step": 384
},
{
"epoch": 0.6998409452397183,
"grad_norm": 0.01841641589999199,
"learning_rate": 1e-05,
"loss": 1.7533,
"step": 385
},
{
"epoch": 0.7016587139286525,
"grad_norm": 0.01596180908381939,
"learning_rate": 1e-05,
"loss": 1.7521,
"step": 386
},
{
"epoch": 0.7034764826175869,
"grad_norm": 0.016269559040665627,
"learning_rate": 1e-05,
"loss": 1.7548,
"step": 387
},
{
"epoch": 0.7052942513065212,
"grad_norm": 0.01708034798502922,
"learning_rate": 1e-05,
"loss": 1.7443,
"step": 388
},
{
"epoch": 0.7071120199954556,
"grad_norm": 0.01742040552198887,
"learning_rate": 1e-05,
"loss": 1.7515,
"step": 389
},
{
"epoch": 0.7089297886843899,
"grad_norm": 0.017336854711174965,
"learning_rate": 1e-05,
"loss": 1.7478,
"step": 390
},
{
"epoch": 0.7107475573733243,
"grad_norm": 0.016049761325120926,
"learning_rate": 1e-05,
"loss": 1.7487,
"step": 391
},
{
"epoch": 0.7125653260622585,
"grad_norm": 0.017974358052015305,
"learning_rate": 1e-05,
"loss": 1.7539,
"step": 392
},
{
"epoch": 0.7143830947511929,
"grad_norm": 0.01644211634993553,
"learning_rate": 1e-05,
"loss": 1.7488,
"step": 393
},
{
"epoch": 0.7162008634401272,
"grad_norm": 0.018557770177721977,
"learning_rate": 1e-05,
"loss": 1.7448,
"step": 394
},
{
"epoch": 0.7180186321290616,
"grad_norm": 0.01734108291566372,
"learning_rate": 1e-05,
"loss": 1.7399,
"step": 395
},
{
"epoch": 0.7198364008179959,
"grad_norm": 0.01636637933552265,
"learning_rate": 1e-05,
"loss": 1.7566,
"step": 396
},
{
"epoch": 0.7216541695069303,
"grad_norm": 0.01724686101078987,
"learning_rate": 1e-05,
"loss": 1.751,
"step": 397
},
{
"epoch": 0.7234719381958645,
"grad_norm": 0.01744897849857807,
"learning_rate": 1e-05,
"loss": 1.7474,
"step": 398
},
{
"epoch": 0.7252897068847989,
"grad_norm": 0.017034457996487617,
"learning_rate": 1e-05,
"loss": 1.7492,
"step": 399
},
{
"epoch": 0.7271074755737332,
"grad_norm": 0.016682956367731094,
"learning_rate": 1e-05,
"loss": 1.7571,
"step": 400
},
{
"epoch": 0.7289252442626676,
"grad_norm": 0.016139404848217964,
"learning_rate": 1e-05,
"loss": 1.7426,
"step": 401
},
{
"epoch": 0.7307430129516019,
"grad_norm": 0.01789063960313797,
"learning_rate": 1e-05,
"loss": 1.7564,
"step": 402
},
{
"epoch": 0.7325607816405363,
"grad_norm": 0.017030801624059677,
"learning_rate": 1e-05,
"loss": 1.7495,
"step": 403
},
{
"epoch": 0.7343785503294705,
"grad_norm": 0.02051538974046707,
"learning_rate": 1e-05,
"loss": 1.7479,
"step": 404
},
{
"epoch": 0.7361963190184049,
"grad_norm": 0.016426604241132736,
"learning_rate": 1e-05,
"loss": 1.7475,
"step": 405
},
{
"epoch": 0.7380140877073392,
"grad_norm": 0.016485676169395447,
"learning_rate": 1e-05,
"loss": 1.7517,
"step": 406
},
{
"epoch": 0.7398318563962736,
"grad_norm": 0.017329517751932144,
"learning_rate": 1e-05,
"loss": 1.7556,
"step": 407
},
{
"epoch": 0.7416496250852079,
"grad_norm": 0.0165878776460886,
"learning_rate": 1e-05,
"loss": 1.7394,
"step": 408
},
{
"epoch": 0.7434673937741423,
"grad_norm": 0.016505807638168335,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 409
},
{
"epoch": 0.7452851624630765,
"grad_norm": 0.016942374408245087,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 410
},
{
"epoch": 0.7471029311520109,
"grad_norm": 0.01690479926764965,
"learning_rate": 1e-05,
"loss": 1.7522,
"step": 411
},
{
"epoch": 0.7489206998409452,
"grad_norm": 0.016314556822180748,
"learning_rate": 1e-05,
"loss": 1.7478,
"step": 412
},
{
"epoch": 0.7507384685298796,
"grad_norm": 0.016368621960282326,
"learning_rate": 1e-05,
"loss": 1.7475,
"step": 413
},
{
"epoch": 0.7525562372188139,
"grad_norm": 0.01776360534131527,
"learning_rate": 1e-05,
"loss": 1.7523,
"step": 414
},
{
"epoch": 0.7543740059077483,
"grad_norm": 0.01603596657514572,
"learning_rate": 1e-05,
"loss": 1.7422,
"step": 415
},
{
"epoch": 0.7561917745966825,
"grad_norm": 0.015459864400327206,
"learning_rate": 1e-05,
"loss": 1.7484,
"step": 416
},
{
"epoch": 0.7580095432856169,
"grad_norm": 0.018278229981660843,
"learning_rate": 1e-05,
"loss": 1.7543,
"step": 417
},
{
"epoch": 0.7598273119745512,
"grad_norm": 0.016482891514897346,
"learning_rate": 1e-05,
"loss": 1.7511,
"step": 418
},
{
"epoch": 0.7616450806634856,
"grad_norm": 0.0158072616904974,
"learning_rate": 1e-05,
"loss": 1.747,
"step": 419
},
{
"epoch": 0.7634628493524199,
"grad_norm": 0.01595921255648136,
"learning_rate": 1e-05,
"loss": 1.741,
"step": 420
},
{
"epoch": 0.7652806180413543,
"grad_norm": 0.01587016135454178,
"learning_rate": 1e-05,
"loss": 1.7427,
"step": 421
},
{
"epoch": 0.7670983867302885,
"grad_norm": 0.017007585614919662,
"learning_rate": 1e-05,
"loss": 1.7413,
"step": 422
},
{
"epoch": 0.7689161554192229,
"grad_norm": 0.015775319188833237,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 423
},
{
"epoch": 0.7707339241081572,
"grad_norm": 0.015736114233732224,
"learning_rate": 1e-05,
"loss": 1.7463,
"step": 424
},
{
"epoch": 0.7725516927970916,
"grad_norm": 0.01561545580625534,
"learning_rate": 1e-05,
"loss": 1.7482,
"step": 425
},
{
"epoch": 0.7743694614860259,
"grad_norm": 0.01614650897681713,
"learning_rate": 1e-05,
"loss": 1.7517,
"step": 426
},
{
"epoch": 0.7761872301749603,
"grad_norm": 0.016477441415190697,
"learning_rate": 1e-05,
"loss": 1.7437,
"step": 427
},
{
"epoch": 0.7780049988638945,
"grad_norm": 0.01549589540809393,
"learning_rate": 1e-05,
"loss": 1.7479,
"step": 428
},
{
"epoch": 0.7798227675528289,
"grad_norm": 0.015598030760884285,
"learning_rate": 1e-05,
"loss": 1.7438,
"step": 429
},
{
"epoch": 0.7816405362417632,
"grad_norm": 0.01621238701045513,
"learning_rate": 1e-05,
"loss": 1.743,
"step": 430
},
{
"epoch": 0.7834583049306976,
"grad_norm": 0.015526995062828064,
"learning_rate": 1e-05,
"loss": 1.7521,
"step": 431
},
{
"epoch": 0.7852760736196319,
"grad_norm": 0.01634833589196205,
"learning_rate": 1e-05,
"loss": 1.7489,
"step": 432
},
{
"epoch": 0.7870938423085663,
"grad_norm": 0.01686246506869793,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 433
},
{
"epoch": 0.7889116109975006,
"grad_norm": 0.01572590321302414,
"learning_rate": 1e-05,
"loss": 1.7454,
"step": 434
},
{
"epoch": 0.7907293796864349,
"grad_norm": 0.016653846949338913,
"learning_rate": 1e-05,
"loss": 1.7447,
"step": 435
},
{
"epoch": 0.7925471483753692,
"grad_norm": 0.016530562192201614,
"learning_rate": 1e-05,
"loss": 1.7465,
"step": 436
},
{
"epoch": 0.7943649170643036,
"grad_norm": 0.016080396249890327,
"learning_rate": 1e-05,
"loss": 1.7437,
"step": 437
},
{
"epoch": 0.7961826857532379,
"grad_norm": 0.016825426369905472,
"learning_rate": 1e-05,
"loss": 1.7432,
"step": 438
},
{
"epoch": 0.7980004544421723,
"grad_norm": 0.01737258955836296,
"learning_rate": 1e-05,
"loss": 1.7363,
"step": 439
},
{
"epoch": 0.7998182231311066,
"grad_norm": 0.015955086797475815,
"learning_rate": 1e-05,
"loss": 1.7509,
"step": 440
},
{
"epoch": 0.8016359918200409,
"grad_norm": 0.016994798555970192,
"learning_rate": 1e-05,
"loss": 1.7446,
"step": 441
},
{
"epoch": 0.8034537605089752,
"grad_norm": 0.0163293294608593,
"learning_rate": 1e-05,
"loss": 1.7491,
"step": 442
},
{
"epoch": 0.8052715291979096,
"grad_norm": 0.016241351142525673,
"learning_rate": 1e-05,
"loss": 1.7408,
"step": 443
},
{
"epoch": 0.8070892978868439,
"grad_norm": 0.03442993760108948,
"learning_rate": 1e-05,
"loss": 1.7485,
"step": 444
},
{
"epoch": 0.8089070665757783,
"grad_norm": 0.01715024746954441,
"learning_rate": 1e-05,
"loss": 1.7507,
"step": 445
},
{
"epoch": 0.8107248352647126,
"grad_norm": 0.016102071851491928,
"learning_rate": 1e-05,
"loss": 1.7508,
"step": 446
},
{
"epoch": 0.8125426039536469,
"grad_norm": 0.018684349954128265,
"learning_rate": 1e-05,
"loss": 1.745,
"step": 447
},
{
"epoch": 0.8143603726425812,
"grad_norm": 0.01681571640074253,
"learning_rate": 1e-05,
"loss": 1.7564,
"step": 448
},
{
"epoch": 0.8161781413315156,
"grad_norm": 0.01673213019967079,
"learning_rate": 1e-05,
"loss": 1.7491,
"step": 449
},
{
"epoch": 0.8179959100204499,
"grad_norm": 0.01589960604906082,
"learning_rate": 1e-05,
"loss": 1.7534,
"step": 450
},
{
"epoch": 0.8198136787093843,
"grad_norm": 0.018107162788510323,
"learning_rate": 1e-05,
"loss": 1.734,
"step": 451
},
{
"epoch": 0.8216314473983186,
"grad_norm": 0.016370611265301704,
"learning_rate": 1e-05,
"loss": 1.748,
"step": 452
},
{
"epoch": 0.8234492160872529,
"grad_norm": 0.01715346798300743,
"learning_rate": 1e-05,
"loss": 1.7581,
"step": 453
},
{
"epoch": 0.8252669847761872,
"grad_norm": 0.016535120084881783,
"learning_rate": 1e-05,
"loss": 1.7483,
"step": 454
},
{
"epoch": 0.8270847534651216,
"grad_norm": 0.01683277077972889,
"learning_rate": 1e-05,
"loss": 1.753,
"step": 455
},
{
"epoch": 0.8289025221540559,
"grad_norm": 0.016108205541968346,
"learning_rate": 1e-05,
"loss": 1.7509,
"step": 456
},
{
"epoch": 0.8307202908429903,
"grad_norm": 0.01758972927927971,
"learning_rate": 1e-05,
"loss": 1.7421,
"step": 457
},
{
"epoch": 0.8325380595319246,
"grad_norm": 0.016740551218390465,
"learning_rate": 1e-05,
"loss": 1.7531,
"step": 458
},
{
"epoch": 0.8343558282208589,
"grad_norm": 0.017136069014668465,
"learning_rate": 1e-05,
"loss": 1.7453,
"step": 459
},
{
"epoch": 0.8361735969097932,
"grad_norm": 0.018268654122948647,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 460
},
{
"epoch": 0.8379913655987276,
"grad_norm": 0.01658778078854084,
"learning_rate": 1e-05,
"loss": 1.7496,
"step": 461
},
{
"epoch": 0.8398091342876619,
"grad_norm": 0.016633301973342896,
"learning_rate": 1e-05,
"loss": 1.7485,
"step": 462
},
{
"epoch": 0.8416269029765963,
"grad_norm": 0.016990309581160545,
"learning_rate": 1e-05,
"loss": 1.7405,
"step": 463
},
{
"epoch": 0.8434446716655306,
"grad_norm": 0.01661493442952633,
"learning_rate": 1e-05,
"loss": 1.7464,
"step": 464
},
{
"epoch": 0.8452624403544649,
"grad_norm": 0.01699172891676426,
"learning_rate": 1e-05,
"loss": 1.7564,
"step": 465
},
{
"epoch": 0.8470802090433992,
"grad_norm": 0.016703175380825996,
"learning_rate": 1e-05,
"loss": 1.745,
"step": 466
},
{
"epoch": 0.8488979777323336,
"grad_norm": 0.01694013550877571,
"learning_rate": 1e-05,
"loss": 1.741,
"step": 467
},
{
"epoch": 0.8507157464212679,
"grad_norm": 0.017576703801751137,
"learning_rate": 1e-05,
"loss": 1.7553,
"step": 468
},
{
"epoch": 0.8525335151102023,
"grad_norm": 0.016727445647120476,
"learning_rate": 1e-05,
"loss": 1.734,
"step": 469
},
{
"epoch": 0.8543512837991366,
"grad_norm": 0.015813367441296577,
"learning_rate": 1e-05,
"loss": 1.7443,
"step": 470
},
{
"epoch": 0.8561690524880708,
"grad_norm": 0.01609817147254944,
"learning_rate": 1e-05,
"loss": 1.7496,
"step": 471
},
{
"epoch": 0.8579868211770052,
"grad_norm": 0.01648952253162861,
"learning_rate": 1e-05,
"loss": 1.7444,
"step": 472
},
{
"epoch": 0.8598045898659396,
"grad_norm": 0.016997788101434708,
"learning_rate": 1e-05,
"loss": 1.7436,
"step": 473
},
{
"epoch": 0.8616223585548739,
"grad_norm": 0.016397470608353615,
"learning_rate": 1e-05,
"loss": 1.7488,
"step": 474
},
{
"epoch": 0.8634401272438083,
"grad_norm": 0.01654043421149254,
"learning_rate": 1e-05,
"loss": 1.7406,
"step": 475
},
{
"epoch": 0.8652578959327426,
"grad_norm": 0.016180653125047684,
"learning_rate": 1e-05,
"loss": 1.7463,
"step": 476
},
{
"epoch": 0.8670756646216768,
"grad_norm": 0.016773954033851624,
"learning_rate": 1e-05,
"loss": 1.751,
"step": 477
},
{
"epoch": 0.8688934333106112,
"grad_norm": 0.01736517809331417,
"learning_rate": 1e-05,
"loss": 1.7402,
"step": 478
},
{
"epoch": 0.8707112019995455,
"grad_norm": 0.01888013258576393,
"learning_rate": 1e-05,
"loss": 1.7457,
"step": 479
},
{
"epoch": 0.8725289706884799,
"grad_norm": 0.018337909132242203,
"learning_rate": 1e-05,
"loss": 1.7453,
"step": 480
},
{
"epoch": 0.8743467393774143,
"grad_norm": 0.01563389040529728,
"learning_rate": 1e-05,
"loss": 1.7386,
"step": 481
},
{
"epoch": 0.8761645080663486,
"grad_norm": 0.017023077234625816,
"learning_rate": 1e-05,
"loss": 1.7412,
"step": 482
},
{
"epoch": 0.8779822767552828,
"grad_norm": 0.01671590842306614,
"learning_rate": 1e-05,
"loss": 1.7462,
"step": 483
},
{
"epoch": 0.8798000454442172,
"grad_norm": 0.019904915243387222,
"learning_rate": 1e-05,
"loss": 1.7443,
"step": 484
},
{
"epoch": 0.8816178141331515,
"grad_norm": 0.01728987693786621,
"learning_rate": 1e-05,
"loss": 1.7345,
"step": 485
},
{
"epoch": 0.8834355828220859,
"grad_norm": 0.019658857956528664,
"learning_rate": 1e-05,
"loss": 1.7425,
"step": 486
},
{
"epoch": 0.8852533515110202,
"grad_norm": 0.01688159443438053,
"learning_rate": 1e-05,
"loss": 1.746,
"step": 487
},
{
"epoch": 0.8870711201999546,
"grad_norm": 0.01599729433655739,
"learning_rate": 1e-05,
"loss": 1.7327,
"step": 488
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.016897086054086685,
"learning_rate": 1e-05,
"loss": 1.7385,
"step": 489
},
{
"epoch": 0.8907066575778232,
"grad_norm": 0.016169127076864243,
"learning_rate": 1e-05,
"loss": 1.7405,
"step": 490
},
{
"epoch": 0.8925244262667575,
"grad_norm": 0.01634543016552925,
"learning_rate": 1e-05,
"loss": 1.748,
"step": 491
},
{
"epoch": 0.8943421949556919,
"grad_norm": 0.016616657376289368,
"learning_rate": 1e-05,
"loss": 1.7465,
"step": 492
},
{
"epoch": 0.8961599636446262,
"grad_norm": 0.016464397311210632,
"learning_rate": 1e-05,
"loss": 1.7331,
"step": 493
},
{
"epoch": 0.8979777323335606,
"grad_norm": 0.017165830358862877,
"learning_rate": 1e-05,
"loss": 1.7383,
"step": 494
},
{
"epoch": 0.8997955010224948,
"grad_norm": 0.016248662024736404,
"learning_rate": 1e-05,
"loss": 1.7416,
"step": 495
},
{
"epoch": 0.9016132697114292,
"grad_norm": 0.01670646481215954,
"learning_rate": 1e-05,
"loss": 1.742,
"step": 496
},
{
"epoch": 0.9034310384003635,
"grad_norm": 0.016594985499978065,
"learning_rate": 1e-05,
"loss": 1.7397,
"step": 497
},
{
"epoch": 0.9052488070892979,
"grad_norm": 0.016361333429813385,
"learning_rate": 1e-05,
"loss": 1.7511,
"step": 498
},
{
"epoch": 0.9070665757782322,
"grad_norm": 0.016266893595457077,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 499
},
{
"epoch": 0.9088843444671666,
"grad_norm": 0.017031649127602577,
"learning_rate": 1e-05,
"loss": 1.7327,
"step": 500
},
{
"epoch": 0.9107021131561008,
"grad_norm": 0.016959581524133682,
"learning_rate": 1e-05,
"loss": 1.7454,
"step": 501
},
{
"epoch": 0.9125198818450352,
"grad_norm": 0.07533946633338928,
"learning_rate": 1e-05,
"loss": 1.7476,
"step": 502
},
{
"epoch": 0.9143376505339695,
"grad_norm": 0.01766197197139263,
"learning_rate": 1e-05,
"loss": 1.7461,
"step": 503
},
{
"epoch": 0.9161554192229039,
"grad_norm": 0.01663908362388611,
"learning_rate": 1e-05,
"loss": 1.7361,
"step": 504
},
{
"epoch": 0.9179731879118382,
"grad_norm": 0.02057843655347824,
"learning_rate": 1e-05,
"loss": 1.7441,
"step": 505
},
{
"epoch": 0.9197909566007726,
"grad_norm": 0.017909778282046318,
"learning_rate": 1e-05,
"loss": 1.742,
"step": 506
},
{
"epoch": 0.9216087252897068,
"grad_norm": 0.017638977617025375,
"learning_rate": 1e-05,
"loss": 1.7391,
"step": 507
},
{
"epoch": 0.9234264939786412,
"grad_norm": 0.018523376435041428,
"learning_rate": 1e-05,
"loss": 1.7405,
"step": 508
},
{
"epoch": 0.9252442626675755,
"grad_norm": 0.01635800302028656,
"learning_rate": 1e-05,
"loss": 1.7458,
"step": 509
},
{
"epoch": 0.9270620313565099,
"grad_norm": 0.01763818971812725,
"learning_rate": 1e-05,
"loss": 1.7351,
"step": 510
},
{
"epoch": 0.9288798000454442,
"grad_norm": 0.017338305711746216,
"learning_rate": 1e-05,
"loss": 1.7397,
"step": 511
},
{
"epoch": 0.9306975687343786,
"grad_norm": 0.01771395467221737,
"learning_rate": 1e-05,
"loss": 1.7471,
"step": 512
},
{
"epoch": 0.9325153374233128,
"grad_norm": 0.017642149701714516,
"learning_rate": 1e-05,
"loss": 1.7454,
"step": 513
},
{
"epoch": 0.9343331061122472,
"grad_norm": 0.017685122787952423,
"learning_rate": 1e-05,
"loss": 1.7375,
"step": 514
},
{
"epoch": 0.9361508748011815,
"grad_norm": 0.017887357622385025,
"learning_rate": 1e-05,
"loss": 1.7394,
"step": 515
},
{
"epoch": 0.9379686434901159,
"grad_norm": 0.01899501495063305,
"learning_rate": 1e-05,
"loss": 1.7452,
"step": 516
},
{
"epoch": 0.9397864121790502,
"grad_norm": 0.017754577100276947,
"learning_rate": 1e-05,
"loss": 1.7441,
"step": 517
},
{
"epoch": 0.9416041808679846,
"grad_norm": 0.01811014860868454,
"learning_rate": 1e-05,
"loss": 1.7417,
"step": 518
},
{
"epoch": 0.9434219495569189,
"grad_norm": 0.01806728169322014,
"learning_rate": 1e-05,
"loss": 1.7428,
"step": 519
},
{
"epoch": 0.9452397182458532,
"grad_norm": 0.018700286746025085,
"learning_rate": 1e-05,
"loss": 1.7345,
"step": 520
},
{
"epoch": 0.9470574869347875,
"grad_norm": 0.01722894422709942,
"learning_rate": 1e-05,
"loss": 1.7362,
"step": 521
},
{
"epoch": 0.9488752556237219,
"grad_norm": 0.016884060576558113,
"learning_rate": 1e-05,
"loss": 1.7355,
"step": 522
},
{
"epoch": 0.9506930243126562,
"grad_norm": 0.017119232565164566,
"learning_rate": 1e-05,
"loss": 1.7468,
"step": 523
},
{
"epoch": 0.9525107930015906,
"grad_norm": 0.017567407339811325,
"learning_rate": 1e-05,
"loss": 1.7422,
"step": 524
},
{
"epoch": 0.9543285616905249,
"grad_norm": 0.017188768833875656,
"learning_rate": 1e-05,
"loss": 1.7393,
"step": 525
},
{
"epoch": 0.9561463303794592,
"grad_norm": 0.016574783250689507,
"learning_rate": 1e-05,
"loss": 1.7341,
"step": 526
},
{
"epoch": 0.9579640990683935,
"grad_norm": 0.020617837086319923,
"learning_rate": 1e-05,
"loss": 1.7428,
"step": 527
},
{
"epoch": 0.9597818677573279,
"grad_norm": 0.018011432141065598,
"learning_rate": 1e-05,
"loss": 1.7496,
"step": 528
},
{
"epoch": 0.9615996364462622,
"grad_norm": 0.018056875094771385,
"learning_rate": 1e-05,
"loss": 1.7413,
"step": 529
},
{
"epoch": 0.9634174051351966,
"grad_norm": 0.018342627212405205,
"learning_rate": 1e-05,
"loss": 1.7395,
"step": 530
},
{
"epoch": 0.9652351738241309,
"grad_norm": 0.022182267159223557,
"learning_rate": 1e-05,
"loss": 1.7342,
"step": 531
},
{
"epoch": 0.9670529425130652,
"grad_norm": 0.01826542802155018,
"learning_rate": 1e-05,
"loss": 1.7384,
"step": 532
},
{
"epoch": 0.9688707112019995,
"grad_norm": 0.01716247759759426,
"learning_rate": 1e-05,
"loss": 1.7425,
"step": 533
},
{
"epoch": 0.9706884798909339,
"grad_norm": 0.017304804176092148,
"learning_rate": 1e-05,
"loss": 1.7521,
"step": 534
},
{
"epoch": 0.9725062485798682,
"grad_norm": 0.01794220507144928,
"learning_rate": 1e-05,
"loss": 1.7455,
"step": 535
},
{
"epoch": 0.9743240172688026,
"grad_norm": 0.017633073031902313,
"learning_rate": 1e-05,
"loss": 1.7509,
"step": 536
},
{
"epoch": 0.9761417859577369,
"grad_norm": 0.016983771696686745,
"learning_rate": 1e-05,
"loss": 1.7392,
"step": 537
},
{
"epoch": 0.9779595546466712,
"grad_norm": 0.01743633858859539,
"learning_rate": 1e-05,
"loss": 1.7341,
"step": 538
},
{
"epoch": 0.9797773233356055,
"grad_norm": 0.017662547528743744,
"learning_rate": 1e-05,
"loss": 1.7367,
"step": 539
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.01701057143509388,
"learning_rate": 1e-05,
"loss": 1.7423,
"step": 540
},
{
"epoch": 0.9834128607134742,
"grad_norm": 0.017070814967155457,
"learning_rate": 1e-05,
"loss": 1.7429,
"step": 541
},
{
"epoch": 0.9852306294024086,
"grad_norm": 0.01704619824886322,
"learning_rate": 1e-05,
"loss": 1.7348,
"step": 542
},
{
"epoch": 0.9870483980913429,
"grad_norm": 0.017563099041581154,
"learning_rate": 1e-05,
"loss": 1.7382,
"step": 543
},
{
"epoch": 0.9888661667802772,
"grad_norm": 0.01661253347992897,
"learning_rate": 1e-05,
"loss": 1.7412,
"step": 544
},
{
"epoch": 0.9906839354692115,
"grad_norm": 0.016802560538053513,
"learning_rate": 1e-05,
"loss": 1.7287,
"step": 545
},
{
"epoch": 0.9925017041581459,
"grad_norm": 0.01623694598674774,
"learning_rate": 1e-05,
"loss": 1.7345,
"step": 546
},
{
"epoch": 0.9943194728470802,
"grad_norm": 0.01796470768749714,
"learning_rate": 1e-05,
"loss": 1.7282,
"step": 547
},
{
"epoch": 0.9961372415360146,
"grad_norm": 0.016037970781326294,
"learning_rate": 1e-05,
"loss": 1.7358,
"step": 548
},
{
"epoch": 0.9979550102249489,
"grad_norm": 0.016084497794508934,
"learning_rate": 1e-05,
"loss": 1.7371,
"step": 549
},
{
"epoch": 0.9997727789138832,
"grad_norm": 0.016458775848150253,
"learning_rate": 1e-05,
"loss": 1.7397,
"step": 550
},
{
"epoch": 0.9997727789138832,
"step": 550,
"total_flos": 2868807299235840.0,
"train_loss": 0.7944060720096935,
"train_runtime": 47914.0805,
"train_samples_per_second": 2.939,
"train_steps_per_second": 0.011
}
],
"logging_steps": 1,
"max_steps": 550,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2868807299235840.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}