{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997727789138832, "eval_steps": 500, "global_step": 550, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018177686889343332, "grad_norm": 0.1331978142261505, "learning_rate": 3.5714285714285716e-07, "loss": 1.8898, "step": 1 }, { "epoch": 0.0036355373778686664, "grad_norm": 0.13566601276397705, "learning_rate": 7.142857142857143e-07, "loss": 1.8867, "step": 2 }, { "epoch": 0.0054533060668029995, "grad_norm": 0.13576287031173706, "learning_rate": 1.0714285714285714e-06, "loss": 1.8848, "step": 3 }, { "epoch": 0.007271074755737333, "grad_norm": 0.1231953352689743, "learning_rate": 1.4285714285714286e-06, "loss": 1.888, "step": 4 }, { "epoch": 0.009088843444671665, "grad_norm": 0.09326394647359848, "learning_rate": 1.7857142857142859e-06, "loss": 1.8816, "step": 5 }, { "epoch": 0.010906612133605999, "grad_norm": 0.08585168421268463, "learning_rate": 2.1428571428571427e-06, "loss": 1.8851, "step": 6 }, { "epoch": 0.012724380822540331, "grad_norm": 0.0567106269299984, "learning_rate": 2.5e-06, "loss": 1.8799, "step": 7 }, { "epoch": 0.014542149511474665, "grad_norm": 0.05393998324871063, "learning_rate": 2.8571428571428573e-06, "loss": 1.8785, "step": 8 }, { "epoch": 0.016359918200409, "grad_norm": 0.05495736747980118, "learning_rate": 3.2142857142857147e-06, "loss": 1.8763, "step": 9 }, { "epoch": 0.01817768688934333, "grad_norm": 0.05345786362886429, "learning_rate": 3.5714285714285718e-06, "loss": 1.8676, "step": 10 }, { "epoch": 0.019995455578277664, "grad_norm": 0.047461625188589096, "learning_rate": 3.928571428571429e-06, "loss": 1.8634, "step": 11 }, { "epoch": 0.021813224267211998, "grad_norm": 0.061344344168901443, "learning_rate": 4.2857142857142855e-06, "loss": 1.864, "step": 12 }, { "epoch": 0.02363099295614633, "grad_norm": 0.06245123967528343, "learning_rate": 4.642857142857144e-06, "loss": 1.8647, "step": 13 }, { "epoch": 0.025448761645080663, "grad_norm": 0.053826089948415756, "learning_rate": 5e-06, "loss": 1.8605, "step": 14 }, { "epoch": 0.027266530334014997, "grad_norm": 0.04343092441558838, "learning_rate": 5.357142857142857e-06, "loss": 1.8648, "step": 15 }, { "epoch": 0.02908429902294933, "grad_norm": 0.04949821159243584, "learning_rate": 5.7142857142857145e-06, "loss": 1.8536, "step": 16 }, { "epoch": 0.03090206771188366, "grad_norm": 0.06119069084525108, "learning_rate": 6.071428571428571e-06, "loss": 1.8485, "step": 17 }, { "epoch": 0.032719836400818, "grad_norm": 0.05067905783653259, "learning_rate": 6.4285714285714295e-06, "loss": 1.8536, "step": 18 }, { "epoch": 0.03453760508975233, "grad_norm": 0.03722887113690376, "learning_rate": 6.785714285714287e-06, "loss": 1.8491, "step": 19 }, { "epoch": 0.03635537377868666, "grad_norm": 0.04830312356352806, "learning_rate": 7.1428571428571436e-06, "loss": 1.8361, "step": 20 }, { "epoch": 0.038173142467621, "grad_norm": 0.052912868559360504, "learning_rate": 7.500000000000001e-06, "loss": 1.8522, "step": 21 }, { "epoch": 0.03999091115655533, "grad_norm": 0.04469645023345947, "learning_rate": 7.857142857142858e-06, "loss": 1.8391, "step": 22 }, { "epoch": 0.04180867984548966, "grad_norm": 0.04048198461532593, "learning_rate": 8.214285714285714e-06, "loss": 1.8345, "step": 23 }, { "epoch": 0.043626448534423996, "grad_norm": 0.03836997598409653, "learning_rate": 8.571428571428571e-06, "loss": 1.8342, "step": 24 }, { "epoch": 0.04544421722335833, "grad_norm": 0.038932956755161285, "learning_rate": 8.92857142857143e-06, "loss": 1.8399, "step": 25 }, { "epoch": 0.04726198591229266, "grad_norm": 0.041100382804870605, "learning_rate": 9.285714285714288e-06, "loss": 1.833, "step": 26 }, { "epoch": 0.049079754601226995, "grad_norm": 0.03821416571736336, "learning_rate": 9.642857142857144e-06, "loss": 1.8342, "step": 27 }, { "epoch": 0.050897523290161326, "grad_norm": 0.037851471453905106, "learning_rate": 1e-05, "loss": 1.8313, "step": 28 }, { "epoch": 0.05271529197909566, "grad_norm": 0.03763577714562416, "learning_rate": 9.999909448127131e-06, "loss": 1.8291, "step": 29 }, { "epoch": 0.054533060668029994, "grad_norm": 0.03475307673215866, "learning_rate": 9.999637795788383e-06, "loss": 1.8185, "step": 30 }, { "epoch": 0.056350829356964324, "grad_norm": 0.03289997950196266, "learning_rate": 9.999185052823207e-06, "loss": 1.8261, "step": 31 }, { "epoch": 0.05816859804589866, "grad_norm": 0.03243958577513695, "learning_rate": 9.99855123563029e-06, "loss": 1.8237, "step": 32 }, { "epoch": 0.05998636673483299, "grad_norm": 0.033227939158678055, "learning_rate": 9.997736367166967e-06, "loss": 1.827, "step": 33 }, { "epoch": 0.06180413542376732, "grad_norm": 0.03226836398243904, "learning_rate": 9.996740476948386e-06, "loss": 1.8257, "step": 34 }, { "epoch": 0.06362190411270166, "grad_norm": 0.029187630861997604, "learning_rate": 9.995563601046434e-06, "loss": 1.819, "step": 35 }, { "epoch": 0.065439672801636, "grad_norm": 0.026967501267790794, "learning_rate": 9.994205782088438e-06, "loss": 1.8136, "step": 36 }, { "epoch": 0.06725744149057032, "grad_norm": 0.031199270859360695, "learning_rate": 9.99266706925562e-06, "loss": 1.8206, "step": 37 }, { "epoch": 0.06907521017950466, "grad_norm": 0.030985625460743904, "learning_rate": 9.990947518281312e-06, "loss": 1.8281, "step": 38 }, { "epoch": 0.070892978868439, "grad_norm": 0.02339562401175499, "learning_rate": 9.989047191448934e-06, "loss": 1.82, "step": 39 }, { "epoch": 0.07271074755737332, "grad_norm": 0.0256453026086092, "learning_rate": 9.986966157589751e-06, "loss": 1.8079, "step": 40 }, { "epoch": 0.07452851624630766, "grad_norm": 0.025680653750896454, "learning_rate": 9.984704492080366e-06, "loss": 1.8088, "step": 41 }, { "epoch": 0.076346284935242, "grad_norm": 0.026331942528486252, "learning_rate": 9.982262276840002e-06, "loss": 1.8153, "step": 42 }, { "epoch": 0.07816405362417632, "grad_norm": 0.026452744379639626, "learning_rate": 9.979639600327522e-06, "loss": 1.8082, "step": 43 }, { "epoch": 0.07998182231311066, "grad_norm": 0.020438341423869133, "learning_rate": 9.976836557538234e-06, "loss": 1.8087, "step": 44 }, { "epoch": 0.081799591002045, "grad_norm": 0.022149616852402687, "learning_rate": 9.973853250000449e-06, "loss": 1.8132, "step": 45 }, { "epoch": 0.08361735969097932, "grad_norm": 0.020680025219917297, "learning_rate": 9.970689785771798e-06, "loss": 1.8077, "step": 46 }, { "epoch": 0.08543512837991366, "grad_norm": 0.018105728551745415, "learning_rate": 9.967346279435328e-06, "loss": 1.8063, "step": 47 }, { "epoch": 0.08725289706884799, "grad_norm": 0.020593147724866867, "learning_rate": 9.963822852095344e-06, "loss": 1.8036, "step": 48 }, { "epoch": 0.08907066575778232, "grad_norm": 0.0193562563508749, "learning_rate": 9.960119631373023e-06, "loss": 1.8135, "step": 49 }, { "epoch": 0.09088843444671665, "grad_norm": 0.017045950517058372, "learning_rate": 9.95623675140179e-06, "loss": 1.8115, "step": 50 }, { "epoch": 0.09270620313565099, "grad_norm": 0.01905151829123497, "learning_rate": 9.952174352822474e-06, "loss": 1.8087, "step": 51 }, { "epoch": 0.09452397182458531, "grad_norm": 0.019179217517375946, "learning_rate": 9.947932582778188e-06, "loss": 1.8093, "step": 52 }, { "epoch": 0.09634174051351965, "grad_norm": 0.016135873273015022, "learning_rate": 9.943511594909024e-06, "loss": 1.8008, "step": 53 }, { "epoch": 0.09815950920245399, "grad_norm": 0.016653183847665787, "learning_rate": 9.938911549346473e-06, "loss": 1.8075, "step": 54 }, { "epoch": 0.09997727789138833, "grad_norm": 0.01784764975309372, "learning_rate": 9.934132612707631e-06, "loss": 1.8065, "step": 55 }, { "epoch": 0.10179504658032265, "grad_norm": 0.01742948405444622, "learning_rate": 9.929174958089167e-06, "loss": 1.8066, "step": 56 }, { "epoch": 0.10361281526925699, "grad_norm": 0.015608050860464573, "learning_rate": 9.924038765061042e-06, "loss": 1.8089, "step": 57 }, { "epoch": 0.10543058395819133, "grad_norm": 0.017180059105157852, "learning_rate": 9.918724219660013e-06, "loss": 1.8063, "step": 58 }, { "epoch": 0.10724835264712565, "grad_norm": 0.01681089587509632, "learning_rate": 9.913231514382902e-06, "loss": 1.7952, "step": 59 }, { "epoch": 0.10906612133605999, "grad_norm": 0.016128279268741608, "learning_rate": 9.907560848179607e-06, "loss": 1.797, "step": 60 }, { "epoch": 0.11088389002499432, "grad_norm": 0.016297221183776855, "learning_rate": 9.901712426445901e-06, "loss": 1.7966, "step": 61 }, { "epoch": 0.11270165871392865, "grad_norm": 0.017089389264583588, "learning_rate": 9.895686461016007e-06, "loss": 1.8097, "step": 62 }, { "epoch": 0.11451942740286299, "grad_norm": 0.01613052934408188, "learning_rate": 9.889483170154903e-06, "loss": 1.7984, "step": 63 }, { "epoch": 0.11633719609179732, "grad_norm": 0.016225503757596016, "learning_rate": 9.883102778550434e-06, "loss": 1.8013, "step": 64 }, { "epoch": 0.11815496478073165, "grad_norm": 0.015952223911881447, "learning_rate": 9.876545517305163e-06, "loss": 1.7993, "step": 65 }, { "epoch": 0.11997273346966598, "grad_norm": 0.016045618802309036, "learning_rate": 9.869811623928001e-06, "loss": 1.7968, "step": 66 }, { "epoch": 0.12179050215860032, "grad_norm": 0.015822941437363625, "learning_rate": 9.862901342325617e-06, "loss": 1.7947, "step": 67 }, { "epoch": 0.12360827084753465, "grad_norm": 0.016080934554338455, "learning_rate": 9.855814922793583e-06, "loss": 1.8011, "step": 68 }, { "epoch": 0.125426039536469, "grad_norm": 0.01613529957830906, "learning_rate": 9.848552622007326e-06, "loss": 1.7956, "step": 69 }, { "epoch": 0.12724380822540332, "grad_norm": 0.01521450374275446, "learning_rate": 9.841114703012817e-06, "loss": 1.7961, "step": 70 }, { "epoch": 0.12906157691433764, "grad_norm": 0.01613503508269787, "learning_rate": 9.83350143521706e-06, "loss": 1.7981, "step": 71 }, { "epoch": 0.130879345603272, "grad_norm": 0.01576644368469715, "learning_rate": 9.82571309437831e-06, "loss": 1.8042, "step": 72 }, { "epoch": 0.13269711429220632, "grad_norm": 0.017247062176465988, "learning_rate": 9.817749962596115e-06, "loss": 1.793, "step": 73 }, { "epoch": 0.13451488298114064, "grad_norm": 0.014981955289840698, "learning_rate": 9.809612328301071e-06, "loss": 1.8074, "step": 74 }, { "epoch": 0.136332651670075, "grad_norm": 0.0150354178622365, "learning_rate": 9.801300486244385e-06, "loss": 1.7973, "step": 75 }, { "epoch": 0.13815042035900932, "grad_norm": 0.015270021744072437, "learning_rate": 9.792814737487207e-06, "loss": 1.7973, "step": 76 }, { "epoch": 0.13996818904794364, "grad_norm": 0.016216879710555077, "learning_rate": 9.784155389389713e-06, "loss": 1.7986, "step": 77 }, { "epoch": 0.141785957736878, "grad_norm": 0.015781838446855545, "learning_rate": 9.775322755599979e-06, "loss": 1.7937, "step": 78 }, { "epoch": 0.14360372642581232, "grad_norm": 0.015398108400404453, "learning_rate": 9.766317156042615e-06, "loss": 1.7976, "step": 79 }, { "epoch": 0.14542149511474664, "grad_norm": 0.01513028983026743, "learning_rate": 9.757138916907184e-06, "loss": 1.7915, "step": 80 }, { "epoch": 0.147239263803681, "grad_norm": 0.015322140417993069, "learning_rate": 9.747788370636389e-06, "loss": 1.8053, "step": 81 }, { "epoch": 0.14905703249261532, "grad_norm": 0.016009092330932617, "learning_rate": 9.738265855914014e-06, "loss": 1.7908, "step": 82 }, { "epoch": 0.15087480118154964, "grad_norm": 0.01483672671020031, "learning_rate": 9.728571717652677e-06, "loss": 1.7888, "step": 83 }, { "epoch": 0.152692569870484, "grad_norm": 0.014686529524624348, "learning_rate": 9.718706306981332e-06, "loss": 1.7911, "step": 84 }, { "epoch": 0.15451033855941831, "grad_norm": 0.01669451966881752, "learning_rate": 9.708669981232542e-06, "loss": 1.8017, "step": 85 }, { "epoch": 0.15632810724835264, "grad_norm": 0.014686268754303455, "learning_rate": 9.698463103929542e-06, "loss": 1.7979, "step": 86 }, { "epoch": 0.158145875937287, "grad_norm": 0.01508221123367548, "learning_rate": 9.688086044773079e-06, "loss": 1.7872, "step": 87 }, { "epoch": 0.1599636446262213, "grad_norm": 0.0154942087829113, "learning_rate": 9.677539179628005e-06, "loss": 1.794, "step": 88 }, { "epoch": 0.16178141331515564, "grad_norm": 0.016326844692230225, "learning_rate": 9.66682289050968e-06, "loss": 1.7981, "step": 89 }, { "epoch": 0.16359918200409, "grad_norm": 0.015189899131655693, "learning_rate": 9.655937565570124e-06, "loss": 1.7943, "step": 90 }, { "epoch": 0.1654169506930243, "grad_norm": 0.014669873751699924, "learning_rate": 9.644883599083959e-06, "loss": 1.7873, "step": 91 }, { "epoch": 0.16723471938195864, "grad_norm": 0.015705488622188568, "learning_rate": 9.63366139143413e-06, "loss": 1.7959, "step": 92 }, { "epoch": 0.169052488070893, "grad_norm": 0.015006368048489094, "learning_rate": 9.622271349097413e-06, "loss": 1.7883, "step": 93 }, { "epoch": 0.1708702567598273, "grad_norm": 0.015823103487491608, "learning_rate": 9.610713884629667e-06, "loss": 1.7864, "step": 94 }, { "epoch": 0.17268802544876163, "grad_norm": 0.03225838020443916, "learning_rate": 9.598989416650915e-06, "loss": 1.7871, "step": 95 }, { "epoch": 0.17450579413769599, "grad_norm": 0.015597975812852383, "learning_rate": 9.587098369830171e-06, "loss": 1.7804, "step": 96 }, { "epoch": 0.1763235628266303, "grad_norm": 0.01537901721894741, "learning_rate": 9.575041174870062e-06, "loss": 1.7858, "step": 97 }, { "epoch": 0.17814133151556463, "grad_norm": 0.023264285176992416, "learning_rate": 9.562818268491216e-06, "loss": 1.7823, "step": 98 }, { "epoch": 0.17995910020449898, "grad_norm": 0.01551035512238741, "learning_rate": 9.550430093416465e-06, "loss": 1.7882, "step": 99 }, { "epoch": 0.1817768688934333, "grad_norm": 0.015448925085365772, "learning_rate": 9.537877098354787e-06, "loss": 1.7836, "step": 100 }, { "epoch": 0.18359463758236763, "grad_norm": 0.01610329933464527, "learning_rate": 9.525159737985066e-06, "loss": 1.7843, "step": 101 }, { "epoch": 0.18541240627130198, "grad_norm": 0.015887994319200516, "learning_rate": 9.512278472939627e-06, "loss": 1.7835, "step": 102 }, { "epoch": 0.1872301749602363, "grad_norm": 0.015717443078756332, "learning_rate": 9.499233769787534e-06, "loss": 1.7899, "step": 103 }, { "epoch": 0.18904794364917063, "grad_norm": 0.01613277569413185, "learning_rate": 9.486026101017711e-06, "loss": 1.789, "step": 104 }, { "epoch": 0.19086571233810498, "grad_norm": 0.0161016546189785, "learning_rate": 9.472655945021815e-06, "loss": 1.7885, "step": 105 }, { "epoch": 0.1926834810270393, "grad_norm": 0.015553218312561512, "learning_rate": 9.459123786076911e-06, "loss": 1.7841, "step": 106 }, { "epoch": 0.19450124971597363, "grad_norm": 0.01636493392288685, "learning_rate": 9.445430114327936e-06, "loss": 1.7864, "step": 107 }, { "epoch": 0.19631901840490798, "grad_norm": 0.016063738614320755, "learning_rate": 9.431575425769938e-06, "loss": 1.7836, "step": 108 }, { "epoch": 0.1981367870938423, "grad_norm": 0.016147315502166748, "learning_rate": 9.417560222230115e-06, "loss": 1.7786, "step": 109 }, { "epoch": 0.19995455578277666, "grad_norm": 0.01560090109705925, "learning_rate": 9.40338501134964e-06, "loss": 1.7782, "step": 110 }, { "epoch": 0.20177232447171098, "grad_norm": 0.015402060933411121, "learning_rate": 9.389050306565269e-06, "loss": 1.7814, "step": 111 }, { "epoch": 0.2035900931606453, "grad_norm": 0.017125973477959633, "learning_rate": 9.374556627090749e-06, "loss": 1.7793, "step": 112 }, { "epoch": 0.20540786184957965, "grad_norm": 0.015735799446702003, "learning_rate": 9.359904497898009e-06, "loss": 1.7872, "step": 113 }, { "epoch": 0.20722563053851398, "grad_norm": 0.01627574861049652, "learning_rate": 9.345094449698143e-06, "loss": 1.7893, "step": 114 }, { "epoch": 0.2090433992274483, "grad_norm": 0.014931687153875828, "learning_rate": 9.330127018922195e-06, "loss": 1.7825, "step": 115 }, { "epoch": 0.21086116791638265, "grad_norm": 0.015015835873782635, "learning_rate": 9.315002747701716e-06, "loss": 1.77, "step": 116 }, { "epoch": 0.21267893660531698, "grad_norm": 0.01571677438914776, "learning_rate": 9.299722183849144e-06, "loss": 1.7843, "step": 117 }, { "epoch": 0.2144967052942513, "grad_norm": 0.014991500414907932, "learning_rate": 9.284285880837947e-06, "loss": 1.7824, "step": 118 }, { "epoch": 0.21631447398318565, "grad_norm": 0.016052858904004097, "learning_rate": 9.268694397782585e-06, "loss": 1.7805, "step": 119 }, { "epoch": 0.21813224267211997, "grad_norm": 0.015834221616387367, "learning_rate": 9.252948299418255e-06, "loss": 1.7855, "step": 120 }, { "epoch": 0.2199500113610543, "grad_norm": 0.01614440232515335, "learning_rate": 9.237048156080433e-06, "loss": 1.7885, "step": 121 }, { "epoch": 0.22176778004998865, "grad_norm": 0.01563919708132744, "learning_rate": 9.220994543684225e-06, "loss": 1.7799, "step": 122 }, { "epoch": 0.22358554873892297, "grad_norm": 0.015689659863710403, "learning_rate": 9.2047880437035e-06, "loss": 1.7808, "step": 123 }, { "epoch": 0.2254033174278573, "grad_norm": 0.015433340333402157, "learning_rate": 9.188429243149824e-06, "loss": 1.7769, "step": 124 }, { "epoch": 0.22722108611679165, "grad_norm": 0.01560978963971138, "learning_rate": 9.171918734551212e-06, "loss": 1.7791, "step": 125 }, { "epoch": 0.22903885480572597, "grad_norm": 0.016046756878495216, "learning_rate": 9.155257115930651e-06, "loss": 1.7778, "step": 126 }, { "epoch": 0.2308566234946603, "grad_norm": 0.01664203219115734, "learning_rate": 9.138444990784455e-06, "loss": 1.7811, "step": 127 }, { "epoch": 0.23267439218359465, "grad_norm": 0.015654807910323143, "learning_rate": 9.121482968060384e-06, "loss": 1.7841, "step": 128 }, { "epoch": 0.23449216087252897, "grad_norm": 0.016352280974388123, "learning_rate": 9.104371662135612e-06, "loss": 1.7839, "step": 129 }, { "epoch": 0.2363099295614633, "grad_norm": 0.016163717955350876, "learning_rate": 9.08711169279446e-06, "loss": 1.7847, "step": 130 }, { "epoch": 0.23812769825039765, "grad_norm": 0.016361849382519722, "learning_rate": 9.069703685205945e-06, "loss": 1.7804, "step": 131 }, { "epoch": 0.23994546693933197, "grad_norm": 0.01635843515396118, "learning_rate": 9.052148269901145e-06, "loss": 1.7811, "step": 132 }, { "epoch": 0.2417632356282663, "grad_norm": 0.016859732568264008, "learning_rate": 9.034446082750352e-06, "loss": 1.7863, "step": 133 }, { "epoch": 0.24358100431720064, "grad_norm": 0.016207806766033173, "learning_rate": 9.01659776494005e-06, "loss": 1.7739, "step": 134 }, { "epoch": 0.24539877300613497, "grad_norm": 0.016936447471380234, "learning_rate": 8.998603962949674e-06, "loss": 1.7818, "step": 135 }, { "epoch": 0.2472165416950693, "grad_norm": 0.015802595764398575, "learning_rate": 8.98046532852822e-06, "loss": 1.7836, "step": 136 }, { "epoch": 0.24903431038400364, "grad_norm": 0.016628528013825417, "learning_rate": 8.96218251867061e-06, "loss": 1.7822, "step": 137 }, { "epoch": 0.250852079072938, "grad_norm": 0.01642756536602974, "learning_rate": 8.943756195593916e-06, "loss": 1.7756, "step": 138 }, { "epoch": 0.2526698477618723, "grad_norm": 0.016094859689474106, "learning_rate": 8.925187026713363e-06, "loss": 1.766, "step": 139 }, { "epoch": 0.25448761645080664, "grad_norm": 0.015560369938611984, "learning_rate": 8.90647568461816e-06, "loss": 1.783, "step": 140 }, { "epoch": 0.256305385139741, "grad_norm": 0.01574082300066948, "learning_rate": 8.887622847047131e-06, "loss": 1.7882, "step": 141 }, { "epoch": 0.2581231538286753, "grad_norm": 0.01694745570421219, "learning_rate": 8.868629196864182e-06, "loss": 1.7797, "step": 142 }, { "epoch": 0.25994092251760964, "grad_norm": 0.01562688499689102, "learning_rate": 8.84949542203355e-06, "loss": 1.7832, "step": 143 }, { "epoch": 0.261758691206544, "grad_norm": 0.015506752766668797, "learning_rate": 8.83022221559489e-06, "loss": 1.7749, "step": 144 }, { "epoch": 0.2635764598954783, "grad_norm": 0.017343781888484955, "learning_rate": 8.810810275638183e-06, "loss": 1.7736, "step": 145 }, { "epoch": 0.26539422858441264, "grad_norm": 0.01597374677658081, "learning_rate": 8.791260305278434e-06, "loss": 1.7879, "step": 146 }, { "epoch": 0.267211997273347, "grad_norm": 0.015632351860404015, "learning_rate": 8.771573012630214e-06, "loss": 1.7804, "step": 147 }, { "epoch": 0.2690297659622813, "grad_norm": 0.01659367047250271, "learning_rate": 8.751749110782013e-06, "loss": 1.7827, "step": 148 }, { "epoch": 0.27084753465121564, "grad_norm": 0.01651635952293873, "learning_rate": 8.731789317770407e-06, "loss": 1.7781, "step": 149 }, { "epoch": 0.27266530334015, "grad_norm": 0.01517146173864603, "learning_rate": 8.71169435655405e-06, "loss": 1.7811, "step": 150 }, { "epoch": 0.2744830720290843, "grad_norm": 0.015295923687517643, "learning_rate": 8.691464954987494e-06, "loss": 1.7677, "step": 151 }, { "epoch": 0.27630084071801864, "grad_norm": 0.015585844404995441, "learning_rate": 8.671101845794816e-06, "loss": 1.7745, "step": 152 }, { "epoch": 0.278118609406953, "grad_norm": 0.015692081302404404, "learning_rate": 8.65060576654309e-06, "loss": 1.7745, "step": 153 }, { "epoch": 0.2799363780958873, "grad_norm": 0.015136554837226868, "learning_rate": 8.629977459615655e-06, "loss": 1.7863, "step": 154 }, { "epoch": 0.28175414678482164, "grad_norm": 0.015603788197040558, "learning_rate": 8.609217672185246e-06, "loss": 1.7796, "step": 155 }, { "epoch": 0.283571915473756, "grad_norm": 0.016288187354803085, "learning_rate": 8.588327156186915e-06, "loss": 1.7785, "step": 156 }, { "epoch": 0.2853896841626903, "grad_norm": 0.016181934624910355, "learning_rate": 8.567306668290801e-06, "loss": 1.7597, "step": 157 }, { "epoch": 0.28720745285162463, "grad_norm": 0.0157309602946043, "learning_rate": 8.546156969874723e-06, "loss": 1.7827, "step": 158 }, { "epoch": 0.289025221540559, "grad_norm": 0.016916731372475624, "learning_rate": 8.524878826996602e-06, "loss": 1.7749, "step": 159 }, { "epoch": 0.2908429902294933, "grad_norm": 0.015968995168805122, "learning_rate": 8.503473010366713e-06, "loss": 1.7683, "step": 160 }, { "epoch": 0.29266075891842763, "grad_norm": 0.01594395563006401, "learning_rate": 8.481940295319772e-06, "loss": 1.7792, "step": 161 }, { "epoch": 0.294478527607362, "grad_norm": 0.016326317563652992, "learning_rate": 8.460281461786848e-06, "loss": 1.7734, "step": 162 }, { "epoch": 0.2962962962962963, "grad_norm": 0.016297809779644012, "learning_rate": 8.438497294267117e-06, "loss": 1.769, "step": 163 }, { "epoch": 0.29811406498523063, "grad_norm": 0.017145946621894836, "learning_rate": 8.416588581799447e-06, "loss": 1.7767, "step": 164 }, { "epoch": 0.299931833674165, "grad_norm": 0.016356928274035454, "learning_rate": 8.394556117933816e-06, "loss": 1.772, "step": 165 }, { "epoch": 0.3017496023630993, "grad_norm": 0.016378790140151978, "learning_rate": 8.372400700702569e-06, "loss": 1.7701, "step": 166 }, { "epoch": 0.30356737105203363, "grad_norm": 0.018152521923184395, "learning_rate": 8.350123132591522e-06, "loss": 1.7769, "step": 167 }, { "epoch": 0.305385139740968, "grad_norm": 0.017259759828448296, "learning_rate": 8.327724220510873e-06, "loss": 1.7742, "step": 168 }, { "epoch": 0.3072029084299023, "grad_norm": 0.016766058281064034, "learning_rate": 8.305204775766003e-06, "loss": 1.771, "step": 169 }, { "epoch": 0.30902067711883663, "grad_norm": 0.017410485073924065, "learning_rate": 8.282565614028068e-06, "loss": 1.7663, "step": 170 }, { "epoch": 0.310838445807771, "grad_norm": 0.017518077045679092, "learning_rate": 8.259807555304469e-06, "loss": 1.769, "step": 171 }, { "epoch": 0.3126562144967053, "grad_norm": 0.017017841339111328, "learning_rate": 8.23693142390914e-06, "loss": 1.7733, "step": 172 }, { "epoch": 0.3144739831856396, "grad_norm": 0.017034539952874184, "learning_rate": 8.213938048432697e-06, "loss": 1.7715, "step": 173 }, { "epoch": 0.316291751874574, "grad_norm": 0.016053663566708565, "learning_rate": 8.19082826171243e-06, "loss": 1.768, "step": 174 }, { "epoch": 0.3181095205635083, "grad_norm": 0.017002522945404053, "learning_rate": 8.167602900802121e-06, "loss": 1.7571, "step": 175 }, { "epoch": 0.3199272892524426, "grad_norm": 0.016666986048221588, "learning_rate": 8.144262806941743e-06, "loss": 1.776, "step": 176 }, { "epoch": 0.321745057941377, "grad_norm": 0.017756953835487366, "learning_rate": 8.120808825526983e-06, "loss": 1.7701, "step": 177 }, { "epoch": 0.3235628266303113, "grad_norm": 0.01685352995991707, "learning_rate": 8.097241806078616e-06, "loss": 1.7697, "step": 178 }, { "epoch": 0.3253805953192456, "grad_norm": 0.01626460626721382, "learning_rate": 8.073562602211743e-06, "loss": 1.7733, "step": 179 }, { "epoch": 0.32719836400818, "grad_norm": 0.017634931951761246, "learning_rate": 8.049772071604864e-06, "loss": 1.7817, "step": 180 }, { "epoch": 0.32901613269711427, "grad_norm": 0.0157694723457098, "learning_rate": 8.025871075968828e-06, "loss": 1.7667, "step": 181 }, { "epoch": 0.3308339013860486, "grad_norm": 0.016742341220378876, "learning_rate": 8.001860481015594e-06, "loss": 1.7753, "step": 182 }, { "epoch": 0.332651670074983, "grad_norm": 0.015466434881091118, "learning_rate": 7.977741156426901e-06, "loss": 1.7706, "step": 183 }, { "epoch": 0.33446943876391727, "grad_norm": 0.017226146534085274, "learning_rate": 7.953513975822755e-06, "loss": 1.7665, "step": 184 }, { "epoch": 0.3362872074528516, "grad_norm": 0.01610388606786728, "learning_rate": 7.92917981672979e-06, "loss": 1.7723, "step": 185 }, { "epoch": 0.338104976141786, "grad_norm": 0.016837526112794876, "learning_rate": 7.904739560549475e-06, "loss": 1.7754, "step": 186 }, { "epoch": 0.33992274483072027, "grad_norm": 0.016696933656930923, "learning_rate": 7.8801940925262e-06, "loss": 1.7707, "step": 187 }, { "epoch": 0.3417405135196546, "grad_norm": 0.016263185068964958, "learning_rate": 7.855544301715203e-06, "loss": 1.7702, "step": 188 }, { "epoch": 0.34355828220858897, "grad_norm": 0.01645650342106819, "learning_rate": 7.830791080950373e-06, "loss": 1.768, "step": 189 }, { "epoch": 0.34537605089752327, "grad_norm": 0.01569991558790207, "learning_rate": 7.805935326811913e-06, "loss": 1.767, "step": 190 }, { "epoch": 0.3471938195864576, "grad_norm": 0.015973446890711784, "learning_rate": 7.780977939593856e-06, "loss": 1.7713, "step": 191 }, { "epoch": 0.34901158827539197, "grad_norm": 0.01654656231403351, "learning_rate": 7.755919823271466e-06, "loss": 1.7577, "step": 192 }, { "epoch": 0.35082935696432627, "grad_norm": 0.015675723552703857, "learning_rate": 7.730761885468486e-06, "loss": 1.7732, "step": 193 }, { "epoch": 0.3526471256532606, "grad_norm": 0.018406856805086136, "learning_rate": 7.70550503742427e-06, "loss": 1.7668, "step": 194 }, { "epoch": 0.35446489434219497, "grad_norm": 0.016395216807723045, "learning_rate": 7.68015019396078e-06, "loss": 1.7672, "step": 195 }, { "epoch": 0.35628266303112927, "grad_norm": 0.016013452783226967, "learning_rate": 7.654698273449435e-06, "loss": 1.7646, "step": 196 }, { "epoch": 0.3581004317200636, "grad_norm": 0.01679440774023533, "learning_rate": 7.629150197777866e-06, "loss": 1.7612, "step": 197 }, { "epoch": 0.35991820040899797, "grad_norm": 0.01686931401491165, "learning_rate": 7.603506892316513e-06, "loss": 1.7597, "step": 198 }, { "epoch": 0.36173596909793226, "grad_norm": 0.017471209168434143, "learning_rate": 7.57776928588511e-06, "loss": 1.7756, "step": 199 }, { "epoch": 0.3635537377868666, "grad_norm": 0.017604535445570946, "learning_rate": 7.551938310719043e-06, "loss": 1.7706, "step": 200 }, { "epoch": 0.36537150647580097, "grad_norm": 0.016083979979157448, "learning_rate": 7.526014902435583e-06, "loss": 1.7689, "step": 201 }, { "epoch": 0.36718927516473526, "grad_norm": 0.017569448798894882, "learning_rate": 7.500000000000001e-06, "loss": 1.7716, "step": 202 }, { "epoch": 0.3690070438536696, "grad_norm": 0.018971305340528488, "learning_rate": 7.4738945456915505e-06, "loss": 1.7639, "step": 203 }, { "epoch": 0.37082481254260397, "grad_norm": 0.017489226534962654, "learning_rate": 7.447699485069342e-06, "loss": 1.7695, "step": 204 }, { "epoch": 0.37264258123153826, "grad_norm": 0.016599513590335846, "learning_rate": 7.421415766938098e-06, "loss": 1.758, "step": 205 }, { "epoch": 0.3744603499204726, "grad_norm": 0.017470112070441246, "learning_rate": 7.395044343313777e-06, "loss": 1.7635, "step": 206 }, { "epoch": 0.37627811860940696, "grad_norm": 0.01866212487220764, "learning_rate": 7.3685861693891026e-06, "loss": 1.7698, "step": 207 }, { "epoch": 0.37809588729834126, "grad_norm": 0.016111081466078758, "learning_rate": 7.342042203498952e-06, "loss": 1.763, "step": 208 }, { "epoch": 0.3799136559872756, "grad_norm": 0.01669992506504059, "learning_rate": 7.315413407085656e-06, "loss": 1.7614, "step": 209 }, { "epoch": 0.38173142467620996, "grad_norm": 0.01589970290660858, "learning_rate": 7.288700744664167e-06, "loss": 1.773, "step": 210 }, { "epoch": 0.38354919336514426, "grad_norm": 0.01591925323009491, "learning_rate": 7.261905183787136e-06, "loss": 1.7754, "step": 211 }, { "epoch": 0.3853669620540786, "grad_norm": 0.01747284270823002, "learning_rate": 7.235027695009846e-06, "loss": 1.7721, "step": 212 }, { "epoch": 0.38718473074301296, "grad_norm": 0.016405848786234856, "learning_rate": 7.208069251855078e-06, "loss": 1.7622, "step": 213 }, { "epoch": 0.38900249943194726, "grad_norm": 0.01654895953834057, "learning_rate": 7.181030830777838e-06, "loss": 1.7636, "step": 214 }, { "epoch": 0.3908202681208816, "grad_norm": 0.015662197023630142, "learning_rate": 7.153913411129993e-06, "loss": 1.7751, "step": 215 }, { "epoch": 0.39263803680981596, "grad_norm": 0.015878858044743538, "learning_rate": 7.1267179751248005e-06, "loss": 1.7708, "step": 216 }, { "epoch": 0.3944558054987503, "grad_norm": 0.016220899298787117, "learning_rate": 7.099445507801324e-06, "loss": 1.7679, "step": 217 }, { "epoch": 0.3962735741876846, "grad_norm": 0.015889156609773636, "learning_rate": 7.0720969969887595e-06, "loss": 1.7657, "step": 218 }, { "epoch": 0.39809134287661896, "grad_norm": 0.01594599336385727, "learning_rate": 7.044673433270659e-06, "loss": 1.7641, "step": 219 }, { "epoch": 0.3999091115655533, "grad_norm": 0.015293586999177933, "learning_rate": 7.017175809949044e-06, "loss": 1.7677, "step": 220 }, { "epoch": 0.4017268802544876, "grad_norm": 0.015891166403889656, "learning_rate": 6.98960512300843e-06, "loss": 1.7629, "step": 221 }, { "epoch": 0.40354464894342196, "grad_norm": 0.016649074852466583, "learning_rate": 6.961962371079752e-06, "loss": 1.7655, "step": 222 }, { "epoch": 0.4053624176323563, "grad_norm": 0.016516495496034622, "learning_rate": 6.934248555404197e-06, "loss": 1.7741, "step": 223 }, { "epoch": 0.4071801863212906, "grad_norm": 0.01925363577902317, "learning_rate": 6.906464679796927e-06, "loss": 1.7572, "step": 224 }, { "epoch": 0.40899795501022496, "grad_norm": 0.01634056493639946, "learning_rate": 6.878611750610731e-06, "loss": 1.759, "step": 225 }, { "epoch": 0.4108157236991593, "grad_norm": 0.016612950712442398, "learning_rate": 6.850690776699574e-06, "loss": 1.7562, "step": 226 }, { "epoch": 0.4126334923880936, "grad_norm": 0.01613459922373295, "learning_rate": 6.822702769382042e-06, "loss": 1.7697, "step": 227 }, { "epoch": 0.41445126107702795, "grad_norm": 0.016045957803726196, "learning_rate": 6.79464874240473e-06, "loss": 1.7623, "step": 228 }, { "epoch": 0.4162690297659623, "grad_norm": 0.016840225085616112, "learning_rate": 6.766529711905513e-06, "loss": 1.7742, "step": 229 }, { "epoch": 0.4180867984548966, "grad_norm": 0.015475032851099968, "learning_rate": 6.7383466963767386e-06, "loss": 1.7644, "step": 230 }, { "epoch": 0.41990456714383095, "grad_norm": 0.016331806778907776, "learning_rate": 6.710100716628345e-06, "loss": 1.7722, "step": 231 }, { "epoch": 0.4217223358327653, "grad_norm": 0.016033973544836044, "learning_rate": 6.681792795750876e-06, "loss": 1.7572, "step": 232 }, { "epoch": 0.4235401045216996, "grad_norm": 0.015963230282068253, "learning_rate": 6.653423959078435e-06, "loss": 1.7714, "step": 233 }, { "epoch": 0.42535787321063395, "grad_norm": 0.016069794073700905, "learning_rate": 6.624995234151539e-06, "loss": 1.7702, "step": 234 }, { "epoch": 0.4271756418995683, "grad_norm": 0.016175484284758568, "learning_rate": 6.5965076506799e-06, "loss": 1.7595, "step": 235 }, { "epoch": 0.4289934105885026, "grad_norm": 0.017575398087501526, "learning_rate": 6.567962240505136e-06, "loss": 1.7589, "step": 236 }, { "epoch": 0.43081117927743695, "grad_norm": 0.01609048619866371, "learning_rate": 6.539360037563384e-06, "loss": 1.7583, "step": 237 }, { "epoch": 0.4326289479663713, "grad_norm": 0.016053223982453346, "learning_rate": 6.510702077847864e-06, "loss": 1.7593, "step": 238 }, { "epoch": 0.4344467166553056, "grad_norm": 0.01691989041864872, "learning_rate": 6.481989399371347e-06, "loss": 1.7643, "step": 239 }, { "epoch": 0.43626448534423995, "grad_norm": 0.017391884699463844, "learning_rate": 6.453223042128556e-06, "loss": 1.7588, "step": 240 }, { "epoch": 0.4380822540331743, "grad_norm": 0.016525816172361374, "learning_rate": 6.424404048058501e-06, "loss": 1.7637, "step": 241 }, { "epoch": 0.4399000227221086, "grad_norm": 0.01585998386144638, "learning_rate": 6.395533461006736e-06, "loss": 1.7652, "step": 242 }, { "epoch": 0.44171779141104295, "grad_norm": 0.01582312397658825, "learning_rate": 6.366612326687555e-06, "loss": 1.7584, "step": 243 }, { "epoch": 0.4435355600999773, "grad_norm": 0.01715337485074997, "learning_rate": 6.337641692646106e-06, "loss": 1.7606, "step": 244 }, { "epoch": 0.4453533287889116, "grad_norm": 0.021504878997802734, "learning_rate": 6.308622608220457e-06, "loss": 1.762, "step": 245 }, { "epoch": 0.44717109747784595, "grad_norm": 0.015527226962149143, "learning_rate": 6.2795561245035895e-06, "loss": 1.757, "step": 246 }, { "epoch": 0.4489888661667803, "grad_norm": 0.017598124220967293, "learning_rate": 6.250443294305315e-06, "loss": 1.7547, "step": 247 }, { "epoch": 0.4508066348557146, "grad_norm": 0.016357263550162315, "learning_rate": 6.221285172114156e-06, "loss": 1.7585, "step": 248 }, { "epoch": 0.45262440354464895, "grad_norm": 0.01646249182522297, "learning_rate": 6.192082814059141e-06, "loss": 1.76, "step": 249 }, { "epoch": 0.4544421722335833, "grad_norm": 0.016435401514172554, "learning_rate": 6.162837277871553e-06, "loss": 1.7664, "step": 250 }, { "epoch": 0.4562599409225176, "grad_norm": 0.016678526997566223, "learning_rate": 6.133549622846625e-06, "loss": 1.7713, "step": 251 }, { "epoch": 0.45807770961145194, "grad_norm": 0.017534134909510612, "learning_rate": 6.104220909805162e-06, "loss": 1.7589, "step": 252 }, { "epoch": 0.4598954783003863, "grad_norm": 0.016283275559544563, "learning_rate": 6.074852201055121e-06, "loss": 1.7598, "step": 253 }, { "epoch": 0.4617132469893206, "grad_norm": 0.017745792865753174, "learning_rate": 6.045444560353136e-06, "loss": 1.7643, "step": 254 }, { "epoch": 0.46353101567825494, "grad_norm": 0.017753778025507927, "learning_rate": 6.015999052865982e-06, "loss": 1.7545, "step": 255 }, { "epoch": 0.4653487843671893, "grad_norm": 0.017292464151978493, "learning_rate": 5.986516745132e-06, "loss": 1.7582, "step": 256 }, { "epoch": 0.4671665530561236, "grad_norm": 0.01648300141096115, "learning_rate": 5.956998705022464e-06, "loss": 1.7603, "step": 257 }, { "epoch": 0.46898432174505794, "grad_norm": 0.017090782523155212, "learning_rate": 5.927446001702899e-06, "loss": 1.7654, "step": 258 }, { "epoch": 0.4708020904339923, "grad_norm": 0.015470580197870731, "learning_rate": 5.8978597055943585e-06, "loss": 1.7529, "step": 259 }, { "epoch": 0.4726198591229266, "grad_norm": 0.016197843477129936, "learning_rate": 5.8682408883346535e-06, "loss": 1.7551, "step": 260 }, { "epoch": 0.47443762781186094, "grad_norm": 0.018076961860060692, "learning_rate": 5.8385906227395304e-06, "loss": 1.7629, "step": 261 }, { "epoch": 0.4762553965007953, "grad_norm": 0.015964508056640625, "learning_rate": 5.808909982763825e-06, "loss": 1.7668, "step": 262 }, { "epoch": 0.4780731651897296, "grad_norm": 0.016753260046243668, "learning_rate": 5.779200043462549e-06, "loss": 1.753, "step": 263 }, { "epoch": 0.47989093387866394, "grad_norm": 0.01664654165506363, "learning_rate": 5.749461880951966e-06, "loss": 1.7654, "step": 264 }, { "epoch": 0.4817087025675983, "grad_norm": 0.01592446304857731, "learning_rate": 5.719696572370596e-06, "loss": 1.763, "step": 265 }, { "epoch": 0.4835264712565326, "grad_norm": 0.016646496951580048, "learning_rate": 5.689905195840216e-06, "loss": 1.766, "step": 266 }, { "epoch": 0.48534423994546694, "grad_norm": 0.016208553686738014, "learning_rate": 5.660088830426804e-06, "loss": 1.7551, "step": 267 }, { "epoch": 0.4871620086344013, "grad_norm": 0.01585574448108673, "learning_rate": 5.630248556101448e-06, "loss": 1.7638, "step": 268 }, { "epoch": 0.4889797773233356, "grad_norm": 0.016133490949869156, "learning_rate": 5.600385453701241e-06, "loss": 1.7644, "step": 269 }, { "epoch": 0.49079754601226994, "grad_norm": 0.015675894916057587, "learning_rate": 5.570500604890124e-06, "loss": 1.7675, "step": 270 }, { "epoch": 0.4926153147012043, "grad_norm": 0.01614633947610855, "learning_rate": 5.540595092119709e-06, "loss": 1.7636, "step": 271 }, { "epoch": 0.4944330833901386, "grad_norm": 0.01666291244328022, "learning_rate": 5.510669998590074e-06, "loss": 1.7583, "step": 272 }, { "epoch": 0.49625085207907293, "grad_norm": 0.016553543508052826, "learning_rate": 5.480726408210519e-06, "loss": 1.7586, "step": 273 }, { "epoch": 0.4980686207680073, "grad_norm": 0.017047051340341568, "learning_rate": 5.450765405560328e-06, "loss": 1.7534, "step": 274 }, { "epoch": 0.4998863894569416, "grad_norm": 0.01579987071454525, "learning_rate": 5.4207880758494545e-06, "loss": 1.7669, "step": 275 }, { "epoch": 0.501704158145876, "grad_norm": 0.016013607382774353, "learning_rate": 5.390795504879243e-06, "loss": 1.7546, "step": 276 }, { "epoch": 0.5035219268348102, "grad_norm": 0.015493376180529594, "learning_rate": 5.360788779003082e-06, "loss": 1.7555, "step": 277 }, { "epoch": 0.5053396955237446, "grad_norm": 0.016125505790114403, "learning_rate": 5.330768985087059e-06, "loss": 1.7485, "step": 278 }, { "epoch": 0.5071574642126789, "grad_norm": 0.015707215294241905, "learning_rate": 5.300737210470603e-06, "loss": 1.7556, "step": 279 }, { "epoch": 0.5089752329016133, "grad_norm": 0.016529636457562447, "learning_rate": 5.270694542927089e-06, "loss": 1.7621, "step": 280 }, { "epoch": 0.5107930015905476, "grad_norm": 0.015912501141428947, "learning_rate": 5.2406420706244376e-06, "loss": 1.7578, "step": 281 }, { "epoch": 0.512610770279482, "grad_norm": 0.017320740967988968, "learning_rate": 5.2105808820857126e-06, "loss": 1.7509, "step": 282 }, { "epoch": 0.5144285389684162, "grad_norm": 0.016190189868211746, "learning_rate": 5.180512066149682e-06, "loss": 1.7586, "step": 283 }, { "epoch": 0.5162463076573506, "grad_norm": 0.01586255431175232, "learning_rate": 5.150436711931387e-06, "loss": 1.7618, "step": 284 }, { "epoch": 0.5180640763462849, "grad_norm": 0.016613394021987915, "learning_rate": 5.120355908782686e-06, "loss": 1.7582, "step": 285 }, { "epoch": 0.5198818450352193, "grad_norm": 0.016856033354997635, "learning_rate": 5.090270746252803e-06, "loss": 1.766, "step": 286 }, { "epoch": 0.5216996137241536, "grad_norm": 0.015804223716259003, "learning_rate": 5.060182314048865e-06, "loss": 1.7548, "step": 287 }, { "epoch": 0.523517382413088, "grad_norm": 0.01533227227628231, "learning_rate": 5.030091701996428e-06, "loss": 1.7508, "step": 288 }, { "epoch": 0.5253351511020222, "grad_norm": 0.017301153391599655, "learning_rate": 5e-06, "loss": 1.7508, "step": 289 }, { "epoch": 0.5271529197909566, "grad_norm": 0.016463877633213997, "learning_rate": 4.9699082980035735e-06, "loss": 1.7612, "step": 290 }, { "epoch": 0.5289706884798909, "grad_norm": 0.017038939520716667, "learning_rate": 4.939817685951135e-06, "loss": 1.7557, "step": 291 }, { "epoch": 0.5307884571688253, "grad_norm": 0.01651296392083168, "learning_rate": 4.909729253747197e-06, "loss": 1.7555, "step": 292 }, { "epoch": 0.5326062258577596, "grad_norm": 0.01751718856394291, "learning_rate": 4.879644091217317e-06, "loss": 1.7524, "step": 293 }, { "epoch": 0.534423994546694, "grad_norm": 0.016333656385540962, "learning_rate": 4.8495632880686155e-06, "loss": 1.7452, "step": 294 }, { "epoch": 0.5362417632356282, "grad_norm": 0.016173357143998146, "learning_rate": 4.819487933850319e-06, "loss": 1.7611, "step": 295 }, { "epoch": 0.5380595319245626, "grad_norm": 0.016298582777380943, "learning_rate": 4.789419117914288e-06, "loss": 1.752, "step": 296 }, { "epoch": 0.5398773006134969, "grad_norm": 0.017157401889562607, "learning_rate": 4.759357929375563e-06, "loss": 1.7518, "step": 297 }, { "epoch": 0.5416950693024313, "grad_norm": 0.01661343313753605, "learning_rate": 4.729305457072913e-06, "loss": 1.7637, "step": 298 }, { "epoch": 0.5435128379913656, "grad_norm": 0.016558021306991577, "learning_rate": 4.699262789529396e-06, "loss": 1.7511, "step": 299 }, { "epoch": 0.5453306066803, "grad_norm": 0.016143113374710083, "learning_rate": 4.6692310149129425e-06, "loss": 1.7562, "step": 300 }, { "epoch": 0.5471483753692342, "grad_norm": 0.01550297997891903, "learning_rate": 1e-05, "loss": 1.7592, "step": 301 }, { "epoch": 0.5489661440581686, "grad_norm": 0.016153663396835327, "learning_rate": 1e-05, "loss": 1.7495, "step": 302 }, { "epoch": 0.5507839127471029, "grad_norm": 0.017202477902173996, "learning_rate": 1e-05, "loss": 1.7564, "step": 303 }, { "epoch": 0.5526016814360373, "grad_norm": 0.01577403023838997, "learning_rate": 1e-05, "loss": 1.7635, "step": 304 }, { "epoch": 0.5544194501249716, "grad_norm": 0.016280407086014748, "learning_rate": 1e-05, "loss": 1.748, "step": 305 }, { "epoch": 0.556237218813906, "grad_norm": 0.016771433874964714, "learning_rate": 1e-05, "loss": 1.7467, "step": 306 }, { "epoch": 0.5580549875028402, "grad_norm": 0.01556472573429346, "learning_rate": 1e-05, "loss": 1.751, "step": 307 }, { "epoch": 0.5598727561917746, "grad_norm": 0.01656194217503071, "learning_rate": 1e-05, "loss": 1.7605, "step": 308 }, { "epoch": 0.5616905248807089, "grad_norm": 0.017003118991851807, "learning_rate": 1e-05, "loss": 1.7516, "step": 309 }, { "epoch": 0.5635082935696433, "grad_norm": 0.016028909012675285, "learning_rate": 1e-05, "loss": 1.7557, "step": 310 }, { "epoch": 0.5653260622585776, "grad_norm": 0.016611898317933083, "learning_rate": 1e-05, "loss": 1.7548, "step": 311 }, { "epoch": 0.567143830947512, "grad_norm": 0.01619804836809635, "learning_rate": 1e-05, "loss": 1.7569, "step": 312 }, { "epoch": 0.5689615996364462, "grad_norm": 0.01763117127120495, "learning_rate": 1e-05, "loss": 1.7499, "step": 313 }, { "epoch": 0.5707793683253806, "grad_norm": 0.017052598297595978, "learning_rate": 1e-05, "loss": 1.7628, "step": 314 }, { "epoch": 0.5725971370143149, "grad_norm": 0.015606777742505074, "learning_rate": 1e-05, "loss": 1.7695, "step": 315 }, { "epoch": 0.5744149057032493, "grad_norm": 0.017086924985051155, "learning_rate": 1e-05, "loss": 1.7573, "step": 316 }, { "epoch": 0.5762326743921836, "grad_norm": 0.01597212627530098, "learning_rate": 1e-05, "loss": 1.7672, "step": 317 }, { "epoch": 0.578050443081118, "grad_norm": 0.016126353293657303, "learning_rate": 1e-05, "loss": 1.7481, "step": 318 }, { "epoch": 0.5798682117700522, "grad_norm": 0.016764555126428604, "learning_rate": 1e-05, "loss": 1.7543, "step": 319 }, { "epoch": 0.5816859804589866, "grad_norm": 0.016383804380893707, "learning_rate": 1e-05, "loss": 1.7595, "step": 320 }, { "epoch": 0.5835037491479209, "grad_norm": 0.016328634694218636, "learning_rate": 1e-05, "loss": 1.7624, "step": 321 }, { "epoch": 0.5853215178368553, "grad_norm": 0.017615774646401405, "learning_rate": 1e-05, "loss": 1.7633, "step": 322 }, { "epoch": 0.5871392865257896, "grad_norm": 0.016653137281537056, "learning_rate": 1e-05, "loss": 1.753, "step": 323 }, { "epoch": 0.588957055214724, "grad_norm": 0.016418032348155975, "learning_rate": 1e-05, "loss": 1.7553, "step": 324 }, { "epoch": 0.5907748239036582, "grad_norm": 0.01667468063533306, "learning_rate": 1e-05, "loss": 1.759, "step": 325 }, { "epoch": 0.5925925925925926, "grad_norm": 0.015785276889801025, "learning_rate": 1e-05, "loss": 1.7545, "step": 326 }, { "epoch": 0.5944103612815269, "grad_norm": 0.017045632004737854, "learning_rate": 1e-05, "loss": 1.7569, "step": 327 }, { "epoch": 0.5962281299704613, "grad_norm": 0.016107341274619102, "learning_rate": 1e-05, "loss": 1.7551, "step": 328 }, { "epoch": 0.5980458986593956, "grad_norm": 0.016075948253273964, "learning_rate": 1e-05, "loss": 1.7489, "step": 329 }, { "epoch": 0.59986366734833, "grad_norm": 0.015299948863685131, "learning_rate": 1e-05, "loss": 1.7584, "step": 330 }, { "epoch": 0.6016814360372642, "grad_norm": 0.01539833564311266, "learning_rate": 1e-05, "loss": 1.7484, "step": 331 }, { "epoch": 0.6034992047261986, "grad_norm": 0.016403749585151672, "learning_rate": 1e-05, "loss": 1.7549, "step": 332 }, { "epoch": 0.6053169734151329, "grad_norm": 0.017300885170698166, "learning_rate": 1e-05, "loss": 1.7503, "step": 333 }, { "epoch": 0.6071347421040673, "grad_norm": 0.01626763306558132, "learning_rate": 1e-05, "loss": 1.7613, "step": 334 }, { "epoch": 0.6089525107930016, "grad_norm": 0.01677662320435047, "learning_rate": 1e-05, "loss": 1.7539, "step": 335 }, { "epoch": 0.610770279481936, "grad_norm": 0.017275378108024597, "learning_rate": 1e-05, "loss": 1.752, "step": 336 }, { "epoch": 0.6125880481708702, "grad_norm": 0.015787243843078613, "learning_rate": 1e-05, "loss": 1.753, "step": 337 }, { "epoch": 0.6144058168598046, "grad_norm": 0.016181068494915962, "learning_rate": 1e-05, "loss": 1.7574, "step": 338 }, { "epoch": 0.6162235855487389, "grad_norm": 0.01625332608819008, "learning_rate": 1e-05, "loss": 1.7552, "step": 339 }, { "epoch": 0.6180413542376733, "grad_norm": 0.01715734228491783, "learning_rate": 1e-05, "loss": 1.7538, "step": 340 }, { "epoch": 0.6198591229266076, "grad_norm": 0.018199391663074493, "learning_rate": 1e-05, "loss": 1.7589, "step": 341 }, { "epoch": 0.621676891615542, "grad_norm": 0.01592421531677246, "learning_rate": 1e-05, "loss": 1.7514, "step": 342 }, { "epoch": 0.6234946603044762, "grad_norm": 0.015030477195978165, "learning_rate": 1e-05, "loss": 1.7578, "step": 343 }, { "epoch": 0.6253124289934106, "grad_norm": 0.01609027571976185, "learning_rate": 1e-05, "loss": 1.7528, "step": 344 }, { "epoch": 0.6271301976823449, "grad_norm": 0.015512831509113312, "learning_rate": 1e-05, "loss": 1.7511, "step": 345 }, { "epoch": 0.6289479663712793, "grad_norm": 0.015017388388514519, "learning_rate": 1e-05, "loss": 1.7504, "step": 346 }, { "epoch": 0.6307657350602136, "grad_norm": 0.01578696072101593, "learning_rate": 1e-05, "loss": 1.7545, "step": 347 }, { "epoch": 0.632583503749148, "grad_norm": 0.015417453832924366, "learning_rate": 1e-05, "loss": 1.7481, "step": 348 }, { "epoch": 0.6344012724380823, "grad_norm": 0.015762289986014366, "learning_rate": 1e-05, "loss": 1.7614, "step": 349 }, { "epoch": 0.6362190411270165, "grad_norm": 0.01597565785050392, "learning_rate": 1e-05, "loss": 1.7497, "step": 350 }, { "epoch": 0.6380368098159509, "grad_norm": 0.01767154410481453, "learning_rate": 1e-05, "loss": 1.7537, "step": 351 }, { "epoch": 0.6398545785048853, "grad_norm": 0.01671607419848442, "learning_rate": 1e-05, "loss": 1.7456, "step": 352 }, { "epoch": 0.6416723471938196, "grad_norm": 0.015792865306138992, "learning_rate": 1e-05, "loss": 1.7494, "step": 353 }, { "epoch": 0.643490115882754, "grad_norm": 0.017053868621587753, "learning_rate": 1e-05, "loss": 1.743, "step": 354 }, { "epoch": 0.6453078845716883, "grad_norm": 0.015672611072659492, "learning_rate": 1e-05, "loss": 1.7478, "step": 355 }, { "epoch": 0.6471256532606225, "grad_norm": 0.01585494540631771, "learning_rate": 1e-05, "loss": 1.7535, "step": 356 }, { "epoch": 0.6489434219495569, "grad_norm": 0.016009824350476265, "learning_rate": 1e-05, "loss": 1.759, "step": 357 }, { "epoch": 0.6507611906384912, "grad_norm": 0.015507341362535954, "learning_rate": 1e-05, "loss": 1.755, "step": 358 }, { "epoch": 0.6525789593274256, "grad_norm": 0.01644650474190712, "learning_rate": 1e-05, "loss": 1.7597, "step": 359 }, { "epoch": 0.65439672801636, "grad_norm": 0.016472771763801575, "learning_rate": 1e-05, "loss": 1.7468, "step": 360 }, { "epoch": 0.6562144967052943, "grad_norm": 0.016300657764077187, "learning_rate": 1e-05, "loss": 1.7468, "step": 361 }, { "epoch": 0.6580322653942285, "grad_norm": 0.016034092754125595, "learning_rate": 1e-05, "loss": 1.7477, "step": 362 }, { "epoch": 0.6598500340831629, "grad_norm": 0.01675514504313469, "learning_rate": 1e-05, "loss": 1.7643, "step": 363 }, { "epoch": 0.6616678027720972, "grad_norm": 0.016840513795614243, "learning_rate": 1e-05, "loss": 1.7514, "step": 364 }, { "epoch": 0.6634855714610316, "grad_norm": 0.017041552811861038, "learning_rate": 1e-05, "loss": 1.7581, "step": 365 }, { "epoch": 0.665303340149966, "grad_norm": 0.016030827537178993, "learning_rate": 1e-05, "loss": 1.7455, "step": 366 }, { "epoch": 0.6671211088389003, "grad_norm": 0.016785001382231712, "learning_rate": 1e-05, "loss": 1.7483, "step": 367 }, { "epoch": 0.6689388775278345, "grad_norm": 0.017177637666463852, "learning_rate": 1e-05, "loss": 1.7512, "step": 368 }, { "epoch": 0.6707566462167689, "grad_norm": 0.015744341537356377, "learning_rate": 1e-05, "loss": 1.7528, "step": 369 }, { "epoch": 0.6725744149057032, "grad_norm": 0.015531038865447044, "learning_rate": 1e-05, "loss": 1.7446, "step": 370 }, { "epoch": 0.6743921835946376, "grad_norm": 0.016207581385970116, "learning_rate": 1e-05, "loss": 1.7533, "step": 371 }, { "epoch": 0.676209952283572, "grad_norm": 0.016298890113830566, "learning_rate": 1e-05, "loss": 1.7512, "step": 372 }, { "epoch": 0.6780277209725063, "grad_norm": 0.016354553401470184, "learning_rate": 1e-05, "loss": 1.7533, "step": 373 }, { "epoch": 0.6798454896614405, "grad_norm": 0.01599087379872799, "learning_rate": 1e-05, "loss": 1.7468, "step": 374 }, { "epoch": 0.6816632583503749, "grad_norm": 0.015880877152085304, "learning_rate": 1e-05, "loss": 1.7514, "step": 375 }, { "epoch": 0.6834810270393092, "grad_norm": 0.016650687903165817, "learning_rate": 1e-05, "loss": 1.746, "step": 376 }, { "epoch": 0.6852987957282436, "grad_norm": 0.0163528211414814, "learning_rate": 1e-05, "loss": 1.7472, "step": 377 }, { "epoch": 0.6871165644171779, "grad_norm": 0.01636846549808979, "learning_rate": 1e-05, "loss": 1.7445, "step": 378 }, { "epoch": 0.6889343331061123, "grad_norm": 0.016309088096022606, "learning_rate": 1e-05, "loss": 1.7575, "step": 379 }, { "epoch": 0.6907521017950465, "grad_norm": 0.01691536419093609, "learning_rate": 1e-05, "loss": 1.7478, "step": 380 }, { "epoch": 0.6925698704839809, "grad_norm": 0.01824839785695076, "learning_rate": 1e-05, "loss": 1.7577, "step": 381 }, { "epoch": 0.6943876391729152, "grad_norm": 0.01665637642145157, "learning_rate": 1e-05, "loss": 1.7516, "step": 382 }, { "epoch": 0.6962054078618496, "grad_norm": 0.015938177704811096, "learning_rate": 1e-05, "loss": 1.7488, "step": 383 }, { "epoch": 0.6980231765507839, "grad_norm": 0.01706807129085064, "learning_rate": 1e-05, "loss": 1.7545, "step": 384 }, { "epoch": 0.6998409452397183, "grad_norm": 0.01841641589999199, "learning_rate": 1e-05, "loss": 1.7533, "step": 385 }, { "epoch": 0.7016587139286525, "grad_norm": 0.01596180908381939, "learning_rate": 1e-05, "loss": 1.7521, "step": 386 }, { "epoch": 0.7034764826175869, "grad_norm": 0.016269559040665627, "learning_rate": 1e-05, "loss": 1.7548, "step": 387 }, { "epoch": 0.7052942513065212, "grad_norm": 0.01708034798502922, "learning_rate": 1e-05, "loss": 1.7443, "step": 388 }, { "epoch": 0.7071120199954556, "grad_norm": 0.01742040552198887, "learning_rate": 1e-05, "loss": 1.7515, "step": 389 }, { "epoch": 0.7089297886843899, "grad_norm": 0.017336854711174965, "learning_rate": 1e-05, "loss": 1.7478, "step": 390 }, { "epoch": 0.7107475573733243, "grad_norm": 0.016049761325120926, "learning_rate": 1e-05, "loss": 1.7487, "step": 391 }, { "epoch": 0.7125653260622585, "grad_norm": 0.017974358052015305, "learning_rate": 1e-05, "loss": 1.7539, "step": 392 }, { "epoch": 0.7143830947511929, "grad_norm": 0.01644211634993553, "learning_rate": 1e-05, "loss": 1.7488, "step": 393 }, { "epoch": 0.7162008634401272, "grad_norm": 0.018557770177721977, "learning_rate": 1e-05, "loss": 1.7448, "step": 394 }, { "epoch": 0.7180186321290616, "grad_norm": 0.01734108291566372, "learning_rate": 1e-05, "loss": 1.7399, "step": 395 }, { "epoch": 0.7198364008179959, "grad_norm": 0.01636637933552265, "learning_rate": 1e-05, "loss": 1.7566, "step": 396 }, { "epoch": 0.7216541695069303, "grad_norm": 0.01724686101078987, "learning_rate": 1e-05, "loss": 1.751, "step": 397 }, { "epoch": 0.7234719381958645, "grad_norm": 0.01744897849857807, "learning_rate": 1e-05, "loss": 1.7474, "step": 398 }, { "epoch": 0.7252897068847989, "grad_norm": 0.017034457996487617, "learning_rate": 1e-05, "loss": 1.7492, "step": 399 }, { "epoch": 0.7271074755737332, "grad_norm": 0.016682956367731094, "learning_rate": 1e-05, "loss": 1.7571, "step": 400 }, { "epoch": 0.7289252442626676, "grad_norm": 0.016139404848217964, "learning_rate": 1e-05, "loss": 1.7426, "step": 401 }, { "epoch": 0.7307430129516019, "grad_norm": 0.01789063960313797, "learning_rate": 1e-05, "loss": 1.7564, "step": 402 }, { "epoch": 0.7325607816405363, "grad_norm": 0.017030801624059677, "learning_rate": 1e-05, "loss": 1.7495, "step": 403 }, { "epoch": 0.7343785503294705, "grad_norm": 0.02051538974046707, "learning_rate": 1e-05, "loss": 1.7479, "step": 404 }, { "epoch": 0.7361963190184049, "grad_norm": 0.016426604241132736, "learning_rate": 1e-05, "loss": 1.7475, "step": 405 }, { "epoch": 0.7380140877073392, "grad_norm": 0.016485676169395447, "learning_rate": 1e-05, "loss": 1.7517, "step": 406 }, { "epoch": 0.7398318563962736, "grad_norm": 0.017329517751932144, "learning_rate": 1e-05, "loss": 1.7556, "step": 407 }, { "epoch": 0.7416496250852079, "grad_norm": 0.0165878776460886, "learning_rate": 1e-05, "loss": 1.7394, "step": 408 }, { "epoch": 0.7434673937741423, "grad_norm": 0.016505807638168335, "learning_rate": 1e-05, "loss": 1.7483, "step": 409 }, { "epoch": 0.7452851624630765, "grad_norm": 0.016942374408245087, "learning_rate": 1e-05, "loss": 1.7483, "step": 410 }, { "epoch": 0.7471029311520109, "grad_norm": 0.01690479926764965, "learning_rate": 1e-05, "loss": 1.7522, "step": 411 }, { "epoch": 0.7489206998409452, "grad_norm": 0.016314556822180748, "learning_rate": 1e-05, "loss": 1.7478, "step": 412 }, { "epoch": 0.7507384685298796, "grad_norm": 0.016368621960282326, "learning_rate": 1e-05, "loss": 1.7475, "step": 413 }, { "epoch": 0.7525562372188139, "grad_norm": 0.01776360534131527, "learning_rate": 1e-05, "loss": 1.7523, "step": 414 }, { "epoch": 0.7543740059077483, "grad_norm": 0.01603596657514572, "learning_rate": 1e-05, "loss": 1.7422, "step": 415 }, { "epoch": 0.7561917745966825, "grad_norm": 0.015459864400327206, "learning_rate": 1e-05, "loss": 1.7484, "step": 416 }, { "epoch": 0.7580095432856169, "grad_norm": 0.018278229981660843, "learning_rate": 1e-05, "loss": 1.7543, "step": 417 }, { "epoch": 0.7598273119745512, "grad_norm": 0.016482891514897346, "learning_rate": 1e-05, "loss": 1.7511, "step": 418 }, { "epoch": 0.7616450806634856, "grad_norm": 0.0158072616904974, "learning_rate": 1e-05, "loss": 1.747, "step": 419 }, { "epoch": 0.7634628493524199, "grad_norm": 0.01595921255648136, "learning_rate": 1e-05, "loss": 1.741, "step": 420 }, { "epoch": 0.7652806180413543, "grad_norm": 0.01587016135454178, "learning_rate": 1e-05, "loss": 1.7427, "step": 421 }, { "epoch": 0.7670983867302885, "grad_norm": 0.017007585614919662, "learning_rate": 1e-05, "loss": 1.7413, "step": 422 }, { "epoch": 0.7689161554192229, "grad_norm": 0.015775319188833237, "learning_rate": 1e-05, "loss": 1.7483, "step": 423 }, { "epoch": 0.7707339241081572, "grad_norm": 0.015736114233732224, "learning_rate": 1e-05, "loss": 1.7463, "step": 424 }, { "epoch": 0.7725516927970916, "grad_norm": 0.01561545580625534, "learning_rate": 1e-05, "loss": 1.7482, "step": 425 }, { "epoch": 0.7743694614860259, "grad_norm": 0.01614650897681713, "learning_rate": 1e-05, "loss": 1.7517, "step": 426 }, { "epoch": 0.7761872301749603, "grad_norm": 0.016477441415190697, "learning_rate": 1e-05, "loss": 1.7437, "step": 427 }, { "epoch": 0.7780049988638945, "grad_norm": 0.01549589540809393, "learning_rate": 1e-05, "loss": 1.7479, "step": 428 }, { "epoch": 0.7798227675528289, "grad_norm": 0.015598030760884285, "learning_rate": 1e-05, "loss": 1.7438, "step": 429 }, { "epoch": 0.7816405362417632, "grad_norm": 0.01621238701045513, "learning_rate": 1e-05, "loss": 1.743, "step": 430 }, { "epoch": 0.7834583049306976, "grad_norm": 0.015526995062828064, "learning_rate": 1e-05, "loss": 1.7521, "step": 431 }, { "epoch": 0.7852760736196319, "grad_norm": 0.01634833589196205, "learning_rate": 1e-05, "loss": 1.7489, "step": 432 }, { "epoch": 0.7870938423085663, "grad_norm": 0.01686246506869793, "learning_rate": 1e-05, "loss": 1.7483, "step": 433 }, { "epoch": 0.7889116109975006, "grad_norm": 0.01572590321302414, "learning_rate": 1e-05, "loss": 1.7454, "step": 434 }, { "epoch": 0.7907293796864349, "grad_norm": 0.016653846949338913, "learning_rate": 1e-05, "loss": 1.7447, "step": 435 }, { "epoch": 0.7925471483753692, "grad_norm": 0.016530562192201614, "learning_rate": 1e-05, "loss": 1.7465, "step": 436 }, { "epoch": 0.7943649170643036, "grad_norm": 0.016080396249890327, "learning_rate": 1e-05, "loss": 1.7437, "step": 437 }, { "epoch": 0.7961826857532379, "grad_norm": 0.016825426369905472, "learning_rate": 1e-05, "loss": 1.7432, "step": 438 }, { "epoch": 0.7980004544421723, "grad_norm": 0.01737258955836296, "learning_rate": 1e-05, "loss": 1.7363, "step": 439 }, { "epoch": 0.7998182231311066, "grad_norm": 0.015955086797475815, "learning_rate": 1e-05, "loss": 1.7509, "step": 440 }, { "epoch": 0.8016359918200409, "grad_norm": 0.016994798555970192, "learning_rate": 1e-05, "loss": 1.7446, "step": 441 }, { "epoch": 0.8034537605089752, "grad_norm": 0.0163293294608593, "learning_rate": 1e-05, "loss": 1.7491, "step": 442 }, { "epoch": 0.8052715291979096, "grad_norm": 0.016241351142525673, "learning_rate": 1e-05, "loss": 1.7408, "step": 443 }, { "epoch": 0.8070892978868439, "grad_norm": 0.03442993760108948, "learning_rate": 1e-05, "loss": 1.7485, "step": 444 }, { "epoch": 0.8089070665757783, "grad_norm": 0.01715024746954441, "learning_rate": 1e-05, "loss": 1.7507, "step": 445 }, { "epoch": 0.8107248352647126, "grad_norm": 0.016102071851491928, "learning_rate": 1e-05, "loss": 1.7508, "step": 446 }, { "epoch": 0.8125426039536469, "grad_norm": 0.018684349954128265, "learning_rate": 1e-05, "loss": 1.745, "step": 447 }, { "epoch": 0.8143603726425812, "grad_norm": 0.01681571640074253, "learning_rate": 1e-05, "loss": 1.7564, "step": 448 }, { "epoch": 0.8161781413315156, "grad_norm": 0.01673213019967079, "learning_rate": 1e-05, "loss": 1.7491, "step": 449 }, { "epoch": 0.8179959100204499, "grad_norm": 0.01589960604906082, "learning_rate": 1e-05, "loss": 1.7534, "step": 450 }, { "epoch": 0.8198136787093843, "grad_norm": 0.018107162788510323, "learning_rate": 1e-05, "loss": 1.734, "step": 451 }, { "epoch": 0.8216314473983186, "grad_norm": 0.016370611265301704, "learning_rate": 1e-05, "loss": 1.748, "step": 452 }, { "epoch": 0.8234492160872529, "grad_norm": 0.01715346798300743, "learning_rate": 1e-05, "loss": 1.7581, "step": 453 }, { "epoch": 0.8252669847761872, "grad_norm": 0.016535120084881783, "learning_rate": 1e-05, "loss": 1.7483, "step": 454 }, { "epoch": 0.8270847534651216, "grad_norm": 0.01683277077972889, "learning_rate": 1e-05, "loss": 1.753, "step": 455 }, { "epoch": 0.8289025221540559, "grad_norm": 0.016108205541968346, "learning_rate": 1e-05, "loss": 1.7509, "step": 456 }, { "epoch": 0.8307202908429903, "grad_norm": 0.01758972927927971, "learning_rate": 1e-05, "loss": 1.7421, "step": 457 }, { "epoch": 0.8325380595319246, "grad_norm": 0.016740551218390465, "learning_rate": 1e-05, "loss": 1.7531, "step": 458 }, { "epoch": 0.8343558282208589, "grad_norm": 0.017136069014668465, "learning_rate": 1e-05, "loss": 1.7453, "step": 459 }, { "epoch": 0.8361735969097932, "grad_norm": 0.018268654122948647, "learning_rate": 1e-05, "loss": 1.7468, "step": 460 }, { "epoch": 0.8379913655987276, "grad_norm": 0.01658778078854084, "learning_rate": 1e-05, "loss": 1.7496, "step": 461 }, { "epoch": 0.8398091342876619, "grad_norm": 0.016633301973342896, "learning_rate": 1e-05, "loss": 1.7485, "step": 462 }, { "epoch": 0.8416269029765963, "grad_norm": 0.016990309581160545, "learning_rate": 1e-05, "loss": 1.7405, "step": 463 }, { "epoch": 0.8434446716655306, "grad_norm": 0.01661493442952633, "learning_rate": 1e-05, "loss": 1.7464, "step": 464 }, { "epoch": 0.8452624403544649, "grad_norm": 0.01699172891676426, "learning_rate": 1e-05, "loss": 1.7564, "step": 465 }, { "epoch": 0.8470802090433992, "grad_norm": 0.016703175380825996, "learning_rate": 1e-05, "loss": 1.745, "step": 466 }, { "epoch": 0.8488979777323336, "grad_norm": 0.01694013550877571, "learning_rate": 1e-05, "loss": 1.741, "step": 467 }, { "epoch": 0.8507157464212679, "grad_norm": 0.017576703801751137, "learning_rate": 1e-05, "loss": 1.7553, "step": 468 }, { "epoch": 0.8525335151102023, "grad_norm": 0.016727445647120476, "learning_rate": 1e-05, "loss": 1.734, "step": 469 }, { "epoch": 0.8543512837991366, "grad_norm": 0.015813367441296577, "learning_rate": 1e-05, "loss": 1.7443, "step": 470 }, { "epoch": 0.8561690524880708, "grad_norm": 0.01609817147254944, "learning_rate": 1e-05, "loss": 1.7496, "step": 471 }, { "epoch": 0.8579868211770052, "grad_norm": 0.01648952253162861, "learning_rate": 1e-05, "loss": 1.7444, "step": 472 }, { "epoch": 0.8598045898659396, "grad_norm": 0.016997788101434708, "learning_rate": 1e-05, "loss": 1.7436, "step": 473 }, { "epoch": 0.8616223585548739, "grad_norm": 0.016397470608353615, "learning_rate": 1e-05, "loss": 1.7488, "step": 474 }, { "epoch": 0.8634401272438083, "grad_norm": 0.01654043421149254, "learning_rate": 1e-05, "loss": 1.7406, "step": 475 }, { "epoch": 0.8652578959327426, "grad_norm": 0.016180653125047684, "learning_rate": 1e-05, "loss": 1.7463, "step": 476 }, { "epoch": 0.8670756646216768, "grad_norm": 0.016773954033851624, "learning_rate": 1e-05, "loss": 1.751, "step": 477 }, { "epoch": 0.8688934333106112, "grad_norm": 0.01736517809331417, "learning_rate": 1e-05, "loss": 1.7402, "step": 478 }, { "epoch": 0.8707112019995455, "grad_norm": 0.01888013258576393, "learning_rate": 1e-05, "loss": 1.7457, "step": 479 }, { "epoch": 0.8725289706884799, "grad_norm": 0.018337909132242203, "learning_rate": 1e-05, "loss": 1.7453, "step": 480 }, { "epoch": 0.8743467393774143, "grad_norm": 0.01563389040529728, "learning_rate": 1e-05, "loss": 1.7386, "step": 481 }, { "epoch": 0.8761645080663486, "grad_norm": 0.017023077234625816, "learning_rate": 1e-05, "loss": 1.7412, "step": 482 }, { "epoch": 0.8779822767552828, "grad_norm": 0.01671590842306614, "learning_rate": 1e-05, "loss": 1.7462, "step": 483 }, { "epoch": 0.8798000454442172, "grad_norm": 0.019904915243387222, "learning_rate": 1e-05, "loss": 1.7443, "step": 484 }, { "epoch": 0.8816178141331515, "grad_norm": 0.01728987693786621, "learning_rate": 1e-05, "loss": 1.7345, "step": 485 }, { "epoch": 0.8834355828220859, "grad_norm": 0.019658857956528664, "learning_rate": 1e-05, "loss": 1.7425, "step": 486 }, { "epoch": 0.8852533515110202, "grad_norm": 0.01688159443438053, "learning_rate": 1e-05, "loss": 1.746, "step": 487 }, { "epoch": 0.8870711201999546, "grad_norm": 0.01599729433655739, "learning_rate": 1e-05, "loss": 1.7327, "step": 488 }, { "epoch": 0.8888888888888888, "grad_norm": 0.016897086054086685, "learning_rate": 1e-05, "loss": 1.7385, "step": 489 }, { "epoch": 0.8907066575778232, "grad_norm": 0.016169127076864243, "learning_rate": 1e-05, "loss": 1.7405, "step": 490 }, { "epoch": 0.8925244262667575, "grad_norm": 0.01634543016552925, "learning_rate": 1e-05, "loss": 1.748, "step": 491 }, { "epoch": 0.8943421949556919, "grad_norm": 0.016616657376289368, "learning_rate": 1e-05, "loss": 1.7465, "step": 492 }, { "epoch": 0.8961599636446262, "grad_norm": 0.016464397311210632, "learning_rate": 1e-05, "loss": 1.7331, "step": 493 }, { "epoch": 0.8979777323335606, "grad_norm": 0.017165830358862877, "learning_rate": 1e-05, "loss": 1.7383, "step": 494 }, { "epoch": 0.8997955010224948, "grad_norm": 0.016248662024736404, "learning_rate": 1e-05, "loss": 1.7416, "step": 495 }, { "epoch": 0.9016132697114292, "grad_norm": 0.01670646481215954, "learning_rate": 1e-05, "loss": 1.742, "step": 496 }, { "epoch": 0.9034310384003635, "grad_norm": 0.016594985499978065, "learning_rate": 1e-05, "loss": 1.7397, "step": 497 }, { "epoch": 0.9052488070892979, "grad_norm": 0.016361333429813385, "learning_rate": 1e-05, "loss": 1.7511, "step": 498 }, { "epoch": 0.9070665757782322, "grad_norm": 0.016266893595457077, "learning_rate": 1e-05, "loss": 1.7468, "step": 499 }, { "epoch": 0.9088843444671666, "grad_norm": 0.017031649127602577, "learning_rate": 1e-05, "loss": 1.7327, "step": 500 }, { "epoch": 0.9107021131561008, "grad_norm": 0.016959581524133682, "learning_rate": 1e-05, "loss": 1.7454, "step": 501 }, { "epoch": 0.9125198818450352, "grad_norm": 0.07533946633338928, "learning_rate": 1e-05, "loss": 1.7476, "step": 502 }, { "epoch": 0.9143376505339695, "grad_norm": 0.01766197197139263, "learning_rate": 1e-05, "loss": 1.7461, "step": 503 }, { "epoch": 0.9161554192229039, "grad_norm": 0.01663908362388611, "learning_rate": 1e-05, "loss": 1.7361, "step": 504 }, { "epoch": 0.9179731879118382, "grad_norm": 0.02057843655347824, "learning_rate": 1e-05, "loss": 1.7441, "step": 505 }, { "epoch": 0.9197909566007726, "grad_norm": 0.017909778282046318, "learning_rate": 1e-05, "loss": 1.742, "step": 506 }, { "epoch": 0.9216087252897068, "grad_norm": 0.017638977617025375, "learning_rate": 1e-05, "loss": 1.7391, "step": 507 }, { "epoch": 0.9234264939786412, "grad_norm": 0.018523376435041428, "learning_rate": 1e-05, "loss": 1.7405, "step": 508 }, { "epoch": 0.9252442626675755, "grad_norm": 0.01635800302028656, "learning_rate": 1e-05, "loss": 1.7458, "step": 509 }, { "epoch": 0.9270620313565099, "grad_norm": 0.01763818971812725, "learning_rate": 1e-05, "loss": 1.7351, "step": 510 }, { "epoch": 0.9288798000454442, "grad_norm": 0.017338305711746216, "learning_rate": 1e-05, "loss": 1.7397, "step": 511 }, { "epoch": 0.9306975687343786, "grad_norm": 0.01771395467221737, "learning_rate": 1e-05, "loss": 1.7471, "step": 512 }, { "epoch": 0.9325153374233128, "grad_norm": 0.017642149701714516, "learning_rate": 1e-05, "loss": 1.7454, "step": 513 }, { "epoch": 0.9343331061122472, "grad_norm": 0.017685122787952423, "learning_rate": 1e-05, "loss": 1.7375, "step": 514 }, { "epoch": 0.9361508748011815, "grad_norm": 0.017887357622385025, "learning_rate": 1e-05, "loss": 1.7394, "step": 515 }, { "epoch": 0.9379686434901159, "grad_norm": 0.01899501495063305, "learning_rate": 1e-05, "loss": 1.7452, "step": 516 }, { "epoch": 0.9397864121790502, "grad_norm": 0.017754577100276947, "learning_rate": 1e-05, "loss": 1.7441, "step": 517 }, { "epoch": 0.9416041808679846, "grad_norm": 0.01811014860868454, "learning_rate": 1e-05, "loss": 1.7417, "step": 518 }, { "epoch": 0.9434219495569189, "grad_norm": 0.01806728169322014, "learning_rate": 1e-05, "loss": 1.7428, "step": 519 }, { "epoch": 0.9452397182458532, "grad_norm": 0.018700286746025085, "learning_rate": 1e-05, "loss": 1.7345, "step": 520 }, { "epoch": 0.9470574869347875, "grad_norm": 0.01722894422709942, "learning_rate": 1e-05, "loss": 1.7362, "step": 521 }, { "epoch": 0.9488752556237219, "grad_norm": 0.016884060576558113, "learning_rate": 1e-05, "loss": 1.7355, "step": 522 }, { "epoch": 0.9506930243126562, "grad_norm": 0.017119232565164566, "learning_rate": 1e-05, "loss": 1.7468, "step": 523 }, { "epoch": 0.9525107930015906, "grad_norm": 0.017567407339811325, "learning_rate": 1e-05, "loss": 1.7422, "step": 524 }, { "epoch": 0.9543285616905249, "grad_norm": 0.017188768833875656, "learning_rate": 1e-05, "loss": 1.7393, "step": 525 }, { "epoch": 0.9561463303794592, "grad_norm": 0.016574783250689507, "learning_rate": 1e-05, "loss": 1.7341, "step": 526 }, { "epoch": 0.9579640990683935, "grad_norm": 0.020617837086319923, "learning_rate": 1e-05, "loss": 1.7428, "step": 527 }, { "epoch": 0.9597818677573279, "grad_norm": 0.018011432141065598, "learning_rate": 1e-05, "loss": 1.7496, "step": 528 }, { "epoch": 0.9615996364462622, "grad_norm": 0.018056875094771385, "learning_rate": 1e-05, "loss": 1.7413, "step": 529 }, { "epoch": 0.9634174051351966, "grad_norm": 0.018342627212405205, "learning_rate": 1e-05, "loss": 1.7395, "step": 530 }, { "epoch": 0.9652351738241309, "grad_norm": 0.022182267159223557, "learning_rate": 1e-05, "loss": 1.7342, "step": 531 }, { "epoch": 0.9670529425130652, "grad_norm": 0.01826542802155018, "learning_rate": 1e-05, "loss": 1.7384, "step": 532 }, { "epoch": 0.9688707112019995, "grad_norm": 0.01716247759759426, "learning_rate": 1e-05, "loss": 1.7425, "step": 533 }, { "epoch": 0.9706884798909339, "grad_norm": 0.017304804176092148, "learning_rate": 1e-05, "loss": 1.7521, "step": 534 }, { "epoch": 0.9725062485798682, "grad_norm": 0.01794220507144928, "learning_rate": 1e-05, "loss": 1.7455, "step": 535 }, { "epoch": 0.9743240172688026, "grad_norm": 0.017633073031902313, "learning_rate": 1e-05, "loss": 1.7509, "step": 536 }, { "epoch": 0.9761417859577369, "grad_norm": 0.016983771696686745, "learning_rate": 1e-05, "loss": 1.7392, "step": 537 }, { "epoch": 0.9779595546466712, "grad_norm": 0.01743633858859539, "learning_rate": 1e-05, "loss": 1.7341, "step": 538 }, { "epoch": 0.9797773233356055, "grad_norm": 0.017662547528743744, "learning_rate": 1e-05, "loss": 1.7367, "step": 539 }, { "epoch": 0.9815950920245399, "grad_norm": 0.01701057143509388, "learning_rate": 1e-05, "loss": 1.7423, "step": 540 }, { "epoch": 0.9834128607134742, "grad_norm": 0.017070814967155457, "learning_rate": 1e-05, "loss": 1.7429, "step": 541 }, { "epoch": 0.9852306294024086, "grad_norm": 0.01704619824886322, "learning_rate": 1e-05, "loss": 1.7348, "step": 542 }, { "epoch": 0.9870483980913429, "grad_norm": 0.017563099041581154, "learning_rate": 1e-05, "loss": 1.7382, "step": 543 }, { "epoch": 0.9888661667802772, "grad_norm": 0.01661253347992897, "learning_rate": 1e-05, "loss": 1.7412, "step": 544 }, { "epoch": 0.9906839354692115, "grad_norm": 0.016802560538053513, "learning_rate": 1e-05, "loss": 1.7287, "step": 545 }, { "epoch": 0.9925017041581459, "grad_norm": 0.01623694598674774, "learning_rate": 1e-05, "loss": 1.7345, "step": 546 }, { "epoch": 0.9943194728470802, "grad_norm": 0.01796470768749714, "learning_rate": 1e-05, "loss": 1.7282, "step": 547 }, { "epoch": 0.9961372415360146, "grad_norm": 0.016037970781326294, "learning_rate": 1e-05, "loss": 1.7358, "step": 548 }, { "epoch": 0.9979550102249489, "grad_norm": 0.016084497794508934, "learning_rate": 1e-05, "loss": 1.7371, "step": 549 }, { "epoch": 0.9997727789138832, "grad_norm": 0.016458775848150253, "learning_rate": 1e-05, "loss": 1.7397, "step": 550 }, { "epoch": 0.9997727789138832, "step": 550, "total_flos": 2868807299235840.0, "train_loss": 0.7944060720096935, "train_runtime": 47914.0805, "train_samples_per_second": 2.939, "train_steps_per_second": 0.011 } ], "logging_steps": 1, "max_steps": 550, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2868807299235840.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }