{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 15948, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009405568096313018, "grad_norm": 4.406911849975586, "learning_rate": 1.9938550288437425e-05, "loss": 0.6174, "step": 50 }, { "epoch": 0.018811136192626036, "grad_norm": 6.521998405456543, "learning_rate": 1.987584650112867e-05, "loss": 0.398, "step": 100 }, { "epoch": 0.028216704288939052, "grad_norm": 4.9869771003723145, "learning_rate": 1.9813142713819916e-05, "loss": 0.3699, "step": 150 }, { "epoch": 0.03762227238525207, "grad_norm": 4.50770378112793, "learning_rate": 1.975043892651116e-05, "loss": 0.3522, "step": 200 }, { "epoch": 0.04702784048156509, "grad_norm": 4.725740909576416, "learning_rate": 1.9687735139202408e-05, "loss": 0.3176, "step": 250 }, { "epoch": 0.056433408577878104, "grad_norm": 2.476680040359497, "learning_rate": 1.9625031351893655e-05, "loss": 0.3091, "step": 300 }, { "epoch": 0.06583897667419113, "grad_norm": 5.042876720428467, "learning_rate": 1.9562327564584903e-05, "loss": 0.3506, "step": 350 }, { "epoch": 0.07524454477050414, "grad_norm": 7.000358581542969, "learning_rate": 1.949962377727615e-05, "loss": 0.3493, "step": 400 }, { "epoch": 0.08465011286681716, "grad_norm": 9.695847511291504, "learning_rate": 1.9436919989967394e-05, "loss": 0.2922, "step": 450 }, { "epoch": 0.09405568096313018, "grad_norm": 5.6148552894592285, "learning_rate": 1.9374216202658642e-05, "loss": 0.3225, "step": 500 }, { "epoch": 0.10346124905944319, "grad_norm": 3.7483432292938232, "learning_rate": 1.931151241534989e-05, "loss": 0.3329, "step": 550 }, { "epoch": 0.11286681715575621, "grad_norm": 3.2282767295837402, "learning_rate": 1.9248808628041137e-05, "loss": 0.3107, "step": 600 }, { "epoch": 0.12227238525206922, "grad_norm": 9.40439224243164, "learning_rate": 1.918610484073238e-05, "loss": 0.3059, "step": 650 }, { "epoch": 0.13167795334838225, "grad_norm": 2.8919079303741455, "learning_rate": 1.912340105342363e-05, "loss": 0.2929, "step": 700 }, { "epoch": 0.14108352144469527, "grad_norm": 3.744126558303833, "learning_rate": 1.9060697266114876e-05, "loss": 0.3426, "step": 750 }, { "epoch": 0.1504890895410083, "grad_norm": 2.0327718257904053, "learning_rate": 1.899799347880612e-05, "loss": 0.3259, "step": 800 }, { "epoch": 0.1598946576373213, "grad_norm": 3.749131679534912, "learning_rate": 1.8935289691497367e-05, "loss": 0.2781, "step": 850 }, { "epoch": 0.16930022573363432, "grad_norm": 2.2363057136535645, "learning_rate": 1.8872585904188615e-05, "loss": 0.3797, "step": 900 }, { "epoch": 0.17870579382994733, "grad_norm": 8.247345924377441, "learning_rate": 1.8809882116879862e-05, "loss": 0.3156, "step": 950 }, { "epoch": 0.18811136192626035, "grad_norm": 4.1785454750061035, "learning_rate": 1.8747178329571106e-05, "loss": 0.2728, "step": 1000 }, { "epoch": 0.19751693002257337, "grad_norm": 2.009939670562744, "learning_rate": 1.8684474542262354e-05, "loss": 0.2674, "step": 1050 }, { "epoch": 0.20692249811888638, "grad_norm": 14.008905410766602, "learning_rate": 1.86217707549536e-05, "loss": 0.3228, "step": 1100 }, { "epoch": 0.2163280662151994, "grad_norm": 7.390902042388916, "learning_rate": 1.855906696764485e-05, "loss": 0.2759, "step": 1150 }, { "epoch": 0.22573363431151242, "grad_norm": 5.746609210968018, "learning_rate": 1.8496363180336093e-05, "loss": 0.257, "step": 1200 }, { "epoch": 0.23513920240782543, "grad_norm": 5.413491725921631, "learning_rate": 1.843365939302734e-05, "loss": 0.2821, "step": 1250 }, { "epoch": 0.24454477050413845, "grad_norm": 17.94203758239746, "learning_rate": 1.8370955605718588e-05, "loss": 0.2749, "step": 1300 }, { "epoch": 0.25395033860045146, "grad_norm": 4.912784099578857, "learning_rate": 1.8308251818409832e-05, "loss": 0.2733, "step": 1350 }, { "epoch": 0.2633559066967645, "grad_norm": 3.2884740829467773, "learning_rate": 1.824554803110108e-05, "loss": 0.2873, "step": 1400 }, { "epoch": 0.2727614747930775, "grad_norm": 3.9251766204833984, "learning_rate": 1.8182844243792327e-05, "loss": 0.2802, "step": 1450 }, { "epoch": 0.28216704288939054, "grad_norm": 1.8012003898620605, "learning_rate": 1.8120140456483574e-05, "loss": 0.2765, "step": 1500 }, { "epoch": 0.29157261098570353, "grad_norm": 3.162705183029175, "learning_rate": 1.805743666917482e-05, "loss": 0.2761, "step": 1550 }, { "epoch": 0.3009781790820166, "grad_norm": 2.2068610191345215, "learning_rate": 1.7994732881866066e-05, "loss": 0.263, "step": 1600 }, { "epoch": 0.31038374717832956, "grad_norm": 2.723480224609375, "learning_rate": 1.7932029094557313e-05, "loss": 0.2496, "step": 1650 }, { "epoch": 0.3197893152746426, "grad_norm": 2.9920785427093506, "learning_rate": 1.786932530724856e-05, "loss": 0.2955, "step": 1700 }, { "epoch": 0.3291948833709556, "grad_norm": 4.665702819824219, "learning_rate": 1.7806621519939805e-05, "loss": 0.306, "step": 1750 }, { "epoch": 0.33860045146726864, "grad_norm": 1.996135950088501, "learning_rate": 1.7743917732631052e-05, "loss": 0.329, "step": 1800 }, { "epoch": 0.3480060195635816, "grad_norm": 2.1975622177124023, "learning_rate": 1.76812139453223e-05, "loss": 0.2896, "step": 1850 }, { "epoch": 0.35741158765989467, "grad_norm": 23.333040237426758, "learning_rate": 1.7618510158013547e-05, "loss": 0.2878, "step": 1900 }, { "epoch": 0.36681715575620766, "grad_norm": 1.7410361766815186, "learning_rate": 1.755580637070479e-05, "loss": 0.2731, "step": 1950 }, { "epoch": 0.3762227238525207, "grad_norm": 5.49874210357666, "learning_rate": 1.749310258339604e-05, "loss": 0.3071, "step": 2000 }, { "epoch": 0.3856282919488337, "grad_norm": 2.9172000885009766, "learning_rate": 1.7430398796087283e-05, "loss": 0.2777, "step": 2050 }, { "epoch": 0.39503386004514673, "grad_norm": 2.531278371810913, "learning_rate": 1.736769500877853e-05, "loss": 0.2683, "step": 2100 }, { "epoch": 0.4044394281414597, "grad_norm": 3.2860658168792725, "learning_rate": 1.7304991221469778e-05, "loss": 0.2599, "step": 2150 }, { "epoch": 0.41384499623777277, "grad_norm": 1.781692624092102, "learning_rate": 1.7242287434161025e-05, "loss": 0.2867, "step": 2200 }, { "epoch": 0.42325056433408575, "grad_norm": 2.29233717918396, "learning_rate": 1.7179583646852273e-05, "loss": 0.2744, "step": 2250 }, { "epoch": 0.4326561324303988, "grad_norm": 2.741166591644287, "learning_rate": 1.7116879859543517e-05, "loss": 0.2595, "step": 2300 }, { "epoch": 0.4420617005267118, "grad_norm": 5.684919834136963, "learning_rate": 1.7054176072234764e-05, "loss": 0.3033, "step": 2350 }, { "epoch": 0.45146726862302483, "grad_norm": 2.437774181365967, "learning_rate": 1.699147228492601e-05, "loss": 0.2934, "step": 2400 }, { "epoch": 0.4608728367193379, "grad_norm": 6.011141300201416, "learning_rate": 1.692876849761726e-05, "loss": 0.2793, "step": 2450 }, { "epoch": 0.47027840481565086, "grad_norm": 24.469600677490234, "learning_rate": 1.6866064710308507e-05, "loss": 0.2492, "step": 2500 }, { "epoch": 0.4796839729119639, "grad_norm": 4.883657455444336, "learning_rate": 1.680336092299975e-05, "loss": 0.2216, "step": 2550 }, { "epoch": 0.4890895410082769, "grad_norm": 2.0113911628723145, "learning_rate": 1.6740657135690995e-05, "loss": 0.2999, "step": 2600 }, { "epoch": 0.49849510910458994, "grad_norm": 2.1354928016662598, "learning_rate": 1.6677953348382242e-05, "loss": 0.2635, "step": 2650 }, { "epoch": 0.5079006772009029, "grad_norm": 3.987088918685913, "learning_rate": 1.661524956107349e-05, "loss": 0.2405, "step": 2700 }, { "epoch": 0.5173062452972159, "grad_norm": 4.9606709480285645, "learning_rate": 1.6552545773764737e-05, "loss": 0.239, "step": 2750 }, { "epoch": 0.526711813393529, "grad_norm": 1.6401499509811401, "learning_rate": 1.6489841986455985e-05, "loss": 0.3246, "step": 2800 }, { "epoch": 0.536117381489842, "grad_norm": 5.161315441131592, "learning_rate": 1.642713819914723e-05, "loss": 0.2625, "step": 2850 }, { "epoch": 0.545522949586155, "grad_norm": 1.054700255393982, "learning_rate": 1.6364434411838476e-05, "loss": 0.2741, "step": 2900 }, { "epoch": 0.554928517682468, "grad_norm": 2.2569172382354736, "learning_rate": 1.6301730624529724e-05, "loss": 0.2622, "step": 2950 }, { "epoch": 0.5643340857787811, "grad_norm": 11.479528427124023, "learning_rate": 1.623902683722097e-05, "loss": 0.2235, "step": 3000 }, { "epoch": 0.5737396538750941, "grad_norm": 2.314810276031494, "learning_rate": 1.617632304991222e-05, "loss": 0.284, "step": 3050 }, { "epoch": 0.5831452219714071, "grad_norm": 2.623328924179077, "learning_rate": 1.6113619262603463e-05, "loss": 0.2951, "step": 3100 }, { "epoch": 0.59255079006772, "grad_norm": 6.059717655181885, "learning_rate": 1.6050915475294707e-05, "loss": 0.2613, "step": 3150 }, { "epoch": 0.6019563581640331, "grad_norm": 1.6962251663208008, "learning_rate": 1.5988211687985954e-05, "loss": 0.2676, "step": 3200 }, { "epoch": 0.6113619262603461, "grad_norm": 6.8796586990356445, "learning_rate": 1.59255079006772e-05, "loss": 0.2314, "step": 3250 }, { "epoch": 0.6207674943566591, "grad_norm": 5.26965856552124, "learning_rate": 1.586280411336845e-05, "loss": 0.2655, "step": 3300 }, { "epoch": 0.6301730624529721, "grad_norm": 2.5264058113098145, "learning_rate": 1.5800100326059697e-05, "loss": 0.2399, "step": 3350 }, { "epoch": 0.6395786305492852, "grad_norm": 7.36959171295166, "learning_rate": 1.573739653875094e-05, "loss": 0.2645, "step": 3400 }, { "epoch": 0.6489841986455982, "grad_norm": 6.5851874351501465, "learning_rate": 1.5674692751442188e-05, "loss": 0.2544, "step": 3450 }, { "epoch": 0.6583897667419112, "grad_norm": 1.4328551292419434, "learning_rate": 1.5611988964133436e-05, "loss": 0.2385, "step": 3500 }, { "epoch": 0.6677953348382242, "grad_norm": 1.747718095779419, "learning_rate": 1.5549285176824683e-05, "loss": 0.2371, "step": 3550 }, { "epoch": 0.6772009029345373, "grad_norm": 5.544091701507568, "learning_rate": 1.548658138951593e-05, "loss": 0.2347, "step": 3600 }, { "epoch": 0.6866064710308503, "grad_norm": 2.0724775791168213, "learning_rate": 1.5423877602207175e-05, "loss": 0.2817, "step": 3650 }, { "epoch": 0.6960120391271633, "grad_norm": 3.6699838638305664, "learning_rate": 1.536117381489842e-05, "loss": 0.282, "step": 3700 }, { "epoch": 0.7054176072234764, "grad_norm": 2.081963539123535, "learning_rate": 1.5298470027589666e-05, "loss": 0.3104, "step": 3750 }, { "epoch": 0.7148231753197893, "grad_norm": 2.8969521522521973, "learning_rate": 1.5235766240280914e-05, "loss": 0.2473, "step": 3800 }, { "epoch": 0.7242287434161023, "grad_norm": 2.79297137260437, "learning_rate": 1.5173062452972161e-05, "loss": 0.2199, "step": 3850 }, { "epoch": 0.7336343115124153, "grad_norm": 1.689758062362671, "learning_rate": 1.5110358665663407e-05, "loss": 0.2487, "step": 3900 }, { "epoch": 0.7430398796087284, "grad_norm": 2.0919642448425293, "learning_rate": 1.5047654878354654e-05, "loss": 0.2558, "step": 3950 }, { "epoch": 0.7524454477050414, "grad_norm": 8.588711738586426, "learning_rate": 1.49849510910459e-05, "loss": 0.2798, "step": 4000 }, { "epoch": 0.7618510158013544, "grad_norm": 2.519028425216675, "learning_rate": 1.4922247303737148e-05, "loss": 0.2862, "step": 4050 }, { "epoch": 0.7712565838976674, "grad_norm": 5.890926837921143, "learning_rate": 1.4859543516428393e-05, "loss": 0.2243, "step": 4100 }, { "epoch": 0.7806621519939805, "grad_norm": 2.727581262588501, "learning_rate": 1.479683972911964e-05, "loss": 0.2578, "step": 4150 }, { "epoch": 0.7900677200902935, "grad_norm": 3.8163578510284424, "learning_rate": 1.4734135941810888e-05, "loss": 0.2844, "step": 4200 }, { "epoch": 0.7994732881866065, "grad_norm": 3.923978567123413, "learning_rate": 1.4671432154502132e-05, "loss": 0.2344, "step": 4250 }, { "epoch": 0.8088788562829194, "grad_norm": 1.962684154510498, "learning_rate": 1.4608728367193378e-05, "loss": 0.2366, "step": 4300 }, { "epoch": 0.8182844243792325, "grad_norm": 4.875962734222412, "learning_rate": 1.4546024579884626e-05, "loss": 0.266, "step": 4350 }, { "epoch": 0.8276899924755455, "grad_norm": 3.953382730484009, "learning_rate": 1.4483320792575873e-05, "loss": 0.2949, "step": 4400 }, { "epoch": 0.8370955605718585, "grad_norm": 4.6789870262146, "learning_rate": 1.4420617005267119e-05, "loss": 0.2429, "step": 4450 }, { "epoch": 0.8465011286681715, "grad_norm": 1.2689917087554932, "learning_rate": 1.4357913217958366e-05, "loss": 0.2278, "step": 4500 }, { "epoch": 0.8559066967644846, "grad_norm": 1.1619006395339966, "learning_rate": 1.4295209430649612e-05, "loss": 0.2125, "step": 4550 }, { "epoch": 0.8653122648607976, "grad_norm": 2.665306329727173, "learning_rate": 1.423250564334086e-05, "loss": 0.2627, "step": 4600 }, { "epoch": 0.8747178329571106, "grad_norm": 2.67232084274292, "learning_rate": 1.4169801856032105e-05, "loss": 0.2747, "step": 4650 }, { "epoch": 0.8841234010534236, "grad_norm": 1.467073678970337, "learning_rate": 1.4107098068723353e-05, "loss": 0.2261, "step": 4700 }, { "epoch": 0.8935289691497367, "grad_norm": 7.690640449523926, "learning_rate": 1.40443942814146e-05, "loss": 0.2209, "step": 4750 }, { "epoch": 0.9029345372460497, "grad_norm": 2.4795353412628174, "learning_rate": 1.3981690494105846e-05, "loss": 0.247, "step": 4800 }, { "epoch": 0.9123401053423627, "grad_norm": 2.3668243885040283, "learning_rate": 1.391898670679709e-05, "loss": 0.2378, "step": 4850 }, { "epoch": 0.9217456734386757, "grad_norm": 2.6995575428009033, "learning_rate": 1.3856282919488338e-05, "loss": 0.2156, "step": 4900 }, { "epoch": 0.9311512415349887, "grad_norm": 3.877608299255371, "learning_rate": 1.3793579132179585e-05, "loss": 0.2434, "step": 4950 }, { "epoch": 0.9405568096313017, "grad_norm": 5.548897743225098, "learning_rate": 1.373087534487083e-05, "loss": 0.2571, "step": 5000 }, { "epoch": 0.9499623777276147, "grad_norm": 1.6183255910873413, "learning_rate": 1.3668171557562078e-05, "loss": 0.2118, "step": 5050 }, { "epoch": 0.9593679458239278, "grad_norm": 8.709449768066406, "learning_rate": 1.3605467770253324e-05, "loss": 0.2548, "step": 5100 }, { "epoch": 0.9687735139202408, "grad_norm": 1.0707285404205322, "learning_rate": 1.3542763982944572e-05, "loss": 0.1974, "step": 5150 }, { "epoch": 0.9781790820165538, "grad_norm": 2.0646872520446777, "learning_rate": 1.3480060195635817e-05, "loss": 0.2524, "step": 5200 }, { "epoch": 0.9875846501128668, "grad_norm": 2.3454248905181885, "learning_rate": 1.3417356408327065e-05, "loss": 0.2698, "step": 5250 }, { "epoch": 0.9969902182091799, "grad_norm": 3.7309887409210205, "learning_rate": 1.3354652621018312e-05, "loss": 0.2508, "step": 5300 }, { "epoch": 1.0063957863054929, "grad_norm": 10.15404987335205, "learning_rate": 1.3291948833709558e-05, "loss": 0.2409, "step": 5350 }, { "epoch": 1.0158013544018059, "grad_norm": 2.018286943435669, "learning_rate": 1.3229245046400802e-05, "loss": 0.2634, "step": 5400 }, { "epoch": 1.0252069224981188, "grad_norm": 1.0378094911575317, "learning_rate": 1.316654125909205e-05, "loss": 0.2317, "step": 5450 }, { "epoch": 1.0346124905944318, "grad_norm": 2.633552074432373, "learning_rate": 1.3103837471783295e-05, "loss": 0.2087, "step": 5500 }, { "epoch": 1.0440180586907448, "grad_norm": 2.9494006633758545, "learning_rate": 1.3041133684474543e-05, "loss": 0.2294, "step": 5550 }, { "epoch": 1.053423626787058, "grad_norm": 1.539960265159607, "learning_rate": 1.297842989716579e-05, "loss": 0.1995, "step": 5600 }, { "epoch": 1.062829194883371, "grad_norm": 1.6446877717971802, "learning_rate": 1.2915726109857036e-05, "loss": 0.2609, "step": 5650 }, { "epoch": 1.072234762979684, "grad_norm": 0.5871282815933228, "learning_rate": 1.2853022322548283e-05, "loss": 0.2095, "step": 5700 }, { "epoch": 1.081640331075997, "grad_norm": 3.3624796867370605, "learning_rate": 1.279031853523953e-05, "loss": 0.2342, "step": 5750 }, { "epoch": 1.09104589917231, "grad_norm": 6.334434509277344, "learning_rate": 1.2727614747930777e-05, "loss": 0.2143, "step": 5800 }, { "epoch": 1.100451467268623, "grad_norm": 3.2644360065460205, "learning_rate": 1.2664910960622022e-05, "loss": 0.2094, "step": 5850 }, { "epoch": 1.109857035364936, "grad_norm": 2.850273847579956, "learning_rate": 1.260220717331327e-05, "loss": 0.2012, "step": 5900 }, { "epoch": 1.119262603461249, "grad_norm": 6.344181537628174, "learning_rate": 1.2539503386004517e-05, "loss": 0.1993, "step": 5950 }, { "epoch": 1.1286681715575622, "grad_norm": 3.1634130477905273, "learning_rate": 1.2476799598695761e-05, "loss": 0.2035, "step": 6000 }, { "epoch": 1.1380737396538751, "grad_norm": 1.3129241466522217, "learning_rate": 1.2414095811387007e-05, "loss": 0.2632, "step": 6050 }, { "epoch": 1.1474793077501881, "grad_norm": 1.7623401880264282, "learning_rate": 1.2351392024078255e-05, "loss": 0.1882, "step": 6100 }, { "epoch": 1.1568848758465011, "grad_norm": 1.544403076171875, "learning_rate": 1.2288688236769502e-05, "loss": 0.2814, "step": 6150 }, { "epoch": 1.1662904439428141, "grad_norm": 2.739286184310913, "learning_rate": 1.2225984449460748e-05, "loss": 0.1824, "step": 6200 }, { "epoch": 1.175696012039127, "grad_norm": 6.419041633605957, "learning_rate": 1.2163280662151995e-05, "loss": 0.2174, "step": 6250 }, { "epoch": 1.18510158013544, "grad_norm": 2.975383996963501, "learning_rate": 1.2100576874843241e-05, "loss": 0.2511, "step": 6300 }, { "epoch": 1.1945071482317533, "grad_norm": 2.4400739669799805, "learning_rate": 1.2037873087534489e-05, "loss": 0.2021, "step": 6350 }, { "epoch": 1.2039127163280663, "grad_norm": 3.1182546615600586, "learning_rate": 1.1975169300225734e-05, "loss": 0.2323, "step": 6400 }, { "epoch": 1.2133182844243793, "grad_norm": 1.4824222326278687, "learning_rate": 1.1912465512916982e-05, "loss": 0.2289, "step": 6450 }, { "epoch": 1.2227238525206923, "grad_norm": 5.336580753326416, "learning_rate": 1.184976172560823e-05, "loss": 0.1726, "step": 6500 }, { "epoch": 1.2321294206170053, "grad_norm": 14.752867698669434, "learning_rate": 1.1787057938299473e-05, "loss": 0.253, "step": 6550 }, { "epoch": 1.2415349887133182, "grad_norm": 1.371951699256897, "learning_rate": 1.172435415099072e-05, "loss": 0.1525, "step": 6600 }, { "epoch": 1.2509405568096312, "grad_norm": 2.216179847717285, "learning_rate": 1.1661650363681967e-05, "loss": 0.2001, "step": 6650 }, { "epoch": 1.2603461249059444, "grad_norm": 6.2752299308776855, "learning_rate": 1.1598946576373214e-05, "loss": 0.2261, "step": 6700 }, { "epoch": 1.2697516930022572, "grad_norm": 3.347257137298584, "learning_rate": 1.153624278906446e-05, "loss": 0.2291, "step": 6750 }, { "epoch": 1.2791572610985704, "grad_norm": 8.093568801879883, "learning_rate": 1.1473539001755707e-05, "loss": 0.1976, "step": 6800 }, { "epoch": 1.2885628291948834, "grad_norm": 1.470790147781372, "learning_rate": 1.1410835214446953e-05, "loss": 0.1928, "step": 6850 }, { "epoch": 1.2979683972911964, "grad_norm": 3.1566500663757324, "learning_rate": 1.13481314271382e-05, "loss": 0.2028, "step": 6900 }, { "epoch": 1.3073739653875094, "grad_norm": 9.452258110046387, "learning_rate": 1.1285427639829446e-05, "loss": 0.213, "step": 6950 }, { "epoch": 1.3167795334838224, "grad_norm": 7.935844898223877, "learning_rate": 1.1222723852520694e-05, "loss": 0.193, "step": 7000 }, { "epoch": 1.3261851015801354, "grad_norm": 1.4266091585159302, "learning_rate": 1.1160020065211941e-05, "loss": 0.1707, "step": 7050 }, { "epoch": 1.3355906696764483, "grad_norm": 11.033124923706055, "learning_rate": 1.1097316277903187e-05, "loss": 0.1936, "step": 7100 }, { "epoch": 1.3449962377727616, "grad_norm": 1.1958593130111694, "learning_rate": 1.1034612490594431e-05, "loss": 0.196, "step": 7150 }, { "epoch": 1.3544018058690745, "grad_norm": 1.222621202468872, "learning_rate": 1.0971908703285679e-05, "loss": 0.2003, "step": 7200 }, { "epoch": 1.3638073739653875, "grad_norm": 2.297128200531006, "learning_rate": 1.0909204915976926e-05, "loss": 0.1994, "step": 7250 }, { "epoch": 1.3732129420617005, "grad_norm": 4.549992561340332, "learning_rate": 1.0846501128668172e-05, "loss": 0.1951, "step": 7300 }, { "epoch": 1.3826185101580135, "grad_norm": 2.43581223487854, "learning_rate": 1.078379734135942e-05, "loss": 0.2665, "step": 7350 }, { "epoch": 1.3920240782543265, "grad_norm": 2.75065016746521, "learning_rate": 1.0721093554050665e-05, "loss": 0.2465, "step": 7400 }, { "epoch": 1.4014296463506395, "grad_norm": 5.422140121459961, "learning_rate": 1.0658389766741913e-05, "loss": 0.1747, "step": 7450 }, { "epoch": 1.4108352144469527, "grad_norm": 0.8706988096237183, "learning_rate": 1.0595685979433158e-05, "loss": 0.2454, "step": 7500 }, { "epoch": 1.4202407825432657, "grad_norm": 1.9640963077545166, "learning_rate": 1.0532982192124406e-05, "loss": 0.2293, "step": 7550 }, { "epoch": 1.4296463506395787, "grad_norm": 2.4464077949523926, "learning_rate": 1.0470278404815653e-05, "loss": 0.2183, "step": 7600 }, { "epoch": 1.4390519187358917, "grad_norm": 1.8322765827178955, "learning_rate": 1.0407574617506899e-05, "loss": 0.1933, "step": 7650 }, { "epoch": 1.4484574868322047, "grad_norm": 1.6448564529418945, "learning_rate": 1.0344870830198143e-05, "loss": 0.2198, "step": 7700 }, { "epoch": 1.4578630549285176, "grad_norm": 1.1031991243362427, "learning_rate": 1.028216704288939e-05, "loss": 0.2223, "step": 7750 }, { "epoch": 1.4672686230248306, "grad_norm": 2.653724193572998, "learning_rate": 1.0219463255580638e-05, "loss": 0.1873, "step": 7800 }, { "epoch": 1.4766741911211438, "grad_norm": 9.545223236083984, "learning_rate": 1.0156759468271884e-05, "loss": 0.2573, "step": 7850 }, { "epoch": 1.4860797592174566, "grad_norm": 0.947347104549408, "learning_rate": 1.0094055680963131e-05, "loss": 0.1485, "step": 7900 }, { "epoch": 1.4954853273137698, "grad_norm": 1.778729796409607, "learning_rate": 1.0031351893654377e-05, "loss": 0.2725, "step": 7950 }, { "epoch": 1.5048908954100828, "grad_norm": 4.2415995597839355, "learning_rate": 9.968648106345625e-06, "loss": 0.1629, "step": 8000 }, { "epoch": 1.5142964635063958, "grad_norm": 2.495288133621216, "learning_rate": 9.90594431903687e-06, "loss": 0.1961, "step": 8050 }, { "epoch": 1.5237020316027088, "grad_norm": 15.494341850280762, "learning_rate": 9.843240531728118e-06, "loss": 0.2032, "step": 8100 }, { "epoch": 1.5331075996990218, "grad_norm": 0.8584136962890625, "learning_rate": 9.780536744419364e-06, "loss": 0.2194, "step": 8150 }, { "epoch": 1.542513167795335, "grad_norm": 1.7161898612976074, "learning_rate": 9.71783295711061e-06, "loss": 0.1985, "step": 8200 }, { "epoch": 1.5519187358916477, "grad_norm": 13.85793399810791, "learning_rate": 9.655129169801857e-06, "loss": 0.1902, "step": 8250 }, { "epoch": 1.561324303987961, "grad_norm": 1.6476123332977295, "learning_rate": 9.592425382493104e-06, "loss": 0.2398, "step": 8300 }, { "epoch": 1.5707298720842737, "grad_norm": 2.9998719692230225, "learning_rate": 9.52972159518435e-06, "loss": 0.1788, "step": 8350 }, { "epoch": 1.580135440180587, "grad_norm": 7.067188262939453, "learning_rate": 9.467017807875598e-06, "loss": 0.211, "step": 8400 }, { "epoch": 1.5895410082769, "grad_norm": 4.7561936378479, "learning_rate": 9.404314020566843e-06, "loss": 0.1814, "step": 8450 }, { "epoch": 1.598946576373213, "grad_norm": 7.8336873054504395, "learning_rate": 9.341610233258089e-06, "loss": 0.2003, "step": 8500 }, { "epoch": 1.6083521444695261, "grad_norm": 3.6782350540161133, "learning_rate": 9.278906445949337e-06, "loss": 0.2555, "step": 8550 }, { "epoch": 1.617757712565839, "grad_norm": 1.2770379781723022, "learning_rate": 9.216202658640582e-06, "loss": 0.1839, "step": 8600 }, { "epoch": 1.627163280662152, "grad_norm": 2.8836193084716797, "learning_rate": 9.15349887133183e-06, "loss": 0.2034, "step": 8650 }, { "epoch": 1.6365688487584649, "grad_norm": 3.362605094909668, "learning_rate": 9.090795084023076e-06, "loss": 0.1872, "step": 8700 }, { "epoch": 1.645974416854778, "grad_norm": 3.509291172027588, "learning_rate": 9.028091296714321e-06, "loss": 0.2519, "step": 8750 }, { "epoch": 1.655379984951091, "grad_norm": 12.957924842834473, "learning_rate": 8.965387509405569e-06, "loss": 0.2648, "step": 8800 }, { "epoch": 1.664785553047404, "grad_norm": 3.217221975326538, "learning_rate": 8.902683722096816e-06, "loss": 0.1699, "step": 8850 }, { "epoch": 1.674191121143717, "grad_norm": 2.8752570152282715, "learning_rate": 8.839979934788062e-06, "loss": 0.1924, "step": 8900 }, { "epoch": 1.68359668924003, "grad_norm": 3.4973011016845703, "learning_rate": 8.77727614747931e-06, "loss": 0.183, "step": 8950 }, { "epoch": 1.6930022573363432, "grad_norm": 1.2514209747314453, "learning_rate": 8.714572360170555e-06, "loss": 0.1837, "step": 9000 }, { "epoch": 1.702407825432656, "grad_norm": 6.367992877960205, "learning_rate": 8.651868572861801e-06, "loss": 0.1828, "step": 9050 }, { "epoch": 1.7118133935289692, "grad_norm": 1.3052902221679688, "learning_rate": 8.589164785553048e-06, "loss": 0.1863, "step": 9100 }, { "epoch": 1.7212189616252822, "grad_norm": 1.235916256904602, "learning_rate": 8.526460998244294e-06, "loss": 0.2599, "step": 9150 }, { "epoch": 1.7306245297215952, "grad_norm": 1.0772079229354858, "learning_rate": 8.463757210935542e-06, "loss": 0.1842, "step": 9200 }, { "epoch": 1.7400300978179082, "grad_norm": 8.388031959533691, "learning_rate": 8.401053423626787e-06, "loss": 0.1866, "step": 9250 }, { "epoch": 1.7494356659142212, "grad_norm": 4.407077789306641, "learning_rate": 8.338349636318033e-06, "loss": 0.2991, "step": 9300 }, { "epoch": 1.7588412340105344, "grad_norm": 5.840625762939453, "learning_rate": 8.27564584900928e-06, "loss": 0.19, "step": 9350 }, { "epoch": 1.7682468021068471, "grad_norm": 2.0648770332336426, "learning_rate": 8.212942061700526e-06, "loss": 0.1934, "step": 9400 }, { "epoch": 1.7776523702031604, "grad_norm": 4.299741744995117, "learning_rate": 8.150238274391774e-06, "loss": 0.155, "step": 9450 }, { "epoch": 1.7870579382994731, "grad_norm": 1.6990511417388916, "learning_rate": 8.087534487083021e-06, "loss": 0.237, "step": 9500 }, { "epoch": 1.7964635063957863, "grad_norm": 2.469029664993286, "learning_rate": 8.024830699774267e-06, "loss": 0.2569, "step": 9550 }, { "epoch": 1.8058690744920993, "grad_norm": 0.9023020267486572, "learning_rate": 7.962126912465513e-06, "loss": 0.1873, "step": 9600 }, { "epoch": 1.8152746425884123, "grad_norm": 3.4308788776397705, "learning_rate": 7.89942312515676e-06, "loss": 0.2105, "step": 9650 }, { "epoch": 1.8246802106847255, "grad_norm": 2.518071174621582, "learning_rate": 7.836719337848006e-06, "loss": 0.2872, "step": 9700 }, { "epoch": 1.8340857787810383, "grad_norm": 1.2336055040359497, "learning_rate": 7.774015550539254e-06, "loss": 0.1982, "step": 9750 }, { "epoch": 1.8434913468773515, "grad_norm": 4.147019863128662, "learning_rate": 7.7113117632305e-06, "loss": 0.1778, "step": 9800 }, { "epoch": 1.8528969149736643, "grad_norm": 2.8657143115997314, "learning_rate": 7.648607975921745e-06, "loss": 0.2742, "step": 9850 }, { "epoch": 1.8623024830699775, "grad_norm": 4.490947246551514, "learning_rate": 7.585904188612993e-06, "loss": 0.2063, "step": 9900 }, { "epoch": 1.8717080511662905, "grad_norm": 13.179983139038086, "learning_rate": 7.523200401304239e-06, "loss": 0.2251, "step": 9950 }, { "epoch": 1.8811136192626035, "grad_norm": 2.9998207092285156, "learning_rate": 7.460496613995486e-06, "loss": 0.1819, "step": 10000 }, { "epoch": 1.8905191873589164, "grad_norm": 3.122727394104004, "learning_rate": 7.3977928266867325e-06, "loss": 0.1712, "step": 10050 }, { "epoch": 1.8999247554552294, "grad_norm": 12.002041816711426, "learning_rate": 7.335089039377979e-06, "loss": 0.1908, "step": 10100 }, { "epoch": 1.9093303235515426, "grad_norm": 3.0402774810791016, "learning_rate": 7.272385252069225e-06, "loss": 0.2027, "step": 10150 }, { "epoch": 1.9187358916478554, "grad_norm": 2.971097707748413, "learning_rate": 7.2096814647604716e-06, "loss": 0.1789, "step": 10200 }, { "epoch": 1.9281414597441686, "grad_norm": 1.380375862121582, "learning_rate": 7.146977677451718e-06, "loss": 0.2104, "step": 10250 }, { "epoch": 1.9375470278404816, "grad_norm": 1.4936792850494385, "learning_rate": 7.084273890142966e-06, "loss": 0.171, "step": 10300 }, { "epoch": 1.9469525959367946, "grad_norm": 1.463129997253418, "learning_rate": 7.021570102834212e-06, "loss": 0.1944, "step": 10350 }, { "epoch": 1.9563581640331076, "grad_norm": 2.7813374996185303, "learning_rate": 6.958866315525459e-06, "loss": 0.1901, "step": 10400 }, { "epoch": 1.9657637321294206, "grad_norm": 5.770429611206055, "learning_rate": 6.896162528216705e-06, "loss": 0.1671, "step": 10450 }, { "epoch": 1.9751693002257338, "grad_norm": 12.642657279968262, "learning_rate": 6.833458740907951e-06, "loss": 0.1977, "step": 10500 }, { "epoch": 1.9845748683220465, "grad_norm": 5.965068817138672, "learning_rate": 6.770754953599198e-06, "loss": 0.1535, "step": 10550 }, { "epoch": 1.9939804364183598, "grad_norm": 1.6920294761657715, "learning_rate": 6.7080511662904445e-06, "loss": 0.1837, "step": 10600 }, { "epoch": 2.0033860045146725, "grad_norm": 1.7706254720687866, "learning_rate": 6.645347378981691e-06, "loss": 0.1791, "step": 10650 }, { "epoch": 2.0127915726109857, "grad_norm": 8.003987312316895, "learning_rate": 6.582643591672938e-06, "loss": 0.2644, "step": 10700 }, { "epoch": 2.0221971407072985, "grad_norm": 1.5629470348358154, "learning_rate": 6.5199398043641835e-06, "loss": 0.1612, "step": 10750 }, { "epoch": 2.0316027088036117, "grad_norm": 2.4208626747131348, "learning_rate": 6.45723601705543e-06, "loss": 0.2278, "step": 10800 }, { "epoch": 2.041008276899925, "grad_norm": 1.0424669981002808, "learning_rate": 6.394532229746678e-06, "loss": 0.1516, "step": 10850 }, { "epoch": 2.0504138449962377, "grad_norm": 2.8615269660949707, "learning_rate": 6.331828442437924e-06, "loss": 0.2025, "step": 10900 }, { "epoch": 2.059819413092551, "grad_norm": 13.714409828186035, "learning_rate": 6.269124655129171e-06, "loss": 0.1939, "step": 10950 }, { "epoch": 2.0692249811888637, "grad_norm": 0.9942559003829956, "learning_rate": 6.206420867820417e-06, "loss": 0.1828, "step": 11000 }, { "epoch": 2.078630549285177, "grad_norm": 2.9414799213409424, "learning_rate": 6.143717080511663e-06, "loss": 0.1681, "step": 11050 }, { "epoch": 2.0880361173814896, "grad_norm": 3.001040458679199, "learning_rate": 6.08101329320291e-06, "loss": 0.165, "step": 11100 }, { "epoch": 2.097441685477803, "grad_norm": 4.616268634796143, "learning_rate": 6.0183095058941565e-06, "loss": 0.1328, "step": 11150 }, { "epoch": 2.106847253574116, "grad_norm": 16.67197608947754, "learning_rate": 5.955605718585403e-06, "loss": 0.1496, "step": 11200 }, { "epoch": 2.116252821670429, "grad_norm": 3.5193891525268555, "learning_rate": 5.89290193127665e-06, "loss": 0.1478, "step": 11250 }, { "epoch": 2.125658389766742, "grad_norm": 4.846385478973389, "learning_rate": 5.8301981439678955e-06, "loss": 0.1585, "step": 11300 }, { "epoch": 2.135063957863055, "grad_norm": 2.8305087089538574, "learning_rate": 5.767494356659142e-06, "loss": 0.1743, "step": 11350 }, { "epoch": 2.144469525959368, "grad_norm": 8.07402229309082, "learning_rate": 5.704790569350389e-06, "loss": 0.1978, "step": 11400 }, { "epoch": 2.153875094055681, "grad_norm": 5.102453231811523, "learning_rate": 5.642086782041636e-06, "loss": 0.23, "step": 11450 }, { "epoch": 2.163280662151994, "grad_norm": 5.685837268829346, "learning_rate": 5.579382994732883e-06, "loss": 0.151, "step": 11500 }, { "epoch": 2.172686230248307, "grad_norm": 1.282105803489685, "learning_rate": 5.516679207424129e-06, "loss": 0.2055, "step": 11550 }, { "epoch": 2.18209179834462, "grad_norm": 1.42794930934906, "learning_rate": 5.453975420115375e-06, "loss": 0.2316, "step": 11600 }, { "epoch": 2.191497366440933, "grad_norm": 0.9819298982620239, "learning_rate": 5.391271632806622e-06, "loss": 0.1529, "step": 11650 }, { "epoch": 2.200902934537246, "grad_norm": 6.298890113830566, "learning_rate": 5.3285678454978684e-06, "loss": 0.1741, "step": 11700 }, { "epoch": 2.210308502633559, "grad_norm": 2.593261480331421, "learning_rate": 5.265864058189115e-06, "loss": 0.2045, "step": 11750 }, { "epoch": 2.219714070729872, "grad_norm": 1.2109441757202148, "learning_rate": 5.203160270880362e-06, "loss": 0.1566, "step": 11800 }, { "epoch": 2.229119638826185, "grad_norm": 2.688478469848633, "learning_rate": 5.140456483571608e-06, "loss": 0.1741, "step": 11850 }, { "epoch": 2.238525206922498, "grad_norm": 1.0694407224655151, "learning_rate": 5.077752696262854e-06, "loss": 0.1442, "step": 11900 }, { "epoch": 2.247930775018811, "grad_norm": 3.074937343597412, "learning_rate": 5.015048908954101e-06, "loss": 0.1743, "step": 11950 }, { "epoch": 2.2573363431151243, "grad_norm": 1.3289566040039062, "learning_rate": 4.952345121645348e-06, "loss": 0.1526, "step": 12000 }, { "epoch": 2.266741911211437, "grad_norm": 1.629441261291504, "learning_rate": 4.889641334336595e-06, "loss": 0.2462, "step": 12050 }, { "epoch": 2.2761474793077503, "grad_norm": 5.780508518218994, "learning_rate": 4.8269375470278405e-06, "loss": 0.1472, "step": 12100 }, { "epoch": 2.285553047404063, "grad_norm": 10.06039810180664, "learning_rate": 4.764233759719087e-06, "loss": 0.1585, "step": 12150 }, { "epoch": 2.2949586155003763, "grad_norm": 11.558574676513672, "learning_rate": 4.701529972410334e-06, "loss": 0.1261, "step": 12200 }, { "epoch": 2.3043641835966895, "grad_norm": 1.2220011949539185, "learning_rate": 4.63882618510158e-06, "loss": 0.1866, "step": 12250 }, { "epoch": 2.3137697516930023, "grad_norm": 4.4303460121154785, "learning_rate": 4.576122397792827e-06, "loss": 0.1733, "step": 12300 }, { "epoch": 2.3231753197893155, "grad_norm": 2.550745964050293, "learning_rate": 4.513418610484074e-06, "loss": 0.2603, "step": 12350 }, { "epoch": 2.3325808878856282, "grad_norm": 3.003775119781494, "learning_rate": 4.45071482317532e-06, "loss": 0.141, "step": 12400 }, { "epoch": 2.3419864559819414, "grad_norm": 1.6381702423095703, "learning_rate": 4.388011035866567e-06, "loss": 0.1607, "step": 12450 }, { "epoch": 2.351392024078254, "grad_norm": 6.195992469787598, "learning_rate": 4.3253072485578135e-06, "loss": 0.1694, "step": 12500 }, { "epoch": 2.3607975921745674, "grad_norm": 1.3570517301559448, "learning_rate": 4.262603461249059e-06, "loss": 0.1823, "step": 12550 }, { "epoch": 2.37020316027088, "grad_norm": 1.8480778932571411, "learning_rate": 4.199899673940307e-06, "loss": 0.1476, "step": 12600 }, { "epoch": 2.3796087283671934, "grad_norm": 1.602105736732483, "learning_rate": 4.137195886631553e-06, "loss": 0.145, "step": 12650 }, { "epoch": 2.3890142964635066, "grad_norm": 2.0385992527008057, "learning_rate": 4.074492099322799e-06, "loss": 0.1685, "step": 12700 }, { "epoch": 2.3984198645598194, "grad_norm": 1.7218743562698364, "learning_rate": 4.011788312014046e-06, "loss": 0.1782, "step": 12750 }, { "epoch": 2.4078254326561326, "grad_norm": 5.287595272064209, "learning_rate": 3.949084524705293e-06, "loss": 0.173, "step": 12800 }, { "epoch": 2.4172310007524453, "grad_norm": 7.334980010986328, "learning_rate": 3.886380737396539e-06, "loss": 0.2384, "step": 12850 }, { "epoch": 2.4266365688487586, "grad_norm": 5.126362323760986, "learning_rate": 3.823676950087786e-06, "loss": 0.14, "step": 12900 }, { "epoch": 2.4360421369450713, "grad_norm": 1.547075867652893, "learning_rate": 3.760973162779032e-06, "loss": 0.1706, "step": 12950 }, { "epoch": 2.4454477050413845, "grad_norm": 4.503610134124756, "learning_rate": 3.698269375470279e-06, "loss": 0.2401, "step": 13000 }, { "epoch": 2.4548532731376973, "grad_norm": 2.71994686126709, "learning_rate": 3.6355655881615255e-06, "loss": 0.1496, "step": 13050 }, { "epoch": 2.4642588412340105, "grad_norm": 8.66321849822998, "learning_rate": 3.5728618008527716e-06, "loss": 0.1299, "step": 13100 }, { "epoch": 2.4736644093303237, "grad_norm": 11.310059547424316, "learning_rate": 3.5101580135440183e-06, "loss": 0.1402, "step": 13150 }, { "epoch": 2.4830699774266365, "grad_norm": 5.4790873527526855, "learning_rate": 3.447454226235265e-06, "loss": 0.1768, "step": 13200 }, { "epoch": 2.4924755455229497, "grad_norm": 3.9280734062194824, "learning_rate": 3.384750438926511e-06, "loss": 0.1844, "step": 13250 }, { "epoch": 2.5018811136192625, "grad_norm": 2.2618765830993652, "learning_rate": 3.322046651617758e-06, "loss": 0.1641, "step": 13300 }, { "epoch": 2.5112866817155757, "grad_norm": 1.4370334148406982, "learning_rate": 3.2593428643090047e-06, "loss": 0.1846, "step": 13350 }, { "epoch": 2.520692249811889, "grad_norm": 0.5764915347099304, "learning_rate": 3.196639077000251e-06, "loss": 0.1174, "step": 13400 }, { "epoch": 2.5300978179082017, "grad_norm": 13.661349296569824, "learning_rate": 3.1339352896914976e-06, "loss": 0.1453, "step": 13450 }, { "epoch": 2.5395033860045144, "grad_norm": 1.970798373222351, "learning_rate": 3.071231502382744e-06, "loss": 0.1712, "step": 13500 }, { "epoch": 2.5489089541008276, "grad_norm": 5.565087795257568, "learning_rate": 3.0085277150739904e-06, "loss": 0.1335, "step": 13550 }, { "epoch": 2.558314522197141, "grad_norm": 1.0521272420883179, "learning_rate": 2.9458239277652374e-06, "loss": 0.142, "step": 13600 }, { "epoch": 2.5677200902934536, "grad_norm": 2.4545013904571533, "learning_rate": 2.883120140456484e-06, "loss": 0.1364, "step": 13650 }, { "epoch": 2.577125658389767, "grad_norm": 2.9941937923431396, "learning_rate": 2.8204163531477302e-06, "loss": 0.1781, "step": 13700 }, { "epoch": 2.5865312264860796, "grad_norm": 1.3698679208755493, "learning_rate": 2.757712565838977e-06, "loss": 0.223, "step": 13750 }, { "epoch": 2.595936794582393, "grad_norm": 1.4991743564605713, "learning_rate": 2.695008778530224e-06, "loss": 0.1749, "step": 13800 }, { "epoch": 2.605342362678706, "grad_norm": 1.5825111865997314, "learning_rate": 2.63230499122147e-06, "loss": 0.2057, "step": 13850 }, { "epoch": 2.6147479307750188, "grad_norm": 9.330909729003906, "learning_rate": 2.5696012039127167e-06, "loss": 0.1412, "step": 13900 }, { "epoch": 2.624153498871332, "grad_norm": 4.595930576324463, "learning_rate": 2.5068974166039633e-06, "loss": 0.1576, "step": 13950 }, { "epoch": 2.6335590669676447, "grad_norm": 2.5993683338165283, "learning_rate": 2.4441936292952095e-06, "loss": 0.1585, "step": 14000 }, { "epoch": 2.642964635063958, "grad_norm": 1.360910177230835, "learning_rate": 2.381489841986456e-06, "loss": 0.1438, "step": 14050 }, { "epoch": 2.6523702031602707, "grad_norm": 7.876944541931152, "learning_rate": 2.3187860546777028e-06, "loss": 0.2808, "step": 14100 }, { "epoch": 2.661775771256584, "grad_norm": 7.476833820343018, "learning_rate": 2.2560822673689494e-06, "loss": 0.1367, "step": 14150 }, { "epoch": 2.6711813393528967, "grad_norm": 12.081886291503906, "learning_rate": 2.193378480060196e-06, "loss": 0.1393, "step": 14200 }, { "epoch": 2.68058690744921, "grad_norm": 2.116596221923828, "learning_rate": 2.130674692751442e-06, "loss": 0.1329, "step": 14250 }, { "epoch": 2.689992475545523, "grad_norm": 9.647782325744629, "learning_rate": 2.067970905442689e-06, "loss": 0.1475, "step": 14300 }, { "epoch": 2.699398043641836, "grad_norm": 5.909144878387451, "learning_rate": 2.0052671181339354e-06, "loss": 0.2392, "step": 14350 }, { "epoch": 2.708803611738149, "grad_norm": 3.9596285820007324, "learning_rate": 1.942563330825182e-06, "loss": 0.225, "step": 14400 }, { "epoch": 2.718209179834462, "grad_norm": 0.5444441437721252, "learning_rate": 1.8798595435164285e-06, "loss": 0.1488, "step": 14450 }, { "epoch": 2.727614747930775, "grad_norm": 1.5310776233673096, "learning_rate": 1.817155756207675e-06, "loss": 0.1905, "step": 14500 }, { "epoch": 2.7370203160270883, "grad_norm": 9.348706245422363, "learning_rate": 1.7544519688989217e-06, "loss": 0.1475, "step": 14550 }, { "epoch": 2.746425884123401, "grad_norm": 3.9080257415771484, "learning_rate": 1.691748181590168e-06, "loss": 0.1715, "step": 14600 }, { "epoch": 2.755831452219714, "grad_norm": 1.6289005279541016, "learning_rate": 1.6290443942814147e-06, "loss": 0.1468, "step": 14650 }, { "epoch": 2.765237020316027, "grad_norm": 8.900264739990234, "learning_rate": 1.5663406069726613e-06, "loss": 0.175, "step": 14700 }, { "epoch": 2.7746425884123402, "grad_norm": 7.342721462249756, "learning_rate": 1.5036368196639077e-06, "loss": 0.1888, "step": 14750 }, { "epoch": 2.784048156508653, "grad_norm": 9.584351539611816, "learning_rate": 1.4409330323551544e-06, "loss": 0.1692, "step": 14800 }, { "epoch": 2.793453724604966, "grad_norm": 1.2614892721176147, "learning_rate": 1.378229245046401e-06, "loss": 0.1522, "step": 14850 }, { "epoch": 2.802859292701279, "grad_norm": 4.842260360717773, "learning_rate": 1.3155254577376476e-06, "loss": 0.1783, "step": 14900 }, { "epoch": 2.812264860797592, "grad_norm": 1.9376300573349, "learning_rate": 1.252821670428894e-06, "loss": 0.1997, "step": 14950 }, { "epoch": 2.8216704288939054, "grad_norm": 1.777771234512329, "learning_rate": 1.1901178831201406e-06, "loss": 0.1452, "step": 15000 }, { "epoch": 2.831075996990218, "grad_norm": 8.967829704284668, "learning_rate": 1.1274140958113872e-06, "loss": 0.1644, "step": 15050 }, { "epoch": 2.8404815650865314, "grad_norm": 7.596806526184082, "learning_rate": 1.0647103085026337e-06, "loss": 0.2017, "step": 15100 }, { "epoch": 2.849887133182844, "grad_norm": 7.003665447235107, "learning_rate": 1.0020065211938803e-06, "loss": 0.1263, "step": 15150 }, { "epoch": 2.8592927012791574, "grad_norm": 2.5569920539855957, "learning_rate": 9.393027338851267e-07, "loss": 0.2034, "step": 15200 }, { "epoch": 2.86869826937547, "grad_norm": 4.9097490310668945, "learning_rate": 8.765989465763733e-07, "loss": 0.1402, "step": 15250 }, { "epoch": 2.8781038374717833, "grad_norm": 2.4663660526275635, "learning_rate": 8.138951592676199e-07, "loss": 0.1558, "step": 15300 }, { "epoch": 2.887509405568096, "grad_norm": 2.791815996170044, "learning_rate": 7.511913719588663e-07, "loss": 0.129, "step": 15350 }, { "epoch": 2.8969149736644093, "grad_norm": 2.119055986404419, "learning_rate": 6.884875846501129e-07, "loss": 0.2119, "step": 15400 }, { "epoch": 2.9063205417607225, "grad_norm": 12.687973976135254, "learning_rate": 6.257837973413595e-07, "loss": 0.1316, "step": 15450 }, { "epoch": 2.9157261098570353, "grad_norm": 1.892902135848999, "learning_rate": 5.63080010032606e-07, "loss": 0.1586, "step": 15500 }, { "epoch": 2.9251316779533485, "grad_norm": 1.8958168029785156, "learning_rate": 5.003762227238526e-07, "loss": 0.1394, "step": 15550 }, { "epoch": 2.9345372460496613, "grad_norm": 3.460698366165161, "learning_rate": 4.3767243541509916e-07, "loss": 0.1432, "step": 15600 }, { "epoch": 2.9439428141459745, "grad_norm": 8.283778190612793, "learning_rate": 3.7496864810634567e-07, "loss": 0.1961, "step": 15650 }, { "epoch": 2.9533483822422877, "grad_norm": 4.339179515838623, "learning_rate": 3.122648607975922e-07, "loss": 0.1535, "step": 15700 }, { "epoch": 2.9627539503386005, "grad_norm": 4.080807685852051, "learning_rate": 2.4956107348883875e-07, "loss": 0.1448, "step": 15750 }, { "epoch": 2.972159518434913, "grad_norm": 1.3516851663589478, "learning_rate": 1.868572861800853e-07, "loss": 0.1516, "step": 15800 }, { "epoch": 2.9815650865312264, "grad_norm": 6.547214031219482, "learning_rate": 1.2415349887133183e-07, "loss": 0.1415, "step": 15850 }, { "epoch": 2.9909706546275396, "grad_norm": 2.244124412536621, "learning_rate": 6.144971156257838e-08, "loss": 0.1463, "step": 15900 }, { "epoch": 3.0, "step": 15948, "total_flos": 1.6575120153138816e+16, "train_loss": 0.21908686365553812, "train_runtime": 7613.9058, "train_samples_per_second": 8.378, "train_steps_per_second": 2.095 }, { "epoch": 3.0, "eval_accuracy": 0.9558278538012962, "eval_f1": 0.5528281962901715, "eval_loss": 0.3851251006126404, "eval_pos_rate_pred": 0.06444756279730685, "eval_pos_rate_true": 0.034333589843004275, "eval_precision": 0.42367006657301653, "eval_recall": 0.7952708512467755, "eval_runtime": 37.8976, "eval_samples_per_second": 63.381, "eval_steps_per_second": 7.942, "step": 15948 } ], "logging_steps": 50, "max_steps": 15948, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6575120153138816e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }