{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 8238, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012138868657441126, "grad_norm": 2.9070301055908203, "learning_rate": 1.975806451612903e-05, "loss": 2.9199, "step": 50 }, { "epoch": 0.024277737314882253, "grad_norm": 1.895039439201355, "learning_rate": 3.991935483870968e-05, "loss": 2.8553, "step": 100 }, { "epoch": 0.03641660597232338, "grad_norm": 2.2209153175354004, "learning_rate": 6.0080645161290325e-05, "loss": 2.8865, "step": 150 }, { "epoch": 0.048555474629764506, "grad_norm": 2.2095389366149902, "learning_rate": 8.024193548387097e-05, "loss": 2.7245, "step": 200 }, { "epoch": 0.06069434328720563, "grad_norm": 2.3479413986206055, "learning_rate": 9.999999613502945e-05, "loss": 2.7798, "step": 250 }, { "epoch": 0.07283321194464676, "grad_norm": 2.177093744277954, "learning_rate": 9.99899475483094e-05, "loss": 2.7718, "step": 300 }, { "epoch": 0.08497208060208788, "grad_norm": 2.8577401638031006, "learning_rate": 9.996057861608239e-05, "loss": 2.7981, "step": 350 }, { "epoch": 0.09711094925952901, "grad_norm": 2.592057466506958, "learning_rate": 9.991190068898889e-05, "loss": 2.7317, "step": 400 }, { "epoch": 0.10924981791697014, "grad_norm": 1.9039149284362793, "learning_rate": 9.98439325802986e-05, "loss": 2.6439, "step": 450 }, { "epoch": 0.12138868657441126, "grad_norm": 1.9637870788574219, "learning_rate": 9.975670055863974e-05, "loss": 2.7429, "step": 500 }, { "epoch": 0.1335275552318524, "grad_norm": 2.6924002170562744, "learning_rate": 9.965023833784636e-05, "loss": 2.7226, "step": 550 }, { "epoch": 0.14566642388929352, "grad_norm": 1.896791934967041, "learning_rate": 9.952458706392864e-05, "loss": 2.6811, "step": 600 }, { "epoch": 0.15780529254673464, "grad_norm": 2.0208559036254883, "learning_rate": 9.937979529917046e-05, "loss": 2.6905, "step": 650 }, { "epoch": 0.16994416120417577, "grad_norm": 2.337047815322876, "learning_rate": 9.921591900336092e-05, "loss": 2.716, "step": 700 }, { "epoch": 0.1820830298616169, "grad_norm": 1.9349219799041748, "learning_rate": 9.903302151216671e-05, "loss": 2.7061, "step": 750 }, { "epoch": 0.19422189851905802, "grad_norm": 1.7132676839828491, "learning_rate": 9.883117351265385e-05, "loss": 2.7762, "step": 800 }, { "epoch": 0.20636076717649915, "grad_norm": 2.1389896869659424, "learning_rate": 9.861045301596821e-05, "loss": 2.7318, "step": 850 }, { "epoch": 0.21849963583394028, "grad_norm": 3.0611305236816406, "learning_rate": 9.837094532718541e-05, "loss": 2.7319, "step": 900 }, { "epoch": 0.2306385044913814, "grad_norm": 1.7353436946868896, "learning_rate": 9.811274301234174e-05, "loss": 2.7076, "step": 950 }, { "epoch": 0.24277737314882253, "grad_norm": 2.3191990852355957, "learning_rate": 9.78359458626588e-05, "loss": 2.7457, "step": 1000 }, { "epoch": 0.25491624180626365, "grad_norm": 2.0282199382781982, "learning_rate": 9.754066085597576e-05, "loss": 2.638, "step": 1050 }, { "epoch": 0.2670551104637048, "grad_norm": 2.1684181690216064, "learning_rate": 9.722700211540394e-05, "loss": 2.6815, "step": 1100 }, { "epoch": 0.2791939791211459, "grad_norm": 1.9141716957092285, "learning_rate": 9.689509086522019e-05, "loss": 2.5845, "step": 1150 }, { "epoch": 0.29133284777858703, "grad_norm": 1.8547425270080566, "learning_rate": 9.65450553840154e-05, "loss": 2.686, "step": 1200 }, { "epoch": 0.30347171643602816, "grad_norm": 2.2245781421661377, "learning_rate": 9.617703095511691e-05, "loss": 2.757, "step": 1250 }, { "epoch": 0.3156105850934693, "grad_norm": 1.9658602476119995, "learning_rate": 9.579115981430349e-05, "loss": 2.6181, "step": 1300 }, { "epoch": 0.3277494537509104, "grad_norm": 2.5929782390594482, "learning_rate": 9.538759109483347e-05, "loss": 2.6221, "step": 1350 }, { "epoch": 0.33988832240835154, "grad_norm": 1.781426191329956, "learning_rate": 9.496648076980702e-05, "loss": 2.6583, "step": 1400 }, { "epoch": 0.35202719106579267, "grad_norm": 2.546909809112549, "learning_rate": 9.452799159188492e-05, "loss": 2.637, "step": 1450 }, { "epoch": 0.3641660597232338, "grad_norm": 2.062859058380127, "learning_rate": 9.407229303038719e-05, "loss": 2.6607, "step": 1500 }, { "epoch": 0.3763049283806749, "grad_norm": 2.205317735671997, "learning_rate": 9.359956120579578e-05, "loss": 2.6899, "step": 1550 }, { "epoch": 0.38844379703811605, "grad_norm": 1.6034296751022339, "learning_rate": 9.310997882168673e-05, "loss": 2.6986, "step": 1600 }, { "epoch": 0.40058266569555717, "grad_norm": 1.492854356765747, "learning_rate": 9.260373509411806e-05, "loss": 2.7071, "step": 1650 }, { "epoch": 0.4127215343529983, "grad_norm": 2.1287174224853516, "learning_rate": 9.208102567850063e-05, "loss": 2.6058, "step": 1700 }, { "epoch": 0.4248604030104394, "grad_norm": 2.4047040939331055, "learning_rate": 9.154205259398038e-05, "loss": 2.705, "step": 1750 }, { "epoch": 0.43699927166788055, "grad_norm": 1.8397117853164673, "learning_rate": 9.098702414536107e-05, "loss": 2.6512, "step": 1800 }, { "epoch": 0.4491381403253217, "grad_norm": 2.055699586868286, "learning_rate": 9.041615484259753e-05, "loss": 2.6701, "step": 1850 }, { "epoch": 0.4612770089827628, "grad_norm": 1.9400551319122314, "learning_rate": 8.982966531789105e-05, "loss": 2.6792, "step": 1900 }, { "epoch": 0.47341587764020393, "grad_norm": 2.668945074081421, "learning_rate": 8.922778224041835e-05, "loss": 2.6004, "step": 1950 }, { "epoch": 0.48555474629764506, "grad_norm": 2.9810187816619873, "learning_rate": 8.861073822872734e-05, "loss": 2.5851, "step": 2000 }, { "epoch": 0.4976936149550862, "grad_norm": 1.625508427619934, "learning_rate": 8.79787717608338e-05, "loss": 2.5802, "step": 2050 }, { "epoch": 0.5098324836125273, "grad_norm": 2.1407854557037354, "learning_rate": 8.733212708205321e-05, "loss": 2.5865, "step": 2100 }, { "epoch": 0.5219713522699685, "grad_norm": 2.2356784343719482, "learning_rate": 8.667105411060361e-05, "loss": 2.6538, "step": 2150 }, { "epoch": 0.5341102209274096, "grad_norm": 2.3607735633850098, "learning_rate": 8.599580834101625e-05, "loss": 2.5077, "step": 2200 }, { "epoch": 0.5462490895848507, "grad_norm": 2.3377416133880615, "learning_rate": 8.530665074539073e-05, "loss": 2.5979, "step": 2250 }, { "epoch": 0.5583879582422918, "grad_norm": 2.3431484699249268, "learning_rate": 8.460384767253331e-05, "loss": 2.4996, "step": 2300 }, { "epoch": 0.570526826899733, "grad_norm": 2.106093406677246, "learning_rate": 8.388767074501731e-05, "loss": 2.4795, "step": 2350 }, { "epoch": 0.5826656955571741, "grad_norm": 1.8955905437469482, "learning_rate": 8.3158396754205e-05, "loss": 2.5837, "step": 2400 }, { "epoch": 0.5948045642146153, "grad_norm": 1.9230371713638306, "learning_rate": 8.241630755327213e-05, "loss": 2.5845, "step": 2450 }, { "epoch": 0.6069434328720563, "grad_norm": 1.6631944179534912, "learning_rate": 8.166168994827599e-05, "loss": 2.6071, "step": 2500 }, { "epoch": 0.6190823015294975, "grad_norm": 2.2075533866882324, "learning_rate": 8.089483558730919e-05, "loss": 2.5412, "step": 2550 }, { "epoch": 0.6312211701869386, "grad_norm": 1.8824903964996338, "learning_rate": 8.011604084778229e-05, "loss": 2.5386, "step": 2600 }, { "epoch": 0.6433600388443798, "grad_norm": 2.604081869125366, "learning_rate": 7.932560672187839e-05, "loss": 2.6509, "step": 2650 }, { "epoch": 0.6554989075018208, "grad_norm": 2.0620648860931396, "learning_rate": 7.852383870022439e-05, "loss": 2.6403, "step": 2700 }, { "epoch": 0.667637776159262, "grad_norm": 2.0239202976226807, "learning_rate": 7.771104665382341e-05, "loss": 2.6965, "step": 2750 }, { "epoch": 0.6797766448167031, "grad_norm": 1.7837492227554321, "learning_rate": 7.688754471429456e-05, "loss": 2.5448, "step": 2800 }, { "epoch": 0.6919155134741443, "grad_norm": 1.9377483129501343, "learning_rate": 7.605365115246581e-05, "loss": 2.6333, "step": 2850 }, { "epoch": 0.7040543821315853, "grad_norm": 2.297499179840088, "learning_rate": 7.520968825536732e-05, "loss": 2.4747, "step": 2900 }, { "epoch": 0.7161932507890265, "grad_norm": 1.857254147529602, "learning_rate": 7.435598220167226e-05, "loss": 2.6631, "step": 2950 }, { "epoch": 0.7283321194464676, "grad_norm": 2.1972172260284424, "learning_rate": 7.349286293563402e-05, "loss": 2.5898, "step": 3000 }, { "epoch": 0.7404709881039088, "grad_norm": 2.267690896987915, "learning_rate": 7.26206640395677e-05, "loss": 2.4341, "step": 3050 }, { "epoch": 0.7526098567613498, "grad_norm": 1.6826646327972412, "learning_rate": 7.17397226049256e-05, "loss": 2.6269, "step": 3100 }, { "epoch": 0.764748725418791, "grad_norm": 2.3957300186157227, "learning_rate": 7.085037910201677e-05, "loss": 2.6107, "step": 3150 }, { "epoch": 0.7768875940762321, "grad_norm": 2.471625566482544, "learning_rate": 6.99529772484203e-05, "loss": 2.5767, "step": 3200 }, { "epoch": 0.7890264627336733, "grad_norm": 1.8939329385757446, "learning_rate": 6.904786387614382e-05, "loss": 2.5009, "step": 3250 }, { "epoch": 0.8011653313911143, "grad_norm": 2.498994827270508, "learning_rate": 6.813538879757828e-05, "loss": 2.5742, "step": 3300 }, { "epoch": 0.8133042000485555, "grad_norm": 2.3812406063079834, "learning_rate": 6.721590467030083e-05, "loss": 2.5011, "step": 3350 }, { "epoch": 0.8254430687059966, "grad_norm": 1.9224671125411987, "learning_rate": 6.62897668607781e-05, "loss": 2.5455, "step": 3400 }, { "epoch": 0.8375819373634378, "grad_norm": 1.811013102531433, "learning_rate": 6.535733330702254e-05, "loss": 2.5791, "step": 3450 }, { "epoch": 0.8497208060208788, "grad_norm": 1.4125910997390747, "learning_rate": 6.441896438025482e-05, "loss": 2.477, "step": 3500 }, { "epoch": 0.86185967467832, "grad_norm": 1.7109546661376953, "learning_rate": 6.3475022745626e-05, "loss": 2.4967, "step": 3550 }, { "epoch": 0.8739985433357611, "grad_norm": 1.8944520950317383, "learning_rate": 6.252587322205299e-05, "loss": 2.6007, "step": 3600 }, { "epoch": 0.8861374119932023, "grad_norm": 2.4895029067993164, "learning_rate": 6.157188264122153e-05, "loss": 2.5122, "step": 3650 }, { "epoch": 0.8982762806506434, "grad_norm": 2.2736401557922363, "learning_rate": 6.061341970581165e-05, "loss": 2.5942, "step": 3700 }, { "epoch": 0.9104151493080845, "grad_norm": 2.258389711380005, "learning_rate": 5.9650854846999495e-05, "loss": 2.4973, "step": 3750 }, { "epoch": 0.9225540179655256, "grad_norm": 2.1070783138275146, "learning_rate": 5.868456008129154e-05, "loss": 2.5858, "step": 3800 }, { "epoch": 0.9346928866229668, "grad_norm": 1.8113417625427246, "learning_rate": 5.7714908866745864e-05, "loss": 2.5253, "step": 3850 }, { "epoch": 0.9468317552804079, "grad_norm": 1.8022534847259521, "learning_rate": 5.674227595863638e-05, "loss": 2.5297, "step": 3900 }, { "epoch": 0.958970623937849, "grad_norm": 2.208134174346924, "learning_rate": 5.5767037264615686e-05, "loss": 2.5352, "step": 3950 }, { "epoch": 0.9711094925952901, "grad_norm": 1.7783771753311157, "learning_rate": 5.478956969943252e-05, "loss": 2.622, "step": 4000 }, { "epoch": 0.9832483612527313, "grad_norm": 1.889061689376831, "learning_rate": 5.3810251039260026e-05, "loss": 2.5766, "step": 4050 }, { "epoch": 0.9953872299101724, "grad_norm": 1.7664889097213745, "learning_rate": 5.2829459775691124e-05, "loss": 2.5343, "step": 4100 }, { "epoch": 1.0075260985676135, "grad_norm": 2.389195442199707, "learning_rate": 5.184757496945726e-05, "loss": 2.4996, "step": 4150 }, { "epoch": 1.0196649672250546, "grad_norm": 2.4707448482513428, "learning_rate": 5.086497610392723e-05, "loss": 2.3471, "step": 4200 }, { "epoch": 1.0318038358824957, "grad_norm": 2.3839166164398193, "learning_rate": 4.988204293844289e-05, "loss": 2.3737, "step": 4250 }, { "epoch": 1.043942704539937, "grad_norm": 2.6970324516296387, "learning_rate": 4.889915536154776e-05, "loss": 2.3854, "step": 4300 }, { "epoch": 1.056081573197378, "grad_norm": 1.623435616493225, "learning_rate": 4.7916693244166126e-05, "loss": 2.3536, "step": 4350 }, { "epoch": 1.0682204418548191, "grad_norm": 2.695117473602295, "learning_rate": 4.693503629278875e-05, "loss": 2.3699, "step": 4400 }, { "epoch": 1.0803593105122602, "grad_norm": 2.7556312084198, "learning_rate": 4.595456390272207e-05, "loss": 2.3021, "step": 4450 }, { "epoch": 1.0924981791697013, "grad_norm": 2.1368134021759033, "learning_rate": 4.4975655011457815e-05, "loss": 2.3003, "step": 4500 }, { "epoch": 1.1046370478271426, "grad_norm": 1.6469930410385132, "learning_rate": 4.399868795221951e-05, "loss": 2.3007, "step": 4550 }, { "epoch": 1.1167759164845836, "grad_norm": 1.8031399250030518, "learning_rate": 4.302404030774248e-05, "loss": 2.4757, "step": 4600 }, { "epoch": 1.1289147851420247, "grad_norm": 2.02652907371521, "learning_rate": 4.205208876434389e-05, "loss": 2.2888, "step": 4650 }, { "epoch": 1.141053653799466, "grad_norm": 1.9721205234527588, "learning_rate": 4.108320896633937e-05, "loss": 2.3307, "step": 4700 }, { "epoch": 1.153192522456907, "grad_norm": 2.1819326877593994, "learning_rate": 4.011777537086219e-05, "loss": 2.3219, "step": 4750 }, { "epoch": 1.1653313911143481, "grad_norm": 2.973172187805176, "learning_rate": 3.915616110314142e-05, "loss": 2.252, "step": 4800 }, { "epoch": 1.1774702597717892, "grad_norm": 2.2087929248809814, "learning_rate": 3.8198737812294675e-05, "loss": 2.3202, "step": 4850 }, { "epoch": 1.1896091284292303, "grad_norm": 2.286069869995117, "learning_rate": 3.724587552769152e-05, "loss": 2.3541, "step": 4900 }, { "epoch": 1.2017479970866716, "grad_norm": 2.08137583732605, "learning_rate": 3.6297942515942776e-05, "loss": 2.3576, "step": 4950 }, { "epoch": 1.2138868657441126, "grad_norm": 2.029747724533081, "learning_rate": 3.535530513857115e-05, "loss": 2.3344, "step": 5000 }, { "epoch": 1.2260257344015537, "grad_norm": 2.449650764465332, "learning_rate": 3.441832771041818e-05, "loss": 2.3351, "step": 5050 }, { "epoch": 1.238164603058995, "grad_norm": 2.0461597442626953, "learning_rate": 3.34873723588421e-05, "loss": 2.2197, "step": 5100 }, { "epoch": 1.250303471716436, "grad_norm": 1.7304949760437012, "learning_rate": 3.25627988837612e-05, "loss": 2.3097, "step": 5150 }, { "epoch": 1.2624423403738771, "grad_norm": 2.58225417137146, "learning_rate": 3.164496461859673e-05, "loss": 2.4066, "step": 5200 }, { "epoch": 1.2745812090313182, "grad_norm": 1.7446330785751343, "learning_rate": 3.0734224292169e-05, "loss": 2.3252, "step": 5250 }, { "epoch": 1.2867200776887593, "grad_norm": 1.8611998558044434, "learning_rate": 2.9830929891600177e-05, "loss": 2.2757, "step": 5300 }, { "epoch": 1.2988589463462006, "grad_norm": 1.8992869853973389, "learning_rate": 2.8935430526276586e-05, "loss": 2.3245, "step": 5350 }, { "epoch": 1.3109978150036417, "grad_norm": 2.460495710372925, "learning_rate": 2.8048072292923465e-05, "loss": 2.2645, "step": 5400 }, { "epoch": 1.3231366836610827, "grad_norm": 2.6929290294647217, "learning_rate": 2.7169198141843767e-05, "loss": 2.2588, "step": 5450 }, { "epoch": 1.335275552318524, "grad_norm": 3.0288407802581787, "learning_rate": 2.6299147744373193e-05, "loss": 2.2605, "step": 5500 }, { "epoch": 1.347414420975965, "grad_norm": 1.7983629703521729, "learning_rate": 2.5438257361602474e-05, "loss": 2.2654, "step": 5550 }, { "epoch": 1.3595532896334062, "grad_norm": 2.5929248332977295, "learning_rate": 2.4586859714417594e-05, "loss": 2.2965, "step": 5600 }, { "epoch": 1.3716921582908472, "grad_norm": 1.558080792427063, "learning_rate": 2.3745283854908305e-05, "loss": 2.3072, "step": 5650 }, { "epoch": 1.3838310269482883, "grad_norm": 1.997135877609253, "learning_rate": 2.2913855039194553e-05, "loss": 2.3047, "step": 5700 }, { "epoch": 1.3959698956057296, "grad_norm": 1.9210643768310547, "learning_rate": 2.2092894601720005e-05, "loss": 2.2756, "step": 5750 }, { "epoch": 1.4081087642631707, "grad_norm": 2.384209156036377, "learning_rate": 2.128271983106121e-05, "loss": 2.2948, "step": 5800 }, { "epoch": 1.4202476329206117, "grad_norm": 2.263803482055664, "learning_rate": 2.0483643847300453e-05, "loss": 2.3062, "step": 5850 }, { "epoch": 1.432386501578053, "grad_norm": 2.315314769744873, "learning_rate": 1.9695975481009683e-05, "loss": 2.3215, "step": 5900 }, { "epoch": 1.444525370235494, "grad_norm": 2.041764497756958, "learning_rate": 1.89200191538922e-05, "loss": 2.3256, "step": 5950 }, { "epoch": 1.4566642388929352, "grad_norm": 2.1705563068389893, "learning_rate": 1.8156074761128454e-05, "loss": 2.2912, "step": 6000 }, { "epoch": 1.4688031075503762, "grad_norm": 2.304280996322632, "learning_rate": 1.7404437555471003e-05, "loss": 2.309, "step": 6050 }, { "epoch": 1.4809419762078173, "grad_norm": 2.4376580715179443, "learning_rate": 1.6665398033134034e-05, "loss": 2.345, "step": 6100 }, { "epoch": 1.4930808448652586, "grad_norm": 2.959686279296875, "learning_rate": 1.5939241821520952e-05, "loss": 2.2565, "step": 6150 }, { "epoch": 1.5052197135226997, "grad_norm": 1.8753809928894043, "learning_rate": 1.5226249568833794e-05, "loss": 2.3363, "step": 6200 }, { "epoch": 1.5173585821801407, "grad_norm": 1.8722175359725952, "learning_rate": 1.452669683560709e-05, "loss": 2.3196, "step": 6250 }, { "epoch": 1.529497450837582, "grad_norm": 2.468750238418579, "learning_rate": 1.3840853988207847e-05, "loss": 2.3277, "step": 6300 }, { "epoch": 1.541636319495023, "grad_norm": 1.8061391115188599, "learning_rate": 1.316898609434319e-05, "loss": 2.2795, "step": 6350 }, { "epoch": 1.5537751881524642, "grad_norm": 1.9603863954544067, "learning_rate": 1.2511352820615691e-05, "loss": 2.326, "step": 6400 }, { "epoch": 1.5659140568099055, "grad_norm": 2.2773890495300293, "learning_rate": 1.1868208332166336e-05, "loss": 2.2427, "step": 6450 }, { "epoch": 1.5780529254673463, "grad_norm": 1.9823254346847534, "learning_rate": 1.1239801194443506e-05, "loss": 2.2775, "step": 6500 }, { "epoch": 1.5901917941247876, "grad_norm": 2.00081205368042, "learning_rate": 1.0626374277136342e-05, "loss": 2.3023, "step": 6550 }, { "epoch": 1.6023306627822287, "grad_norm": 2.134455919265747, "learning_rate": 1.0028164660309259e-05, "loss": 2.4271, "step": 6600 }, { "epoch": 1.6144695314396698, "grad_norm": 2.493212938308716, "learning_rate": 9.445403542774206e-06, "loss": 2.2615, "step": 6650 }, { "epoch": 1.626608400097111, "grad_norm": 2.063344955444336, "learning_rate": 8.878316152735888e-06, "loss": 2.2552, "step": 6700 }, { "epoch": 1.6387472687545521, "grad_norm": 2.2275609970092773, "learning_rate": 8.327121660744452e-06, "loss": 2.3427, "step": 6750 }, { "epoch": 1.6508861374119932, "grad_norm": 2.143228769302368, "learning_rate": 7.792033094989593e-06, "loss": 2.2294, "step": 6800 }, { "epoch": 1.6630250060694345, "grad_norm": 1.6725349426269531, "learning_rate": 7.273257258968275e-06, "loss": 2.3335, "step": 6850 }, { "epoch": 1.6751638747268753, "grad_norm": 1.7002774477005005, "learning_rate": 6.77099465155846e-06, "loss": 2.3019, "step": 6900 }, { "epoch": 1.6873027433843166, "grad_norm": 2.0058093070983887, "learning_rate": 6.285439389529346e-06, "loss": 2.2801, "step": 6950 }, { "epoch": 1.6994416120417577, "grad_norm": 2.444603681564331, "learning_rate": 5.816779132518224e-06, "loss": 2.2837, "step": 7000 }, { "epoch": 1.7115804806991988, "grad_norm": 2.3724894523620605, "learning_rate": 5.365195010502916e-06, "loss": 2.3238, "step": 7050 }, { "epoch": 1.72371934935664, "grad_norm": 2.394784450531006, "learning_rate": 4.930861553797822e-06, "loss": 2.2119, "step": 7100 }, { "epoch": 1.7358582180140811, "grad_norm": 1.8876112699508667, "learning_rate": 4.5139466256006625e-06, "loss": 2.3293, "step": 7150 }, { "epoch": 1.7479970866715222, "grad_norm": 2.4736382961273193, "learning_rate": 4.1146113571158995e-06, "loss": 2.2619, "step": 7200 }, { "epoch": 1.7601359553289635, "grad_norm": 2.3860538005828857, "learning_rate": 3.733010085280031e-06, "loss": 2.2628, "step": 7250 }, { "epoch": 1.7722748239864043, "grad_norm": 2.2846248149871826, "learning_rate": 3.3692902931127256e-06, "loss": 2.2636, "step": 7300 }, { "epoch": 1.7844136926438456, "grad_norm": 1.9925642013549805, "learning_rate": 3.0235925527169196e-06, "loss": 2.2772, "step": 7350 }, { "epoch": 1.7965525613012867, "grad_norm": 2.708155870437622, "learning_rate": 2.696050470949857e-06, "loss": 2.2776, "step": 7400 }, { "epoch": 1.8086914299587278, "grad_norm": 1.6095919609069824, "learning_rate": 2.386790637786085e-06, "loss": 2.3365, "step": 7450 }, { "epoch": 1.820830298616169, "grad_norm": 1.8871222734451294, "learning_rate": 2.0959325773923732e-06, "loss": 2.3408, "step": 7500 }, { "epoch": 1.8329691672736101, "grad_norm": 2.4641993045806885, "learning_rate": 1.8235887019334985e-06, "loss": 2.2675, "step": 7550 }, { "epoch": 1.8451080359310512, "grad_norm": 2.003045082092285, "learning_rate": 1.569864268126614e-06, "loss": 2.3028, "step": 7600 }, { "epoch": 1.8572469045884925, "grad_norm": 2.2934603691101074, "learning_rate": 1.3348573365612184e-06, "loss": 2.3406, "step": 7650 }, { "epoch": 1.8693857732459334, "grad_norm": 1.78590989112854, "learning_rate": 1.118658733800193e-06, "loss": 2.2264, "step": 7700 }, { "epoch": 1.8815246419033747, "grad_norm": 1.7274677753448486, "learning_rate": 9.213520172767332e-07, "loss": 2.3045, "step": 7750 }, { "epoch": 1.8936635105608157, "grad_norm": 1.8418123722076416, "learning_rate": 7.43013443000734e-07, "loss": 2.2462, "step": 7800 }, { "epoch": 1.9058023792182568, "grad_norm": 2.445272445678711, "learning_rate": 5.837119360869503e-07, "loss": 2.4228, "step": 7850 }, { "epoch": 1.917941247875698, "grad_norm": 2.3169541358947754, "learning_rate": 4.435090641165651e-07, "loss": 2.3271, "step": 7900 }, { "epoch": 1.9300801165331392, "grad_norm": 1.9685901403427124, "learning_rate": 3.2245901334221895e-07, "loss": 2.2368, "step": 7950 }, { "epoch": 1.9422189851905802, "grad_norm": 2.039243221282959, "learning_rate": 2.2060856774587803e-07, "loss": 2.3857, "step": 8000 }, { "epoch": 1.9543578538480215, "grad_norm": 2.139963150024414, "learning_rate": 1.3799709095754232e-07, "loss": 2.3981, "step": 8050 }, { "epoch": 1.9664967225054624, "grad_norm": 2.1113266944885254, "learning_rate": 7.46565110417985e-08, "loss": 2.306, "step": 8100 }, { "epoch": 1.9786355911629037, "grad_norm": 2.550076723098755, "learning_rate": 3.06113081581405e-08, "loss": 2.2583, "step": 8150 }, { "epoch": 1.9907744598203447, "grad_norm": 1.6660057306289673, "learning_rate": 5.878505099732312e-09, "loss": 2.3201, "step": 8200 } ], "logging_steps": 50, "max_steps": 8238, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.2322467312492544e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }