| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 8238, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.012138868657441126, | |
| "grad_norm": 2.9070301055908203, | |
| "learning_rate": 1.975806451612903e-05, | |
| "loss": 2.9199, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.024277737314882253, | |
| "grad_norm": 1.895039439201355, | |
| "learning_rate": 3.991935483870968e-05, | |
| "loss": 2.8553, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.03641660597232338, | |
| "grad_norm": 2.2209153175354004, | |
| "learning_rate": 6.0080645161290325e-05, | |
| "loss": 2.8865, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.048555474629764506, | |
| "grad_norm": 2.2095389366149902, | |
| "learning_rate": 8.024193548387097e-05, | |
| "loss": 2.7245, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.06069434328720563, | |
| "grad_norm": 2.3479413986206055, | |
| "learning_rate": 9.999999613502945e-05, | |
| "loss": 2.7798, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.07283321194464676, | |
| "grad_norm": 2.177093744277954, | |
| "learning_rate": 9.99899475483094e-05, | |
| "loss": 2.7718, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.08497208060208788, | |
| "grad_norm": 2.8577401638031006, | |
| "learning_rate": 9.996057861608239e-05, | |
| "loss": 2.7981, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.09711094925952901, | |
| "grad_norm": 2.592057466506958, | |
| "learning_rate": 9.991190068898889e-05, | |
| "loss": 2.7317, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.10924981791697014, | |
| "grad_norm": 1.9039149284362793, | |
| "learning_rate": 9.98439325802986e-05, | |
| "loss": 2.6439, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.12138868657441126, | |
| "grad_norm": 1.9637870788574219, | |
| "learning_rate": 9.975670055863974e-05, | |
| "loss": 2.7429, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.1335275552318524, | |
| "grad_norm": 2.6924002170562744, | |
| "learning_rate": 9.965023833784636e-05, | |
| "loss": 2.7226, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.14566642388929352, | |
| "grad_norm": 1.896791934967041, | |
| "learning_rate": 9.952458706392864e-05, | |
| "loss": 2.6811, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.15780529254673464, | |
| "grad_norm": 2.0208559036254883, | |
| "learning_rate": 9.937979529917046e-05, | |
| "loss": 2.6905, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.16994416120417577, | |
| "grad_norm": 2.337047815322876, | |
| "learning_rate": 9.921591900336092e-05, | |
| "loss": 2.716, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.1820830298616169, | |
| "grad_norm": 1.9349219799041748, | |
| "learning_rate": 9.903302151216671e-05, | |
| "loss": 2.7061, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.19422189851905802, | |
| "grad_norm": 1.7132676839828491, | |
| "learning_rate": 9.883117351265385e-05, | |
| "loss": 2.7762, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.20636076717649915, | |
| "grad_norm": 2.1389896869659424, | |
| "learning_rate": 9.861045301596821e-05, | |
| "loss": 2.7318, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.21849963583394028, | |
| "grad_norm": 3.0611305236816406, | |
| "learning_rate": 9.837094532718541e-05, | |
| "loss": 2.7319, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.2306385044913814, | |
| "grad_norm": 1.7353436946868896, | |
| "learning_rate": 9.811274301234174e-05, | |
| "loss": 2.7076, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.24277737314882253, | |
| "grad_norm": 2.3191990852355957, | |
| "learning_rate": 9.78359458626588e-05, | |
| "loss": 2.7457, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.25491624180626365, | |
| "grad_norm": 2.0282199382781982, | |
| "learning_rate": 9.754066085597576e-05, | |
| "loss": 2.638, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.2670551104637048, | |
| "grad_norm": 2.1684181690216064, | |
| "learning_rate": 9.722700211540394e-05, | |
| "loss": 2.6815, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.2791939791211459, | |
| "grad_norm": 1.9141716957092285, | |
| "learning_rate": 9.689509086522019e-05, | |
| "loss": 2.5845, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.29133284777858703, | |
| "grad_norm": 1.8547425270080566, | |
| "learning_rate": 9.65450553840154e-05, | |
| "loss": 2.686, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.30347171643602816, | |
| "grad_norm": 2.2245781421661377, | |
| "learning_rate": 9.617703095511691e-05, | |
| "loss": 2.757, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.3156105850934693, | |
| "grad_norm": 1.9658602476119995, | |
| "learning_rate": 9.579115981430349e-05, | |
| "loss": 2.6181, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.3277494537509104, | |
| "grad_norm": 2.5929782390594482, | |
| "learning_rate": 9.538759109483347e-05, | |
| "loss": 2.6221, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.33988832240835154, | |
| "grad_norm": 1.781426191329956, | |
| "learning_rate": 9.496648076980702e-05, | |
| "loss": 2.6583, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.35202719106579267, | |
| "grad_norm": 2.546909809112549, | |
| "learning_rate": 9.452799159188492e-05, | |
| "loss": 2.637, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.3641660597232338, | |
| "grad_norm": 2.062859058380127, | |
| "learning_rate": 9.407229303038719e-05, | |
| "loss": 2.6607, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.3763049283806749, | |
| "grad_norm": 2.205317735671997, | |
| "learning_rate": 9.359956120579578e-05, | |
| "loss": 2.6899, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.38844379703811605, | |
| "grad_norm": 1.6034296751022339, | |
| "learning_rate": 9.310997882168673e-05, | |
| "loss": 2.6986, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.40058266569555717, | |
| "grad_norm": 1.492854356765747, | |
| "learning_rate": 9.260373509411806e-05, | |
| "loss": 2.7071, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.4127215343529983, | |
| "grad_norm": 2.1287174224853516, | |
| "learning_rate": 9.208102567850063e-05, | |
| "loss": 2.6058, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.4248604030104394, | |
| "grad_norm": 2.4047040939331055, | |
| "learning_rate": 9.154205259398038e-05, | |
| "loss": 2.705, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.43699927166788055, | |
| "grad_norm": 1.8397117853164673, | |
| "learning_rate": 9.098702414536107e-05, | |
| "loss": 2.6512, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.4491381403253217, | |
| "grad_norm": 2.055699586868286, | |
| "learning_rate": 9.041615484259753e-05, | |
| "loss": 2.6701, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.4612770089827628, | |
| "grad_norm": 1.9400551319122314, | |
| "learning_rate": 8.982966531789105e-05, | |
| "loss": 2.6792, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.47341587764020393, | |
| "grad_norm": 2.668945074081421, | |
| "learning_rate": 8.922778224041835e-05, | |
| "loss": 2.6004, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.48555474629764506, | |
| "grad_norm": 2.9810187816619873, | |
| "learning_rate": 8.861073822872734e-05, | |
| "loss": 2.5851, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.4976936149550862, | |
| "grad_norm": 1.625508427619934, | |
| "learning_rate": 8.79787717608338e-05, | |
| "loss": 2.5802, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.5098324836125273, | |
| "grad_norm": 2.1407854557037354, | |
| "learning_rate": 8.733212708205321e-05, | |
| "loss": 2.5865, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.5219713522699685, | |
| "grad_norm": 2.2356784343719482, | |
| "learning_rate": 8.667105411060361e-05, | |
| "loss": 2.6538, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.5341102209274096, | |
| "grad_norm": 2.3607735633850098, | |
| "learning_rate": 8.599580834101625e-05, | |
| "loss": 2.5077, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.5462490895848507, | |
| "grad_norm": 2.3377416133880615, | |
| "learning_rate": 8.530665074539073e-05, | |
| "loss": 2.5979, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.5583879582422918, | |
| "grad_norm": 2.3431484699249268, | |
| "learning_rate": 8.460384767253331e-05, | |
| "loss": 2.4996, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.570526826899733, | |
| "grad_norm": 2.106093406677246, | |
| "learning_rate": 8.388767074501731e-05, | |
| "loss": 2.4795, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.5826656955571741, | |
| "grad_norm": 1.8955905437469482, | |
| "learning_rate": 8.3158396754205e-05, | |
| "loss": 2.5837, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.5948045642146153, | |
| "grad_norm": 1.9230371713638306, | |
| "learning_rate": 8.241630755327213e-05, | |
| "loss": 2.5845, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.6069434328720563, | |
| "grad_norm": 1.6631944179534912, | |
| "learning_rate": 8.166168994827599e-05, | |
| "loss": 2.6071, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.6190823015294975, | |
| "grad_norm": 2.2075533866882324, | |
| "learning_rate": 8.089483558730919e-05, | |
| "loss": 2.5412, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.6312211701869386, | |
| "grad_norm": 1.8824903964996338, | |
| "learning_rate": 8.011604084778229e-05, | |
| "loss": 2.5386, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.6433600388443798, | |
| "grad_norm": 2.604081869125366, | |
| "learning_rate": 7.932560672187839e-05, | |
| "loss": 2.6509, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.6554989075018208, | |
| "grad_norm": 2.0620648860931396, | |
| "learning_rate": 7.852383870022439e-05, | |
| "loss": 2.6403, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.667637776159262, | |
| "grad_norm": 2.0239202976226807, | |
| "learning_rate": 7.771104665382341e-05, | |
| "loss": 2.6965, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.6797766448167031, | |
| "grad_norm": 1.7837492227554321, | |
| "learning_rate": 7.688754471429456e-05, | |
| "loss": 2.5448, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.6919155134741443, | |
| "grad_norm": 1.9377483129501343, | |
| "learning_rate": 7.605365115246581e-05, | |
| "loss": 2.6333, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.7040543821315853, | |
| "grad_norm": 2.297499179840088, | |
| "learning_rate": 7.520968825536732e-05, | |
| "loss": 2.4747, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.7161932507890265, | |
| "grad_norm": 1.857254147529602, | |
| "learning_rate": 7.435598220167226e-05, | |
| "loss": 2.6631, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.7283321194464676, | |
| "grad_norm": 2.1972172260284424, | |
| "learning_rate": 7.349286293563402e-05, | |
| "loss": 2.5898, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.7404709881039088, | |
| "grad_norm": 2.267690896987915, | |
| "learning_rate": 7.26206640395677e-05, | |
| "loss": 2.4341, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.7526098567613498, | |
| "grad_norm": 1.6826646327972412, | |
| "learning_rate": 7.17397226049256e-05, | |
| "loss": 2.6269, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.764748725418791, | |
| "grad_norm": 2.3957300186157227, | |
| "learning_rate": 7.085037910201677e-05, | |
| "loss": 2.6107, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.7768875940762321, | |
| "grad_norm": 2.471625566482544, | |
| "learning_rate": 6.99529772484203e-05, | |
| "loss": 2.5767, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.7890264627336733, | |
| "grad_norm": 1.8939329385757446, | |
| "learning_rate": 6.904786387614382e-05, | |
| "loss": 2.5009, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.8011653313911143, | |
| "grad_norm": 2.498994827270508, | |
| "learning_rate": 6.813538879757828e-05, | |
| "loss": 2.5742, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.8133042000485555, | |
| "grad_norm": 2.3812406063079834, | |
| "learning_rate": 6.721590467030083e-05, | |
| "loss": 2.5011, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.8254430687059966, | |
| "grad_norm": 1.9224671125411987, | |
| "learning_rate": 6.62897668607781e-05, | |
| "loss": 2.5455, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.8375819373634378, | |
| "grad_norm": 1.811013102531433, | |
| "learning_rate": 6.535733330702254e-05, | |
| "loss": 2.5791, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.8497208060208788, | |
| "grad_norm": 1.4125910997390747, | |
| "learning_rate": 6.441896438025482e-05, | |
| "loss": 2.477, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.86185967467832, | |
| "grad_norm": 1.7109546661376953, | |
| "learning_rate": 6.3475022745626e-05, | |
| "loss": 2.4967, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.8739985433357611, | |
| "grad_norm": 1.8944520950317383, | |
| "learning_rate": 6.252587322205299e-05, | |
| "loss": 2.6007, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.8861374119932023, | |
| "grad_norm": 2.4895029067993164, | |
| "learning_rate": 6.157188264122153e-05, | |
| "loss": 2.5122, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.8982762806506434, | |
| "grad_norm": 2.2736401557922363, | |
| "learning_rate": 6.061341970581165e-05, | |
| "loss": 2.5942, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.9104151493080845, | |
| "grad_norm": 2.258389711380005, | |
| "learning_rate": 5.9650854846999495e-05, | |
| "loss": 2.4973, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.9225540179655256, | |
| "grad_norm": 2.1070783138275146, | |
| "learning_rate": 5.868456008129154e-05, | |
| "loss": 2.5858, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.9346928866229668, | |
| "grad_norm": 1.8113417625427246, | |
| "learning_rate": 5.7714908866745864e-05, | |
| "loss": 2.5253, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.9468317552804079, | |
| "grad_norm": 1.8022534847259521, | |
| "learning_rate": 5.674227595863638e-05, | |
| "loss": 2.5297, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.958970623937849, | |
| "grad_norm": 2.208134174346924, | |
| "learning_rate": 5.5767037264615686e-05, | |
| "loss": 2.5352, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.9711094925952901, | |
| "grad_norm": 1.7783771753311157, | |
| "learning_rate": 5.478956969943252e-05, | |
| "loss": 2.622, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.9832483612527313, | |
| "grad_norm": 1.889061689376831, | |
| "learning_rate": 5.3810251039260026e-05, | |
| "loss": 2.5766, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.9953872299101724, | |
| "grad_norm": 1.7664889097213745, | |
| "learning_rate": 5.2829459775691124e-05, | |
| "loss": 2.5343, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 1.0075260985676135, | |
| "grad_norm": 2.389195442199707, | |
| "learning_rate": 5.184757496945726e-05, | |
| "loss": 2.4996, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 1.0196649672250546, | |
| "grad_norm": 2.4707448482513428, | |
| "learning_rate": 5.086497610392723e-05, | |
| "loss": 2.3471, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 1.0318038358824957, | |
| "grad_norm": 2.3839166164398193, | |
| "learning_rate": 4.988204293844289e-05, | |
| "loss": 2.3737, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 1.043942704539937, | |
| "grad_norm": 2.6970324516296387, | |
| "learning_rate": 4.889915536154776e-05, | |
| "loss": 2.3854, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 1.056081573197378, | |
| "grad_norm": 1.623435616493225, | |
| "learning_rate": 4.7916693244166126e-05, | |
| "loss": 2.3536, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 1.0682204418548191, | |
| "grad_norm": 2.695117473602295, | |
| "learning_rate": 4.693503629278875e-05, | |
| "loss": 2.3699, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 1.0803593105122602, | |
| "grad_norm": 2.7556312084198, | |
| "learning_rate": 4.595456390272207e-05, | |
| "loss": 2.3021, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 1.0924981791697013, | |
| "grad_norm": 2.1368134021759033, | |
| "learning_rate": 4.4975655011457815e-05, | |
| "loss": 2.3003, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 1.1046370478271426, | |
| "grad_norm": 1.6469930410385132, | |
| "learning_rate": 4.399868795221951e-05, | |
| "loss": 2.3007, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 1.1167759164845836, | |
| "grad_norm": 1.8031399250030518, | |
| "learning_rate": 4.302404030774248e-05, | |
| "loss": 2.4757, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 1.1289147851420247, | |
| "grad_norm": 2.02652907371521, | |
| "learning_rate": 4.205208876434389e-05, | |
| "loss": 2.2888, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 1.141053653799466, | |
| "grad_norm": 1.9721205234527588, | |
| "learning_rate": 4.108320896633937e-05, | |
| "loss": 2.3307, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 1.153192522456907, | |
| "grad_norm": 2.1819326877593994, | |
| "learning_rate": 4.011777537086219e-05, | |
| "loss": 2.3219, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 1.1653313911143481, | |
| "grad_norm": 2.973172187805176, | |
| "learning_rate": 3.915616110314142e-05, | |
| "loss": 2.252, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 1.1774702597717892, | |
| "grad_norm": 2.2087929248809814, | |
| "learning_rate": 3.8198737812294675e-05, | |
| "loss": 2.3202, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 1.1896091284292303, | |
| "grad_norm": 2.286069869995117, | |
| "learning_rate": 3.724587552769152e-05, | |
| "loss": 2.3541, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 1.2017479970866716, | |
| "grad_norm": 2.08137583732605, | |
| "learning_rate": 3.6297942515942776e-05, | |
| "loss": 2.3576, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 1.2138868657441126, | |
| "grad_norm": 2.029747724533081, | |
| "learning_rate": 3.535530513857115e-05, | |
| "loss": 2.3344, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 1.2260257344015537, | |
| "grad_norm": 2.449650764465332, | |
| "learning_rate": 3.441832771041818e-05, | |
| "loss": 2.3351, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 1.238164603058995, | |
| "grad_norm": 2.0461597442626953, | |
| "learning_rate": 3.34873723588421e-05, | |
| "loss": 2.2197, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 1.250303471716436, | |
| "grad_norm": 1.7304949760437012, | |
| "learning_rate": 3.25627988837612e-05, | |
| "loss": 2.3097, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 1.2624423403738771, | |
| "grad_norm": 2.58225417137146, | |
| "learning_rate": 3.164496461859673e-05, | |
| "loss": 2.4066, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 1.2745812090313182, | |
| "grad_norm": 1.7446330785751343, | |
| "learning_rate": 3.0734224292169e-05, | |
| "loss": 2.3252, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 1.2867200776887593, | |
| "grad_norm": 1.8611998558044434, | |
| "learning_rate": 2.9830929891600177e-05, | |
| "loss": 2.2757, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 1.2988589463462006, | |
| "grad_norm": 1.8992869853973389, | |
| "learning_rate": 2.8935430526276586e-05, | |
| "loss": 2.3245, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 1.3109978150036417, | |
| "grad_norm": 2.460495710372925, | |
| "learning_rate": 2.8048072292923465e-05, | |
| "loss": 2.2645, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 1.3231366836610827, | |
| "grad_norm": 2.6929290294647217, | |
| "learning_rate": 2.7169198141843767e-05, | |
| "loss": 2.2588, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 1.335275552318524, | |
| "grad_norm": 3.0288407802581787, | |
| "learning_rate": 2.6299147744373193e-05, | |
| "loss": 2.2605, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 1.347414420975965, | |
| "grad_norm": 1.7983629703521729, | |
| "learning_rate": 2.5438257361602474e-05, | |
| "loss": 2.2654, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 1.3595532896334062, | |
| "grad_norm": 2.5929248332977295, | |
| "learning_rate": 2.4586859714417594e-05, | |
| "loss": 2.2965, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 1.3716921582908472, | |
| "grad_norm": 1.558080792427063, | |
| "learning_rate": 2.3745283854908305e-05, | |
| "loss": 2.3072, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 1.3838310269482883, | |
| "grad_norm": 1.997135877609253, | |
| "learning_rate": 2.2913855039194553e-05, | |
| "loss": 2.3047, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 1.3959698956057296, | |
| "grad_norm": 1.9210643768310547, | |
| "learning_rate": 2.2092894601720005e-05, | |
| "loss": 2.2756, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 1.4081087642631707, | |
| "grad_norm": 2.384209156036377, | |
| "learning_rate": 2.128271983106121e-05, | |
| "loss": 2.2948, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 1.4202476329206117, | |
| "grad_norm": 2.263803482055664, | |
| "learning_rate": 2.0483643847300453e-05, | |
| "loss": 2.3062, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 1.432386501578053, | |
| "grad_norm": 2.315314769744873, | |
| "learning_rate": 1.9695975481009683e-05, | |
| "loss": 2.3215, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 1.444525370235494, | |
| "grad_norm": 2.041764497756958, | |
| "learning_rate": 1.89200191538922e-05, | |
| "loss": 2.3256, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 1.4566642388929352, | |
| "grad_norm": 2.1705563068389893, | |
| "learning_rate": 1.8156074761128454e-05, | |
| "loss": 2.2912, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 1.4688031075503762, | |
| "grad_norm": 2.304280996322632, | |
| "learning_rate": 1.7404437555471003e-05, | |
| "loss": 2.309, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 1.4809419762078173, | |
| "grad_norm": 2.4376580715179443, | |
| "learning_rate": 1.6665398033134034e-05, | |
| "loss": 2.345, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 1.4930808448652586, | |
| "grad_norm": 2.959686279296875, | |
| "learning_rate": 1.5939241821520952e-05, | |
| "loss": 2.2565, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 1.5052197135226997, | |
| "grad_norm": 1.8753809928894043, | |
| "learning_rate": 1.5226249568833794e-05, | |
| "loss": 2.3363, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 1.5173585821801407, | |
| "grad_norm": 1.8722175359725952, | |
| "learning_rate": 1.452669683560709e-05, | |
| "loss": 2.3196, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 1.529497450837582, | |
| "grad_norm": 2.468750238418579, | |
| "learning_rate": 1.3840853988207847e-05, | |
| "loss": 2.3277, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 1.541636319495023, | |
| "grad_norm": 1.8061391115188599, | |
| "learning_rate": 1.316898609434319e-05, | |
| "loss": 2.2795, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 1.5537751881524642, | |
| "grad_norm": 1.9603863954544067, | |
| "learning_rate": 1.2511352820615691e-05, | |
| "loss": 2.326, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 1.5659140568099055, | |
| "grad_norm": 2.2773890495300293, | |
| "learning_rate": 1.1868208332166336e-05, | |
| "loss": 2.2427, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 1.5780529254673463, | |
| "grad_norm": 1.9823254346847534, | |
| "learning_rate": 1.1239801194443506e-05, | |
| "loss": 2.2775, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 1.5901917941247876, | |
| "grad_norm": 2.00081205368042, | |
| "learning_rate": 1.0626374277136342e-05, | |
| "loss": 2.3023, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 1.6023306627822287, | |
| "grad_norm": 2.134455919265747, | |
| "learning_rate": 1.0028164660309259e-05, | |
| "loss": 2.4271, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 1.6144695314396698, | |
| "grad_norm": 2.493212938308716, | |
| "learning_rate": 9.445403542774206e-06, | |
| "loss": 2.2615, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 1.626608400097111, | |
| "grad_norm": 2.063344955444336, | |
| "learning_rate": 8.878316152735888e-06, | |
| "loss": 2.2552, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 1.6387472687545521, | |
| "grad_norm": 2.2275609970092773, | |
| "learning_rate": 8.327121660744452e-06, | |
| "loss": 2.3427, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 1.6508861374119932, | |
| "grad_norm": 2.143228769302368, | |
| "learning_rate": 7.792033094989593e-06, | |
| "loss": 2.2294, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 1.6630250060694345, | |
| "grad_norm": 1.6725349426269531, | |
| "learning_rate": 7.273257258968275e-06, | |
| "loss": 2.3335, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 1.6751638747268753, | |
| "grad_norm": 1.7002774477005005, | |
| "learning_rate": 6.77099465155846e-06, | |
| "loss": 2.3019, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 1.6873027433843166, | |
| "grad_norm": 2.0058093070983887, | |
| "learning_rate": 6.285439389529346e-06, | |
| "loss": 2.2801, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 1.6994416120417577, | |
| "grad_norm": 2.444603681564331, | |
| "learning_rate": 5.816779132518224e-06, | |
| "loss": 2.2837, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.7115804806991988, | |
| "grad_norm": 2.3724894523620605, | |
| "learning_rate": 5.365195010502916e-06, | |
| "loss": 2.3238, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 1.72371934935664, | |
| "grad_norm": 2.394784450531006, | |
| "learning_rate": 4.930861553797822e-06, | |
| "loss": 2.2119, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 1.7358582180140811, | |
| "grad_norm": 1.8876112699508667, | |
| "learning_rate": 4.5139466256006625e-06, | |
| "loss": 2.3293, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 1.7479970866715222, | |
| "grad_norm": 2.4736382961273193, | |
| "learning_rate": 4.1146113571158995e-06, | |
| "loss": 2.2619, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 1.7601359553289635, | |
| "grad_norm": 2.3860538005828857, | |
| "learning_rate": 3.733010085280031e-06, | |
| "loss": 2.2628, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 1.7722748239864043, | |
| "grad_norm": 2.2846248149871826, | |
| "learning_rate": 3.3692902931127256e-06, | |
| "loss": 2.2636, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 1.7844136926438456, | |
| "grad_norm": 1.9925642013549805, | |
| "learning_rate": 3.0235925527169196e-06, | |
| "loss": 2.2772, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 1.7965525613012867, | |
| "grad_norm": 2.708155870437622, | |
| "learning_rate": 2.696050470949857e-06, | |
| "loss": 2.2776, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 1.8086914299587278, | |
| "grad_norm": 1.6095919609069824, | |
| "learning_rate": 2.386790637786085e-06, | |
| "loss": 2.3365, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 1.820830298616169, | |
| "grad_norm": 1.8871222734451294, | |
| "learning_rate": 2.0959325773923732e-06, | |
| "loss": 2.3408, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 1.8329691672736101, | |
| "grad_norm": 2.4641993045806885, | |
| "learning_rate": 1.8235887019334985e-06, | |
| "loss": 2.2675, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.8451080359310512, | |
| "grad_norm": 2.003045082092285, | |
| "learning_rate": 1.569864268126614e-06, | |
| "loss": 2.3028, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.8572469045884925, | |
| "grad_norm": 2.2934603691101074, | |
| "learning_rate": 1.3348573365612184e-06, | |
| "loss": 2.3406, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.8693857732459334, | |
| "grad_norm": 1.78590989112854, | |
| "learning_rate": 1.118658733800193e-06, | |
| "loss": 2.2264, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.8815246419033747, | |
| "grad_norm": 1.7274677753448486, | |
| "learning_rate": 9.213520172767332e-07, | |
| "loss": 2.3045, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.8936635105608157, | |
| "grad_norm": 1.8418123722076416, | |
| "learning_rate": 7.43013443000734e-07, | |
| "loss": 2.2462, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.9058023792182568, | |
| "grad_norm": 2.445272445678711, | |
| "learning_rate": 5.837119360869503e-07, | |
| "loss": 2.4228, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.917941247875698, | |
| "grad_norm": 2.3169541358947754, | |
| "learning_rate": 4.435090641165651e-07, | |
| "loss": 2.3271, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.9300801165331392, | |
| "grad_norm": 1.9685901403427124, | |
| "learning_rate": 3.2245901334221895e-07, | |
| "loss": 2.2368, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.9422189851905802, | |
| "grad_norm": 2.039243221282959, | |
| "learning_rate": 2.2060856774587803e-07, | |
| "loss": 2.3857, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.9543578538480215, | |
| "grad_norm": 2.139963150024414, | |
| "learning_rate": 1.3799709095754232e-07, | |
| "loss": 2.3981, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.9664967225054624, | |
| "grad_norm": 2.1113266944885254, | |
| "learning_rate": 7.46565110417985e-08, | |
| "loss": 2.306, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.9786355911629037, | |
| "grad_norm": 2.550076723098755, | |
| "learning_rate": 3.06113081581405e-08, | |
| "loss": 2.2583, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.9907744598203447, | |
| "grad_norm": 1.6660057306289673, | |
| "learning_rate": 5.878505099732312e-09, | |
| "loss": 2.3201, | |
| "step": 8200 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 8238, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 4000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.2322467312492544e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |