{ "best_global_step": 6586, "best_metric": 0.847490661036219, "best_model_checkpoint": "outputs/final-run/checkpoint-6586", "epoch": 2.0, "eval_steps": 500, "global_step": 6586, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015186028853454821, "grad_norm": 2.3480491638183594, "learning_rate": 9.999658552822536e-06, "loss": 0.6629, "step": 50 }, { "epoch": 0.030372057706909643, "grad_norm": 3.3713912963867188, "learning_rate": 9.998606244733398e-06, "loss": 0.5702, "step": 100 }, { "epoch": 0.04555808656036447, "grad_norm": 5.710732460021973, "learning_rate": 9.996843083169648e-06, "loss": 0.5245, "step": 150 }, { "epoch": 0.060744115413819286, "grad_norm": 5.55316686630249, "learning_rate": 9.994369318871088e-06, "loss": 0.4684, "step": 200 }, { "epoch": 0.07593014426727411, "grad_norm": 8.149177551269531, "learning_rate": 9.991185303632574e-06, "loss": 0.4783, "step": 250 }, { "epoch": 0.09111617312072894, "grad_norm": 7.209503650665283, "learning_rate": 9.987291490253976e-06, "loss": 0.4589, "step": 300 }, { "epoch": 0.10630220197418375, "grad_norm": 7.983670234680176, "learning_rate": 9.98268843247581e-06, "loss": 0.4623, "step": 350 }, { "epoch": 0.12148823082763857, "grad_norm": 4.2466206550598145, "learning_rate": 9.977376784900465e-06, "loss": 0.4694, "step": 400 }, { "epoch": 0.1366742596810934, "grad_norm": 6.11989164352417, "learning_rate": 9.971357302899133e-06, "loss": 0.4965, "step": 450 }, { "epoch": 0.15186028853454822, "grad_norm": 4.606278896331787, "learning_rate": 9.964630842504372e-06, "loss": 0.4919, "step": 500 }, { "epoch": 0.16704631738800305, "grad_norm": 5.91365909576416, "learning_rate": 9.957198360288374e-06, "loss": 0.4536, "step": 550 }, { "epoch": 0.18223234624145787, "grad_norm": 5.541100025177002, "learning_rate": 9.949060913226936e-06, "loss": 0.4719, "step": 600 }, { "epoch": 0.19741837509491267, "grad_norm": 4.289076805114746, "learning_rate": 9.94021965854914e-06, "loss": 0.4136, "step": 650 }, { "epoch": 0.2126044039483675, "grad_norm": 14.322737693786621, "learning_rate": 9.930675853572787e-06, "loss": 0.4705, "step": 700 }, { "epoch": 0.22779043280182232, "grad_norm": 6.375570297241211, "learning_rate": 9.920430855525589e-06, "loss": 0.4701, "step": 750 }, { "epoch": 0.24297646165527714, "grad_norm": 3.261223316192627, "learning_rate": 9.909486121352163e-06, "loss": 0.4528, "step": 800 }, { "epoch": 0.25816249050873197, "grad_norm": 3.854436159133911, "learning_rate": 9.89784320750684e-06, "loss": 0.4265, "step": 850 }, { "epoch": 0.2733485193621868, "grad_norm": 10.977453231811523, "learning_rate": 9.885503769732304e-06, "loss": 0.4329, "step": 900 }, { "epoch": 0.2885345482156416, "grad_norm": 6.016301155090332, "learning_rate": 9.872469562824157e-06, "loss": 0.4147, "step": 950 }, { "epoch": 0.30372057706909644, "grad_norm": 5.806178569793701, "learning_rate": 9.858742440381343e-06, "loss": 0.4718, "step": 1000 }, { "epoch": 0.31890660592255127, "grad_norm": 6.394021987915039, "learning_rate": 9.844324354542558e-06, "loss": 0.3912, "step": 1050 }, { "epoch": 0.3340926347760061, "grad_norm": 7.948565483093262, "learning_rate": 9.82921735570864e-06, "loss": 0.4223, "step": 1100 }, { "epoch": 0.3492786636294609, "grad_norm": 5.151625156402588, "learning_rate": 9.813423592250969e-06, "loss": 0.4079, "step": 1150 }, { "epoch": 0.36446469248291574, "grad_norm": 6.724940299987793, "learning_rate": 9.796945310205958e-06, "loss": 0.4306, "step": 1200 }, { "epoch": 0.37965072133637057, "grad_norm": 3.623936176300049, "learning_rate": 9.779784852955636e-06, "loss": 0.438, "step": 1250 }, { "epoch": 0.39483675018982534, "grad_norm": 6.648692607879639, "learning_rate": 9.761944660894397e-06, "loss": 0.4515, "step": 1300 }, { "epoch": 0.41002277904328016, "grad_norm": 5.989009380340576, "learning_rate": 9.743427271081954e-06, "loss": 0.3911, "step": 1350 }, { "epoch": 0.425208807896735, "grad_norm": 3.3359053134918213, "learning_rate": 9.724235316882537e-06, "loss": 0.4454, "step": 1400 }, { "epoch": 0.4403948367501898, "grad_norm": 11.017061233520508, "learning_rate": 9.704371527590404e-06, "loss": 0.4022, "step": 1450 }, { "epoch": 0.45558086560364464, "grad_norm": 7.159852981567383, "learning_rate": 9.68383872804171e-06, "loss": 0.4464, "step": 1500 }, { "epoch": 0.47076689445709946, "grad_norm": 4.259323596954346, "learning_rate": 9.662639838212781e-06, "loss": 0.3829, "step": 1550 }, { "epoch": 0.4859529233105543, "grad_norm": 7.917912483215332, "learning_rate": 9.640777872804868e-06, "loss": 0.4186, "step": 1600 }, { "epoch": 0.5011389521640092, "grad_norm": 8.7236967086792, "learning_rate": 9.61825594081542e-06, "loss": 0.3766, "step": 1650 }, { "epoch": 0.5163249810174639, "grad_norm": 4.761518478393555, "learning_rate": 9.595077245095959e-06, "loss": 0.4057, "step": 1700 }, { "epoch": 0.5315110098709187, "grad_norm": 2.5256729125976562, "learning_rate": 9.571245081896594e-06, "loss": 0.4321, "step": 1750 }, { "epoch": 0.5466970387243736, "grad_norm": 9.82975959777832, "learning_rate": 9.546762840397268e-06, "loss": 0.4067, "step": 1800 }, { "epoch": 0.5618830675778284, "grad_norm": 4.607714653015137, "learning_rate": 9.521634002225774e-06, "loss": 0.3834, "step": 1850 }, { "epoch": 0.5770690964312832, "grad_norm": 8.330415725708008, "learning_rate": 9.495862140962638e-06, "loss": 0.374, "step": 1900 }, { "epoch": 0.592255125284738, "grad_norm": 5.7992634773254395, "learning_rate": 9.469450921632912e-06, "loss": 0.3852, "step": 1950 }, { "epoch": 0.6074411541381929, "grad_norm": 5.298435211181641, "learning_rate": 9.44240410018498e-06, "loss": 0.4345, "step": 2000 }, { "epoch": 0.6226271829916477, "grad_norm": 6.483381271362305, "learning_rate": 9.414725522956414e-06, "loss": 0.407, "step": 2050 }, { "epoch": 0.6378132118451025, "grad_norm": 5.179783821105957, "learning_rate": 9.386419126126983e-06, "loss": 0.432, "step": 2100 }, { "epoch": 0.6529992406985573, "grad_norm": 5.316011428833008, "learning_rate": 9.357488935158897e-06, "loss": 0.4071, "step": 2150 }, { "epoch": 0.6681852695520122, "grad_norm": 10.58410930633545, "learning_rate": 9.327939064224346e-06, "loss": 0.3772, "step": 2200 }, { "epoch": 0.683371298405467, "grad_norm": 4.013734817504883, "learning_rate": 9.297773715620406e-06, "loss": 0.4064, "step": 2250 }, { "epoch": 0.6985573272589218, "grad_norm": 9.252372741699219, "learning_rate": 9.266997179171442e-06, "loss": 0.3911, "step": 2300 }, { "epoch": 0.7137433561123766, "grad_norm": 8.192291259765625, "learning_rate": 9.235613831619052e-06, "loss": 0.3816, "step": 2350 }, { "epoch": 0.7289293849658315, "grad_norm": 4.068896770477295, "learning_rate": 9.203628135999643e-06, "loss": 0.4304, "step": 2400 }, { "epoch": 0.7441154138192863, "grad_norm": 2.9444737434387207, "learning_rate": 9.171044641009741e-06, "loss": 0.4231, "step": 2450 }, { "epoch": 0.7593014426727411, "grad_norm": 4.700106620788574, "learning_rate": 9.137867980359126e-06, "loss": 0.3982, "step": 2500 }, { "epoch": 0.7744874715261959, "grad_norm": 14.975322723388672, "learning_rate": 9.104102872111858e-06, "loss": 0.4241, "step": 2550 }, { "epoch": 0.7896735003796507, "grad_norm": 4.325404644012451, "learning_rate": 9.069754118015339e-06, "loss": 0.3725, "step": 2600 }, { "epoch": 0.8048595292331056, "grad_norm": 3.829643964767456, "learning_rate": 9.034826602817433e-06, "loss": 0.4048, "step": 2650 }, { "epoch": 0.8200455580865603, "grad_norm": 6.086367607116699, "learning_rate": 8.99932529357182e-06, "loss": 0.4333, "step": 2700 }, { "epoch": 0.8352315869400152, "grad_norm": 4.058459758758545, "learning_rate": 8.963255238931623e-06, "loss": 0.4004, "step": 2750 }, { "epoch": 0.85041761579347, "grad_norm": 4.049592971801758, "learning_rate": 8.926621568431442e-06, "loss": 0.4126, "step": 2800 }, { "epoch": 0.8656036446469249, "grad_norm": 3.434569835662842, "learning_rate": 8.889429491757872e-06, "loss": 0.4134, "step": 2850 }, { "epoch": 0.8807896735003796, "grad_norm": 5.300995349884033, "learning_rate": 8.851684298008642e-06, "loss": 0.4224, "step": 2900 }, { "epoch": 0.8959757023538345, "grad_norm": 8.158344268798828, "learning_rate": 8.813391354940445e-06, "loss": 0.3538, "step": 2950 }, { "epoch": 0.9111617312072893, "grad_norm": 6.747292518615723, "learning_rate": 8.77455610820559e-06, "loss": 0.3907, "step": 3000 }, { "epoch": 0.9263477600607442, "grad_norm": 6.279948711395264, "learning_rate": 8.735184080577569e-06, "loss": 0.4344, "step": 3050 }, { "epoch": 0.9415337889141989, "grad_norm": 4.355826377868652, "learning_rate": 8.69528087116567e-06, "loss": 0.4082, "step": 3100 }, { "epoch": 0.9567198177676538, "grad_norm": 6.685491561889648, "learning_rate": 8.65485215461872e-06, "loss": 0.3851, "step": 3150 }, { "epoch": 0.9719058466211086, "grad_norm": 5.933023452758789, "learning_rate": 8.61390368031809e-06, "loss": 0.3734, "step": 3200 }, { "epoch": 0.9870918754745635, "grad_norm": 9.179722785949707, "learning_rate": 8.572441271560077e-06, "loss": 0.3934, "step": 3250 }, { "epoch": 1.0, "eval_f1": 0.8445935154128733, "eval_loss": 0.37980714440345764, "eval_runtime": 7.8494, "eval_samples_per_second": 745.663, "eval_steps_per_second": 23.314, "step": 3293 }, { "epoch": 1.0021260440394837, "grad_norm": 1.5981299877166748, "learning_rate": 8.53047082472777e-06, "loss": 0.3967, "step": 3300 }, { "epoch": 1.0173120728929386, "grad_norm": 5.159671783447266, "learning_rate": 8.487998308452525e-06, "loss": 0.3125, "step": 3350 }, { "epoch": 1.0324981017463932, "grad_norm": 8.904830932617188, "learning_rate": 8.445029762765159e-06, "loss": 0.3201, "step": 3400 }, { "epoch": 1.047684130599848, "grad_norm": 4.215548992156982, "learning_rate": 8.401571298237e-06, "loss": 0.3043, "step": 3450 }, { "epoch": 1.062870159453303, "grad_norm": 2.9603254795074463, "learning_rate": 8.357629095110906e-06, "loss": 0.307, "step": 3500 }, { "epoch": 1.0780561883067579, "grad_norm": 8.665258407592773, "learning_rate": 8.313209402422348e-06, "loss": 0.3081, "step": 3550 }, { "epoch": 1.0932422171602125, "grad_norm": 7.101922512054443, "learning_rate": 8.268318537110762e-06, "loss": 0.3536, "step": 3600 }, { "epoch": 1.1084282460136674, "grad_norm": 9.113100051879883, "learning_rate": 8.222962883121196e-06, "loss": 0.3557, "step": 3650 }, { "epoch": 1.1236142748671223, "grad_norm": 3.427243947982788, "learning_rate": 8.177148890496452e-06, "loss": 0.2984, "step": 3700 }, { "epoch": 1.138800303720577, "grad_norm": 6.6492695808410645, "learning_rate": 8.130883074459823e-06, "loss": 0.3407, "step": 3750 }, { "epoch": 1.1539863325740318, "grad_norm": 9.254618644714355, "learning_rate": 8.084172014488564e-06, "loss": 0.3487, "step": 3800 }, { "epoch": 1.1691723614274867, "grad_norm": 3.8507754802703857, "learning_rate": 8.037022353378218e-06, "loss": 0.3374, "step": 3850 }, { "epoch": 1.1843583902809416, "grad_norm": 18.62590217590332, "learning_rate": 7.989440796297943e-06, "loss": 0.3269, "step": 3900 }, { "epoch": 1.1995444191343965, "grad_norm": 14.359010696411133, "learning_rate": 7.941434109836968e-06, "loss": 0.3219, "step": 3950 }, { "epoch": 1.2147304479878511, "grad_norm": 8.173829078674316, "learning_rate": 7.893009121042314e-06, "loss": 0.2944, "step": 4000 }, { "epoch": 1.229916476841306, "grad_norm": 6.0913591384887695, "learning_rate": 7.844172716447918e-06, "loss": 0.366, "step": 4050 }, { "epoch": 1.2451025056947609, "grad_norm": 8.989174842834473, "learning_rate": 7.794931841095297e-06, "loss": 0.3223, "step": 4100 }, { "epoch": 1.2602885345482155, "grad_norm": 4.618454456329346, "learning_rate": 7.745293497545892e-06, "loss": 0.3718, "step": 4150 }, { "epoch": 1.2754745634016704, "grad_norm": 6.966646194458008, "learning_rate": 7.695264744885225e-06, "loss": 0.34, "step": 4200 }, { "epoch": 1.2906605922551253, "grad_norm": 8.476325988769531, "learning_rate": 7.64485269771903e-06, "loss": 0.309, "step": 4250 }, { "epoch": 1.3058466211085802, "grad_norm": 3.3412492275238037, "learning_rate": 7.594064525161487e-06, "loss": 0.3491, "step": 4300 }, { "epoch": 1.321032649962035, "grad_norm": 9.971606254577637, "learning_rate": 7.54290744981569e-06, "loss": 0.3097, "step": 4350 }, { "epoch": 1.3362186788154897, "grad_norm": 7.083515167236328, "learning_rate": 7.491388746746522e-06, "loss": 0.3446, "step": 4400 }, { "epoch": 1.3514047076689446, "grad_norm": 5.6028361320495605, "learning_rate": 7.439515742446065e-06, "loss": 0.3229, "step": 4450 }, { "epoch": 1.3665907365223995, "grad_norm": 9.373847961425781, "learning_rate": 7.387295813791705e-06, "loss": 0.3022, "step": 4500 }, { "epoch": 1.3817767653758541, "grad_norm": 5.378981590270996, "learning_rate": 7.334736386997049e-06, "loss": 0.2955, "step": 4550 }, { "epoch": 1.396962794229309, "grad_norm": 9.248358726501465, "learning_rate": 7.281844936555853e-06, "loss": 0.3562, "step": 4600 }, { "epoch": 1.412148823082764, "grad_norm": 6.579871654510498, "learning_rate": 7.228628984179068e-06, "loss": 0.3436, "step": 4650 }, { "epoch": 1.4273348519362186, "grad_norm": 2.5316176414489746, "learning_rate": 7.175096097725169e-06, "loss": 0.3464, "step": 4700 }, { "epoch": 1.4425208807896734, "grad_norm": 12.828206062316895, "learning_rate": 7.121253890123941e-06, "loss": 0.3333, "step": 4750 }, { "epoch": 1.4577069096431283, "grad_norm": 8.807774543762207, "learning_rate": 7.067110018293828e-06, "loss": 0.2955, "step": 4800 }, { "epoch": 1.4728929384965832, "grad_norm": 10.35312557220459, "learning_rate": 7.012672182053043e-06, "loss": 0.3321, "step": 4850 }, { "epoch": 1.488078967350038, "grad_norm": 2.2814652919769287, "learning_rate": 6.9579481230245835e-06, "loss": 0.3466, "step": 4900 }, { "epoch": 1.5032649962034927, "grad_norm": 5.442550182342529, "learning_rate": 6.9029456235352795e-06, "loss": 0.3321, "step": 4950 }, { "epoch": 1.5184510250569476, "grad_norm": 12.557025909423828, "learning_rate": 6.847672505509079e-06, "loss": 0.3429, "step": 5000 }, { "epoch": 1.5336370539104025, "grad_norm": 4.002285480499268, "learning_rate": 6.792136629354677e-06, "loss": 0.3274, "step": 5050 }, { "epoch": 1.5488230827638572, "grad_norm": 17.179048538208008, "learning_rate": 6.736345892847691e-06, "loss": 0.3472, "step": 5100 }, { "epoch": 1.564009111617312, "grad_norm": 8.354110717773438, "learning_rate": 6.680308230007521e-06, "loss": 0.3282, "step": 5150 }, { "epoch": 1.579195140470767, "grad_norm": 5.1743035316467285, "learning_rate": 6.624031609969036e-06, "loss": 0.3443, "step": 5200 }, { "epoch": 1.5943811693242216, "grad_norm": 6.959432601928711, "learning_rate": 6.567524035849293e-06, "loss": 0.35, "step": 5250 }, { "epoch": 1.6095671981776767, "grad_norm": 20.55417823791504, "learning_rate": 6.5107935436094076e-06, "loss": 0.3158, "step": 5300 }, { "epoch": 1.6247532270311313, "grad_norm": 14.025495529174805, "learning_rate": 6.453848200911752e-06, "loss": 0.3287, "step": 5350 }, { "epoch": 1.6399392558845862, "grad_norm": 12.094548225402832, "learning_rate": 6.396696105972655e-06, "loss": 0.3448, "step": 5400 }, { "epoch": 1.655125284738041, "grad_norm": 3.596747398376465, "learning_rate": 6.339345386410756e-06, "loss": 0.3544, "step": 5450 }, { "epoch": 1.6703113135914958, "grad_norm": 4.897212505340576, "learning_rate": 6.2818041980911635e-06, "loss": 0.3363, "step": 5500 }, { "epoch": 1.6854973424449506, "grad_norm": 2.7992074489593506, "learning_rate": 6.224080723965616e-06, "loss": 0.3405, "step": 5550 }, { "epoch": 1.7006833712984055, "grad_norm": 8.647635459899902, "learning_rate": 6.1661831729087705e-06, "loss": 0.3218, "step": 5600 }, { "epoch": 1.7158694001518602, "grad_norm": 16.2703800201416, "learning_rate": 6.1081197785508335e-06, "loss": 0.3569, "step": 5650 }, { "epoch": 1.731055429005315, "grad_norm": 9.62259578704834, "learning_rate": 6.049898798106636e-06, "loss": 0.3181, "step": 5700 }, { "epoch": 1.74624145785877, "grad_norm": 10.183274269104004, "learning_rate": 5.991528511201382e-06, "loss": 0.3191, "step": 5750 }, { "epoch": 1.7614274867122246, "grad_norm": 20.28440284729004, "learning_rate": 5.933017218693193e-06, "loss": 0.3162, "step": 5800 }, { "epoch": 1.7766135155656797, "grad_norm": 18.231319427490234, "learning_rate": 5.874373241492651e-06, "loss": 0.3788, "step": 5850 }, { "epoch": 1.7917995444191344, "grad_norm": 14.682201385498047, "learning_rate": 5.815604919379472e-06, "loss": 0.3242, "step": 5900 }, { "epoch": 1.8069855732725892, "grad_norm": 6.563547611236572, "learning_rate": 5.7567206098164965e-06, "loss": 0.3377, "step": 5950 }, { "epoch": 1.8221716021260441, "grad_norm": 8.406890869140625, "learning_rate": 5.697728686761189e-06, "loss": 0.3222, "step": 6000 }, { "epoch": 1.8373576309794988, "grad_norm": 5.706462860107422, "learning_rate": 5.638637539474758e-06, "loss": 0.3169, "step": 6050 }, { "epoch": 1.8525436598329537, "grad_norm": 3.566732883453369, "learning_rate": 5.579455571329128e-06, "loss": 0.2993, "step": 6100 }, { "epoch": 1.8677296886864085, "grad_norm": 21.842191696166992, "learning_rate": 5.520191198611883e-06, "loss": 0.3411, "step": 6150 }, { "epoch": 1.8829157175398632, "grad_norm": 7.155375957489014, "learning_rate": 5.460852849329394e-06, "loss": 0.3168, "step": 6200 }, { "epoch": 1.8981017463933183, "grad_norm": 5.166109085083008, "learning_rate": 5.401448962008262e-06, "loss": 0.3526, "step": 6250 }, { "epoch": 1.913287775246773, "grad_norm": 10.691755294799805, "learning_rate": 5.341987984495275e-06, "loss": 0.334, "step": 6300 }, { "epoch": 1.9284738041002278, "grad_norm": 1.8157846927642822, "learning_rate": 5.282478372756036e-06, "loss": 0.2981, "step": 6350 }, { "epoch": 1.9436598329536827, "grad_norm": 6.267528057098389, "learning_rate": 5.222928589672436e-06, "loss": 0.3443, "step": 6400 }, { "epoch": 1.9588458618071374, "grad_norm": 8.20384407043457, "learning_rate": 5.163347103839149e-06, "loss": 0.3196, "step": 6450 }, { "epoch": 1.9740318906605923, "grad_norm": 6.2834882736206055, "learning_rate": 5.10374238835931e-06, "loss": 0.3176, "step": 6500 }, { "epoch": 1.9892179195140471, "grad_norm": 7.512860298156738, "learning_rate": 5.0441229196395416e-06, "loss": 0.3216, "step": 6550 }, { "epoch": 2.0, "eval_f1": 0.847490661036219, "eval_loss": 0.39481809735298157, "eval_runtime": 7.8513, "eval_samples_per_second": 745.486, "eval_steps_per_second": 23.308, "step": 6586 } ], "logging_steps": 50, "max_steps": 13172, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.771769723795456e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }