{ "best_global_step": 900, "best_metric": 0.3843807876110077, "best_model_checkpoint": "./payment-extractor-optimized/checkpoint-900", "epoch": 4.391676866585067, "eval_steps": 100, "global_step": 900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.608897936344147, "epoch": 0.04895960832313342, "grad_norm": 3.6901261806488037, "learning_rate": 3.6000000000000003e-06, "loss": 3.7818, "mean_token_accuracy": 0.4824275210499763, "num_tokens": 38801.0, "step": 10 }, { "entropy": 2.6241641879081725, "epoch": 0.09791921664626684, "grad_norm": 3.3132846355438232, "learning_rate": 7.600000000000001e-06, "loss": 3.8589, "mean_token_accuracy": 0.4720219738781452, "num_tokens": 69778.0, "step": 20 }, { "entropy": 2.4173066079616548, "epoch": 0.14687882496940025, "grad_norm": 3.6249752044677734, "learning_rate": 1.16e-05, "loss": 3.3912, "mean_token_accuracy": 0.5040820851922035, "num_tokens": 96530.0, "step": 30 }, { "entropy": 2.40450359582901, "epoch": 0.19583843329253367, "grad_norm": 2.8998804092407227, "learning_rate": 1.5600000000000003e-05, "loss": 2.8447, "mean_token_accuracy": 0.5414798364043236, "num_tokens": 120256.0, "step": 40 }, { "entropy": 2.284638965129852, "epoch": 0.24479804161566707, "grad_norm": 3.2823739051818848, "learning_rate": 1.9600000000000002e-05, "loss": 1.994, "mean_token_accuracy": 0.6505917206406593, "num_tokens": 142198.0, "step": 50 }, { "entropy": 2.4689642548561097, "epoch": 0.2937576499388005, "grad_norm": 2.1198339462280273, "learning_rate": 1.9995795492789368e-05, "loss": 2.407, "mean_token_accuracy": 0.6436298653483391, "num_tokens": 182368.0, "step": 60 }, { "entropy": 1.9469897836446761, "epoch": 0.3427172582619339, "grad_norm": 1.6485199928283691, "learning_rate": 1.9981265932877486e-05, "loss": 1.822, "mean_token_accuracy": 0.7381727218627929, "num_tokens": 213403.0, "step": 70 }, { "entropy": 1.2083469659090043, "epoch": 0.39167686658506734, "grad_norm": 1.6277607679367065, "learning_rate": 1.995637449278864e-05, "loss": 1.1033, "mean_token_accuracy": 0.8396818682551384, "num_tokens": 239775.0, "step": 80 }, { "entropy": 0.8737979799509048, "epoch": 0.44063647490820074, "grad_norm": 1.428305745124817, "learning_rate": 1.9921147013144782e-05, "loss": 0.819, "mean_token_accuracy": 0.8934117943048477, "num_tokens": 263407.0, "step": 90 }, { "entropy": 0.5950189992785454, "epoch": 0.48959608323133413, "grad_norm": 1.842503547668457, "learning_rate": 1.98756200647502e-05, "loss": 0.5144, "mean_token_accuracy": 0.9327697649598121, "num_tokens": 285288.0, "step": 100 }, { "epoch": 0.48959608323133413, "eval_entropy": 1.026704891055238, "eval_loss": 1.094792127609253, "eval_mean_token_accuracy": 0.8390047088557598, "eval_num_tokens": 285288.0, "eval_runtime": 21.2516, "eval_samples_per_second": 19.199, "eval_steps_per_second": 4.8, "step": 100 }, { "entropy": 1.7036595433950423, "epoch": 0.5385556915544676, "grad_norm": 1.3429725170135498, "learning_rate": 1.9819840910626174e-05, "loss": 1.8116, "mean_token_accuracy": 0.7263100475072861, "num_tokens": 325823.0, "step": 110 }, { "entropy": 1.3686422437429429, "epoch": 0.587515299877601, "grad_norm": 1.5084576606750488, "learning_rate": 1.9753867456945653e-05, "loss": 1.2567, "mean_token_accuracy": 0.8075757145881652, "num_tokens": 356947.0, "step": 120 }, { "entropy": 0.9899184465408325, "epoch": 0.6364749082007344, "grad_norm": 1.1403902769088745, "learning_rate": 1.9677768192918973e-05, "loss": 0.9506, "mean_token_accuracy": 0.8574481099843979, "num_tokens": 384860.0, "step": 130 }, { "entropy": 0.6585730135440826, "epoch": 0.6854345165238678, "grad_norm": 1.179526925086975, "learning_rate": 1.9591622119692953e-05, "loss": 0.6207, "mean_token_accuracy": 0.9152766704559326, "num_tokens": 408668.0, "step": 140 }, { "entropy": 0.49890025779604913, "epoch": 0.7343941248470012, "grad_norm": 2.6344473361968994, "learning_rate": 1.9495518668337204e-05, "loss": 0.4502, "mean_token_accuracy": 0.9399106308817864, "num_tokens": 430716.0, "step": 150 }, { "entropy": 1.4263419717550279, "epoch": 0.7833537331701347, "grad_norm": 1.8926013708114624, "learning_rate": 1.9389557607002808e-05, "loss": 1.4959, "mean_token_accuracy": 0.7735099658370018, "num_tokens": 468866.0, "step": 160 }, { "entropy": 1.0953898191452027, "epoch": 0.8323133414932681, "grad_norm": 0.972065806388855, "learning_rate": 1.9273848937349712e-05, "loss": 0.9593, "mean_token_accuracy": 0.8498499140143394, "num_tokens": 499808.0, "step": 170 }, { "entropy": 0.7438366547226906, "epoch": 0.8812729498164015, "grad_norm": 1.1401448249816895, "learning_rate": 1.9148512780350384e-05, "loss": 0.7447, "mean_token_accuracy": 0.885815504193306, "num_tokens": 526616.0, "step": 180 }, { "entropy": 0.5698909372091293, "epoch": 0.9302325581395349, "grad_norm": 1.2068321704864502, "learning_rate": 1.9013679251588304e-05, "loss": 0.5473, "mean_token_accuracy": 0.9253638312220573, "num_tokens": 550407.0, "step": 190 }, { "entropy": 0.44005816280841825, "epoch": 0.9791921664626683, "grad_norm": 2.358927011489868, "learning_rate": 1.8869488326180682e-05, "loss": 0.4075, "mean_token_accuracy": 0.9496751800179482, "num_tokens": 572396.0, "step": 200 }, { "epoch": 0.9791921664626683, "eval_entropy": 0.7869388329632142, "eval_loss": 0.8091182112693787, "eval_mean_token_accuracy": 0.8807467134559855, "eval_num_tokens": 572396.0, "eval_runtime": 21.2312, "eval_samples_per_second": 19.217, "eval_steps_per_second": 4.804, "step": 200 }, { "entropy": 1.2279899853306848, "epoch": 1.0244798041615668, "grad_norm": 2.1428945064544678, "learning_rate": 1.8716089693465696e-05, "loss": 1.1313, "mean_token_accuracy": 0.8170017583950145, "num_tokens": 606225.0, "step": 210 }, { "entropy": 1.1651067942380906, "epoch": 1.0734394124847002, "grad_norm": 1.0429226160049438, "learning_rate": 1.855364260160507e-05, "loss": 1.0415, "mean_token_accuracy": 0.8387726441025734, "num_tokens": 639876.0, "step": 220 }, { "entropy": 0.8667519375681877, "epoch": 1.1223990208078336, "grad_norm": 1.605554461479187, "learning_rate": 1.8382315692263324e-05, "loss": 0.8179, "mean_token_accuracy": 0.8734943136572838, "num_tokens": 669254.0, "step": 230 }, { "entropy": 0.5002034060657025, "epoch": 1.171358629130967, "grad_norm": 1.766509771347046, "learning_rate": 1.820228682553533e-05, "loss": 0.5005, "mean_token_accuracy": 0.9262110635638237, "num_tokens": 693566.0, "step": 240 }, { "entropy": 0.4824553743004799, "epoch": 1.2203182374541004, "grad_norm": 1.3010213375091553, "learning_rate": 1.8013742895303883e-05, "loss": 0.4382, "mean_token_accuracy": 0.9407104596495628, "num_tokens": 716324.0, "step": 250 }, { "entropy": 0.886764119565487, "epoch": 1.2692778457772338, "grad_norm": 2.592423915863037, "learning_rate": 1.7816879635219028e-05, "loss": 0.8545, "mean_token_accuracy": 0.8653567478060722, "num_tokens": 749252.0, "step": 260 }, { "entropy": 0.9997715935111046, "epoch": 1.3182374541003672, "grad_norm": 1.1456091403961182, "learning_rate": 1.7611901415500536e-05, "loss": 0.9247, "mean_token_accuracy": 0.8514459431171417, "num_tokens": 782122.0, "step": 270 }, { "entropy": 0.8610424906015396, "epoch": 1.3671970624235006, "grad_norm": 1.3674919605255127, "learning_rate": 1.7399021030774443e-05, "loss": 0.7975, "mean_token_accuracy": 0.8724059730768203, "num_tokens": 812066.0, "step": 280 }, { "entropy": 0.4803216062486172, "epoch": 1.416156670746634, "grad_norm": 1.0078914165496826, "learning_rate": 1.717845947916398e-05, "loss": 0.4703, "mean_token_accuracy": 0.9251162573695183, "num_tokens": 836705.0, "step": 290 }, { "entropy": 0.477826127409935, "epoch": 1.4651162790697674, "grad_norm": 1.0581692457199097, "learning_rate": 1.695044573286413e-05, "loss": 0.4365, "mean_token_accuracy": 0.9394737258553505, "num_tokens": 859600.0, "step": 300 }, { "epoch": 1.4651162790697674, "eval_entropy": 0.7299714935760871, "eval_loss": 0.6802113056182861, "eval_mean_token_accuracy": 0.8902294659147075, "eval_num_tokens": 859600.0, "eval_runtime": 21.2433, "eval_samples_per_second": 19.206, "eval_steps_per_second": 4.802, "step": 300 }, { "entropy": 0.8674551732838154, "epoch": 1.514075887392901, "grad_norm": 2.246056079864502, "learning_rate": 1.6715216500438093e-05, "loss": 0.7837, "mean_token_accuracy": 0.874087019264698, "num_tokens": 891961.0, "step": 310 }, { "entropy": 0.9596038445830345, "epoch": 1.5630354957160342, "grad_norm": 1.619541883468628, "learning_rate": 1.647301598108234e-05, "loss": 0.8646, "mean_token_accuracy": 0.857255433499813, "num_tokens": 924600.0, "step": 320 }, { "entropy": 0.7986739322543144, "epoch": 1.6119951040391678, "grad_norm": 2.2831315994262695, "learning_rate": 1.6224095611115385e-05, "loss": 0.7373, "mean_token_accuracy": 0.8797152638435364, "num_tokens": 954438.0, "step": 330 }, { "entropy": 0.47184758856892584, "epoch": 1.660954712362301, "grad_norm": 2.6723692417144775, "learning_rate": 1.596871380295351e-05, "loss": 0.4218, "mean_token_accuracy": 0.9275540292263031, "num_tokens": 979015.0, "step": 340 }, { "entropy": 0.45431587770581244, "epoch": 1.7099143206854346, "grad_norm": 2.8102011680603027, "learning_rate": 1.570713567684432e-05, "loss": 0.3684, "mean_token_accuracy": 0.9440548986196518, "num_tokens": 1001989.0, "step": 350 }, { "entropy": 0.7829712487757206, "epoch": 1.758873929008568, "grad_norm": 1.7116206884384155, "learning_rate": 1.5439632785636707e-05, "loss": 0.663, "mean_token_accuracy": 0.8908370733261108, "num_tokens": 1034477.0, "step": 360 }, { "entropy": 0.8193635582923889, "epoch": 1.8078335373317014, "grad_norm": 0.9935292601585388, "learning_rate": 1.5166482832872923e-05, "loss": 0.7506, "mean_token_accuracy": 0.8711828157305718, "num_tokens": 1067534.0, "step": 370 }, { "entropy": 0.7036949403584003, "epoch": 1.8567931456548348, "grad_norm": 1.7203794717788696, "learning_rate": 1.4887969384495403e-05, "loss": 0.6792, "mean_token_accuracy": 0.88548723757267, "num_tokens": 1097280.0, "step": 380 }, { "entropy": 0.3680045209825039, "epoch": 1.9057527539779682, "grad_norm": 1.4336254596710205, "learning_rate": 1.4604381574467616e-05, "loss": 0.3249, "mean_token_accuracy": 0.9348379656672478, "num_tokens": 1121920.0, "step": 390 }, { "entropy": 0.37416374161839483, "epoch": 1.9547123623011016, "grad_norm": 1.0702754259109497, "learning_rate": 1.4316013804614644e-05, "loss": 0.2929, "mean_token_accuracy": 0.9463742494583129, "num_tokens": 1145002.0, "step": 400 }, { "epoch": 1.9547123623011016, "eval_entropy": 0.6033922105151064, "eval_loss": 0.49456480145454407, "eval_mean_token_accuracy": 0.9079791880121418, "eval_num_tokens": 1145002.0, "eval_runtime": 21.1773, "eval_samples_per_second": 19.266, "eval_steps_per_second": 4.816, "step": 400 }, { "entropy": 0.41346504196927353, "epoch": 2.0, "grad_norm": 0.9998394250869751, "learning_rate": 1.4023165438994933e-05, "loss": 0.3152, "mean_token_accuracy": 0.9379849611101924, "num_tokens": 1167618.0, "step": 410 }, { "entropy": 1.0350674405694007, "epoch": 2.0489596083231336, "grad_norm": 1.5349912643432617, "learning_rate": 1.3726140493120639e-05, "loss": 0.8548, "mean_token_accuracy": 0.8497318252921104, "num_tokens": 1208219.0, "step": 420 }, { "entropy": 0.6850787699222565, "epoch": 2.097919216646267, "grad_norm": 0.9396725296974182, "learning_rate": 1.3425247318349137e-05, "loss": 0.6238, "mean_token_accuracy": 0.8850513309240341, "num_tokens": 1239429.0, "step": 430 }, { "entropy": 0.5261640004813671, "epoch": 2.1468788249694004, "grad_norm": 1.1289340257644653, "learning_rate": 1.3120798281773346e-05, "loss": 0.4823, "mean_token_accuracy": 0.9071076571941376, "num_tokens": 1266803.0, "step": 440 }, { "entropy": 0.3201193898916245, "epoch": 2.1958384332925336, "grad_norm": 0.8056601285934448, "learning_rate": 1.2813109441943166e-05, "loss": 0.2848, "mean_token_accuracy": 0.943960489332676, "num_tokens": 1290548.0, "step": 450 }, { "entropy": 0.24066586568951606, "epoch": 2.244798041615667, "grad_norm": 1.4606080055236816, "learning_rate": 1.2502500220754736e-05, "loss": 0.198, "mean_token_accuracy": 0.9593303337693214, "num_tokens": 1312508.0, "step": 460 }, { "entropy": 0.9151711657643318, "epoch": 2.2937576499388004, "grad_norm": 1.4147366285324097, "learning_rate": 1.2189293071848051e-05, "loss": 0.7861, "mean_token_accuracy": 0.8564901977777482, "num_tokens": 1351246.0, "step": 470 }, { "entropy": 0.6647965341806412, "epoch": 2.342717258261934, "grad_norm": 0.8306525349617004, "learning_rate": 1.187381314585725e-05, "loss": 0.6222, "mean_token_accuracy": 0.8851410359144211, "num_tokens": 1382196.0, "step": 480 }, { "entropy": 0.4626566760241985, "epoch": 2.391676866585067, "grad_norm": 1.2030415534973145, "learning_rate": 1.1556387952861036e-05, "loss": 0.4374, "mean_token_accuracy": 0.9146721437573433, "num_tokens": 1409050.0, "step": 490 }, { "entropy": 0.292323163151741, "epoch": 2.440636474908201, "grad_norm": 0.6598659753799438, "learning_rate": 1.1237347022383747e-05, "loss": 0.2684, "mean_token_accuracy": 0.9467211216688156, "num_tokens": 1432781.0, "step": 500 }, { "epoch": 2.440636474908201, "eval_entropy": 0.5070152823247162, "eval_loss": 0.4397956132888794, "eval_mean_token_accuracy": 0.9159132575287539, "eval_num_tokens": 1432781.0, "eval_runtime": 21.252, "eval_samples_per_second": 19.198, "eval_steps_per_second": 4.8, "step": 500 }, { "entropy": 0.22589576356112956, "epoch": 2.489596083231334, "grad_norm": 0.9443103075027466, "learning_rate": 1.0917021561299864e-05, "loss": 0.191, "mean_token_accuracy": 0.9594075292348861, "num_tokens": 1454735.0, "step": 510 }, { "entropy": 0.8820951759815217, "epoch": 2.5385556915544676, "grad_norm": 1.3829594850540161, "learning_rate": 1.0595744109997326e-05, "loss": 0.7404, "mean_token_accuracy": 0.8678841248154641, "num_tokens": 1494090.0, "step": 520 }, { "entropy": 0.6331574842333794, "epoch": 2.5875152998776008, "grad_norm": 1.056781530380249, "learning_rate": 1.0273848197156401e-05, "loss": 0.5978, "mean_token_accuracy": 0.888825386762619, "num_tokens": 1525062.0, "step": 530 }, { "entropy": 0.4361429732292891, "epoch": 2.6364749082007344, "grad_norm": 1.1021668910980225, "learning_rate": 9.951667993502599e-06, "loss": 0.3999, "mean_token_accuracy": 0.9207724243402481, "num_tokens": 1551813.0, "step": 540 }, { "entropy": 0.2877490069717169, "epoch": 2.685434516523868, "grad_norm": 0.7195802927017212, "learning_rate": 9.629537964893063e-06, "loss": 0.2665, "mean_token_accuracy": 0.9467098742723465, "num_tokens": 1575433.0, "step": 550 }, { "entropy": 0.2189638450741768, "epoch": 2.734394124847001, "grad_norm": 1.2461501359939575, "learning_rate": 9.307792525096582e-06, "loss": 0.1899, "mean_token_accuracy": 0.9602582737803459, "num_tokens": 1597306.0, "step": 560 }, { "entropy": 0.825089768320322, "epoch": 2.783353733170135, "grad_norm": 1.1732733249664307, "learning_rate": 8.986765688627652e-06, "loss": 0.6871, "mean_token_accuracy": 0.8730710700154305, "num_tokens": 1636454.0, "step": 570 }, { "entropy": 0.6250899910926819, "epoch": 2.832313341493268, "grad_norm": 0.8510639071464539, "learning_rate": 8.666790723995043e-06, "loss": 0.597, "mean_token_accuracy": 0.8902681171894073, "num_tokens": 1667449.0, "step": 580 }, { "entropy": 0.4255227465182543, "epoch": 2.8812729498164016, "grad_norm": 0.7777973413467407, "learning_rate": 8.348199807724806e-06, "loss": 0.3826, "mean_token_accuracy": 0.921813291311264, "num_tokens": 1694019.0, "step": 590 }, { "entropy": 0.280773538723588, "epoch": 2.9302325581395348, "grad_norm": 0.7043775916099548, "learning_rate": 8.0313236795169e-06, "loss": 0.2658, "mean_token_accuracy": 0.9461780115962029, "num_tokens": 1717763.0, "step": 600 }, { "epoch": 2.9302325581395348, "eval_entropy": 0.4671570363582349, "eval_loss": 0.41600170731544495, "eval_mean_token_accuracy": 0.919863361938327, "eval_num_tokens": 1717763.0, "eval_runtime": 21.0643, "eval_samples_per_second": 19.369, "eval_steps_per_second": 4.842, "step": 600 }, { "entropy": 0.22502716928720473, "epoch": 2.9791921664626684, "grad_norm": 1.245797872543335, "learning_rate": 7.716491298893443e-06, "loss": 0.2014, "mean_token_accuracy": 0.9587241023778915, "num_tokens": 1739700.0, "step": 610 }, { "entropy": 0.7121120212045876, "epoch": 3.0244798041615666, "grad_norm": 1.578238606452942, "learning_rate": 7.404029503695028e-06, "loss": 0.592, "mean_token_accuracy": 0.884317248254209, "num_tokens": 1773279.0, "step": 620 }, { "entropy": 0.6201752804219722, "epoch": 3.0734394124847, "grad_norm": 0.9049448370933533, "learning_rate": 7.094262670779611e-06, "loss": 0.5563, "mean_token_accuracy": 0.8980106115341187, "num_tokens": 1806614.0, "step": 630 }, { "entropy": 0.5976772040128708, "epoch": 3.1223990208078334, "grad_norm": 1.2108434438705444, "learning_rate": 6.78751237927623e-06, "loss": 0.5527, "mean_token_accuracy": 0.898520989716053, "num_tokens": 1836366.0, "step": 640 }, { "entropy": 0.26630178317427633, "epoch": 3.171358629130967, "grad_norm": 0.9259095788002014, "learning_rate": 6.48409707674317e-06, "loss": 0.2551, "mean_token_accuracy": 0.9447213500738144, "num_tokens": 1860872.0, "step": 650 }, { "entropy": 0.2778078857809305, "epoch": 3.2203182374541, "grad_norm": 1.1952687501907349, "learning_rate": 6.18433174857705e-06, "loss": 0.2529, "mean_token_accuracy": 0.9497045069932938, "num_tokens": 1883874.0, "step": 660 }, { "entropy": 0.5209752138704061, "epoch": 3.269277845777234, "grad_norm": 1.9098657369613647, "learning_rate": 5.8885275910161574e-06, "loss": 0.4378, "mean_token_accuracy": 0.9142355337738991, "num_tokens": 1917087.0, "step": 670 }, { "entropy": 0.64239452034235, "epoch": 3.318237454100367, "grad_norm": 0.8732991814613342, "learning_rate": 5.596991688077409e-06, "loss": 0.5759, "mean_token_accuracy": 0.8934120118618012, "num_tokens": 1949777.0, "step": 680 }, { "entropy": 0.5809963166713714, "epoch": 3.3671970624235006, "grad_norm": 0.8436909317970276, "learning_rate": 5.310026692762316e-06, "loss": 0.5335, "mean_token_accuracy": 0.8995566830039025, "num_tokens": 1978937.0, "step": 690 }, { "entropy": 0.2611778501421213, "epoch": 3.416156670746634, "grad_norm": 1.1178691387176514, "learning_rate": 5.027930512862976e-06, "loss": 0.2472, "mean_token_accuracy": 0.9472135573625564, "num_tokens": 2003325.0, "step": 700 }, { "epoch": 3.416156670746634, "eval_entropy": 0.4348171895333365, "eval_loss": 0.39616772532463074, "eval_mean_token_accuracy": 0.9233323300586027, "eval_num_tokens": 2003325.0, "eval_runtime": 21.154, "eval_samples_per_second": 19.287, "eval_steps_per_second": 4.822, "step": 700 }, { "entropy": 0.25852978341281413, "epoch": 3.4651162790697674, "grad_norm": 1.280714511871338, "learning_rate": 4.750996001694215e-06, "loss": 0.2374, "mean_token_accuracy": 0.9529847070574761, "num_tokens": 2026113.0, "step": 710 }, { "entropy": 0.5000423431396485, "epoch": 3.514075887392901, "grad_norm": 1.6330279111862183, "learning_rate": 4.479510654072909e-06, "loss": 0.4167, "mean_token_accuracy": 0.9191933527588845, "num_tokens": 2058740.0, "step": 720 }, { "entropy": 0.5997287526726722, "epoch": 3.563035495716034, "grad_norm": 0.8560138940811157, "learning_rate": 4.213756307860175e-06, "loss": 0.5358, "mean_token_accuracy": 0.8984866350889206, "num_tokens": 2091634.0, "step": 730 }, { "entropy": 0.5770318634808064, "epoch": 3.611995104039168, "grad_norm": 1.2353603839874268, "learning_rate": 3.954008851376252e-06, "loss": 0.5377, "mean_token_accuracy": 0.9002507776021957, "num_tokens": 2121380.0, "step": 740 }, { "entropy": 0.26896645687520504, "epoch": 3.660954712362301, "grad_norm": 0.5919920802116394, "learning_rate": 3.700537936991733e-06, "loss": 0.2541, "mean_token_accuracy": 0.9460319548845291, "num_tokens": 2145937.0, "step": 750 }, { "entropy": 0.27403030432760717, "epoch": 3.7099143206854346, "grad_norm": 0.9070358276367188, "learning_rate": 3.4536067011925945e-06, "loss": 0.2555, "mean_token_accuracy": 0.9493874981999397, "num_tokens": 2168886.0, "step": 760 }, { "entropy": 0.49704274982213975, "epoch": 3.758873929008568, "grad_norm": 1.7363125085830688, "learning_rate": 3.213471491409568e-06, "loss": 0.42, "mean_token_accuracy": 0.9207542642951012, "num_tokens": 2201618.0, "step": 770 }, { "entropy": 0.6181983411312103, "epoch": 3.8078335373317014, "grad_norm": 0.8889790773391724, "learning_rate": 2.9803815998954334e-06, "loss": 0.5448, "mean_token_accuracy": 0.8981723725795746, "num_tokens": 2234716.0, "step": 780 }, { "entropy": 0.5966475777328014, "epoch": 3.8567931456548346, "grad_norm": 1.2056671380996704, "learning_rate": 2.7545790049265506e-06, "loss": 0.5489, "mean_token_accuracy": 0.8978816866874695, "num_tokens": 2264636.0, "step": 790 }, { "entropy": 0.2779835805296898, "epoch": 3.905752753977968, "grad_norm": 0.9485862851142883, "learning_rate": 2.5362981195972627e-06, "loss": 0.2623, "mean_token_accuracy": 0.9445345297455787, "num_tokens": 2289186.0, "step": 800 }, { "epoch": 3.905752753977968, "eval_entropy": 0.42678100907919453, "eval_loss": 0.3873916268348694, "eval_mean_token_accuracy": 0.9257115113968942, "eval_num_tokens": 2289186.0, "eval_runtime": 21.0498, "eval_samples_per_second": 19.383, "eval_steps_per_second": 4.846, "step": 800 }, { "entropy": 0.26332913562655447, "epoch": 3.954712362301102, "grad_norm": 0.8345707058906555, "learning_rate": 2.3257655484679376e-06, "loss": 0.2408, "mean_token_accuracy": 0.9522591263055802, "num_tokens": 2312074.0, "step": 810 }, { "entropy": 0.31254331364824967, "epoch": 4.0, "grad_norm": 1.0298808813095093, "learning_rate": 2.123199852319352e-06, "loss": 0.2697, "mean_token_accuracy": 0.9450424829044858, "num_tokens": 2335236.0, "step": 820 }, { "entropy": 0.6749883458018303, "epoch": 4.048959608323133, "grad_norm": 0.9510061740875244, "learning_rate": 1.9288113212575454e-06, "loss": 0.5564, "mean_token_accuracy": 0.8976946637034416, "num_tokens": 2374400.0, "step": 830 }, { "entropy": 0.5979305505752563, "epoch": 4.097919216646267, "grad_norm": 0.8506686687469482, "learning_rate": 1.7428017564047594e-06, "loss": 0.562, "mean_token_accuracy": 0.8973473310470581, "num_tokens": 2405288.0, "step": 840 }, { "entropy": 0.38541266694664955, "epoch": 4.1468788249694, "grad_norm": 0.8283617496490479, "learning_rate": 1.565364260403055e-06, "loss": 0.3559, "mean_token_accuracy": 0.9307585805654526, "num_tokens": 2431701.0, "step": 850 }, { "entropy": 0.26541123948991296, "epoch": 4.195838433292534, "grad_norm": 0.6910677552223206, "learning_rate": 1.3966830369481231e-06, "loss": 0.2495, "mean_token_accuracy": 0.9491465017199516, "num_tokens": 2455249.0, "step": 860 }, { "entropy": 0.19118896164000035, "epoch": 4.244798041615667, "grad_norm": 0.9370400905609131, "learning_rate": 1.2369331995613664e-06, "loss": 0.1709, "mean_token_accuracy": 0.9622136607766152, "num_tokens": 2477038.0, "step": 870 }, { "entropy": 0.7027731344103814, "epoch": 4.293757649938801, "grad_norm": 1.2064703702926636, "learning_rate": 1.0862805897987894e-06, "loss": 0.5842, "mean_token_accuracy": 0.8925257995724678, "num_tokens": 2516496.0, "step": 880 }, { "entropy": 0.5964450292289257, "epoch": 4.342717258261934, "grad_norm": 1.0601837635040283, "learning_rate": 9.448816050854559e-07, "loss": 0.5577, "mean_token_accuracy": 0.8974330082535744, "num_tokens": 2547663.0, "step": 890 }, { "entropy": 0.4454402156174183, "epoch": 4.391676866585067, "grad_norm": 0.7408022880554199, "learning_rate": 8.128830363541574e-07, "loss": 0.4126, "mean_token_accuracy": 0.9200813502073288, "num_tokens": 2575037.0, "step": 900 }, { "epoch": 4.391676866585067, "eval_entropy": 0.4241247632924248, "eval_loss": 0.3843807876110077, "eval_mean_token_accuracy": 0.9258733800813264, "eval_num_tokens": 2575037.0, "eval_runtime": 21.0482, "eval_samples_per_second": 19.384, "eval_steps_per_second": 4.846, "step": 900 } ], "logging_steps": 10, "max_steps": 1025, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.525760757970944e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }