{ "best_global_step": 562, "best_metric": 0.15477769076824188, "best_model_checkpoint": "/app/output/bf-router-v0.5/checkpoint-562", "epoch": 2.0, "eval_steps": 500, "global_step": 562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.4157426357269287, "epoch": 0.003561887800534283, "grad_norm": 1.203125, "learning_rate": 0.0, "loss": 4.336674213409424, "mean_token_accuracy": 0.4680640399456024, "num_tokens": 3092.0, "step": 1 }, { "entropy": 1.5884604156017303, "epoch": 0.03561887800534283, "grad_norm": 0.59375, "learning_rate": 6.923076923076924e-05, "loss": 3.986017862955729, "mean_token_accuracy": 0.4735516524977154, "num_tokens": 30650.0, "step": 10 }, { "entropy": 2.2315500736236573, "epoch": 0.07123775601068566, "grad_norm": 0.26953125, "learning_rate": 0.00014615384615384615, "loss": 2.16544189453125, "mean_token_accuracy": 0.6272616922855377, "num_tokens": 61735.0, "step": 20 }, { "entropy": 0.8066075079143047, "epoch": 0.10685663401602849, "grad_norm": 0.298828125, "learning_rate": 0.00019999334629824895, "loss": 0.7935717582702637, "mean_token_accuracy": 0.857068446278572, "num_tokens": 92484.0, "step": 30 }, { "entropy": 0.5340980306267739, "epoch": 0.14247551202137132, "grad_norm": 0.10693359375, "learning_rate": 0.00019987508289735716, "loss": 0.5261529922485352, "mean_token_accuracy": 0.9004581302404404, "num_tokens": 123010.0, "step": 40 }, { "entropy": 0.4001643668860197, "epoch": 0.17809439002671415, "grad_norm": 0.08251953125, "learning_rate": 0.0001996091607179287, "loss": 0.3956298351287842, "mean_token_accuracy": 0.9209299519658088, "num_tokens": 153710.0, "step": 50 }, { "entropy": 0.34828933551907537, "epoch": 0.21371326803205698, "grad_norm": 0.10009765625, "learning_rate": 0.00019919597290851538, "loss": 0.3648662090301514, "mean_token_accuracy": 0.9263768374919892, "num_tokens": 184577.0, "step": 60 }, { "entropy": 0.2983129292726517, "epoch": 0.2493321460373998, "grad_norm": 0.08447265625, "learning_rate": 0.00019863613034027224, "loss": 0.29642860889434813, "mean_token_accuracy": 0.9371836110949516, "num_tokens": 215040.0, "step": 70 }, { "entropy": 0.280751570686698, "epoch": 0.28495102404274264, "grad_norm": 0.099609375, "learning_rate": 0.00019793046070382437, "loss": 0.2813178300857544, "mean_token_accuracy": 0.9388988897204399, "num_tokens": 245683.0, "step": 80 }, { "entropy": 0.2960826952010393, "epoch": 0.3205699020480855, "grad_norm": 0.11962890625, "learning_rate": 0.00019708000728558064, "loss": 0.31115410327911375, "mean_token_accuracy": 0.9335825845599175, "num_tokens": 276430.0, "step": 90 }, { "entropy": 0.28416929207742214, "epoch": 0.3561887800534283, "grad_norm": 0.103515625, "learning_rate": 0.00019608602742530283, "loss": 0.27938365936279297, "mean_token_accuracy": 0.9393619552254677, "num_tokens": 307296.0, "step": 100 }, { "entropy": 0.23417662251740695, "epoch": 0.39180765805877116, "grad_norm": 0.06298828125, "learning_rate": 0.00019494999065721108, "loss": 0.23734774589538574, "mean_token_accuracy": 0.9487677246332169, "num_tokens": 337962.0, "step": 110 }, { "entropy": 0.24019648060202597, "epoch": 0.42742653606411396, "grad_norm": 0.06640625, "learning_rate": 0.0001936735765373737, "loss": 0.245324969291687, "mean_token_accuracy": 0.9457737937569618, "num_tokens": 368782.0, "step": 120 }, { "entropy": 0.269666058011353, "epoch": 0.4630454140694568, "grad_norm": 0.09765625, "learning_rate": 0.00019225867216059325, "loss": 0.2686375617980957, "mean_token_accuracy": 0.9398508563637733, "num_tokens": 399329.0, "step": 130 }, { "entropy": 0.18328354582190515, "epoch": 0.4986642920747996, "grad_norm": 0.06298828125, "learning_rate": 0.00019070736937046035, "loss": 0.18070420026779174, "mean_token_accuracy": 0.9568557634949684, "num_tokens": 429759.0, "step": 140 }, { "entropy": 0.21165105439722537, "epoch": 0.5342831700801425, "grad_norm": 0.0810546875, "learning_rate": 0.0001890219616666997, "loss": 0.22004930973052977, "mean_token_accuracy": 0.9494227185845375, "num_tokens": 460409.0, "step": 150 }, { "entropy": 0.23916025627404452, "epoch": 0.5699020480854853, "grad_norm": 0.07568359375, "learning_rate": 0.00018720494081438078, "loss": 0.2379377841949463, "mean_token_accuracy": 0.9455192387104034, "num_tokens": 491344.0, "step": 160 }, { "entropy": 0.2238690486177802, "epoch": 0.6055209260908282, "grad_norm": 0.08984375, "learning_rate": 0.00018525899316000608, "loss": 0.2245168685913086, "mean_token_accuracy": 0.9485938593745231, "num_tokens": 522029.0, "step": 170 }, { "entropy": 0.17032606173306702, "epoch": 0.641139804096171, "grad_norm": 0.080078125, "learning_rate": 0.00018318699565992357, "loss": 0.17549347877502441, "mean_token_accuracy": 0.9574712902307511, "num_tokens": 552306.0, "step": 180 }, { "entropy": 0.21890966054052113, "epoch": 0.6767586821015138, "grad_norm": 0.0673828125, "learning_rate": 0.00018099201162693476, "loss": 0.21968293190002441, "mean_token_accuracy": 0.9482985377311707, "num_tokens": 583181.0, "step": 190 }, { "entropy": 0.20646683853119613, "epoch": 0.7123775601068566, "grad_norm": 0.09765625, "learning_rate": 0.00017867728620138708, "loss": 0.20968315601348878, "mean_token_accuracy": 0.9519409075379371, "num_tokens": 613818.0, "step": 200 }, { "entropy": 0.23000301327556372, "epoch": 0.7479964381121995, "grad_norm": 0.08935546875, "learning_rate": 0.00017624624155344626, "loss": 0.2383474349975586, "mean_token_accuracy": 0.9458372846245766, "num_tokens": 644576.0, "step": 210 }, { "entropy": 0.21420854832977057, "epoch": 0.7836153161175423, "grad_norm": 0.0966796875, "learning_rate": 0.0001737024718236413, "loss": 0.21017465591430665, "mean_token_accuracy": 0.9511569887399673, "num_tokens": 675564.0, "step": 220 }, { "entropy": 0.19280126914381981, "epoch": 0.8192341941228851, "grad_norm": 0.08154296875, "learning_rate": 0.00017104973780916294, "loss": 0.1936139941215515, "mean_token_accuracy": 0.954449312388897, "num_tokens": 706309.0, "step": 230 }, { "entropy": 0.22524890769273043, "epoch": 0.8548530721282279, "grad_norm": 0.10546875, "learning_rate": 0.00016829196140377085, "loss": 0.22950620651245118, "mean_token_accuracy": 0.9476036429405212, "num_tokens": 737135.0, "step": 240 }, { "entropy": 0.18842433094978334, "epoch": 0.8904719501335708, "grad_norm": 0.087890625, "learning_rate": 0.00016543321979953007, "loss": 0.19049547910690307, "mean_token_accuracy": 0.9543813273310662, "num_tokens": 767853.0, "step": 250 }, { "entropy": 0.1991994746029377, "epoch": 0.9260908281389136, "grad_norm": 0.07763671875, "learning_rate": 0.00016247773945894962, "loss": 0.19527161121368408, "mean_token_accuracy": 0.9535173490643502, "num_tokens": 798511.0, "step": 260 }, { "entropy": 0.18196075949817897, "epoch": 0.9617097061442564, "grad_norm": 0.0771484375, "learning_rate": 0.00015942988986643352, "loss": 0.19091761112213135, "mean_token_accuracy": 0.9535432115197182, "num_tokens": 829447.0, "step": 270 }, { "entropy": 0.18307169582694768, "epoch": 0.9973285841495992, "grad_norm": 0.08837890625, "learning_rate": 0.00015629417706828423, "loss": 0.1818631410598755, "mean_token_accuracy": 0.955574706196785, "num_tokens": 860142.0, "step": 280 }, { "epoch": 1.0, "eval_entropy": 0.18207345770817276, "eval_loss": 0.18149949610233307, "eval_mean_token_accuracy": 0.9555717660180221, "eval_num_tokens": 862512.0, "eval_runtime": 35.3256, "eval_samples_per_second": 15.881, "eval_steps_per_second": 3.991, "step": 281 }, { "entropy": 0.1545313375118451, "epoch": 1.0320569902048085, "grad_norm": 0.064453125, "learning_rate": 0.00015307523701080768, "loss": 0.14675980806350708, "mean_token_accuracy": 0.9623546004295349, "num_tokens": 889907.0, "step": 290 }, { "entropy": 0.13382284864783286, "epoch": 1.0676758682101515, "grad_norm": 0.060546875, "learning_rate": 0.00014977782868636999, "loss": 0.13706474304199218, "mean_token_accuracy": 0.9633408606052398, "num_tokens": 920639.0, "step": 300 }, { "entropy": 0.16596811451017857, "epoch": 1.1032947462154943, "grad_norm": 0.0791015625, "learning_rate": 0.00014640682709753832, "loss": 0.16318607330322266, "mean_token_accuracy": 0.9578790530562401, "num_tokens": 951323.0, "step": 310 }, { "entropy": 0.1480622159317136, "epoch": 1.138913624220837, "grad_norm": 0.057861328125, "learning_rate": 0.0001429672160497085, "loss": 0.1488279938697815, "mean_token_accuracy": 0.9609164595603943, "num_tokens": 981911.0, "step": 320 }, { "entropy": 0.1655621325597167, "epoch": 1.1745325022261799, "grad_norm": 0.07275390625, "learning_rate": 0.00013946408078287462, "loss": 0.16427644491195678, "mean_token_accuracy": 0.9576473370194435, "num_tokens": 1012535.0, "step": 330 }, { "entropy": 0.18122829273343086, "epoch": 1.2101513802315227, "grad_norm": 0.06689453125, "learning_rate": 0.00013590260045343432, "loss": 0.176006543636322, "mean_token_accuracy": 0.9551453411579132, "num_tokens": 1043503.0, "step": 340 }, { "entropy": 0.15644419118762015, "epoch": 1.2457702582368655, "grad_norm": 0.06201171875, "learning_rate": 0.00013228804047714463, "loss": 0.1652477502822876, "mean_token_accuracy": 0.9575047269463539, "num_tokens": 1074168.0, "step": 350 }, { "entropy": 0.14828295167535543, "epoch": 1.2813891362422083, "grad_norm": 0.06982421875, "learning_rate": 0.00012862574474454928, "loss": 0.14066768884658815, "mean_token_accuracy": 0.9639188721776009, "num_tokens": 1104681.0, "step": 360 }, { "entropy": 0.19175196047872306, "epoch": 1.317008014247551, "grad_norm": 0.09619140625, "learning_rate": 0.0001249211277203859, "loss": 0.1917089343070984, "mean_token_accuracy": 0.9522068575024605, "num_tokens": 1135361.0, "step": 370 }, { "entropy": 0.1644950734451413, "epoch": 1.3526268922528941, "grad_norm": 0.0849609375, "learning_rate": 0.00012117966643865398, "loss": 0.16864393949508666, "mean_token_accuracy": 0.9572859182953835, "num_tokens": 1166125.0, "step": 380 }, { "entropy": 0.16431492734700442, "epoch": 1.388245770258237, "grad_norm": 0.08056640625, "learning_rate": 0.00011740689240517837, "loss": 0.1597315788269043, "mean_token_accuracy": 0.9589557304978371, "num_tokens": 1197232.0, "step": 390 }, { "entropy": 0.15399955678731203, "epoch": 1.4238646482635797, "grad_norm": 0.07421875, "learning_rate": 0.00011360838341963964, "loss": 0.15647656917572023, "mean_token_accuracy": 0.9591503396630288, "num_tokens": 1227801.0, "step": 400 }, { "entropy": 0.15702628958970308, "epoch": 1.4594835262689225, "grad_norm": 0.0888671875, "learning_rate": 0.00010978975532916189, "loss": 0.16044070720672607, "mean_token_accuracy": 0.958300518989563, "num_tokens": 1258629.0, "step": 410 }, { "entropy": 0.12639004811644555, "epoch": 1.4951024042742653, "grad_norm": 0.0654296875, "learning_rate": 0.00010595665372565027, "loss": 0.12638626098632813, "mean_token_accuracy": 0.9659075498580932, "num_tokens": 1289072.0, "step": 420 }, { "entropy": 0.14171069134026765, "epoch": 1.5307212822796084, "grad_norm": 0.08056640625, "learning_rate": 0.00010211474559915233, "loss": 0.13931651115417482, "mean_token_accuracy": 0.9626615524291993, "num_tokens": 1319747.0, "step": 430 }, { "entropy": 0.16094463262706996, "epoch": 1.566340160284951, "grad_norm": 0.07958984375, "learning_rate": 9.826971095958395e-05, "loss": 0.15868637561798096, "mean_token_accuracy": 0.9590194016695023, "num_tokens": 1350664.0, "step": 440 }, { "entropy": 0.15832340456545352, "epoch": 1.601959038290294, "grad_norm": 0.0673828125, "learning_rate": 9.442723443920623e-05, "loss": 0.16317789554595946, "mean_token_accuracy": 0.9585595563054085, "num_tokens": 1381628.0, "step": 450 }, { "entropy": 0.16448966227471828, "epoch": 1.6375779162956365, "grad_norm": 0.0751953125, "learning_rate": 9.059299688826816e-05, "loss": 0.16371761560440062, "mean_token_accuracy": 0.9589671149849892, "num_tokens": 1412506.0, "step": 460 }, { "entropy": 0.15921370945870877, "epoch": 1.6731967943009796, "grad_norm": 0.0732421875, "learning_rate": 8.677266697624138e-05, "loss": 0.15553101301193237, "mean_token_accuracy": 0.9597976416349411, "num_tokens": 1442986.0, "step": 470 }, { "entropy": 0.14739455822855235, "epoch": 1.7088156723063224, "grad_norm": 0.0751953125, "learning_rate": 8.297189281106278e-05, "loss": 0.14005433320999144, "mean_token_accuracy": 0.9631642028689384, "num_tokens": 1473444.0, "step": 480 }, { "entropy": 0.1320339234545827, "epoch": 1.7444345503116652, "grad_norm": 0.07275390625, "learning_rate": 7.919629358877657e-05, "loss": 0.13607435226440429, "mean_token_accuracy": 0.9643332988023758, "num_tokens": 1504098.0, "step": 490 }, { "entropy": 0.12228272054344416, "epoch": 1.780053428317008, "grad_norm": 0.0888671875, "learning_rate": 7.54514512859201e-05, "loss": 0.12318435907363892, "mean_token_accuracy": 0.9668563097715378, "num_tokens": 1534785.0, "step": 500 }, { "entropy": 0.14428242221474646, "epoch": 1.8156723063223508, "grad_norm": 0.07861328125, "learning_rate": 7.174290240693689e-05, "loss": 0.14451712369918823, "mean_token_accuracy": 0.9615424692630767, "num_tokens": 1565500.0, "step": 510 }, { "entropy": 0.1427522897720337, "epoch": 1.8512911843276938, "grad_norm": 0.056884765625, "learning_rate": 6.807612979881661e-05, "loss": 0.1388334035873413, "mean_token_accuracy": 0.9621888875961304, "num_tokens": 1596446.0, "step": 520 }, { "entropy": 0.15157688688486814, "epoch": 1.8869100623330364, "grad_norm": 0.08349609375, "learning_rate": 6.445655454506465e-05, "loss": 0.15446548461914061, "mean_token_accuracy": 0.9599128782749176, "num_tokens": 1627189.0, "step": 530 }, { "entropy": 0.1300442773848772, "epoch": 1.9225289403383794, "grad_norm": 0.059326171875, "learning_rate": 6.0889527950984416e-05, "loss": 0.1254338026046753, "mean_token_accuracy": 0.9661760002374649, "num_tokens": 1657858.0, "step": 540 }, { "entropy": 0.1557828625664115, "epoch": 1.9581478183437222, "grad_norm": 0.06689453125, "learning_rate": 5.738032363212258e-05, "loss": 0.16083067655563354, "mean_token_accuracy": 0.9595108240842819, "num_tokens": 1688776.0, "step": 550 }, { "entropy": 0.13151839561760426, "epoch": 1.993766696349065, "grad_norm": 0.0810546875, "learning_rate": 5.3934129717573165e-05, "loss": 0.12607554197311402, "mean_token_accuracy": 0.966105441749096, "num_tokens": 1719573.0, "step": 560 }, { "epoch": 2.0, "eval_entropy": 0.14480842649936676, "eval_loss": 0.15477769076824188, "eval_mean_token_accuracy": 0.9607759993127052, "eval_num_tokens": 1725024.0, "eval_runtime": 35.306, "eval_samples_per_second": 15.89, "eval_steps_per_second": 3.994, "step": 562 } ], "logging_steps": 10, "max_steps": 843, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.057603725318144e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }