| { |
| "best_global_step": 562, |
| "best_metric": 0.15477769076824188, |
| "best_model_checkpoint": "/app/output/bf-router-v0.5/checkpoint-562", |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 843, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.4157426357269287, |
| "epoch": 0.003561887800534283, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.0, |
| "loss": 4.336674213409424, |
| "mean_token_accuracy": 0.4680640399456024, |
| "num_tokens": 3092.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.5884604156017303, |
| "epoch": 0.03561887800534283, |
| "grad_norm": 0.59375, |
| "learning_rate": 6.923076923076924e-05, |
| "loss": 3.986017862955729, |
| "mean_token_accuracy": 0.4735516524977154, |
| "num_tokens": 30650.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 2.2315500736236573, |
| "epoch": 0.07123775601068566, |
| "grad_norm": 0.26953125, |
| "learning_rate": 0.00014615384615384615, |
| "loss": 2.16544189453125, |
| "mean_token_accuracy": 0.6272616922855377, |
| "num_tokens": 61735.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.8066075079143047, |
| "epoch": 0.10685663401602849, |
| "grad_norm": 0.298828125, |
| "learning_rate": 0.00019999334629824895, |
| "loss": 0.7935717582702637, |
| "mean_token_accuracy": 0.857068446278572, |
| "num_tokens": 92484.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.5340980306267739, |
| "epoch": 0.14247551202137132, |
| "grad_norm": 0.10693359375, |
| "learning_rate": 0.00019987508289735716, |
| "loss": 0.5261529922485352, |
| "mean_token_accuracy": 0.9004581302404404, |
| "num_tokens": 123010.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.4001643668860197, |
| "epoch": 0.17809439002671415, |
| "grad_norm": 0.08251953125, |
| "learning_rate": 0.0001996091607179287, |
| "loss": 0.3956298351287842, |
| "mean_token_accuracy": 0.9209299519658088, |
| "num_tokens": 153710.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.34828933551907537, |
| "epoch": 0.21371326803205698, |
| "grad_norm": 0.10009765625, |
| "learning_rate": 0.00019919597290851538, |
| "loss": 0.3648662090301514, |
| "mean_token_accuracy": 0.9263768374919892, |
| "num_tokens": 184577.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.2983129292726517, |
| "epoch": 0.2493321460373998, |
| "grad_norm": 0.08447265625, |
| "learning_rate": 0.00019863613034027224, |
| "loss": 0.29642860889434813, |
| "mean_token_accuracy": 0.9371836110949516, |
| "num_tokens": 215040.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.280751570686698, |
| "epoch": 0.28495102404274264, |
| "grad_norm": 0.099609375, |
| "learning_rate": 0.00019793046070382437, |
| "loss": 0.2813178300857544, |
| "mean_token_accuracy": 0.9388988897204399, |
| "num_tokens": 245683.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.2960826952010393, |
| "epoch": 0.3205699020480855, |
| "grad_norm": 0.11962890625, |
| "learning_rate": 0.00019708000728558064, |
| "loss": 0.31115410327911375, |
| "mean_token_accuracy": 0.9335825845599175, |
| "num_tokens": 276430.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.28416929207742214, |
| "epoch": 0.3561887800534283, |
| "grad_norm": 0.103515625, |
| "learning_rate": 0.00019608602742530283, |
| "loss": 0.27938365936279297, |
| "mean_token_accuracy": 0.9393619552254677, |
| "num_tokens": 307296.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.23417662251740695, |
| "epoch": 0.39180765805877116, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00019494999065721108, |
| "loss": 0.23734774589538574, |
| "mean_token_accuracy": 0.9487677246332169, |
| "num_tokens": 337962.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.24019648060202597, |
| "epoch": 0.42742653606411396, |
| "grad_norm": 0.06640625, |
| "learning_rate": 0.0001936735765373737, |
| "loss": 0.245324969291687, |
| "mean_token_accuracy": 0.9457737937569618, |
| "num_tokens": 368782.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.269666058011353, |
| "epoch": 0.4630454140694568, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.00019225867216059325, |
| "loss": 0.2686375617980957, |
| "mean_token_accuracy": 0.9398508563637733, |
| "num_tokens": 399329.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.18328354582190515, |
| "epoch": 0.4986642920747996, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 0.00019070736937046035, |
| "loss": 0.18070420026779174, |
| "mean_token_accuracy": 0.9568557634949684, |
| "num_tokens": 429759.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.21165105439722537, |
| "epoch": 0.5342831700801425, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 0.0001890219616666997, |
| "loss": 0.22004930973052977, |
| "mean_token_accuracy": 0.9494227185845375, |
| "num_tokens": 460409.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.23916025627404452, |
| "epoch": 0.5699020480854853, |
| "grad_norm": 0.07568359375, |
| "learning_rate": 0.00018720494081438078, |
| "loss": 0.2379377841949463, |
| "mean_token_accuracy": 0.9455192387104034, |
| "num_tokens": 491344.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.2238690486177802, |
| "epoch": 0.6055209260908282, |
| "grad_norm": 0.08984375, |
| "learning_rate": 0.00018525899316000608, |
| "loss": 0.2245168685913086, |
| "mean_token_accuracy": 0.9485938593745231, |
| "num_tokens": 522029.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.17032606173306702, |
| "epoch": 0.641139804096171, |
| "grad_norm": 0.080078125, |
| "learning_rate": 0.00018318699565992357, |
| "loss": 0.17549347877502441, |
| "mean_token_accuracy": 0.9574712902307511, |
| "num_tokens": 552306.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.21890966054052113, |
| "epoch": 0.6767586821015138, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 0.00018099201162693476, |
| "loss": 0.21968293190002441, |
| "mean_token_accuracy": 0.9482985377311707, |
| "num_tokens": 583181.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.20646683853119613, |
| "epoch": 0.7123775601068566, |
| "grad_norm": 0.09765625, |
| "learning_rate": 0.00017867728620138708, |
| "loss": 0.20968315601348878, |
| "mean_token_accuracy": 0.9519409075379371, |
| "num_tokens": 613818.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.23000301327556372, |
| "epoch": 0.7479964381121995, |
| "grad_norm": 0.08935546875, |
| "learning_rate": 0.00017624624155344626, |
| "loss": 0.2383474349975586, |
| "mean_token_accuracy": 0.9458372846245766, |
| "num_tokens": 644576.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.21420854832977057, |
| "epoch": 0.7836153161175423, |
| "grad_norm": 0.0966796875, |
| "learning_rate": 0.0001737024718236413, |
| "loss": 0.21017465591430665, |
| "mean_token_accuracy": 0.9511569887399673, |
| "num_tokens": 675564.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.19280126914381981, |
| "epoch": 0.8192341941228851, |
| "grad_norm": 0.08154296875, |
| "learning_rate": 0.00017104973780916294, |
| "loss": 0.1936139941215515, |
| "mean_token_accuracy": 0.954449312388897, |
| "num_tokens": 706309.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.22524890769273043, |
| "epoch": 0.8548530721282279, |
| "grad_norm": 0.10546875, |
| "learning_rate": 0.00016829196140377085, |
| "loss": 0.22950620651245118, |
| "mean_token_accuracy": 0.9476036429405212, |
| "num_tokens": 737135.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 0.18842433094978334, |
| "epoch": 0.8904719501335708, |
| "grad_norm": 0.087890625, |
| "learning_rate": 0.00016543321979953007, |
| "loss": 0.19049547910690307, |
| "mean_token_accuracy": 0.9543813273310662, |
| "num_tokens": 767853.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.1991994746029377, |
| "epoch": 0.9260908281389136, |
| "grad_norm": 0.07763671875, |
| "learning_rate": 0.00016247773945894962, |
| "loss": 0.19527161121368408, |
| "mean_token_accuracy": 0.9535173490643502, |
| "num_tokens": 798511.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.18196075949817897, |
| "epoch": 0.9617097061442564, |
| "grad_norm": 0.0771484375, |
| "learning_rate": 0.00015942988986643352, |
| "loss": 0.19091761112213135, |
| "mean_token_accuracy": 0.9535432115197182, |
| "num_tokens": 829447.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.18307169582694768, |
| "epoch": 0.9973285841495992, |
| "grad_norm": 0.08837890625, |
| "learning_rate": 0.00015629417706828423, |
| "loss": 0.1818631410598755, |
| "mean_token_accuracy": 0.955574706196785, |
| "num_tokens": 860142.0, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 0.18207345770817276, |
| "eval_loss": 0.18149949610233307, |
| "eval_mean_token_accuracy": 0.9555717660180221, |
| "eval_num_tokens": 862512.0, |
| "eval_runtime": 35.3256, |
| "eval_samples_per_second": 15.881, |
| "eval_steps_per_second": 3.991, |
| "step": 281 |
| }, |
| { |
| "entropy": 0.1545313375118451, |
| "epoch": 1.0320569902048085, |
| "grad_norm": 0.064453125, |
| "learning_rate": 0.00015307523701080768, |
| "loss": 0.14675980806350708, |
| "mean_token_accuracy": 0.9623546004295349, |
| "num_tokens": 889907.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.13382284864783286, |
| "epoch": 1.0676758682101515, |
| "grad_norm": 0.060546875, |
| "learning_rate": 0.00014977782868636999, |
| "loss": 0.13706474304199218, |
| "mean_token_accuracy": 0.9633408606052398, |
| "num_tokens": 920639.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.16596811451017857, |
| "epoch": 1.1032947462154943, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 0.00014640682709753832, |
| "loss": 0.16318607330322266, |
| "mean_token_accuracy": 0.9578790530562401, |
| "num_tokens": 951323.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.1480622159317136, |
| "epoch": 1.138913624220837, |
| "grad_norm": 0.057861328125, |
| "learning_rate": 0.0001429672160497085, |
| "loss": 0.1488279938697815, |
| "mean_token_accuracy": 0.9609164595603943, |
| "num_tokens": 981911.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.1655621325597167, |
| "epoch": 1.1745325022261799, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 0.00013946408078287462, |
| "loss": 0.16427644491195678, |
| "mean_token_accuracy": 0.9576473370194435, |
| "num_tokens": 1012535.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.18122829273343086, |
| "epoch": 1.2101513802315227, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 0.00013590260045343432, |
| "loss": 0.176006543636322, |
| "mean_token_accuracy": 0.9551453411579132, |
| "num_tokens": 1043503.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.15644419118762015, |
| "epoch": 1.2457702582368655, |
| "grad_norm": 0.06201171875, |
| "learning_rate": 0.00013228804047714463, |
| "loss": 0.1652477502822876, |
| "mean_token_accuracy": 0.9575047269463539, |
| "num_tokens": 1074168.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.14828295167535543, |
| "epoch": 1.2813891362422083, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 0.00012862574474454928, |
| "loss": 0.14066768884658815, |
| "mean_token_accuracy": 0.9639188721776009, |
| "num_tokens": 1104681.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.19175196047872306, |
| "epoch": 1.317008014247551, |
| "grad_norm": 0.09619140625, |
| "learning_rate": 0.0001249211277203859, |
| "loss": 0.1917089343070984, |
| "mean_token_accuracy": 0.9522068575024605, |
| "num_tokens": 1135361.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.1644950734451413, |
| "epoch": 1.3526268922528941, |
| "grad_norm": 0.0849609375, |
| "learning_rate": 0.00012117966643865398, |
| "loss": 0.16864393949508666, |
| "mean_token_accuracy": 0.9572859182953835, |
| "num_tokens": 1166125.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.16431492734700442, |
| "epoch": 1.388245770258237, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.00011740689240517837, |
| "loss": 0.1597315788269043, |
| "mean_token_accuracy": 0.9589557304978371, |
| "num_tokens": 1197232.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.15399955678731203, |
| "epoch": 1.4238646482635797, |
| "grad_norm": 0.07421875, |
| "learning_rate": 0.00011360838341963964, |
| "loss": 0.15647656917572023, |
| "mean_token_accuracy": 0.9591503396630288, |
| "num_tokens": 1227801.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.15702628958970308, |
| "epoch": 1.4594835262689225, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 0.00010978975532916189, |
| "loss": 0.16044070720672607, |
| "mean_token_accuracy": 0.958300518989563, |
| "num_tokens": 1258629.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.12639004811644555, |
| "epoch": 1.4951024042742653, |
| "grad_norm": 0.0654296875, |
| "learning_rate": 0.00010595665372565027, |
| "loss": 0.12638626098632813, |
| "mean_token_accuracy": 0.9659075498580932, |
| "num_tokens": 1289072.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.14171069134026765, |
| "epoch": 1.5307212822796084, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 0.00010211474559915233, |
| "loss": 0.13931651115417482, |
| "mean_token_accuracy": 0.9626615524291993, |
| "num_tokens": 1319747.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.16094463262706996, |
| "epoch": 1.566340160284951, |
| "grad_norm": 0.07958984375, |
| "learning_rate": 9.826971095958395e-05, |
| "loss": 0.15868637561798096, |
| "mean_token_accuracy": 0.9590194016695023, |
| "num_tokens": 1350664.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.15832340456545352, |
| "epoch": 1.601959038290294, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 9.442723443920623e-05, |
| "loss": 0.16317789554595946, |
| "mean_token_accuracy": 0.9585595563054085, |
| "num_tokens": 1381628.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.16448966227471828, |
| "epoch": 1.6375779162956365, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 9.059299688826816e-05, |
| "loss": 0.16371761560440062, |
| "mean_token_accuracy": 0.9589671149849892, |
| "num_tokens": 1412506.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.15921370945870877, |
| "epoch": 1.6731967943009796, |
| "grad_norm": 0.0732421875, |
| "learning_rate": 8.677266697624138e-05, |
| "loss": 0.15553101301193237, |
| "mean_token_accuracy": 0.9597976416349411, |
| "num_tokens": 1442986.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.14739455822855235, |
| "epoch": 1.7088156723063224, |
| "grad_norm": 0.0751953125, |
| "learning_rate": 8.297189281106278e-05, |
| "loss": 0.14005433320999144, |
| "mean_token_accuracy": 0.9631642028689384, |
| "num_tokens": 1473444.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 0.1320339234545827, |
| "epoch": 1.7444345503116652, |
| "grad_norm": 0.07275390625, |
| "learning_rate": 7.919629358877657e-05, |
| "loss": 0.13607435226440429, |
| "mean_token_accuracy": 0.9643332988023758, |
| "num_tokens": 1504098.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.12228272054344416, |
| "epoch": 1.780053428317008, |
| "grad_norm": 0.0888671875, |
| "learning_rate": 7.54514512859201e-05, |
| "loss": 0.12318435907363892, |
| "mean_token_accuracy": 0.9668563097715378, |
| "num_tokens": 1534785.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.14428242221474646, |
| "epoch": 1.8156723063223508, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 7.174290240693689e-05, |
| "loss": 0.14451712369918823, |
| "mean_token_accuracy": 0.9615424692630767, |
| "num_tokens": 1565500.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.1427522897720337, |
| "epoch": 1.8512911843276938, |
| "grad_norm": 0.056884765625, |
| "learning_rate": 6.807612979881661e-05, |
| "loss": 0.1388334035873413, |
| "mean_token_accuracy": 0.9621888875961304, |
| "num_tokens": 1596446.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.15157688688486814, |
| "epoch": 1.8869100623330364, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 6.445655454506465e-05, |
| "loss": 0.15446548461914061, |
| "mean_token_accuracy": 0.9599128782749176, |
| "num_tokens": 1627189.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.1300442773848772, |
| "epoch": 1.9225289403383794, |
| "grad_norm": 0.059326171875, |
| "learning_rate": 6.0889527950984416e-05, |
| "loss": 0.1254338026046753, |
| "mean_token_accuracy": 0.9661760002374649, |
| "num_tokens": 1657858.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.1557828625664115, |
| "epoch": 1.9581478183437222, |
| "grad_norm": 0.06689453125, |
| "learning_rate": 5.738032363212258e-05, |
| "loss": 0.16083067655563354, |
| "mean_token_accuracy": 0.9595108240842819, |
| "num_tokens": 1688776.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.13151839561760426, |
| "epoch": 1.993766696349065, |
| "grad_norm": 0.0810546875, |
| "learning_rate": 5.3934129717573165e-05, |
| "loss": 0.12607554197311402, |
| "mean_token_accuracy": 0.966105441749096, |
| "num_tokens": 1719573.0, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 0.14480842649936676, |
| "eval_loss": 0.15477769076824188, |
| "eval_mean_token_accuracy": 0.9607759993127052, |
| "eval_num_tokens": 1725024.0, |
| "eval_runtime": 35.306, |
| "eval_samples_per_second": 15.89, |
| "eval_steps_per_second": 3.994, |
| "step": 562 |
| }, |
| { |
| "entropy": 0.13348992398151985, |
| "epoch": 2.0284951024042743, |
| "grad_norm": 0.09326171875, |
| "learning_rate": 5.0556041179668354e-05, |
| "loss": 0.12256227731704712, |
| "mean_token_accuracy": 0.9662964267608447, |
| "num_tokens": 1749504.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.13551258575171232, |
| "epoch": 2.064113980409617, |
| "grad_norm": 0.0732421875, |
| "learning_rate": 4.725105230139465e-05, |
| "loss": 0.12811031341552734, |
| "mean_token_accuracy": 0.9640967190265656, |
| "num_tokens": 1780206.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.11903938055038452, |
| "epoch": 2.09973285841496, |
| "grad_norm": 0.061279296875, |
| "learning_rate": 4.402404929267235e-05, |
| "loss": 0.1129370927810669, |
| "mean_token_accuracy": 0.9682316735386849, |
| "num_tokens": 1810985.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.11653038449585437, |
| "epoch": 2.135351736420303, |
| "grad_norm": 0.08056640625, |
| "learning_rate": 4.087980306641365e-05, |
| "loss": 0.11388391256332397, |
| "mean_token_accuracy": 0.9668685078620911, |
| "num_tokens": 1841597.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.11204307321459055, |
| "epoch": 2.1709706144256455, |
| "grad_norm": 0.1015625, |
| "learning_rate": 3.7822962185039914e-05, |
| "loss": 0.11144263744354248, |
| "mean_token_accuracy": 0.9687378108501434, |
| "num_tokens": 1872199.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.12015030086040497, |
| "epoch": 2.2065894924309886, |
| "grad_norm": 0.0712890625, |
| "learning_rate": 3.4858045987886145e-05, |
| "loss": 0.11484370231628419, |
| "mean_token_accuracy": 0.967641019821167, |
| "num_tokens": 1903172.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.12499598301947117, |
| "epoch": 2.242208370436331, |
| "grad_norm": 0.06640625, |
| "learning_rate": 3.198943790965332e-05, |
| "loss": 0.12357065677642823, |
| "mean_token_accuracy": 0.9670094296336174, |
| "num_tokens": 1933713.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.1296376422047615, |
| "epoch": 2.277827248441674, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 2.9221378999786853e-05, |
| "loss": 0.12340972423553467, |
| "mean_token_accuracy": 0.96511862128973, |
| "num_tokens": 1964556.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.12314727567136288, |
| "epoch": 2.3134461264470167, |
| "grad_norm": 0.080078125, |
| "learning_rate": 2.655796165236234e-05, |
| "loss": 0.11837244033813477, |
| "mean_token_accuracy": 0.9671967878937722, |
| "num_tokens": 1995252.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.11561896577477455, |
| "epoch": 2.3490650044523598, |
| "grad_norm": 0.1064453125, |
| "learning_rate": 2.400312355574853e-05, |
| "loss": 0.11029613018035889, |
| "mean_token_accuracy": 0.967219403386116, |
| "num_tokens": 2025889.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.11878675390034914, |
| "epoch": 2.3846838824577024, |
| "grad_norm": 0.080078125, |
| "learning_rate": 2.1560641870992616e-05, |
| "loss": 0.11299164295196533, |
| "mean_token_accuracy": 0.9675500631332398, |
| "num_tokens": 2056588.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.11508079580962657, |
| "epoch": 2.4203027604630454, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 1.9234127647534604e-05, |
| "loss": 0.10480635166168213, |
| "mean_token_accuracy": 0.9691305905580521, |
| "num_tokens": 2087259.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.11411711536347866, |
| "epoch": 2.4559216384683884, |
| "grad_norm": 0.06640625, |
| "learning_rate": 1.7027020484506996e-05, |
| "loss": 0.11348260641098022, |
| "mean_token_accuracy": 0.9682735517621041, |
| "num_tokens": 2117728.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.11259160749614239, |
| "epoch": 2.491540516473731, |
| "grad_norm": 0.05224609375, |
| "learning_rate": 1.4942583445512103e-05, |
| "loss": 0.10898959636688232, |
| "mean_token_accuracy": 0.9685835257172585, |
| "num_tokens": 2148333.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.12499043270945549, |
| "epoch": 2.527159394479074, |
| "grad_norm": 0.0849609375, |
| "learning_rate": 1.2983898234396308e-05, |
| "loss": 0.12133818864822388, |
| "mean_token_accuracy": 0.9664898782968521, |
| "num_tokens": 2178932.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.11083998121321201, |
| "epoch": 2.5627782724844166, |
| "grad_norm": 0.048828125, |
| "learning_rate": 1.1153860639152169e-05, |
| "loss": 0.10544953346252442, |
| "mean_token_accuracy": 0.9696028590202331, |
| "num_tokens": 2209610.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.11835917010903359, |
| "epoch": 2.5983971504897596, |
| "grad_norm": 0.08203125, |
| "learning_rate": 9.455176250685338e-06, |
| "loss": 0.11609755754470825, |
| "mean_token_accuracy": 0.9664707094430923, |
| "num_tokens": 2240346.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 0.12215895913541316, |
| "epoch": 2.634016028495102, |
| "grad_norm": 0.0673828125, |
| "learning_rate": 7.890356462775373e-06, |
| "loss": 0.11725597381591797, |
| "mean_token_accuracy": 0.9670157313346863, |
| "num_tokens": 2271054.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 0.12366384714841842, |
| "epoch": 2.669634906500445, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 6.461714759144233e-06, |
| "loss": 0.1168657660484314, |
| "mean_token_accuracy": 0.9668171271681786, |
| "num_tokens": 2301971.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 0.10823128782212735, |
| "epoch": 2.7052537845057882, |
| "grad_norm": 0.05712890625, |
| "learning_rate": 5.171363293121901e-06, |
| "loss": 0.10103480815887451, |
| "mean_token_accuracy": 0.9696328729391098, |
| "num_tokens": 2332515.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 0.10126303266733885, |
| "epoch": 2.740872662511131, |
| "grad_norm": 0.07080078125, |
| "learning_rate": 4.021209764965828e-06, |
| "loss": 0.09550071954727173, |
| "mean_token_accuracy": 0.9718543246388436, |
| "num_tokens": 2363176.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 0.10594012532383204, |
| "epoch": 2.776491540516474, |
| "grad_norm": 0.0625, |
| "learning_rate": 3.0129546014508567e-06, |
| "loss": 0.1027148962020874, |
| "mean_token_accuracy": 0.9698196157813073, |
| "num_tokens": 2393823.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 0.11277557052671909, |
| "epoch": 2.8121104185218164, |
| "grad_norm": 0.057373046875, |
| "learning_rate": 2.1480884418993983e-06, |
| "loss": 0.10367020368576049, |
| "mean_token_accuracy": 0.9688401147723198, |
| "num_tokens": 2424575.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 0.13837510608136655, |
| "epoch": 2.8477292965271594, |
| "grad_norm": 0.0791015625, |
| "learning_rate": 1.4278899343687425e-06, |
| "loss": 0.13071552515029908, |
| "mean_token_accuracy": 0.9625913232564927, |
| "num_tokens": 2455901.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 0.12289627455174923, |
| "epoch": 2.883348174532502, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 8.534238452534759e-07, |
| "loss": 0.11349356174468994, |
| "mean_token_accuracy": 0.9673542976379395, |
| "num_tokens": 2486966.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 0.11963035948574544, |
| "epoch": 2.918967052537845, |
| "grad_norm": 0.05419921875, |
| "learning_rate": 4.2553948509802545e-07, |
| "loss": 0.11257882118225097, |
| "mean_token_accuracy": 0.9673726871609688, |
| "num_tokens": 2517590.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 0.13870051857084037, |
| "epoch": 2.954585930543188, |
| "grad_norm": 0.06298828125, |
| "learning_rate": 1.4486945294652776e-07, |
| "loss": 0.13519666194915772, |
| "mean_token_accuracy": 0.9639795452356339, |
| "num_tokens": 2548330.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 0.10831217430531978, |
| "epoch": 2.9902048085485307, |
| "grad_norm": 0.06494140625, |
| "learning_rate": 1.1828701086558092e-08, |
| "loss": 0.10172584056854247, |
| "mean_token_accuracy": 0.9707534283399581, |
| "num_tokens": 2579075.0, |
| "step": 840 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_entropy": 0.12990308695651115, |
| "eval_loss": 0.15479549765586853, |
| "eval_mean_token_accuracy": 0.9612296595641062, |
| "eval_num_tokens": 2587536.0, |
| "eval_runtime": 35.3111, |
| "eval_samples_per_second": 15.887, |
| "eval_steps_per_second": 3.993, |
| "step": 843 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 843, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.085644413841408e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|