| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.172316063938329, | |
| "eval_steps": 500, | |
| "global_step": 950, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.401060389354825, | |
| "epoch": 0.0018138533046139895, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 4.999991783232007e-05, | |
| "loss": 0.3909, | |
| "mean_token_accuracy": 0.8833676137030124, | |
| "num_tokens": 689094.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.35119859240949153, | |
| "epoch": 0.003627706609227979, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.999963379658905e-05, | |
| "loss": 0.3488, | |
| "mean_token_accuracy": 0.8924659512937069, | |
| "num_tokens": 1431066.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.330338497646153, | |
| "epoch": 0.005441559913841968, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 4.999914688069563e-05, | |
| "loss": 0.3263, | |
| "mean_token_accuracy": 0.8976220838725567, | |
| "num_tokens": 2149722.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.3173067133873701, | |
| "epoch": 0.007255413218455958, | |
| "grad_norm": 0.37109375, | |
| "learning_rate": 4.9998457088591286e-05, | |
| "loss": 0.3126, | |
| "mean_token_accuracy": 0.900362791121006, | |
| "num_tokens": 2839517.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.3035059319809079, | |
| "epoch": 0.009069266523069946, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.999756442587392e-05, | |
| "loss": 0.2985, | |
| "mean_token_accuracy": 0.9043247863650322, | |
| "num_tokens": 3562428.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.31549433059990406, | |
| "epoch": 0.010883119827683935, | |
| "grad_norm": 0.447265625, | |
| "learning_rate": 4.999646889978778e-05, | |
| "loss": 0.3119, | |
| "mean_token_accuracy": 0.9018740214407444, | |
| "num_tokens": 679264.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.3051564427092671, | |
| "epoch": 0.012696973132297925, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.99951705192234e-05, | |
| "loss": 0.3007, | |
| "mean_token_accuracy": 0.9044437751173973, | |
| "num_tokens": 1386187.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.29480861593037844, | |
| "epoch": 0.014510826436911916, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.999366929471757e-05, | |
| "loss": 0.2925, | |
| "mean_token_accuracy": 0.9076969392597676, | |
| "num_tokens": 2069407.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.29471962377429006, | |
| "epoch": 0.016324679741525905, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.999196523845322e-05, | |
| "loss": 0.2822, | |
| "mean_token_accuracy": 0.9081305578351021, | |
| "num_tokens": 2751799.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.2778853852301836, | |
| "epoch": 0.018138533046139892, | |
| "grad_norm": 0.390625, | |
| "learning_rate": 4.9990058364259315e-05, | |
| "loss": 0.2757, | |
| "mean_token_accuracy": 0.9112294614315033, | |
| "num_tokens": 3432327.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.2759243570268154, | |
| "epoch": 0.019952386350753883, | |
| "grad_norm": 0.33984375, | |
| "learning_rate": 4.998794868761076e-05, | |
| "loss": 0.2709, | |
| "mean_token_accuracy": 0.9125727131962776, | |
| "num_tokens": 4120004.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.2782453015446663, | |
| "epoch": 0.02176623965536787, | |
| "grad_norm": 0.3515625, | |
| "learning_rate": 4.9985636225628276e-05, | |
| "loss": 0.2707, | |
| "mean_token_accuracy": 0.9114268697798252, | |
| "num_tokens": 4818399.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.28113641776144505, | |
| "epoch": 0.023580092959981862, | |
| "grad_norm": 0.318359375, | |
| "learning_rate": 4.998312099707823e-05, | |
| "loss": 0.2816, | |
| "mean_token_accuracy": 0.9110292144119739, | |
| "num_tokens": 5530113.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.2701249118894339, | |
| "epoch": 0.02539394626459585, | |
| "grad_norm": 0.384765625, | |
| "learning_rate": 4.998040302237253e-05, | |
| "loss": 0.2633, | |
| "mean_token_accuracy": 0.9143483161926269, | |
| "num_tokens": 6202520.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.26040864232927563, | |
| "epoch": 0.02720779956920984, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.997748232356841e-05, | |
| "loss": 0.2595, | |
| "mean_token_accuracy": 0.9160367086529732, | |
| "num_tokens": 6897257.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 0.26147676482796667, | |
| "epoch": 0.02902165287382383, | |
| "grad_norm": 0.37890625, | |
| "learning_rate": 4.997435892436832e-05, | |
| "loss": 0.2607, | |
| "mean_token_accuracy": 0.9158791072666645, | |
| "num_tokens": 7596780.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 0.2591979356482625, | |
| "epoch": 0.03083550617843782, | |
| "grad_norm": 0.333984375, | |
| "learning_rate": 4.997103285011964e-05, | |
| "loss": 0.2498, | |
| "mean_token_accuracy": 0.917728378623724, | |
| "num_tokens": 8327856.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.262632866948843, | |
| "epoch": 0.03264935948305181, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.996750412781454e-05, | |
| "loss": 0.2615, | |
| "mean_token_accuracy": 0.9157723136246204, | |
| "num_tokens": 9035609.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.2631745539605618, | |
| "epoch": 0.0344632127876658, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.996377278608977e-05, | |
| "loss": 0.2632, | |
| "mean_token_accuracy": 0.9168937481939793, | |
| "num_tokens": 9761489.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.2691297778859735, | |
| "epoch": 0.036277066092279785, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.995983885522636e-05, | |
| "loss": 0.267, | |
| "mean_token_accuracy": 0.9136621370911598, | |
| "num_tokens": 10480790.0, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 0.2528717964887619, | |
| "epoch": 0.038090919396893776, | |
| "grad_norm": 0.3671875, | |
| "learning_rate": 4.995570236714945e-05, | |
| "loss": 0.2457, | |
| "mean_token_accuracy": 0.9190806046128273, | |
| "num_tokens": 11166729.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 0.25353531800210477, | |
| "epoch": 0.03990477270150777, | |
| "grad_norm": 0.48046875, | |
| "learning_rate": 4.995136335542799e-05, | |
| "loss": 0.2538, | |
| "mean_token_accuracy": 0.9182398840785027, | |
| "num_tokens": 11883461.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.2523710783571005, | |
| "epoch": 0.04171862600612176, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 4.9946821855274475e-05, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9204144366085529, | |
| "num_tokens": 12586996.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.24951837193220855, | |
| "epoch": 0.04353247931073574, | |
| "grad_norm": 0.375, | |
| "learning_rate": 4.994207790354464e-05, | |
| "loss": 0.2439, | |
| "mean_token_accuracy": 0.9190732814371586, | |
| "num_tokens": 13294833.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.2435963459312916, | |
| "epoch": 0.04534633261534973, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.993713153873721e-05, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.9220220901072025, | |
| "num_tokens": 13964074.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.24172000139951705, | |
| "epoch": 0.047160185919963724, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.9931982800993535e-05, | |
| "loss": 0.2409, | |
| "mean_token_accuracy": 0.9220294274389744, | |
| "num_tokens": 14699210.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.24300440158694983, | |
| "epoch": 0.048974039224577715, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.992663173209732e-05, | |
| "loss": 0.2383, | |
| "mean_token_accuracy": 0.9216976344585419, | |
| "num_tokens": 15397171.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.23483966048806906, | |
| "epoch": 0.0507878925291917, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 4.992107837547419e-05, | |
| "loss": 0.2312, | |
| "mean_token_accuracy": 0.9239355817437171, | |
| "num_tokens": 16084594.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.22807897245511413, | |
| "epoch": 0.05260174583380569, | |
| "grad_norm": 0.4296875, | |
| "learning_rate": 4.991532277619149e-05, | |
| "loss": 0.225, | |
| "mean_token_accuracy": 0.9266773730516433, | |
| "num_tokens": 16814512.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.23185804225504397, | |
| "epoch": 0.05441559913841968, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 4.9909364980957735e-05, | |
| "loss": 0.227, | |
| "mean_token_accuracy": 0.9256549589335918, | |
| "num_tokens": 17530989.0, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.227366485260427, | |
| "epoch": 0.05622945244303367, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.99032050381224e-05, | |
| "loss": 0.2258, | |
| "mean_token_accuracy": 0.9261060066521167, | |
| "num_tokens": 18206500.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.2307612099684775, | |
| "epoch": 0.05804330574764766, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.989684299767541e-05, | |
| "loss": 0.2252, | |
| "mean_token_accuracy": 0.9251494728028774, | |
| "num_tokens": 18876031.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.23350342214107514, | |
| "epoch": 0.059857159052261646, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.9890278911246805e-05, | |
| "loss": 0.2307, | |
| "mean_token_accuracy": 0.9253196962177753, | |
| "num_tokens": 19628445.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.22912344289943576, | |
| "epoch": 0.06167101235687564, | |
| "grad_norm": 0.365234375, | |
| "learning_rate": 4.9883512832106286e-05, | |
| "loss": 0.2253, | |
| "mean_token_accuracy": 0.926545312255621, | |
| "num_tokens": 20323150.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.22221376914530994, | |
| "epoch": 0.06348486566148963, | |
| "grad_norm": 0.404296875, | |
| "learning_rate": 4.987654481516276e-05, | |
| "loss": 0.2179, | |
| "mean_token_accuracy": 0.92796630859375, | |
| "num_tokens": 21008346.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.220701056253165, | |
| "epoch": 0.06529871896610362, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 4.9869374916963983e-05, | |
| "loss": 0.2147, | |
| "mean_token_accuracy": 0.9299234464764595, | |
| "num_tokens": 21709970.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.2218360759317875, | |
| "epoch": 0.06711257227071761, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.9862003195696e-05, | |
| "loss": 0.2175, | |
| "mean_token_accuracy": 0.9282336570322514, | |
| "num_tokens": 22414407.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.21832106141373514, | |
| "epoch": 0.0689264255753316, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.985442971118273e-05, | |
| "loss": 0.2154, | |
| "mean_token_accuracy": 0.9296296700835228, | |
| "num_tokens": 23105526.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.22634686436504126, | |
| "epoch": 0.07074027887994558, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.984665452488545e-05, | |
| "loss": 0.2291, | |
| "mean_token_accuracy": 0.9264358282089233, | |
| "num_tokens": 23783734.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.21892870962619781, | |
| "epoch": 0.07255413218455957, | |
| "grad_norm": 0.361328125, | |
| "learning_rate": 4.983867769990233e-05, | |
| "loss": 0.2138, | |
| "mean_token_accuracy": 0.929813839495182, | |
| "num_tokens": 24499302.0, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.20870111417025328, | |
| "epoch": 0.07436798548917356, | |
| "grad_norm": 0.373046875, | |
| "learning_rate": 4.9830499300967924e-05, | |
| "loss": 0.2059, | |
| "mean_token_accuracy": 0.9323622785508633, | |
| "num_tokens": 25187005.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 0.2046168447472155, | |
| "epoch": 0.07618183879378755, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.9822119394452575e-05, | |
| "loss": 0.2022, | |
| "mean_token_accuracy": 0.9341225482523441, | |
| "num_tokens": 25894789.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.21055085053667427, | |
| "epoch": 0.07799569209840154, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 4.981353804836196e-05, | |
| "loss": 0.2088, | |
| "mean_token_accuracy": 0.9323237225413322, | |
| "num_tokens": 26605837.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.22691676011309028, | |
| "epoch": 0.07980954540301553, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.98047553323365e-05, | |
| "loss": 0.2232, | |
| "mean_token_accuracy": 0.9282535433769226, | |
| "num_tokens": 27306059.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.21169109037145972, | |
| "epoch": 0.08162339870762952, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.979577131765079e-05, | |
| "loss": 0.2094, | |
| "mean_token_accuracy": 0.9311828054487705, | |
| "num_tokens": 28027405.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.21191262369975447, | |
| "epoch": 0.08343725201224352, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.978658607721301e-05, | |
| "loss": 0.2076, | |
| "mean_token_accuracy": 0.9319212548434734, | |
| "num_tokens": 28717583.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.20386602552607655, | |
| "epoch": 0.08525110531685749, | |
| "grad_norm": 0.498046875, | |
| "learning_rate": 4.977719968556441e-05, | |
| "loss": 0.1973, | |
| "mean_token_accuracy": 0.9341823615133762, | |
| "num_tokens": 29415356.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.21132566928863525, | |
| "epoch": 0.08706495862147148, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 4.976761221887859e-05, | |
| "loss": 0.2095, | |
| "mean_token_accuracy": 0.9307463005185127, | |
| "num_tokens": 30130018.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.2104976624250412, | |
| "epoch": 0.08887881192608547, | |
| "grad_norm": 0.3359375, | |
| "learning_rate": 4.975782375496098e-05, | |
| "loss": 0.2057, | |
| "mean_token_accuracy": 0.9317945875227451, | |
| "num_tokens": 30871597.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.20463952338322997, | |
| "epoch": 0.09069266523069947, | |
| "grad_norm": 0.392578125, | |
| "learning_rate": 4.974783437324812e-05, | |
| "loss": 0.203, | |
| "mean_token_accuracy": 0.934221388399601, | |
| "num_tokens": 31571911.0, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.20768251903355123, | |
| "epoch": 0.09250651853531346, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.9737644154807126e-05, | |
| "loss": 0.2058, | |
| "mean_token_accuracy": 0.9322210021317006, | |
| "num_tokens": 32245409.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.20007140738889576, | |
| "epoch": 0.09432037183992745, | |
| "grad_norm": 0.3984375, | |
| "learning_rate": 4.972725318233492e-05, | |
| "loss": 0.1977, | |
| "mean_token_accuracy": 0.9346926286816597, | |
| "num_tokens": 32960743.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.20683543616905808, | |
| "epoch": 0.09613422514454144, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.971666154015762e-05, | |
| "loss": 0.2034, | |
| "mean_token_accuracy": 0.9335305042564869, | |
| "num_tokens": 33657841.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.2172858018428087, | |
| "epoch": 0.09794807844915543, | |
| "grad_norm": 0.451171875, | |
| "learning_rate": 4.970586931422985e-05, | |
| "loss": 0.2102, | |
| "mean_token_accuracy": 0.9308790504932404, | |
| "num_tokens": 34304617.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.19533313820138573, | |
| "epoch": 0.09976193175376942, | |
| "grad_norm": 0.443359375, | |
| "learning_rate": 4.969487659213404e-05, | |
| "loss": 0.1942, | |
| "mean_token_accuracy": 0.9365381173789501, | |
| "num_tokens": 35002676.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.19501794883981347, | |
| "epoch": 0.1015757850583834, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 4.9683683463079716e-05, | |
| "loss": 0.194, | |
| "mean_token_accuracy": 0.9368167445063591, | |
| "num_tokens": 35721130.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.19730768594890832, | |
| "epoch": 0.10338963836299739, | |
| "grad_norm": 0.4375, | |
| "learning_rate": 4.967229001790275e-05, | |
| "loss": 0.1912, | |
| "mean_token_accuracy": 0.9373390160501003, | |
| "num_tokens": 36452825.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.20269922027364373, | |
| "epoch": 0.10520349166761138, | |
| "grad_norm": 0.416015625, | |
| "learning_rate": 4.966069634906465e-05, | |
| "loss": 0.1976, | |
| "mean_token_accuracy": 0.9344136849045753, | |
| "num_tokens": 37150583.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.1830467650666833, | |
| "epoch": 0.10701734497222537, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.964890255065182e-05, | |
| "loss": 0.1813, | |
| "mean_token_accuracy": 0.9408909261226654, | |
| "num_tokens": 37856958.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.18502615857869387, | |
| "epoch": 0.10883119827683936, | |
| "grad_norm": 0.38671875, | |
| "learning_rate": 4.9636908718374776e-05, | |
| "loss": 0.1811, | |
| "mean_token_accuracy": 0.9401354268193245, | |
| "num_tokens": 38557749.0, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.17543947864323856, | |
| "epoch": 0.11064505158145335, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.962471494956736e-05, | |
| "loss": 0.1742, | |
| "mean_token_accuracy": 0.9421761430799961, | |
| "num_tokens": 39302134.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.18896188419312238, | |
| "epoch": 0.11245890488606734, | |
| "grad_norm": 0.376953125, | |
| "learning_rate": 4.961232134318597e-05, | |
| "loss": 0.1848, | |
| "mean_token_accuracy": 0.9393556989729405, | |
| "num_tokens": 40012713.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.19227341655641794, | |
| "epoch": 0.11427275819068133, | |
| "grad_norm": 0.408203125, | |
| "learning_rate": 4.9599727999808776e-05, | |
| "loss": 0.1813, | |
| "mean_token_accuracy": 0.9380827076733113, | |
| "num_tokens": 40728293.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.17786815362051128, | |
| "epoch": 0.11608661149529532, | |
| "grad_norm": 0.75, | |
| "learning_rate": 4.9586935021634835e-05, | |
| "loss": 0.1764, | |
| "mean_token_accuracy": 0.9420537509024143, | |
| "num_tokens": 41434153.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.18211883958429098, | |
| "epoch": 0.1179004647999093, | |
| "grad_norm": 0.412109375, | |
| "learning_rate": 4.957394251248334e-05, | |
| "loss": 0.1771, | |
| "mean_token_accuracy": 0.9409064181149006, | |
| "num_tokens": 42140372.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.1796271875500679, | |
| "epoch": 0.11971431810452329, | |
| "grad_norm": 0.41796875, | |
| "learning_rate": 4.9560750577792715e-05, | |
| "loss": 0.1747, | |
| "mean_token_accuracy": 0.942291560024023, | |
| "num_tokens": 42838039.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.17686794362962246, | |
| "epoch": 0.12152817140913728, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.954735932461982e-05, | |
| "loss": 0.1741, | |
| "mean_token_accuracy": 0.942273823171854, | |
| "num_tokens": 43550368.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.18043181113898754, | |
| "epoch": 0.12334202471375127, | |
| "grad_norm": 0.458984375, | |
| "learning_rate": 4.9533768861639016e-05, | |
| "loss": 0.1758, | |
| "mean_token_accuracy": 0.9421767868101597, | |
| "num_tokens": 44249323.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.17555835144594312, | |
| "epoch": 0.12515587801836525, | |
| "grad_norm": 0.453125, | |
| "learning_rate": 4.951997929914135e-05, | |
| "loss": 0.1712, | |
| "mean_token_accuracy": 0.9432690389454365, | |
| "num_tokens": 44934599.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.1881471230648458, | |
| "epoch": 0.12696973132297926, | |
| "grad_norm": 0.44921875, | |
| "learning_rate": 4.9505990749033596e-05, | |
| "loss": 0.1814, | |
| "mean_token_accuracy": 0.9401827119290829, | |
| "num_tokens": 45627988.0, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.1738990875892341, | |
| "epoch": 0.12878358462759323, | |
| "grad_norm": 0.396484375, | |
| "learning_rate": 4.9491803324837394e-05, | |
| "loss": 0.1669, | |
| "mean_token_accuracy": 0.9448154501616954, | |
| "num_tokens": 46344816.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.1657118304632604, | |
| "epoch": 0.13059743793220724, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 4.94774171416883e-05, | |
| "loss": 0.1635, | |
| "mean_token_accuracy": 0.9460572391748429, | |
| "num_tokens": 47031930.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.17199594508856536, | |
| "epoch": 0.13241129123682122, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.946283231633488e-05, | |
| "loss": 0.1677, | |
| "mean_token_accuracy": 0.9447118684649467, | |
| "num_tokens": 47700276.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.17147760968655348, | |
| "epoch": 0.13422514454143522, | |
| "grad_norm": 0.380859375, | |
| "learning_rate": 4.944804896713773e-05, | |
| "loss": 0.1699, | |
| "mean_token_accuracy": 0.9442392319440842, | |
| "num_tokens": 48399953.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.16508059948682785, | |
| "epoch": 0.1360389978460492, | |
| "grad_norm": 0.400390625, | |
| "learning_rate": 4.943306721406854e-05, | |
| "loss": 0.1605, | |
| "mean_token_accuracy": 0.9466995656490326, | |
| "num_tokens": 49117167.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.1752813272178173, | |
| "epoch": 0.1378528511506632, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 4.941788717870912e-05, | |
| "loss": 0.1733, | |
| "mean_token_accuracy": 0.9437988288700581, | |
| "num_tokens": 49824093.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.17272874722257256, | |
| "epoch": 0.13966670445527718, | |
| "grad_norm": 0.431640625, | |
| "learning_rate": 4.940250898425039e-05, | |
| "loss": 0.1678, | |
| "mean_token_accuracy": 0.9446949861943722, | |
| "num_tokens": 50511512.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.16612791642546654, | |
| "epoch": 0.14148055775989116, | |
| "grad_norm": 0.421875, | |
| "learning_rate": 4.93869327554914e-05, | |
| "loss": 0.1624, | |
| "mean_token_accuracy": 0.9459903188049793, | |
| "num_tokens": 51215026.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.16376803433522583, | |
| "epoch": 0.14329441106450516, | |
| "grad_norm": 0.369140625, | |
| "learning_rate": 4.937115861883831e-05, | |
| "loss": 0.1604, | |
| "mean_token_accuracy": 0.9473146192729474, | |
| "num_tokens": 51929249.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.1707521677017212, | |
| "epoch": 0.14510826436911914, | |
| "grad_norm": 0.359375, | |
| "learning_rate": 4.935518670230339e-05, | |
| "loss": 0.1664, | |
| "mean_token_accuracy": 0.9445839419960975, | |
| "num_tokens": 52626389.0, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.15680131250992418, | |
| "epoch": 0.14692211767373314, | |
| "grad_norm": 0.345703125, | |
| "learning_rate": 4.933901713550391e-05, | |
| "loss": 0.1565, | |
| "mean_token_accuracy": 0.9488645300269127, | |
| "num_tokens": 53324625.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.16342241717502476, | |
| "epoch": 0.14873597097834712, | |
| "grad_norm": 0.353515625, | |
| "learning_rate": 4.932265004966119e-05, | |
| "loss": 0.1582, | |
| "mean_token_accuracy": 0.9477129466831684, | |
| "num_tokens": 54022972.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.16008385811001064, | |
| "epoch": 0.15054982428296113, | |
| "grad_norm": 0.46484375, | |
| "learning_rate": 4.930608557759945e-05, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.9485097445547581, | |
| "num_tokens": 54705358.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.15298983799293636, | |
| "epoch": 0.1523636775875751, | |
| "grad_norm": 0.35546875, | |
| "learning_rate": 4.928932385374477e-05, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9502383783459664, | |
| "num_tokens": 55415741.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.16001109983772038, | |
| "epoch": 0.1541775308921891, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 4.927236501412399e-05, | |
| "loss": 0.1576, | |
| "mean_token_accuracy": 0.9483717873692512, | |
| "num_tokens": 56107468.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.15251143919304014, | |
| "epoch": 0.15599138419680308, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.925520919636361e-05, | |
| "loss": 0.1469, | |
| "mean_token_accuracy": 0.9511617720127106, | |
| "num_tokens": 56823057.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.1463920651935041, | |
| "epoch": 0.15780523750141706, | |
| "grad_norm": 0.349609375, | |
| "learning_rate": 4.923785653968868e-05, | |
| "loss": 0.1465, | |
| "mean_token_accuracy": 0.9523930512368679, | |
| "num_tokens": 57549593.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.1542105100117624, | |
| "epoch": 0.15961909080603107, | |
| "grad_norm": 0.388671875, | |
| "learning_rate": 4.922030718492166e-05, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9503716394305229, | |
| "num_tokens": 58266995.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.16295539326965808, | |
| "epoch": 0.16143294411064504, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.920256127448125e-05, | |
| "loss": 0.1583, | |
| "mean_token_accuracy": 0.9478306032717227, | |
| "num_tokens": 58991460.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.16300919009372591, | |
| "epoch": 0.16324679741525905, | |
| "grad_norm": 0.39453125, | |
| "learning_rate": 4.9184618952381314e-05, | |
| "loss": 0.161, | |
| "mean_token_accuracy": 0.9473481513559818, | |
| "num_tokens": 59703569.0, | |
| "step": 900 | |
| }, | |
| { | |
| "entropy": 0.1508354874327779, | |
| "epoch": 0.16506065071987303, | |
| "grad_norm": 0.357421875, | |
| "learning_rate": 4.916648036422962e-05, | |
| "loss": 0.1496, | |
| "mean_token_accuracy": 0.9515601739287376, | |
| "num_tokens": 60396836.0, | |
| "step": 910 | |
| }, | |
| { | |
| "entropy": 0.14858688032254577, | |
| "epoch": 0.16687450402448703, | |
| "grad_norm": 0.36328125, | |
| "learning_rate": 4.914814565722671e-05, | |
| "loss": 0.1445, | |
| "mean_token_accuracy": 0.9528601355850697, | |
| "num_tokens": 61112387.0, | |
| "step": 920 | |
| }, | |
| { | |
| "entropy": 0.15294391922652723, | |
| "epoch": 0.168688357329101, | |
| "grad_norm": 0.3828125, | |
| "learning_rate": 4.91296149801647e-05, | |
| "loss": 0.1515, | |
| "mean_token_accuracy": 0.9504486098885536, | |
| "num_tokens": 61840615.0, | |
| "step": 930 | |
| }, | |
| { | |
| "entropy": 0.16582977958023548, | |
| "epoch": 0.17050221063371498, | |
| "grad_norm": 0.341796875, | |
| "learning_rate": 4.9110888483426075e-05, | |
| "loss": 0.1642, | |
| "mean_token_accuracy": 0.948068980127573, | |
| "num_tokens": 62517757.0, | |
| "step": 940 | |
| }, | |
| { | |
| "entropy": 0.1455585315823555, | |
| "epoch": 0.172316063938329, | |
| "grad_norm": 0.337890625, | |
| "learning_rate": 4.909196631898244e-05, | |
| "loss": 0.1438, | |
| "mean_token_accuracy": 0.9531501352787017, | |
| "num_tokens": 63235939.0, | |
| "step": 950 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 11028, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.033342770766072e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |