| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.96, |
| "eval_steps": 500, |
| "global_step": 3000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.553677573800087, |
| "epoch": 0.0032, |
| "grad_norm": 0.08532850444316864, |
| "learning_rate": 0.00019942400000000002, |
| "loss": 1.6543, |
| "mean_token_accuracy": 0.5980595760047436, |
| "num_tokens": 237514.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.5756538808345795, |
| "epoch": 0.0064, |
| "grad_norm": 0.08995039016008377, |
| "learning_rate": 0.00019878400000000003, |
| "loss": 1.54, |
| "mean_token_accuracy": 0.6125335305929184, |
| "num_tokens": 492416.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.5214510440826416, |
| "epoch": 0.0096, |
| "grad_norm": 0.08052819967269897, |
| "learning_rate": 0.000198144, |
| "loss": 1.5069, |
| "mean_token_accuracy": 0.6163223661482334, |
| "num_tokens": 750229.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.4752343490719795, |
| "epoch": 0.0128, |
| "grad_norm": 0.07612819969654083, |
| "learning_rate": 0.000197504, |
| "loss": 1.4543, |
| "mean_token_accuracy": 0.6286050908267498, |
| "num_tokens": 985946.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.450043585896492, |
| "epoch": 0.016, |
| "grad_norm": 0.07470866292715073, |
| "learning_rate": 0.000196864, |
| "loss": 1.4449, |
| "mean_token_accuracy": 0.629861956089735, |
| "num_tokens": 1221024.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.4457027241587639, |
| "epoch": 0.0192, |
| "grad_norm": 0.07191968709230423, |
| "learning_rate": 0.000196224, |
| "loss": 1.4317, |
| "mean_token_accuracy": 0.6348222590982914, |
| "num_tokens": 1465139.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.4202743485569953, |
| "epoch": 0.0224, |
| "grad_norm": 0.07370436936616898, |
| "learning_rate": 0.000195584, |
| "loss": 1.4058, |
| "mean_token_accuracy": 0.6382698558270932, |
| "num_tokens": 1707062.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.4351178482174873, |
| "epoch": 0.0256, |
| "grad_norm": 0.07945587486028671, |
| "learning_rate": 0.000194944, |
| "loss": 1.4106, |
| "mean_token_accuracy": 0.6350636854767799, |
| "num_tokens": 1943738.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.379793293774128, |
| "epoch": 0.0288, |
| "grad_norm": 0.08567175269126892, |
| "learning_rate": 0.00019430400000000002, |
| "loss": 1.3877, |
| "mean_token_accuracy": 0.6421510070562363, |
| "num_tokens": 2185125.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.4026792377233506, |
| "epoch": 0.032, |
| "grad_norm": 0.07803715765476227, |
| "learning_rate": 0.000193664, |
| "loss": 1.394, |
| "mean_token_accuracy": 0.6389409720897674, |
| "num_tokens": 2411471.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.390401628613472, |
| "epoch": 0.0352, |
| "grad_norm": 0.08585170656442642, |
| "learning_rate": 0.000193024, |
| "loss": 1.3835, |
| "mean_token_accuracy": 0.6402524076402187, |
| "num_tokens": 2668530.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.3789777308702469, |
| "epoch": 0.0384, |
| "grad_norm": 0.08998815715312958, |
| "learning_rate": 0.000192384, |
| "loss": 1.3678, |
| "mean_token_accuracy": 0.6434545509517193, |
| "num_tokens": 2916918.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.3781758308410645, |
| "epoch": 0.0416, |
| "grad_norm": 0.0936167761683464, |
| "learning_rate": 0.000191744, |
| "loss": 1.3695, |
| "mean_token_accuracy": 0.6434862360358238, |
| "num_tokens": 3166485.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.3970939204096795, |
| "epoch": 0.0448, |
| "grad_norm": 0.08567705750465393, |
| "learning_rate": 0.00019110400000000002, |
| "loss": 1.3852, |
| "mean_token_accuracy": 0.6414178721606731, |
| "num_tokens": 3395460.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.3698946446180345, |
| "epoch": 0.048, |
| "grad_norm": 0.08985050022602081, |
| "learning_rate": 0.00019046400000000002, |
| "loss": 1.3786, |
| "mean_token_accuracy": 0.6414492532610894, |
| "num_tokens": 3652903.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.393638737499714, |
| "epoch": 0.0512, |
| "grad_norm": 0.09392407536506653, |
| "learning_rate": 0.000189824, |
| "loss": 1.3885, |
| "mean_token_accuracy": 0.6388964027166366, |
| "num_tokens": 3879620.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.3732418417930603, |
| "epoch": 0.0544, |
| "grad_norm": 0.08198658376932144, |
| "learning_rate": 0.000189184, |
| "loss": 1.3721, |
| "mean_token_accuracy": 0.6454169787466526, |
| "num_tokens": 4126817.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.3810233503580094, |
| "epoch": 0.0576, |
| "grad_norm": 0.07432520389556885, |
| "learning_rate": 0.00018854400000000002, |
| "loss": 1.3731, |
| "mean_token_accuracy": 0.6420239724218846, |
| "num_tokens": 4378841.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.3441029533743858, |
| "epoch": 0.0608, |
| "grad_norm": 0.08733066916465759, |
| "learning_rate": 0.00018790400000000002, |
| "loss": 1.3312, |
| "mean_token_accuracy": 0.6496633291244507, |
| "num_tokens": 4638721.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.3665404744446277, |
| "epoch": 0.064, |
| "grad_norm": 0.09019674360752106, |
| "learning_rate": 0.00018726400000000003, |
| "loss": 1.3509, |
| "mean_token_accuracy": 0.6477358929812909, |
| "num_tokens": 4882310.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.3439898125827312, |
| "epoch": 0.0672, |
| "grad_norm": 0.07889644801616669, |
| "learning_rate": 0.000186624, |
| "loss": 1.339, |
| "mean_token_accuracy": 0.6494252569973469, |
| "num_tokens": 5135711.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.3742446891963482, |
| "epoch": 0.0704, |
| "grad_norm": 0.0959227904677391, |
| "learning_rate": 0.00018598400000000001, |
| "loss": 1.3651, |
| "mean_token_accuracy": 0.6457678973674774, |
| "num_tokens": 5386372.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.3562579050660133, |
| "epoch": 0.0736, |
| "grad_norm": 0.08159064501523972, |
| "learning_rate": 0.00018534400000000002, |
| "loss": 1.3371, |
| "mean_token_accuracy": 0.6496910750865936, |
| "num_tokens": 5635934.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.3549902424216271, |
| "epoch": 0.0768, |
| "grad_norm": 0.09278034418821335, |
| "learning_rate": 0.000184704, |
| "loss": 1.3463, |
| "mean_token_accuracy": 0.6507370501756669, |
| "num_tokens": 5881040.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.3526610367000103, |
| "epoch": 0.08, |
| "grad_norm": 0.08876577019691467, |
| "learning_rate": 0.000184064, |
| "loss": 1.3458, |
| "mean_token_accuracy": 0.6485695078969002, |
| "num_tokens": 6127082.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.348705020546913, |
| "epoch": 0.0832, |
| "grad_norm": 0.09106452763080597, |
| "learning_rate": 0.000183424, |
| "loss": 1.3325, |
| "mean_token_accuracy": 0.6495699658989906, |
| "num_tokens": 6384976.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.3516556322574615, |
| "epoch": 0.0864, |
| "grad_norm": 0.10168611258268356, |
| "learning_rate": 0.000182784, |
| "loss": 1.3385, |
| "mean_token_accuracy": 0.6495325975120068, |
| "num_tokens": 6633971.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.3481612212955951, |
| "epoch": 0.0896, |
| "grad_norm": 0.08860975503921509, |
| "learning_rate": 0.000182144, |
| "loss": 1.3378, |
| "mean_token_accuracy": 0.6494495809078217, |
| "num_tokens": 6904349.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.3710920095443726, |
| "epoch": 0.0928, |
| "grad_norm": 0.11280851066112518, |
| "learning_rate": 0.000181504, |
| "loss": 1.3694, |
| "mean_token_accuracy": 0.6439775034785271, |
| "num_tokens": 7139563.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.3453910604119301, |
| "epoch": 0.096, |
| "grad_norm": 0.09350915998220444, |
| "learning_rate": 0.000180864, |
| "loss": 1.3355, |
| "mean_token_accuracy": 0.6493184350430965, |
| "num_tokens": 7382567.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.343386459350586, |
| "epoch": 0.0992, |
| "grad_norm": 0.10091142356395721, |
| "learning_rate": 0.00018022400000000001, |
| "loss": 1.343, |
| "mean_token_accuracy": 0.6506248451769352, |
| "num_tokens": 7620384.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.3504434123635292, |
| "epoch": 0.1024, |
| "grad_norm": 0.11306887865066528, |
| "learning_rate": 0.00017958400000000002, |
| "loss": 1.3423, |
| "mean_token_accuracy": 0.6478524334728718, |
| "num_tokens": 7854350.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.3544681049883365, |
| "epoch": 0.1056, |
| "grad_norm": 0.10049337148666382, |
| "learning_rate": 0.000178944, |
| "loss": 1.3323, |
| "mean_token_accuracy": 0.6498128823935986, |
| "num_tokens": 8091689.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.3563036635518073, |
| "epoch": 0.1088, |
| "grad_norm": 0.10255710035562515, |
| "learning_rate": 0.000178304, |
| "loss": 1.3462, |
| "mean_token_accuracy": 0.6476793691515923, |
| "num_tokens": 8330974.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.3327604666352273, |
| "epoch": 0.112, |
| "grad_norm": 0.10156874358654022, |
| "learning_rate": 0.000177664, |
| "loss": 1.3251, |
| "mean_token_accuracy": 0.6525948382914066, |
| "num_tokens": 8571477.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.3347194895148278, |
| "epoch": 0.1152, |
| "grad_norm": 0.10417470335960388, |
| "learning_rate": 0.00017702400000000002, |
| "loss": 1.3276, |
| "mean_token_accuracy": 0.6516825027763844, |
| "num_tokens": 8811667.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.3434858575463295, |
| "epoch": 0.1184, |
| "grad_norm": 0.10329274833202362, |
| "learning_rate": 0.00017638400000000002, |
| "loss": 1.337, |
| "mean_token_accuracy": 0.6512879721820355, |
| "num_tokens": 9045845.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.3232483133673667, |
| "epoch": 0.1216, |
| "grad_norm": 0.09415698796510696, |
| "learning_rate": 0.000175744, |
| "loss": 1.3264, |
| "mean_token_accuracy": 0.6521417684853077, |
| "num_tokens": 9289002.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.3317414239048957, |
| "epoch": 0.1248, |
| "grad_norm": 0.10082657635211945, |
| "learning_rate": 0.000175104, |
| "loss": 1.3267, |
| "mean_token_accuracy": 0.6535967506468296, |
| "num_tokens": 9534097.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.3344782829284667, |
| "epoch": 0.128, |
| "grad_norm": 0.08955828100442886, |
| "learning_rate": 0.00017446400000000002, |
| "loss": 1.3233, |
| "mean_token_accuracy": 0.6512827917933464, |
| "num_tokens": 9775060.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.3362744271755218, |
| "epoch": 0.1312, |
| "grad_norm": 0.09589624404907227, |
| "learning_rate": 0.00017382400000000002, |
| "loss": 1.3345, |
| "mean_token_accuracy": 0.6508975014090538, |
| "num_tokens": 10011225.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.3263516038656236, |
| "epoch": 0.1344, |
| "grad_norm": 0.09233064204454422, |
| "learning_rate": 0.000173184, |
| "loss": 1.2972, |
| "mean_token_accuracy": 0.6545626476407052, |
| "num_tokens": 10265073.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.3442956805229187, |
| "epoch": 0.1376, |
| "grad_norm": 0.09690549969673157, |
| "learning_rate": 0.000172544, |
| "loss": 1.3377, |
| "mean_token_accuracy": 0.6507034592330456, |
| "num_tokens": 10506554.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.3446499943733214, |
| "epoch": 0.1408, |
| "grad_norm": 0.10441666841506958, |
| "learning_rate": 0.00017190399999999999, |
| "loss": 1.3413, |
| "mean_token_accuracy": 0.6506175689399243, |
| "num_tokens": 10744084.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.3303291231393815, |
| "epoch": 0.144, |
| "grad_norm": 0.09686215966939926, |
| "learning_rate": 0.000171264, |
| "loss": 1.3271, |
| "mean_token_accuracy": 0.6542476817965508, |
| "num_tokens": 10979917.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.3150670006871223, |
| "epoch": 0.1472, |
| "grad_norm": 0.1011364534497261, |
| "learning_rate": 0.000170624, |
| "loss": 1.3034, |
| "mean_token_accuracy": 0.6562925077974796, |
| "num_tokens": 11220883.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.3295040972530843, |
| "epoch": 0.1504, |
| "grad_norm": 0.10409437119960785, |
| "learning_rate": 0.000169984, |
| "loss": 1.3243, |
| "mean_token_accuracy": 0.6531910292804242, |
| "num_tokens": 11465575.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.3206181339919567, |
| "epoch": 0.1536, |
| "grad_norm": 0.09432035684585571, |
| "learning_rate": 0.000169344, |
| "loss": 1.3171, |
| "mean_token_accuracy": 0.6539786323904991, |
| "num_tokens": 11702675.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.3373061180114747, |
| "epoch": 0.1568, |
| "grad_norm": 0.10558689385652542, |
| "learning_rate": 0.00016870400000000002, |
| "loss": 1.3257, |
| "mean_token_accuracy": 0.6521159410476685, |
| "num_tokens": 11946162.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.3231555491685867, |
| "epoch": 0.16, |
| "grad_norm": 0.11124099045991898, |
| "learning_rate": 0.000168064, |
| "loss": 1.311, |
| "mean_token_accuracy": 0.6546669088304042, |
| "num_tokens": 12193584.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.32535812407732, |
| "epoch": 0.1632, |
| "grad_norm": 0.12996409833431244, |
| "learning_rate": 0.000167424, |
| "loss": 1.3141, |
| "mean_token_accuracy": 0.6530978180468082, |
| "num_tokens": 12429911.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.3478160053491592, |
| "epoch": 0.1664, |
| "grad_norm": 0.0903463065624237, |
| "learning_rate": 0.000166784, |
| "loss": 1.3198, |
| "mean_token_accuracy": 0.6524186365306377, |
| "num_tokens": 12676017.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.330558653920889, |
| "epoch": 0.1696, |
| "grad_norm": 0.09826014935970306, |
| "learning_rate": 0.00016614400000000001, |
| "loss": 1.3219, |
| "mean_token_accuracy": 0.6538619548082352, |
| "num_tokens": 12922333.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.321587622910738, |
| "epoch": 0.1728, |
| "grad_norm": 0.10300284624099731, |
| "learning_rate": 0.00016550400000000002, |
| "loss": 1.3074, |
| "mean_token_accuracy": 0.655467725545168, |
| "num_tokens": 13177611.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.317962073534727, |
| "epoch": 0.176, |
| "grad_norm": 0.10007941722869873, |
| "learning_rate": 0.00016486400000000003, |
| "loss": 1.3201, |
| "mean_token_accuracy": 0.6558892779052258, |
| "num_tokens": 13423705.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.3344788908958436, |
| "epoch": 0.1792, |
| "grad_norm": 0.10150554031133652, |
| "learning_rate": 0.000164224, |
| "loss": 1.3282, |
| "mean_token_accuracy": 0.6522024109959602, |
| "num_tokens": 13677643.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.3231421291828156, |
| "epoch": 0.1824, |
| "grad_norm": 0.11198398470878601, |
| "learning_rate": 0.000163584, |
| "loss": 1.3173, |
| "mean_token_accuracy": 0.6538634590804577, |
| "num_tokens": 13920963.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.3234215386211872, |
| "epoch": 0.1856, |
| "grad_norm": 0.10237586498260498, |
| "learning_rate": 0.00016294400000000002, |
| "loss": 1.3198, |
| "mean_token_accuracy": 0.6537396013736725, |
| "num_tokens": 14175879.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.3299582540988921, |
| "epoch": 0.1888, |
| "grad_norm": 0.11097724735736847, |
| "learning_rate": 0.00016230400000000002, |
| "loss": 1.3079, |
| "mean_token_accuracy": 0.6532341055572033, |
| "num_tokens": 14426394.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.3175608664751053, |
| "epoch": 0.192, |
| "grad_norm": 0.09972016513347626, |
| "learning_rate": 0.000161664, |
| "loss": 1.3173, |
| "mean_token_accuracy": 0.6553133882582187, |
| "num_tokens": 14666379.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.3370580792427063, |
| "epoch": 0.1952, |
| "grad_norm": 0.11125458031892776, |
| "learning_rate": 0.000161024, |
| "loss": 1.3359, |
| "mean_token_accuracy": 0.6516197249293327, |
| "num_tokens": 14911711.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.3226444989442825, |
| "epoch": 0.1984, |
| "grad_norm": 0.11607536673545837, |
| "learning_rate": 0.000160384, |
| "loss": 1.3197, |
| "mean_token_accuracy": 0.6528247050940991, |
| "num_tokens": 15152918.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.3154152296483517, |
| "epoch": 0.2016, |
| "grad_norm": 0.10487879067659378, |
| "learning_rate": 0.000159744, |
| "loss": 1.2988, |
| "mean_token_accuracy": 0.6569284565746785, |
| "num_tokens": 15401823.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.316512218117714, |
| "epoch": 0.2048, |
| "grad_norm": 0.09663492441177368, |
| "learning_rate": 0.000159104, |
| "loss": 1.2988, |
| "mean_token_accuracy": 0.6556785583496094, |
| "num_tokens": 15676507.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.320728339254856, |
| "epoch": 0.208, |
| "grad_norm": 0.1180514469742775, |
| "learning_rate": 0.000158464, |
| "loss": 1.3219, |
| "mean_token_accuracy": 0.6542477697134018, |
| "num_tokens": 15913478.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.3256668210029603, |
| "epoch": 0.2112, |
| "grad_norm": 0.12013403326272964, |
| "learning_rate": 0.000157824, |
| "loss": 1.3187, |
| "mean_token_accuracy": 0.6541241362690926, |
| "num_tokens": 16153317.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.3174795433878899, |
| "epoch": 0.2144, |
| "grad_norm": 0.0980040431022644, |
| "learning_rate": 0.000157184, |
| "loss": 1.3142, |
| "mean_token_accuracy": 0.6559794172644615, |
| "num_tokens": 16394720.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.3147821851074695, |
| "epoch": 0.2176, |
| "grad_norm": 0.10305086523294449, |
| "learning_rate": 0.000156544, |
| "loss": 1.3147, |
| "mean_token_accuracy": 0.6544689692556858, |
| "num_tokens": 16629218.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.32085572630167, |
| "epoch": 0.2208, |
| "grad_norm": 0.10918137431144714, |
| "learning_rate": 0.000155904, |
| "loss": 1.3147, |
| "mean_token_accuracy": 0.6540469415485859, |
| "num_tokens": 16881838.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.3132400080561637, |
| "epoch": 0.224, |
| "grad_norm": 0.10053646564483643, |
| "learning_rate": 0.000155264, |
| "loss": 1.3057, |
| "mean_token_accuracy": 0.6564008943736553, |
| "num_tokens": 17131467.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.3165880754590034, |
| "epoch": 0.2272, |
| "grad_norm": 0.12175013870000839, |
| "learning_rate": 0.00015462400000000002, |
| "loss": 1.3029, |
| "mean_token_accuracy": 0.6550934337079525, |
| "num_tokens": 17397448.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.3092229522764682, |
| "epoch": 0.2304, |
| "grad_norm": 0.10413607209920883, |
| "learning_rate": 0.00015398400000000002, |
| "loss": 1.3036, |
| "mean_token_accuracy": 0.657696595042944, |
| "num_tokens": 17639732.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.3083337277173996, |
| "epoch": 0.2336, |
| "grad_norm": 0.12531371414661407, |
| "learning_rate": 0.000153344, |
| "loss": 1.2973, |
| "mean_token_accuracy": 0.6588804170489311, |
| "num_tokens": 17883926.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.321522592008114, |
| "epoch": 0.2368, |
| "grad_norm": 0.10225304961204529, |
| "learning_rate": 0.000152704, |
| "loss": 1.3117, |
| "mean_token_accuracy": 0.6564318485558033, |
| "num_tokens": 18132518.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.315239043533802, |
| "epoch": 0.24, |
| "grad_norm": 0.10350354015827179, |
| "learning_rate": 0.000152064, |
| "loss": 1.3051, |
| "mean_token_accuracy": 0.6568679049611091, |
| "num_tokens": 18382094.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.3136978909373282, |
| "epoch": 0.2432, |
| "grad_norm": 0.10933377593755722, |
| "learning_rate": 0.00015142400000000002, |
| "loss": 1.3124, |
| "mean_token_accuracy": 0.6560237430036068, |
| "num_tokens": 18615093.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.3141875579953193, |
| "epoch": 0.2464, |
| "grad_norm": 0.11045292764902115, |
| "learning_rate": 0.00015078400000000003, |
| "loss": 1.3031, |
| "mean_token_accuracy": 0.6565131366252899, |
| "num_tokens": 18860828.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.3189457476139068, |
| "epoch": 0.2496, |
| "grad_norm": 0.11167466640472412, |
| "learning_rate": 0.000150144, |
| "loss": 1.3158, |
| "mean_token_accuracy": 0.6546958208084106, |
| "num_tokens": 19092521.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.3183915361762046, |
| "epoch": 0.2528, |
| "grad_norm": 0.11039945483207703, |
| "learning_rate": 0.000149504, |
| "loss": 1.3102, |
| "mean_token_accuracy": 0.6547456443309784, |
| "num_tokens": 19340961.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.3195623025298118, |
| "epoch": 0.256, |
| "grad_norm": 0.11707881838083267, |
| "learning_rate": 0.00014886400000000002, |
| "loss": 1.3059, |
| "mean_token_accuracy": 0.6558867372572422, |
| "num_tokens": 19587605.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.318366675078869, |
| "epoch": 0.2592, |
| "grad_norm": 0.11866453289985657, |
| "learning_rate": 0.000148224, |
| "loss": 1.3079, |
| "mean_token_accuracy": 0.6560817562043667, |
| "num_tokens": 19849177.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.3193187534809112, |
| "epoch": 0.2624, |
| "grad_norm": 0.12462879717350006, |
| "learning_rate": 0.000147584, |
| "loss": 1.3096, |
| "mean_token_accuracy": 0.6570102378726006, |
| "num_tokens": 20077744.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.2931833848357202, |
| "epoch": 0.2656, |
| "grad_norm": 0.11133814603090286, |
| "learning_rate": 0.000146944, |
| "loss": 1.2905, |
| "mean_token_accuracy": 0.6588832668960094, |
| "num_tokens": 20329962.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.3287797823548317, |
| "epoch": 0.2688, |
| "grad_norm": 0.110688216984272, |
| "learning_rate": 0.000146304, |
| "loss": 1.3251, |
| "mean_token_accuracy": 0.6538114577531815, |
| "num_tokens": 20575282.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.3191335260868073, |
| "epoch": 0.272, |
| "grad_norm": 0.08832383155822754, |
| "learning_rate": 0.000145664, |
| "loss": 1.3076, |
| "mean_token_accuracy": 0.6542156912386418, |
| "num_tokens": 20828532.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.3225988179445267, |
| "epoch": 0.2752, |
| "grad_norm": 0.10659767687320709, |
| "learning_rate": 0.000145024, |
| "loss": 1.3208, |
| "mean_token_accuracy": 0.6547297932207584, |
| "num_tokens": 21063969.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.3056413672864438, |
| "epoch": 0.2784, |
| "grad_norm": 0.11102893203496933, |
| "learning_rate": 0.000144384, |
| "loss": 1.3019, |
| "mean_token_accuracy": 0.6585859775543212, |
| "num_tokens": 21305180.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.295969295501709, |
| "epoch": 0.2816, |
| "grad_norm": 0.10383511334657669, |
| "learning_rate": 0.000143744, |
| "loss": 1.2865, |
| "mean_token_accuracy": 0.6606860637664795, |
| "num_tokens": 21551736.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.3159167543053627, |
| "epoch": 0.2848, |
| "grad_norm": 0.1158025935292244, |
| "learning_rate": 0.00014310400000000002, |
| "loss": 1.3122, |
| "mean_token_accuracy": 0.6532071150839329, |
| "num_tokens": 21799151.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.3213034585118293, |
| "epoch": 0.288, |
| "grad_norm": 0.12137997895479202, |
| "learning_rate": 0.000142464, |
| "loss": 1.307, |
| "mean_token_accuracy": 0.6579110652208329, |
| "num_tokens": 22042861.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.3044739000499248, |
| "epoch": 0.2912, |
| "grad_norm": 0.11786556988954544, |
| "learning_rate": 0.000141824, |
| "loss": 1.2919, |
| "mean_token_accuracy": 0.6586256206035614, |
| "num_tokens": 22289424.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.3048512905836105, |
| "epoch": 0.2944, |
| "grad_norm": 0.09443694353103638, |
| "learning_rate": 0.000141184, |
| "loss": 1.3045, |
| "mean_token_accuracy": 0.6565531671047211, |
| "num_tokens": 22532672.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.3144661530852317, |
| "epoch": 0.2976, |
| "grad_norm": 0.10688792914152145, |
| "learning_rate": 0.00014054400000000002, |
| "loss": 1.3026, |
| "mean_token_accuracy": 0.6541644394397735, |
| "num_tokens": 22781407.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.3230641946196555, |
| "epoch": 0.3008, |
| "grad_norm": 0.10468871146440506, |
| "learning_rate": 0.00013990400000000002, |
| "loss": 1.3116, |
| "mean_token_accuracy": 0.6554778039455413, |
| "num_tokens": 23024418.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.3082950003445148, |
| "epoch": 0.304, |
| "grad_norm": 0.0945873111486435, |
| "learning_rate": 0.00013926400000000003, |
| "loss": 1.2946, |
| "mean_token_accuracy": 0.6569370336830616, |
| "num_tokens": 23278003.0, |
| "step": 950 |
| }, |
| { |
| "entropy": 1.2960704147815705, |
| "epoch": 0.3072, |
| "grad_norm": 0.11186746507883072, |
| "learning_rate": 0.000138624, |
| "loss": 1.2988, |
| "mean_token_accuracy": 0.6593247003853321, |
| "num_tokens": 23525101.0, |
| "step": 960 |
| }, |
| { |
| "entropy": 1.3077548265457153, |
| "epoch": 0.3104, |
| "grad_norm": 0.11378996819257736, |
| "learning_rate": 0.000137984, |
| "loss": 1.3029, |
| "mean_token_accuracy": 0.6561847567558289, |
| "num_tokens": 23764370.0, |
| "step": 970 |
| }, |
| { |
| "entropy": 1.3246932446956634, |
| "epoch": 0.3136, |
| "grad_norm": 0.11621101945638657, |
| "learning_rate": 0.00013734400000000002, |
| "loss": 1.3194, |
| "mean_token_accuracy": 0.6553701266646386, |
| "num_tokens": 23989373.0, |
| "step": 980 |
| }, |
| { |
| "entropy": 1.3070946156978607, |
| "epoch": 0.3168, |
| "grad_norm": 0.10978589206933975, |
| "learning_rate": 0.000136704, |
| "loss": 1.2936, |
| "mean_token_accuracy": 0.65978347286582, |
| "num_tokens": 24236128.0, |
| "step": 990 |
| }, |
| { |
| "entropy": 1.3108026057481765, |
| "epoch": 0.32, |
| "grad_norm": 0.12311512976884842, |
| "learning_rate": 0.000136064, |
| "loss": 1.3092, |
| "mean_token_accuracy": 0.6566934674978256, |
| "num_tokens": 24472782.0, |
| "step": 1000 |
| }, |
| { |
| "entropy": 1.2946225196123122, |
| "epoch": 0.3232, |
| "grad_norm": 0.1115647554397583, |
| "learning_rate": 0.000135424, |
| "loss": 1.2862, |
| "mean_token_accuracy": 0.6616561591625214, |
| "num_tokens": 24712655.0, |
| "step": 1010 |
| }, |
| { |
| "entropy": 1.311082622408867, |
| "epoch": 0.3264, |
| "grad_norm": 0.1018873080611229, |
| "learning_rate": 0.000134784, |
| "loss": 1.3179, |
| "mean_token_accuracy": 0.6546162769198418, |
| "num_tokens": 24957725.0, |
| "step": 1020 |
| }, |
| { |
| "entropy": 1.3099162042140962, |
| "epoch": 0.3296, |
| "grad_norm": 0.11416924744844437, |
| "learning_rate": 0.000134144, |
| "loss": 1.2975, |
| "mean_token_accuracy": 0.6587870210409165, |
| "num_tokens": 25200016.0, |
| "step": 1030 |
| }, |
| { |
| "entropy": 1.2903758212924004, |
| "epoch": 0.3328, |
| "grad_norm": 0.11968304961919785, |
| "learning_rate": 0.000133504, |
| "loss": 1.2969, |
| "mean_token_accuracy": 0.6600047402083874, |
| "num_tokens": 25442186.0, |
| "step": 1040 |
| }, |
| { |
| "entropy": 1.3093498826026917, |
| "epoch": 0.336, |
| "grad_norm": 0.10511133819818497, |
| "learning_rate": 0.000132864, |
| "loss": 1.2979, |
| "mean_token_accuracy": 0.659348300844431, |
| "num_tokens": 25688756.0, |
| "step": 1050 |
| }, |
| { |
| "entropy": 1.3099051028490067, |
| "epoch": 0.3392, |
| "grad_norm": 0.11748205870389938, |
| "learning_rate": 0.000132224, |
| "loss": 1.302, |
| "mean_token_accuracy": 0.6570538312196732, |
| "num_tokens": 25941106.0, |
| "step": 1060 |
| }, |
| { |
| "entropy": 1.2965721994638444, |
| "epoch": 0.3424, |
| "grad_norm": 0.1022774800658226, |
| "learning_rate": 0.000131584, |
| "loss": 1.2875, |
| "mean_token_accuracy": 0.661591324210167, |
| "num_tokens": 26194045.0, |
| "step": 1070 |
| }, |
| { |
| "entropy": 1.2942138247191906, |
| "epoch": 0.3456, |
| "grad_norm": 0.09989454597234726, |
| "learning_rate": 0.000130944, |
| "loss": 1.2754, |
| "mean_token_accuracy": 0.6626487828791141, |
| "num_tokens": 26436761.0, |
| "step": 1080 |
| }, |
| { |
| "entropy": 1.302545104175806, |
| "epoch": 0.3488, |
| "grad_norm": 0.10628174245357513, |
| "learning_rate": 0.000130304, |
| "loss": 1.286, |
| "mean_token_accuracy": 0.6594067342579365, |
| "num_tokens": 26680746.0, |
| "step": 1090 |
| }, |
| { |
| "entropy": 1.302603267133236, |
| "epoch": 0.352, |
| "grad_norm": 0.12155800312757492, |
| "learning_rate": 0.000129664, |
| "loss": 1.3027, |
| "mean_token_accuracy": 0.6587515436112881, |
| "num_tokens": 26907065.0, |
| "step": 1100 |
| }, |
| { |
| "entropy": 1.3198756337165833, |
| "epoch": 0.3552, |
| "grad_norm": 0.10880861431360245, |
| "learning_rate": 0.00012902400000000002, |
| "loss": 1.3082, |
| "mean_token_accuracy": 0.656289030611515, |
| "num_tokens": 27161233.0, |
| "step": 1110 |
| }, |
| { |
| "entropy": 1.2902938559651376, |
| "epoch": 0.3584, |
| "grad_norm": 0.11190807819366455, |
| "learning_rate": 0.00012838400000000002, |
| "loss": 1.282, |
| "mean_token_accuracy": 0.6609233863651752, |
| "num_tokens": 27405578.0, |
| "step": 1120 |
| }, |
| { |
| "entropy": 1.2958773463964461, |
| "epoch": 0.3616, |
| "grad_norm": 0.11322169005870819, |
| "learning_rate": 0.000127744, |
| "loss": 1.2913, |
| "mean_token_accuracy": 0.6599987909197808, |
| "num_tokens": 27637766.0, |
| "step": 1130 |
| }, |
| { |
| "entropy": 1.3140671238303185, |
| "epoch": 0.3648, |
| "grad_norm": 0.11261705309152603, |
| "learning_rate": 0.000127104, |
| "loss": 1.3017, |
| "mean_token_accuracy": 0.6572919100522995, |
| "num_tokens": 27875161.0, |
| "step": 1140 |
| }, |
| { |
| "entropy": 1.3053696781396866, |
| "epoch": 0.368, |
| "grad_norm": 0.11668220907449722, |
| "learning_rate": 0.00012646400000000001, |
| "loss": 1.2983, |
| "mean_token_accuracy": 0.6583772532641887, |
| "num_tokens": 28107893.0, |
| "step": 1150 |
| }, |
| { |
| "entropy": 1.3009184449911118, |
| "epoch": 0.3712, |
| "grad_norm": 0.10576856881380081, |
| "learning_rate": 0.00012582400000000002, |
| "loss": 1.2965, |
| "mean_token_accuracy": 0.6595028474926948, |
| "num_tokens": 28348673.0, |
| "step": 1160 |
| }, |
| { |
| "entropy": 1.31601525247097, |
| "epoch": 0.3744, |
| "grad_norm": 0.11733856052160263, |
| "learning_rate": 0.000125184, |
| "loss": 1.3111, |
| "mean_token_accuracy": 0.6571513615548611, |
| "num_tokens": 28594970.0, |
| "step": 1170 |
| }, |
| { |
| "entropy": 1.3011867359280587, |
| "epoch": 0.3776, |
| "grad_norm": 0.11323832720518112, |
| "learning_rate": 0.000124544, |
| "loss": 1.2909, |
| "mean_token_accuracy": 0.6589725501835346, |
| "num_tokens": 28844455.0, |
| "step": 1180 |
| }, |
| { |
| "entropy": 1.3101375102996826, |
| "epoch": 0.3808, |
| "grad_norm": 0.12091344594955444, |
| "learning_rate": 0.00012390399999999998, |
| "loss": 1.2974, |
| "mean_token_accuracy": 0.6579912699759006, |
| "num_tokens": 29086055.0, |
| "step": 1190 |
| }, |
| { |
| "entropy": 1.2863799035549164, |
| "epoch": 0.384, |
| "grad_norm": 0.11909841001033783, |
| "learning_rate": 0.000123264, |
| "loss": 1.2903, |
| "mean_token_accuracy": 0.6595034293830395, |
| "num_tokens": 29340160.0, |
| "step": 1200 |
| }, |
| { |
| "entropy": 1.31235663741827, |
| "epoch": 0.3872, |
| "grad_norm": 0.11971624195575714, |
| "learning_rate": 0.000122624, |
| "loss": 1.3001, |
| "mean_token_accuracy": 0.6574183270335198, |
| "num_tokens": 29570492.0, |
| "step": 1210 |
| }, |
| { |
| "entropy": 1.3069498613476753, |
| "epoch": 0.3904, |
| "grad_norm": 0.09152086079120636, |
| "learning_rate": 0.000121984, |
| "loss": 1.3003, |
| "mean_token_accuracy": 0.6592905893921852, |
| "num_tokens": 29823932.0, |
| "step": 1220 |
| }, |
| { |
| "entropy": 1.3035918101668358, |
| "epoch": 0.3936, |
| "grad_norm": 0.1179327666759491, |
| "learning_rate": 0.00012134400000000001, |
| "loss": 1.2902, |
| "mean_token_accuracy": 0.6589632578194141, |
| "num_tokens": 30083036.0, |
| "step": 1230 |
| }, |
| { |
| "entropy": 1.2851035431027413, |
| "epoch": 0.3968, |
| "grad_norm": 0.09468456357717514, |
| "learning_rate": 0.00012070399999999999, |
| "loss": 1.2718, |
| "mean_token_accuracy": 0.6635173566639423, |
| "num_tokens": 30330880.0, |
| "step": 1240 |
| }, |
| { |
| "entropy": 1.2939402066171168, |
| "epoch": 0.4, |
| "grad_norm": 0.11907203495502472, |
| "learning_rate": 0.000120064, |
| "loss": 1.2918, |
| "mean_token_accuracy": 0.6597561411559582, |
| "num_tokens": 30567694.0, |
| "step": 1250 |
| }, |
| { |
| "entropy": 1.2958165138959885, |
| "epoch": 0.4032, |
| "grad_norm": 0.12533923983573914, |
| "learning_rate": 0.000119424, |
| "loss": 1.2747, |
| "mean_token_accuracy": 0.6604633182287216, |
| "num_tokens": 30819173.0, |
| "step": 1260 |
| }, |
| { |
| "entropy": 1.3044821873307229, |
| "epoch": 0.4064, |
| "grad_norm": 0.11745740473270416, |
| "learning_rate": 0.000118784, |
| "loss": 1.2972, |
| "mean_token_accuracy": 0.658799535036087, |
| "num_tokens": 31067589.0, |
| "step": 1270 |
| }, |
| { |
| "entropy": 1.3022723242640495, |
| "epoch": 0.4096, |
| "grad_norm": 0.12718279659748077, |
| "learning_rate": 0.00011814400000000001, |
| "loss": 1.2948, |
| "mean_token_accuracy": 0.6585467241704464, |
| "num_tokens": 31312655.0, |
| "step": 1280 |
| }, |
| { |
| "entropy": 1.283016387373209, |
| "epoch": 0.4128, |
| "grad_norm": 0.11515259742736816, |
| "learning_rate": 0.00011750400000000002, |
| "loss": 1.2748, |
| "mean_token_accuracy": 0.663072568923235, |
| "num_tokens": 31579920.0, |
| "step": 1290 |
| }, |
| { |
| "entropy": 1.2980821460485459, |
| "epoch": 0.416, |
| "grad_norm": 0.10572729259729385, |
| "learning_rate": 0.000116864, |
| "loss": 1.2873, |
| "mean_token_accuracy": 0.6594301424920559, |
| "num_tokens": 31817578.0, |
| "step": 1300 |
| }, |
| { |
| "entropy": 1.3054782345890998, |
| "epoch": 0.4192, |
| "grad_norm": 0.11281997710466385, |
| "learning_rate": 0.000116224, |
| "loss": 1.3028, |
| "mean_token_accuracy": 0.6591689370572567, |
| "num_tokens": 32058048.0, |
| "step": 1310 |
| }, |
| { |
| "entropy": 1.2863758310675621, |
| "epoch": 0.4224, |
| "grad_norm": 0.11885910481214523, |
| "learning_rate": 0.00011558400000000001, |
| "loss": 1.2758, |
| "mean_token_accuracy": 0.659640759974718, |
| "num_tokens": 32312447.0, |
| "step": 1320 |
| }, |
| { |
| "entropy": 1.2836967006325721, |
| "epoch": 0.4256, |
| "grad_norm": 0.11872182786464691, |
| "learning_rate": 0.000114944, |
| "loss": 1.2721, |
| "mean_token_accuracy": 0.6634449824690819, |
| "num_tokens": 32550927.0, |
| "step": 1330 |
| }, |
| { |
| "entropy": 1.2840148255228996, |
| "epoch": 0.4288, |
| "grad_norm": 0.1084010899066925, |
| "learning_rate": 0.00011430400000000001, |
| "loss": 1.2836, |
| "mean_token_accuracy": 0.6622305043041706, |
| "num_tokens": 32807743.0, |
| "step": 1340 |
| }, |
| { |
| "entropy": 1.2817936651408672, |
| "epoch": 0.432, |
| "grad_norm": 0.11066750437021255, |
| "learning_rate": 0.00011366400000000001, |
| "loss": 1.2603, |
| "mean_token_accuracy": 0.6648251056671143, |
| "num_tokens": 33060740.0, |
| "step": 1350 |
| }, |
| { |
| "entropy": 1.2912034377455712, |
| "epoch": 0.4352, |
| "grad_norm": 0.09771846234798431, |
| "learning_rate": 0.000113024, |
| "loss": 1.2735, |
| "mean_token_accuracy": 0.6614834323525429, |
| "num_tokens": 33298003.0, |
| "step": 1360 |
| }, |
| { |
| "entropy": 1.2874145463109017, |
| "epoch": 0.4384, |
| "grad_norm": 0.10283275693655014, |
| "learning_rate": 0.000112384, |
| "loss": 1.2869, |
| "mean_token_accuracy": 0.6601925425231456, |
| "num_tokens": 33545132.0, |
| "step": 1370 |
| }, |
| { |
| "entropy": 1.3115147739648818, |
| "epoch": 0.4416, |
| "grad_norm": 0.1161857396364212, |
| "learning_rate": 0.000111744, |
| "loss": 1.3023, |
| "mean_token_accuracy": 0.6571636445820331, |
| "num_tokens": 33783413.0, |
| "step": 1380 |
| }, |
| { |
| "entropy": 1.2771015651524067, |
| "epoch": 0.4448, |
| "grad_norm": 0.10708731412887573, |
| "learning_rate": 0.00011110400000000001, |
| "loss": 1.2636, |
| "mean_token_accuracy": 0.6635805793106556, |
| "num_tokens": 34035248.0, |
| "step": 1390 |
| }, |
| { |
| "entropy": 1.2920770995318889, |
| "epoch": 0.448, |
| "grad_norm": 0.1167186051607132, |
| "learning_rate": 0.00011046400000000002, |
| "loss": 1.2838, |
| "mean_token_accuracy": 0.6611834704875946, |
| "num_tokens": 34293940.0, |
| "step": 1400 |
| }, |
| { |
| "entropy": 1.3105048850178718, |
| "epoch": 0.4512, |
| "grad_norm": 0.1058393269777298, |
| "learning_rate": 0.00010982400000000001, |
| "loss": 1.3017, |
| "mean_token_accuracy": 0.6579717807471752, |
| "num_tokens": 34536644.0, |
| "step": 1410 |
| }, |
| { |
| "entropy": 1.2866226516664028, |
| "epoch": 0.4544, |
| "grad_norm": 0.11724147945642471, |
| "learning_rate": 0.000109184, |
| "loss": 1.2775, |
| "mean_token_accuracy": 0.663629986345768, |
| "num_tokens": 34785178.0, |
| "step": 1420 |
| }, |
| { |
| "entropy": 1.2797075405716896, |
| "epoch": 0.4576, |
| "grad_norm": 0.12725114822387695, |
| "learning_rate": 0.000108544, |
| "loss": 1.2757, |
| "mean_token_accuracy": 0.6621624812483787, |
| "num_tokens": 35016972.0, |
| "step": 1430 |
| }, |
| { |
| "entropy": 1.2796621806919575, |
| "epoch": 0.4608, |
| "grad_norm": 0.11476690322160721, |
| "learning_rate": 0.000107904, |
| "loss": 1.2668, |
| "mean_token_accuracy": 0.6637491337954998, |
| "num_tokens": 35258105.0, |
| "step": 1440 |
| }, |
| { |
| "entropy": 1.3116767704486847, |
| "epoch": 0.464, |
| "grad_norm": 0.11448593437671661, |
| "learning_rate": 0.00010726400000000001, |
| "loss": 1.3041, |
| "mean_token_accuracy": 0.65796999335289, |
| "num_tokens": 35488734.0, |
| "step": 1450 |
| }, |
| { |
| "entropy": 1.2956475257873534, |
| "epoch": 0.4672, |
| "grad_norm": 0.09690966457128525, |
| "learning_rate": 0.00010662400000000001, |
| "loss": 1.2849, |
| "mean_token_accuracy": 0.6598352804780007, |
| "num_tokens": 35737361.0, |
| "step": 1460 |
| }, |
| { |
| "entropy": 1.2924716591835022, |
| "epoch": 0.4704, |
| "grad_norm": 0.12266408652067184, |
| "learning_rate": 0.000105984, |
| "loss": 1.2843, |
| "mean_token_accuracy": 0.6619208715856075, |
| "num_tokens": 35973111.0, |
| "step": 1470 |
| }, |
| { |
| "entropy": 1.2869246512651444, |
| "epoch": 0.4736, |
| "grad_norm": 0.12115157395601273, |
| "learning_rate": 0.000105344, |
| "loss": 1.2847, |
| "mean_token_accuracy": 0.6612910144031048, |
| "num_tokens": 36208327.0, |
| "step": 1480 |
| }, |
| { |
| "entropy": 1.298311972618103, |
| "epoch": 0.4768, |
| "grad_norm": 0.12120506167411804, |
| "learning_rate": 0.000104704, |
| "loss": 1.2876, |
| "mean_token_accuracy": 0.660318773984909, |
| "num_tokens": 36451671.0, |
| "step": 1490 |
| }, |
| { |
| "entropy": 1.2932984501123428, |
| "epoch": 0.48, |
| "grad_norm": 0.11744143813848495, |
| "learning_rate": 0.00010406400000000001, |
| "loss": 1.2886, |
| "mean_token_accuracy": 0.6611990183591843, |
| "num_tokens": 36683329.0, |
| "step": 1500 |
| }, |
| { |
| "entropy": 1.3003613620996475, |
| "epoch": 0.4832, |
| "grad_norm": 0.12476562708616257, |
| "learning_rate": 0.000103424, |
| "loss": 1.285, |
| "mean_token_accuracy": 0.6586381584405899, |
| "num_tokens": 36926726.0, |
| "step": 1510 |
| }, |
| { |
| "entropy": 1.2948037102818488, |
| "epoch": 0.4864, |
| "grad_norm": 0.12511593103408813, |
| "learning_rate": 0.00010278400000000001, |
| "loss": 1.2942, |
| "mean_token_accuracy": 0.6591917663812638, |
| "num_tokens": 37172097.0, |
| "step": 1520 |
| }, |
| { |
| "entropy": 1.2966615833342074, |
| "epoch": 0.4896, |
| "grad_norm": 0.11104147881269455, |
| "learning_rate": 0.00010214399999999999, |
| "loss": 1.2861, |
| "mean_token_accuracy": 0.6602436915040016, |
| "num_tokens": 37416447.0, |
| "step": 1530 |
| }, |
| { |
| "entropy": 1.296153624355793, |
| "epoch": 0.4928, |
| "grad_norm": 0.11017699539661407, |
| "learning_rate": 0.000101504, |
| "loss": 1.2953, |
| "mean_token_accuracy": 0.660629465430975, |
| "num_tokens": 37665260.0, |
| "step": 1540 |
| }, |
| { |
| "entropy": 1.3059000916779042, |
| "epoch": 0.496, |
| "grad_norm": 0.13431954383850098, |
| "learning_rate": 0.000100864, |
| "loss": 1.2863, |
| "mean_token_accuracy": 0.6589184135198594, |
| "num_tokens": 37900028.0, |
| "step": 1550 |
| }, |
| { |
| "entropy": 1.2858399972319603, |
| "epoch": 0.4992, |
| "grad_norm": 0.10387441515922546, |
| "learning_rate": 0.00010022400000000001, |
| "loss": 1.2844, |
| "mean_token_accuracy": 0.6599555842578411, |
| "num_tokens": 38143880.0, |
| "step": 1560 |
| }, |
| { |
| "entropy": 1.2912098079919816, |
| "epoch": 0.5024, |
| "grad_norm": 0.10686223208904266, |
| "learning_rate": 9.9584e-05, |
| "loss": 1.2773, |
| "mean_token_accuracy": 0.6621040225028991, |
| "num_tokens": 38385328.0, |
| "step": 1570 |
| }, |
| { |
| "entropy": 1.2873357623815536, |
| "epoch": 0.5056, |
| "grad_norm": 0.11096842586994171, |
| "learning_rate": 9.8944e-05, |
| "loss": 1.2696, |
| "mean_token_accuracy": 0.6641848027706146, |
| "num_tokens": 38623074.0, |
| "step": 1580 |
| }, |
| { |
| "entropy": 1.2860740795731544, |
| "epoch": 0.5088, |
| "grad_norm": 0.12370982766151428, |
| "learning_rate": 9.830400000000001e-05, |
| "loss": 1.2812, |
| "mean_token_accuracy": 0.6610544748604298, |
| "num_tokens": 38875248.0, |
| "step": 1590 |
| }, |
| { |
| "entropy": 1.273354247957468, |
| "epoch": 0.512, |
| "grad_norm": 0.09915758669376373, |
| "learning_rate": 9.7664e-05, |
| "loss": 1.2608, |
| "mean_token_accuracy": 0.6646574601531029, |
| "num_tokens": 39136593.0, |
| "step": 1600 |
| }, |
| { |
| "entropy": 1.2963866576552392, |
| "epoch": 0.5152, |
| "grad_norm": 0.1212090253829956, |
| "learning_rate": 9.7024e-05, |
| "loss": 1.2922, |
| "mean_token_accuracy": 0.6610896795988083, |
| "num_tokens": 39376811.0, |
| "step": 1610 |
| }, |
| { |
| "entropy": 1.2954113230109214, |
| "epoch": 0.5184, |
| "grad_norm": 0.12089242786169052, |
| "learning_rate": 9.6384e-05, |
| "loss": 1.2936, |
| "mean_token_accuracy": 0.6614227883517743, |
| "num_tokens": 39615619.0, |
| "step": 1620 |
| }, |
| { |
| "entropy": 1.294113714247942, |
| "epoch": 0.5216, |
| "grad_norm": 0.12379191815853119, |
| "learning_rate": 9.5744e-05, |
| "loss": 1.2821, |
| "mean_token_accuracy": 0.6602171882987022, |
| "num_tokens": 39853819.0, |
| "step": 1630 |
| }, |
| { |
| "entropy": 1.2999729424715043, |
| "epoch": 0.5248, |
| "grad_norm": 0.10780320316553116, |
| "learning_rate": 9.5104e-05, |
| "loss": 1.297, |
| "mean_token_accuracy": 0.6596785984933377, |
| "num_tokens": 40095019.0, |
| "step": 1640 |
| }, |
| { |
| "entropy": 1.2916208013892174, |
| "epoch": 0.528, |
| "grad_norm": 0.11127542704343796, |
| "learning_rate": 9.446400000000001e-05, |
| "loss": 1.2767, |
| "mean_token_accuracy": 0.6634647272527218, |
| "num_tokens": 40330639.0, |
| "step": 1650 |
| }, |
| { |
| "entropy": 1.2651664420962334, |
| "epoch": 0.5312, |
| "grad_norm": 0.1118161603808403, |
| "learning_rate": 9.3824e-05, |
| "loss": 1.2666, |
| "mean_token_accuracy": 0.665022649616003, |
| "num_tokens": 40576767.0, |
| "step": 1660 |
| }, |
| { |
| "entropy": 1.3101583018898963, |
| "epoch": 0.5344, |
| "grad_norm": 0.11668220162391663, |
| "learning_rate": 9.318400000000001e-05, |
| "loss": 1.3058, |
| "mean_token_accuracy": 0.6568942174315453, |
| "num_tokens": 40826990.0, |
| "step": 1670 |
| }, |
| { |
| "entropy": 1.3064464211463929, |
| "epoch": 0.5376, |
| "grad_norm": 0.11836805194616318, |
| "learning_rate": 9.254400000000001e-05, |
| "loss": 1.2986, |
| "mean_token_accuracy": 0.6598636016249657, |
| "num_tokens": 41059310.0, |
| "step": 1680 |
| }, |
| { |
| "entropy": 1.2753410398960114, |
| "epoch": 0.5408, |
| "grad_norm": 0.11756067723035812, |
| "learning_rate": 9.1904e-05, |
| "loss": 1.2631, |
| "mean_token_accuracy": 0.6639902092516422, |
| "num_tokens": 41314851.0, |
| "step": 1690 |
| }, |
| { |
| "entropy": 1.2974440984427928, |
| "epoch": 0.544, |
| "grad_norm": 0.12369421124458313, |
| "learning_rate": 9.1264e-05, |
| "loss": 1.2976, |
| "mean_token_accuracy": 0.6614777378737926, |
| "num_tokens": 41546600.0, |
| "step": 1700 |
| }, |
| { |
| "entropy": 1.2844402551651002, |
| "epoch": 0.5472, |
| "grad_norm": 0.11832479387521744, |
| "learning_rate": 9.0624e-05, |
| "loss": 1.2796, |
| "mean_token_accuracy": 0.6630353279411793, |
| "num_tokens": 41793602.0, |
| "step": 1710 |
| }, |
| { |
| "entropy": 1.291226226836443, |
| "epoch": 0.5504, |
| "grad_norm": 0.12497137486934662, |
| "learning_rate": 8.9984e-05, |
| "loss": 1.2871, |
| "mean_token_accuracy": 0.6616393506526947, |
| "num_tokens": 42029939.0, |
| "step": 1720 |
| }, |
| { |
| "entropy": 1.283573118597269, |
| "epoch": 0.5536, |
| "grad_norm": 0.11917891353368759, |
| "learning_rate": 8.9344e-05, |
| "loss": 1.2724, |
| "mean_token_accuracy": 0.6623697899281978, |
| "num_tokens": 42273631.0, |
| "step": 1730 |
| }, |
| { |
| "entropy": 1.2863767340779304, |
| "epoch": 0.5568, |
| "grad_norm": 0.11677709966897964, |
| "learning_rate": 8.870400000000001e-05, |
| "loss": 1.2743, |
| "mean_token_accuracy": 0.6619865775108338, |
| "num_tokens": 42524742.0, |
| "step": 1740 |
| }, |
| { |
| "entropy": 1.2735685274004935, |
| "epoch": 0.56, |
| "grad_norm": 0.12022030353546143, |
| "learning_rate": 8.8064e-05, |
| "loss": 1.2674, |
| "mean_token_accuracy": 0.6651507563889026, |
| "num_tokens": 42761081.0, |
| "step": 1750 |
| }, |
| { |
| "entropy": 1.292349175363779, |
| "epoch": 0.5632, |
| "grad_norm": 0.11252912878990173, |
| "learning_rate": 8.742400000000001e-05, |
| "loss": 1.2778, |
| "mean_token_accuracy": 0.6609838925302028, |
| "num_tokens": 43001402.0, |
| "step": 1760 |
| }, |
| { |
| "entropy": 1.2807253673672676, |
| "epoch": 0.5664, |
| "grad_norm": 0.11592043936252594, |
| "learning_rate": 8.6784e-05, |
| "loss": 1.2766, |
| "mean_token_accuracy": 0.6626990288496017, |
| "num_tokens": 43249266.0, |
| "step": 1770 |
| }, |
| { |
| "entropy": 1.2867185749113559, |
| "epoch": 0.5696, |
| "grad_norm": 0.1209031492471695, |
| "learning_rate": 8.614400000000001e-05, |
| "loss": 1.2709, |
| "mean_token_accuracy": 0.6622023105621337, |
| "num_tokens": 43501018.0, |
| "step": 1780 |
| }, |
| { |
| "entropy": 1.2948940724134446, |
| "epoch": 0.5728, |
| "grad_norm": 0.12464050203561783, |
| "learning_rate": 8.5504e-05, |
| "loss": 1.2816, |
| "mean_token_accuracy": 0.660938760638237, |
| "num_tokens": 43736409.0, |
| "step": 1790 |
| }, |
| { |
| "entropy": 1.2794981703162194, |
| "epoch": 0.576, |
| "grad_norm": 0.09791232645511627, |
| "learning_rate": 8.486399999999999e-05, |
| "loss": 1.2654, |
| "mean_token_accuracy": 0.6630588375031948, |
| "num_tokens": 43964780.0, |
| "step": 1800 |
| }, |
| { |
| "entropy": 1.3002039618790149, |
| "epoch": 0.5792, |
| "grad_norm": 0.10294368118047714, |
| "learning_rate": 8.4224e-05, |
| "loss": 1.2956, |
| "mean_token_accuracy": 0.6607864029705525, |
| "num_tokens": 44213749.0, |
| "step": 1810 |
| }, |
| { |
| "entropy": 1.2742962069809436, |
| "epoch": 0.5824, |
| "grad_norm": 0.0991392433643341, |
| "learning_rate": 8.3584e-05, |
| "loss": 1.25, |
| "mean_token_accuracy": 0.665590125322342, |
| "num_tokens": 44470739.0, |
| "step": 1820 |
| }, |
| { |
| "entropy": 1.2917442351579667, |
| "epoch": 0.5856, |
| "grad_norm": 0.12601912021636963, |
| "learning_rate": 8.2944e-05, |
| "loss": 1.2828, |
| "mean_token_accuracy": 0.6622489891946316, |
| "num_tokens": 44703199.0, |
| "step": 1830 |
| }, |
| { |
| "entropy": 1.275299820303917, |
| "epoch": 0.5888, |
| "grad_norm": 0.12390691041946411, |
| "learning_rate": 8.2304e-05, |
| "loss": 1.266, |
| "mean_token_accuracy": 0.6645067222416401, |
| "num_tokens": 44960600.0, |
| "step": 1840 |
| }, |
| { |
| "entropy": 1.2826978042721748, |
| "epoch": 0.592, |
| "grad_norm": 0.12506622076034546, |
| "learning_rate": 8.166400000000001e-05, |
| "loss": 1.2771, |
| "mean_token_accuracy": 0.6630433753132821, |
| "num_tokens": 45200255.0, |
| "step": 1850 |
| }, |
| { |
| "entropy": 1.2878143966197968, |
| "epoch": 0.5952, |
| "grad_norm": 0.11564245074987411, |
| "learning_rate": 8.1024e-05, |
| "loss": 1.2767, |
| "mean_token_accuracy": 0.6613390885293484, |
| "num_tokens": 45453962.0, |
| "step": 1860 |
| }, |
| { |
| "entropy": 1.2832387924194335, |
| "epoch": 0.5984, |
| "grad_norm": 0.1215086281299591, |
| "learning_rate": 8.038400000000001e-05, |
| "loss": 1.2823, |
| "mean_token_accuracy": 0.6630079805850982, |
| "num_tokens": 45683100.0, |
| "step": 1870 |
| }, |
| { |
| "entropy": 1.2782616093754768, |
| "epoch": 0.6016, |
| "grad_norm": 0.11948413401842117, |
| "learning_rate": 7.9744e-05, |
| "loss": 1.251, |
| "mean_token_accuracy": 0.6664728112518787, |
| "num_tokens": 45948341.0, |
| "step": 1880 |
| }, |
| { |
| "entropy": 1.282223752886057, |
| "epoch": 0.6048, |
| "grad_norm": 0.12758035957813263, |
| "learning_rate": 7.910400000000001e-05, |
| "loss": 1.2797, |
| "mean_token_accuracy": 0.6634828291833401, |
| "num_tokens": 46188142.0, |
| "step": 1890 |
| }, |
| { |
| "entropy": 1.2859626293182373, |
| "epoch": 0.608, |
| "grad_norm": 0.11322532594203949, |
| "learning_rate": 7.8464e-05, |
| "loss": 1.2839, |
| "mean_token_accuracy": 0.6610695406794548, |
| "num_tokens": 46432450.0, |
| "step": 1900 |
| }, |
| { |
| "entropy": 1.2739025503396988, |
| "epoch": 0.6112, |
| "grad_norm": 0.12269606441259384, |
| "learning_rate": 7.7824e-05, |
| "loss": 1.2569, |
| "mean_token_accuracy": 0.6647766470909119, |
| "num_tokens": 46675172.0, |
| "step": 1910 |
| }, |
| { |
| "entropy": 1.2879651993513108, |
| "epoch": 0.6144, |
| "grad_norm": 0.14067530632019043, |
| "learning_rate": 7.7184e-05, |
| "loss": 1.2839, |
| "mean_token_accuracy": 0.6605637408792973, |
| "num_tokens": 46921940.0, |
| "step": 1920 |
| }, |
| { |
| "entropy": 1.3045418858528137, |
| "epoch": 0.6176, |
| "grad_norm": 0.11205250024795532, |
| "learning_rate": 7.6544e-05, |
| "loss": 1.2965, |
| "mean_token_accuracy": 0.6595428980886936, |
| "num_tokens": 47156371.0, |
| "step": 1930 |
| }, |
| { |
| "entropy": 1.2783097460865975, |
| "epoch": 0.6208, |
| "grad_norm": 0.12043390423059464, |
| "learning_rate": 7.590400000000001e-05, |
| "loss": 1.2716, |
| "mean_token_accuracy": 0.6628904029726982, |
| "num_tokens": 47397196.0, |
| "step": 1940 |
| }, |
| { |
| "entropy": 1.2877770692110062, |
| "epoch": 0.624, |
| "grad_norm": 0.12139495462179184, |
| "learning_rate": 7.5264e-05, |
| "loss": 1.2804, |
| "mean_token_accuracy": 0.6629617258906364, |
| "num_tokens": 47631807.0, |
| "step": 1950 |
| }, |
| { |
| "entropy": 1.2847104147076607, |
| "epoch": 0.6272, |
| "grad_norm": 0.11467945575714111, |
| "learning_rate": 7.462400000000001e-05, |
| "loss": 1.2744, |
| "mean_token_accuracy": 0.6624419778585434, |
| "num_tokens": 47872281.0, |
| "step": 1960 |
| }, |
| { |
| "entropy": 1.2643931224942206, |
| "epoch": 0.6304, |
| "grad_norm": 0.11596507579088211, |
| "learning_rate": 7.398400000000002e-05, |
| "loss": 1.249, |
| "mean_token_accuracy": 0.6665705449879169, |
| "num_tokens": 48130031.0, |
| "step": 1970 |
| }, |
| { |
| "entropy": 1.2806773453950882, |
| "epoch": 0.6336, |
| "grad_norm": 0.12911729514598846, |
| "learning_rate": 7.334400000000001e-05, |
| "loss": 1.2723, |
| "mean_token_accuracy": 0.6623373091220855, |
| "num_tokens": 48374685.0, |
| "step": 1980 |
| }, |
| { |
| "entropy": 1.2845478609204293, |
| "epoch": 0.6368, |
| "grad_norm": 0.11935040354728699, |
| "learning_rate": 7.2704e-05, |
| "loss": 1.2704, |
| "mean_token_accuracy": 0.6626521065831185, |
| "num_tokens": 48634041.0, |
| "step": 1990 |
| }, |
| { |
| "entropy": 1.2804365545511245, |
| "epoch": 0.64, |
| "grad_norm": 0.11345125734806061, |
| "learning_rate": 7.206399999999999e-05, |
| "loss": 1.2778, |
| "mean_token_accuracy": 0.6626640096306801, |
| "num_tokens": 48875289.0, |
| "step": 2000 |
| }, |
| { |
| "entropy": 1.2774985551834106, |
| "epoch": 0.6432, |
| "grad_norm": 0.10802491754293442, |
| "learning_rate": 7.1424e-05, |
| "loss": 1.2629, |
| "mean_token_accuracy": 0.6640660047531128, |
| "num_tokens": 49120850.0, |
| "step": 2010 |
| }, |
| { |
| "entropy": 1.285240511596203, |
| "epoch": 0.6464, |
| "grad_norm": 0.12500284612178802, |
| "learning_rate": 7.0784e-05, |
| "loss": 1.2836, |
| "mean_token_accuracy": 0.6611738555133343, |
| "num_tokens": 49358432.0, |
| "step": 2020 |
| }, |
| { |
| "entropy": 1.2876662492752076, |
| "epoch": 0.6496, |
| "grad_norm": 0.13986480236053467, |
| "learning_rate": 7.0144e-05, |
| "loss": 1.2862, |
| "mean_token_accuracy": 0.6606057547032833, |
| "num_tokens": 49601601.0, |
| "step": 2030 |
| }, |
| { |
| "entropy": 1.3031430423259736, |
| "epoch": 0.6528, |
| "grad_norm": 0.12369693070650101, |
| "learning_rate": 6.9504e-05, |
| "loss": 1.2952, |
| "mean_token_accuracy": 0.6599393948912621, |
| "num_tokens": 49829767.0, |
| "step": 2040 |
| }, |
| { |
| "entropy": 1.281213691085577, |
| "epoch": 0.656, |
| "grad_norm": 0.09500250220298767, |
| "learning_rate": 6.886400000000001e-05, |
| "loss": 1.2717, |
| "mean_token_accuracy": 0.6652219220995903, |
| "num_tokens": 50090882.0, |
| "step": 2050 |
| }, |
| { |
| "entropy": 1.2874048799276352, |
| "epoch": 0.6592, |
| "grad_norm": 0.11683914810419083, |
| "learning_rate": 6.8224e-05, |
| "loss": 1.2825, |
| "mean_token_accuracy": 0.6623919330537319, |
| "num_tokens": 50326367.0, |
| "step": 2060 |
| }, |
| { |
| "entropy": 1.2961020797491074, |
| "epoch": 0.6624, |
| "grad_norm": 0.11372298747301102, |
| "learning_rate": 6.758400000000001e-05, |
| "loss": 1.2818, |
| "mean_token_accuracy": 0.6601490914821625, |
| "num_tokens": 50566139.0, |
| "step": 2070 |
| }, |
| { |
| "entropy": 1.291332770884037, |
| "epoch": 0.6656, |
| "grad_norm": 0.12454129010438919, |
| "learning_rate": 6.6944e-05, |
| "loss": 1.2839, |
| "mean_token_accuracy": 0.6620477616786957, |
| "num_tokens": 50799393.0, |
| "step": 2080 |
| }, |
| { |
| "entropy": 1.2960528805851936, |
| "epoch": 0.6688, |
| "grad_norm": 0.11593926697969437, |
| "learning_rate": 6.6304e-05, |
| "loss": 1.2924, |
| "mean_token_accuracy": 0.6605801820755005, |
| "num_tokens": 51041067.0, |
| "step": 2090 |
| }, |
| { |
| "entropy": 1.2801642760634422, |
| "epoch": 0.672, |
| "grad_norm": 0.12337479740381241, |
| "learning_rate": 6.5664e-05, |
| "loss": 1.2782, |
| "mean_token_accuracy": 0.6638985246419906, |
| "num_tokens": 51299375.0, |
| "step": 2100 |
| }, |
| { |
| "entropy": 1.2779830560088157, |
| "epoch": 0.6752, |
| "grad_norm": 0.1154545322060585, |
| "learning_rate": 6.5024e-05, |
| "loss": 1.253, |
| "mean_token_accuracy": 0.6645998746156693, |
| "num_tokens": 51548178.0, |
| "step": 2110 |
| }, |
| { |
| "entropy": 1.3021435037255287, |
| "epoch": 0.6784, |
| "grad_norm": 0.11757897585630417, |
| "learning_rate": 6.4384e-05, |
| "loss": 1.2996, |
| "mean_token_accuracy": 0.6572086110711097, |
| "num_tokens": 51778808.0, |
| "step": 2120 |
| }, |
| { |
| "entropy": 1.2716527745127677, |
| "epoch": 0.6816, |
| "grad_norm": 0.1235128715634346, |
| "learning_rate": 6.3744e-05, |
| "loss": 1.2629, |
| "mean_token_accuracy": 0.6644945830106735, |
| "num_tokens": 52032553.0, |
| "step": 2130 |
| }, |
| { |
| "entropy": 1.2885536648333074, |
| "epoch": 0.6848, |
| "grad_norm": 0.12500311434268951, |
| "learning_rate": 6.310400000000001e-05, |
| "loss": 1.2846, |
| "mean_token_accuracy": 0.6607552655041218, |
| "num_tokens": 52272493.0, |
| "step": 2140 |
| }, |
| { |
| "entropy": 1.2856855228543282, |
| "epoch": 0.688, |
| "grad_norm": 0.11195147782564163, |
| "learning_rate": 6.2464e-05, |
| "loss": 1.2834, |
| "mean_token_accuracy": 0.6618089392781258, |
| "num_tokens": 52523385.0, |
| "step": 2150 |
| }, |
| { |
| "entropy": 1.293137502670288, |
| "epoch": 0.6912, |
| "grad_norm": 0.10891924798488617, |
| "learning_rate": 6.182400000000001e-05, |
| "loss": 1.28, |
| "mean_token_accuracy": 0.6622749775648117, |
| "num_tokens": 52778433.0, |
| "step": 2160 |
| }, |
| { |
| "entropy": 1.2859415262937546, |
| "epoch": 0.6944, |
| "grad_norm": 0.12123312056064606, |
| "learning_rate": 6.1184e-05, |
| "loss": 1.2784, |
| "mean_token_accuracy": 0.6639839120209217, |
| "num_tokens": 53020978.0, |
| "step": 2170 |
| }, |
| { |
| "entropy": 1.2648396782577038, |
| "epoch": 0.6976, |
| "grad_norm": 0.1204434335231781, |
| "learning_rate": 6.0544e-05, |
| "loss": 1.2605, |
| "mean_token_accuracy": 0.6650677911937237, |
| "num_tokens": 53261503.0, |
| "step": 2180 |
| }, |
| { |
| "entropy": 1.27807125300169, |
| "epoch": 0.7008, |
| "grad_norm": 0.1071564257144928, |
| "learning_rate": 5.990400000000001e-05, |
| "loss": 1.266, |
| "mean_token_accuracy": 0.662727988511324, |
| "num_tokens": 53509387.0, |
| "step": 2190 |
| }, |
| { |
| "entropy": 1.263394307345152, |
| "epoch": 0.704, |
| "grad_norm": 0.11641150712966919, |
| "learning_rate": 5.9264e-05, |
| "loss": 1.2371, |
| "mean_token_accuracy": 0.6679573692381382, |
| "num_tokens": 53772653.0, |
| "step": 2200 |
| }, |
| { |
| "entropy": 1.3010064527392387, |
| "epoch": 0.7072, |
| "grad_norm": 0.11632030457258224, |
| "learning_rate": 5.8624e-05, |
| "loss": 1.3008, |
| "mean_token_accuracy": 0.6585525132715702, |
| "num_tokens": 54013373.0, |
| "step": 2210 |
| }, |
| { |
| "entropy": 1.276947945356369, |
| "epoch": 0.7104, |
| "grad_norm": 0.1137506440281868, |
| "learning_rate": 5.7984000000000006e-05, |
| "loss": 1.2752, |
| "mean_token_accuracy": 0.6626610539853572, |
| "num_tokens": 54266488.0, |
| "step": 2220 |
| }, |
| { |
| "entropy": 1.28578822016716, |
| "epoch": 0.7136, |
| "grad_norm": 0.12409249693155289, |
| "learning_rate": 5.7344e-05, |
| "loss": 1.279, |
| "mean_token_accuracy": 0.6634858660399914, |
| "num_tokens": 54507453.0, |
| "step": 2230 |
| }, |
| { |
| "entropy": 1.280213326215744, |
| "epoch": 0.7168, |
| "grad_norm": 0.11915239691734314, |
| "learning_rate": 5.6704000000000005e-05, |
| "loss": 1.2743, |
| "mean_token_accuracy": 0.6631876476109028, |
| "num_tokens": 54750374.0, |
| "step": 2240 |
| }, |
| { |
| "entropy": 1.2909489214420318, |
| "epoch": 0.72, |
| "grad_norm": 0.11880353093147278, |
| "learning_rate": 5.6064000000000004e-05, |
| "loss": 1.2852, |
| "mean_token_accuracy": 0.6620245948433876, |
| "num_tokens": 54979703.0, |
| "step": 2250 |
| }, |
| { |
| "entropy": 1.2845760613679886, |
| "epoch": 0.7232, |
| "grad_norm": 0.09207862615585327, |
| "learning_rate": 5.5423999999999997e-05, |
| "loss": 1.2767, |
| "mean_token_accuracy": 0.6643706910312176, |
| "num_tokens": 55222074.0, |
| "step": 2260 |
| }, |
| { |
| "entropy": 1.290702012181282, |
| "epoch": 0.7264, |
| "grad_norm": 0.1262422502040863, |
| "learning_rate": 5.4784e-05, |
| "loss": 1.283, |
| "mean_token_accuracy": 0.6628161288797856, |
| "num_tokens": 55463137.0, |
| "step": 2270 |
| }, |
| { |
| "entropy": 1.2771591052412987, |
| "epoch": 0.7296, |
| "grad_norm": 0.1214606836438179, |
| "learning_rate": 5.414400000000001e-05, |
| "loss": 1.2709, |
| "mean_token_accuracy": 0.6648481003940105, |
| "num_tokens": 55698634.0, |
| "step": 2280 |
| }, |
| { |
| "entropy": 1.2851046845316887, |
| "epoch": 0.7328, |
| "grad_norm": 0.11718755215406418, |
| "learning_rate": 5.3504e-05, |
| "loss": 1.2795, |
| "mean_token_accuracy": 0.661083023250103, |
| "num_tokens": 55942748.0, |
| "step": 2290 |
| }, |
| { |
| "entropy": 1.3050734251737595, |
| "epoch": 0.736, |
| "grad_norm": 0.11945287138223648, |
| "learning_rate": 5.2864e-05, |
| "loss": 1.2874, |
| "mean_token_accuracy": 0.660216660797596, |
| "num_tokens": 56181218.0, |
| "step": 2300 |
| }, |
| { |
| "entropy": 1.2771841265261172, |
| "epoch": 0.7392, |
| "grad_norm": 0.12805123627185822, |
| "learning_rate": 5.222400000000001e-05, |
| "loss": 1.2668, |
| "mean_token_accuracy": 0.6635241940617561, |
| "num_tokens": 56422553.0, |
| "step": 2310 |
| }, |
| { |
| "entropy": 1.2706456460058688, |
| "epoch": 0.7424, |
| "grad_norm": 0.1329023540019989, |
| "learning_rate": 5.1584e-05, |
| "loss": 1.2639, |
| "mean_token_accuracy": 0.6640631876885891, |
| "num_tokens": 56672803.0, |
| "step": 2320 |
| }, |
| { |
| "entropy": 1.2867251232266426, |
| "epoch": 0.7456, |
| "grad_norm": 0.12729530036449432, |
| "learning_rate": 5.0944000000000006e-05, |
| "loss": 1.2725, |
| "mean_token_accuracy": 0.661464573442936, |
| "num_tokens": 56915307.0, |
| "step": 2330 |
| }, |
| { |
| "entropy": 1.2946675971150399, |
| "epoch": 0.7488, |
| "grad_norm": 0.12116143107414246, |
| "learning_rate": 5.0304000000000005e-05, |
| "loss": 1.292, |
| "mean_token_accuracy": 0.6595002539455891, |
| "num_tokens": 57157892.0, |
| "step": 2340 |
| }, |
| { |
| "entropy": 1.275485384464264, |
| "epoch": 0.752, |
| "grad_norm": 0.12485181540250778, |
| "learning_rate": 4.9664000000000004e-05, |
| "loss": 1.2723, |
| "mean_token_accuracy": 0.6653809279203415, |
| "num_tokens": 57396351.0, |
| "step": 2350 |
| }, |
| { |
| "entropy": 1.2744866386055946, |
| "epoch": 0.7552, |
| "grad_norm": 0.1181459128856659, |
| "learning_rate": 4.9024000000000004e-05, |
| "loss": 1.2586, |
| "mean_token_accuracy": 0.6657501354813575, |
| "num_tokens": 57639198.0, |
| "step": 2360 |
| }, |
| { |
| "entropy": 1.2898824840784073, |
| "epoch": 0.7584, |
| "grad_norm": 0.11204960197210312, |
| "learning_rate": 4.8384e-05, |
| "loss": 1.284, |
| "mean_token_accuracy": 0.6609448194503784, |
| "num_tokens": 57883707.0, |
| "step": 2370 |
| }, |
| { |
| "entropy": 1.2666787058115005, |
| "epoch": 0.7616, |
| "grad_norm": 0.12575663626194, |
| "learning_rate": 4.7744e-05, |
| "loss": 1.2536, |
| "mean_token_accuracy": 0.6649810247123241, |
| "num_tokens": 58137011.0, |
| "step": 2380 |
| }, |
| { |
| "entropy": 1.2985649809241295, |
| "epoch": 0.7648, |
| "grad_norm": 0.11438199877738953, |
| "learning_rate": 4.7104e-05, |
| "loss": 1.2952, |
| "mean_token_accuracy": 0.6614246018230915, |
| "num_tokens": 58365454.0, |
| "step": 2390 |
| }, |
| { |
| "entropy": 1.2787313714623452, |
| "epoch": 0.768, |
| "grad_norm": 0.12201087176799774, |
| "learning_rate": 4.6464e-05, |
| "loss": 1.263, |
| "mean_token_accuracy": 0.6637052565813064, |
| "num_tokens": 58606240.0, |
| "step": 2400 |
| }, |
| { |
| "entropy": 1.2832006998360157, |
| "epoch": 0.7712, |
| "grad_norm": 0.12575478851795197, |
| "learning_rate": 4.5824e-05, |
| "loss": 1.2632, |
| "mean_token_accuracy": 0.662400508671999, |
| "num_tokens": 58862956.0, |
| "step": 2410 |
| }, |
| { |
| "entropy": 1.2945918813347816, |
| "epoch": 0.7744, |
| "grad_norm": 0.10888398438692093, |
| "learning_rate": 4.5184000000000006e-05, |
| "loss": 1.2929, |
| "mean_token_accuracy": 0.6602617390453815, |
| "num_tokens": 59106046.0, |
| "step": 2420 |
| }, |
| { |
| "entropy": 1.275023841112852, |
| "epoch": 0.7776, |
| "grad_norm": 0.12014330923557281, |
| "learning_rate": 4.4544e-05, |
| "loss": 1.27, |
| "mean_token_accuracy": 0.6666641846299172, |
| "num_tokens": 59345101.0, |
| "step": 2430 |
| }, |
| { |
| "entropy": 1.2761300414800645, |
| "epoch": 0.7808, |
| "grad_norm": 0.12748950719833374, |
| "learning_rate": 4.3904e-05, |
| "loss": 1.2662, |
| "mean_token_accuracy": 0.6619000285863876, |
| "num_tokens": 59594637.0, |
| "step": 2440 |
| }, |
| { |
| "entropy": 1.29071052223444, |
| "epoch": 0.784, |
| "grad_norm": 0.10636208206415176, |
| "learning_rate": 4.3264000000000005e-05, |
| "loss": 1.2806, |
| "mean_token_accuracy": 0.6633819214999676, |
| "num_tokens": 59832503.0, |
| "step": 2450 |
| }, |
| { |
| "entropy": 1.2922189198434353, |
| "epoch": 0.7872, |
| "grad_norm": 0.12413835525512695, |
| "learning_rate": 4.2624000000000004e-05, |
| "loss": 1.2794, |
| "mean_token_accuracy": 0.6620264105498791, |
| "num_tokens": 60066539.0, |
| "step": 2460 |
| }, |
| { |
| "entropy": 1.2840410895645618, |
| "epoch": 0.7904, |
| "grad_norm": 0.14140242338180542, |
| "learning_rate": 4.1984e-05, |
| "loss": 1.2783, |
| "mean_token_accuracy": 0.6627412438392639, |
| "num_tokens": 60315692.0, |
| "step": 2470 |
| }, |
| { |
| "entropy": 1.2861724182963372, |
| "epoch": 0.7936, |
| "grad_norm": 0.11666164547204971, |
| "learning_rate": 4.1344e-05, |
| "loss": 1.2778, |
| "mean_token_accuracy": 0.6631257057189941, |
| "num_tokens": 60552318.0, |
| "step": 2480 |
| }, |
| { |
| "entropy": 1.2788305684924126, |
| "epoch": 0.7968, |
| "grad_norm": 0.13158653676509857, |
| "learning_rate": 4.0704e-05, |
| "loss": 1.2819, |
| "mean_token_accuracy": 0.6623201578855514, |
| "num_tokens": 60785287.0, |
| "step": 2490 |
| }, |
| { |
| "entropy": 1.2820916578173638, |
| "epoch": 0.8, |
| "grad_norm": 0.11907195299863815, |
| "learning_rate": 4.0064e-05, |
| "loss": 1.2832, |
| "mean_token_accuracy": 0.6616691462695599, |
| "num_tokens": 61021867.0, |
| "step": 2500 |
| }, |
| { |
| "entropy": 1.2639753066003323, |
| "epoch": 0.8032, |
| "grad_norm": 0.09102090448141098, |
| "learning_rate": 3.9424e-05, |
| "loss": 1.251, |
| "mean_token_accuracy": 0.6675893403589725, |
| "num_tokens": 61282259.0, |
| "step": 2510 |
| }, |
| { |
| "entropy": 1.2987436592578887, |
| "epoch": 0.8064, |
| "grad_norm": 0.10720188170671463, |
| "learning_rate": 3.878400000000001e-05, |
| "loss": 1.296, |
| "mean_token_accuracy": 0.6576383277773857, |
| "num_tokens": 61525495.0, |
| "step": 2520 |
| }, |
| { |
| "entropy": 1.2969974979758263, |
| "epoch": 0.8096, |
| "grad_norm": 0.12770576775074005, |
| "learning_rate": 3.8144e-05, |
| "loss": 1.2796, |
| "mean_token_accuracy": 0.6609326675534248, |
| "num_tokens": 61768031.0, |
| "step": 2530 |
| }, |
| { |
| "entropy": 1.273153106123209, |
| "epoch": 0.8128, |
| "grad_norm": 0.10298594832420349, |
| "learning_rate": 3.7504e-05, |
| "loss": 1.2611, |
| "mean_token_accuracy": 0.6674535654485225, |
| "num_tokens": 62012030.0, |
| "step": 2540 |
| }, |
| { |
| "entropy": 1.2733975648880005, |
| "epoch": 0.816, |
| "grad_norm": 0.11955207586288452, |
| "learning_rate": 3.6864000000000005e-05, |
| "loss": 1.2623, |
| "mean_token_accuracy": 0.6660604678094387, |
| "num_tokens": 62267931.0, |
| "step": 2550 |
| }, |
| { |
| "entropy": 1.2702475264668465, |
| "epoch": 0.8192, |
| "grad_norm": 0.12374407052993774, |
| "learning_rate": 3.6224000000000004e-05, |
| "loss": 1.276, |
| "mean_token_accuracy": 0.6624728865921498, |
| "num_tokens": 62520167.0, |
| "step": 2560 |
| }, |
| { |
| "entropy": 1.2743104055523873, |
| "epoch": 0.8224, |
| "grad_norm": 0.10744906216859818, |
| "learning_rate": 3.5584000000000004e-05, |
| "loss": 1.257, |
| "mean_token_accuracy": 0.664283612370491, |
| "num_tokens": 62774798.0, |
| "step": 2570 |
| }, |
| { |
| "entropy": 1.2885457567870617, |
| "epoch": 0.8256, |
| "grad_norm": 0.12830103933811188, |
| "learning_rate": 3.4943999999999996e-05, |
| "loss": 1.2715, |
| "mean_token_accuracy": 0.6622392393648624, |
| "num_tokens": 63032335.0, |
| "step": 2580 |
| }, |
| { |
| "entropy": 1.2897769168019295, |
| "epoch": 0.8288, |
| "grad_norm": 0.11151009052991867, |
| "learning_rate": 3.4304e-05, |
| "loss": 1.2808, |
| "mean_token_accuracy": 0.6619915708899498, |
| "num_tokens": 63269843.0, |
| "step": 2590 |
| }, |
| { |
| "entropy": 1.279052458703518, |
| "epoch": 0.832, |
| "grad_norm": 0.1219983845949173, |
| "learning_rate": 3.3664e-05, |
| "loss": 1.2763, |
| "mean_token_accuracy": 0.663902149349451, |
| "num_tokens": 63520241.0, |
| "step": 2600 |
| }, |
| { |
| "entropy": 1.2832585543394088, |
| "epoch": 0.8352, |
| "grad_norm": 0.12476496398448944, |
| "learning_rate": 3.3024e-05, |
| "loss": 1.2757, |
| "mean_token_accuracy": 0.6641913838684559, |
| "num_tokens": 63754752.0, |
| "step": 2610 |
| }, |
| { |
| "entropy": 1.2999466940760613, |
| "epoch": 0.8384, |
| "grad_norm": 0.10892050713300705, |
| "learning_rate": 3.2384e-05, |
| "loss": 1.2921, |
| "mean_token_accuracy": 0.658152700215578, |
| "num_tokens": 63995881.0, |
| "step": 2620 |
| }, |
| { |
| "entropy": 1.296813391149044, |
| "epoch": 0.8416, |
| "grad_norm": 0.12568706274032593, |
| "learning_rate": 3.1744e-05, |
| "loss": 1.2849, |
| "mean_token_accuracy": 0.6614639803767204, |
| "num_tokens": 64233320.0, |
| "step": 2630 |
| }, |
| { |
| "entropy": 1.2962809160351754, |
| "epoch": 0.8448, |
| "grad_norm": 0.11287514120340347, |
| "learning_rate": 3.1104e-05, |
| "loss": 1.2969, |
| "mean_token_accuracy": 0.6604227140545845, |
| "num_tokens": 64474964.0, |
| "step": 2640 |
| }, |
| { |
| "entropy": 1.2853048376739025, |
| "epoch": 0.848, |
| "grad_norm": 0.12490761280059814, |
| "learning_rate": 3.0464000000000005e-05, |
| "loss": 1.2825, |
| "mean_token_accuracy": 0.6642357870936394, |
| "num_tokens": 64716572.0, |
| "step": 2650 |
| }, |
| { |
| "entropy": 1.2827091276645661, |
| "epoch": 0.8512, |
| "grad_norm": 0.11987301707267761, |
| "learning_rate": 2.9824e-05, |
| "loss": 1.2788, |
| "mean_token_accuracy": 0.6627641633152962, |
| "num_tokens": 64961857.0, |
| "step": 2660 |
| }, |
| { |
| "entropy": 1.2868135765194892, |
| "epoch": 0.8544, |
| "grad_norm": 0.1166943833231926, |
| "learning_rate": 2.9184e-05, |
| "loss": 1.2799, |
| "mean_token_accuracy": 0.6655416525900364, |
| "num_tokens": 65203952.0, |
| "step": 2670 |
| }, |
| { |
| "entropy": 1.28294907361269, |
| "epoch": 0.8576, |
| "grad_norm": 0.12952201068401337, |
| "learning_rate": 2.8544000000000003e-05, |
| "loss": 1.2868, |
| "mean_token_accuracy": 0.6606246560811997, |
| "num_tokens": 65441322.0, |
| "step": 2680 |
| }, |
| { |
| "entropy": 1.279330450296402, |
| "epoch": 0.8608, |
| "grad_norm": 0.10416481643915176, |
| "learning_rate": 2.7904000000000003e-05, |
| "loss": 1.279, |
| "mean_token_accuracy": 0.6641826786100864, |
| "num_tokens": 65689378.0, |
| "step": 2690 |
| }, |
| { |
| "entropy": 1.2880327992141247, |
| "epoch": 0.864, |
| "grad_norm": 0.13343755900859833, |
| "learning_rate": 2.7264000000000002e-05, |
| "loss": 1.279, |
| "mean_token_accuracy": 0.6617752760648727, |
| "num_tokens": 65923388.0, |
| "step": 2700 |
| }, |
| { |
| "entropy": 1.2682015240192412, |
| "epoch": 0.8672, |
| "grad_norm": 0.11383596807718277, |
| "learning_rate": 2.6623999999999998e-05, |
| "loss": 1.2649, |
| "mean_token_accuracy": 0.6657252870500088, |
| "num_tokens": 66168664.0, |
| "step": 2710 |
| }, |
| { |
| "entropy": 1.2920080795884132, |
| "epoch": 0.8704, |
| "grad_norm": 0.11232816427946091, |
| "learning_rate": 2.5984000000000004e-05, |
| "loss": 1.2855, |
| "mean_token_accuracy": 0.6608941502869129, |
| "num_tokens": 66405205.0, |
| "step": 2720 |
| }, |
| { |
| "entropy": 1.2816180631518364, |
| "epoch": 0.8736, |
| "grad_norm": 0.11804413795471191, |
| "learning_rate": 2.5344e-05, |
| "loss": 1.2704, |
| "mean_token_accuracy": 0.6637136444449425, |
| "num_tokens": 66647402.0, |
| "step": 2730 |
| }, |
| { |
| "entropy": 1.2769486971199513, |
| "epoch": 0.8768, |
| "grad_norm": 0.1137230172753334, |
| "learning_rate": 2.4704000000000003e-05, |
| "loss": 1.2731, |
| "mean_token_accuracy": 0.6625068850815297, |
| "num_tokens": 66883075.0, |
| "step": 2740 |
| }, |
| { |
| "entropy": 1.2738276951014995, |
| "epoch": 0.88, |
| "grad_norm": 0.12964418530464172, |
| "learning_rate": 2.4064000000000002e-05, |
| "loss": 1.2647, |
| "mean_token_accuracy": 0.6640062846243382, |
| "num_tokens": 67117146.0, |
| "step": 2750 |
| }, |
| { |
| "entropy": 1.2770246736705304, |
| "epoch": 0.8832, |
| "grad_norm": 0.11520498991012573, |
| "learning_rate": 2.3424e-05, |
| "loss": 1.2751, |
| "mean_token_accuracy": 0.6637044087052345, |
| "num_tokens": 67362921.0, |
| "step": 2760 |
| }, |
| { |
| "entropy": 1.2787333600223065, |
| "epoch": 0.8864, |
| "grad_norm": 0.11831101775169373, |
| "learning_rate": 2.2784e-05, |
| "loss": 1.276, |
| "mean_token_accuracy": 0.6626746043562889, |
| "num_tokens": 67611232.0, |
| "step": 2770 |
| }, |
| { |
| "entropy": 1.29137175232172, |
| "epoch": 0.8896, |
| "grad_norm": 0.12874531745910645, |
| "learning_rate": 2.2144e-05, |
| "loss": 1.2784, |
| "mean_token_accuracy": 0.6635870583355427, |
| "num_tokens": 67840663.0, |
| "step": 2780 |
| }, |
| { |
| "entropy": 1.276902187615633, |
| "epoch": 0.8928, |
| "grad_norm": 0.12204962968826294, |
| "learning_rate": 2.1504000000000003e-05, |
| "loss": 1.2641, |
| "mean_token_accuracy": 0.6631358481943608, |
| "num_tokens": 68087130.0, |
| "step": 2790 |
| }, |
| { |
| "entropy": 1.285895049571991, |
| "epoch": 0.896, |
| "grad_norm": 0.11658646166324615, |
| "learning_rate": 2.0864e-05, |
| "loss": 1.2802, |
| "mean_token_accuracy": 0.6612561024725437, |
| "num_tokens": 68325513.0, |
| "step": 2800 |
| }, |
| { |
| "entropy": 1.2554577991366387, |
| "epoch": 0.8992, |
| "grad_norm": 0.11913077533245087, |
| "learning_rate": 2.0224e-05, |
| "loss": 1.2507, |
| "mean_token_accuracy": 0.6675182059407234, |
| "num_tokens": 68572469.0, |
| "step": 2810 |
| }, |
| { |
| "entropy": 1.2792992070317268, |
| "epoch": 0.9024, |
| "grad_norm": 0.1174750030040741, |
| "learning_rate": 1.9584e-05, |
| "loss": 1.2654, |
| "mean_token_accuracy": 0.6631789304316044, |
| "num_tokens": 68809942.0, |
| "step": 2820 |
| }, |
| { |
| "entropy": 1.2754346296191215, |
| "epoch": 0.9056, |
| "grad_norm": 0.1153152734041214, |
| "learning_rate": 1.8944e-05, |
| "loss": 1.2601, |
| "mean_token_accuracy": 0.6639180190861225, |
| "num_tokens": 69060770.0, |
| "step": 2830 |
| }, |
| { |
| "entropy": 1.2921388849616051, |
| "epoch": 0.9088, |
| "grad_norm": 0.12659227848052979, |
| "learning_rate": 1.8304000000000003e-05, |
| "loss": 1.2856, |
| "mean_token_accuracy": 0.6612554118037224, |
| "num_tokens": 69292188.0, |
| "step": 2840 |
| }, |
| { |
| "entropy": 1.2787108525633812, |
| "epoch": 0.912, |
| "grad_norm": 0.13238121569156647, |
| "learning_rate": 1.7664e-05, |
| "loss": 1.2794, |
| "mean_token_accuracy": 0.6619273334741592, |
| "num_tokens": 69540898.0, |
| "step": 2850 |
| }, |
| { |
| "entropy": 1.287446926534176, |
| "epoch": 0.9152, |
| "grad_norm": 0.13604958355426788, |
| "learning_rate": 1.7024e-05, |
| "loss": 1.2845, |
| "mean_token_accuracy": 0.6607030496001244, |
| "num_tokens": 69774803.0, |
| "step": 2860 |
| }, |
| { |
| "entropy": 1.2738503232598304, |
| "epoch": 0.9184, |
| "grad_norm": 0.12944011390209198, |
| "learning_rate": 1.6384e-05, |
| "loss": 1.2611, |
| "mean_token_accuracy": 0.6632089488208294, |
| "num_tokens": 70017895.0, |
| "step": 2870 |
| }, |
| { |
| "entropy": 1.2791565768420696, |
| "epoch": 0.9216, |
| "grad_norm": 0.11483687162399292, |
| "learning_rate": 1.5744e-05, |
| "loss": 1.271, |
| "mean_token_accuracy": 0.6649341732263565, |
| "num_tokens": 70257937.0, |
| "step": 2880 |
| }, |
| { |
| "entropy": 1.298772156983614, |
| "epoch": 0.9248, |
| "grad_norm": 0.1275722235441208, |
| "learning_rate": 1.5104000000000001e-05, |
| "loss": 1.2851, |
| "mean_token_accuracy": 0.6598134271800518, |
| "num_tokens": 70496009.0, |
| "step": 2890 |
| }, |
| { |
| "entropy": 1.2867298275232315, |
| "epoch": 0.928, |
| "grad_norm": 0.11700791865587234, |
| "learning_rate": 1.4463999999999999e-05, |
| "loss": 1.2702, |
| "mean_token_accuracy": 0.6639612965285778, |
| "num_tokens": 70737350.0, |
| "step": 2900 |
| }, |
| { |
| "entropy": 1.268957207351923, |
| "epoch": 0.9312, |
| "grad_norm": 0.11532899737358093, |
| "learning_rate": 1.3824e-05, |
| "loss": 1.2596, |
| "mean_token_accuracy": 0.6657577298581601, |
| "num_tokens": 70988734.0, |
| "step": 2910 |
| }, |
| { |
| "entropy": 1.272731138765812, |
| "epoch": 0.9344, |
| "grad_norm": 0.11290522664785385, |
| "learning_rate": 1.3184000000000001e-05, |
| "loss": 1.2692, |
| "mean_token_accuracy": 0.6658322259783744, |
| "num_tokens": 71240907.0, |
| "step": 2920 |
| }, |
| { |
| "entropy": 1.2795393958687782, |
| "epoch": 0.9376, |
| "grad_norm": 0.10183095932006836, |
| "learning_rate": 1.2544e-05, |
| "loss": 1.2595, |
| "mean_token_accuracy": 0.6654788628220558, |
| "num_tokens": 71500533.0, |
| "step": 2930 |
| }, |
| { |
| "entropy": 1.2860508039593697, |
| "epoch": 0.9408, |
| "grad_norm": 0.12720970809459686, |
| "learning_rate": 1.1904000000000002e-05, |
| "loss": 1.2732, |
| "mean_token_accuracy": 0.6631670542061329, |
| "num_tokens": 71749745.0, |
| "step": 2940 |
| }, |
| { |
| "entropy": 1.2804876193404198, |
| "epoch": 0.944, |
| "grad_norm": 0.119963638484478, |
| "learning_rate": 1.1264000000000001e-05, |
| "loss": 1.2721, |
| "mean_token_accuracy": 0.6632199361920357, |
| "num_tokens": 71987469.0, |
| "step": 2950 |
| }, |
| { |
| "entropy": 1.2801647558808327, |
| "epoch": 0.9472, |
| "grad_norm": 0.1231529638171196, |
| "learning_rate": 1.0624e-05, |
| "loss": 1.2809, |
| "mean_token_accuracy": 0.6618592575192451, |
| "num_tokens": 72226054.0, |
| "step": 2960 |
| }, |
| { |
| "entropy": 1.2606345623731614, |
| "epoch": 0.9504, |
| "grad_norm": 0.12572775781154633, |
| "learning_rate": 9.984e-06, |
| "loss": 1.2572, |
| "mean_token_accuracy": 0.6675226472318172, |
| "num_tokens": 72463869.0, |
| "step": 2970 |
| }, |
| { |
| "entropy": 1.2803600803017616, |
| "epoch": 0.9536, |
| "grad_norm": 0.12920019030570984, |
| "learning_rate": 9.344e-06, |
| "loss": 1.276, |
| "mean_token_accuracy": 0.6623409941792489, |
| "num_tokens": 72703143.0, |
| "step": 2980 |
| }, |
| { |
| "entropy": 1.2736871719360352, |
| "epoch": 0.9568, |
| "grad_norm": 0.10845527052879333, |
| "learning_rate": 8.704000000000002e-06, |
| "loss": 1.2586, |
| "mean_token_accuracy": 0.6645304918289184, |
| "num_tokens": 72958313.0, |
| "step": 2990 |
| }, |
| { |
| "entropy": 1.272608993947506, |
| "epoch": 0.96, |
| "grad_norm": 0.13339777290821075, |
| "learning_rate": 8.064000000000001e-06, |
| "loss": 1.2639, |
| "mean_token_accuracy": 0.6631217822432518, |
| "num_tokens": 73197222.0, |
| "step": 3000 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 3125, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 300, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.66801090262688e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|