| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 732, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.3894134521484376, |
| "epoch": 0.040983606557377046, |
| "grad_norm": 13.141048431396484, |
| "learning_rate": 1.975409836065574e-05, |
| "loss": 1.25, |
| "mean_token_accuracy": 0.67734375, |
| "num_tokens": 12480.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 0.900828194618225, |
| "epoch": 0.08196721311475409, |
| "grad_norm": 52.69793701171875, |
| "learning_rate": 1.9480874316939892e-05, |
| "loss": 0.4053, |
| "mean_token_accuracy": 0.834375, |
| "num_tokens": 24960.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 0.874811452627182, |
| "epoch": 0.12295081967213115, |
| "grad_norm": 34.038414001464844, |
| "learning_rate": 1.9207650273224046e-05, |
| "loss": 0.2804, |
| "mean_token_accuracy": 0.88125, |
| "num_tokens": 37440.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 0.9816662311553955, |
| "epoch": 0.16393442622950818, |
| "grad_norm": 14.941110610961914, |
| "learning_rate": 1.89344262295082e-05, |
| "loss": 0.2195, |
| "mean_token_accuracy": 0.90625, |
| "num_tokens": 49920.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 0.8901944577693939, |
| "epoch": 0.20491803278688525, |
| "grad_norm": 23.074167251586914, |
| "learning_rate": 1.866120218579235e-05, |
| "loss": 0.1932, |
| "mean_token_accuracy": 0.9109375, |
| "num_tokens": 62400.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.8397292912006378, |
| "epoch": 0.2459016393442623, |
| "grad_norm": 17.52640724182129, |
| "learning_rate": 1.8387978142076503e-05, |
| "loss": 0.2027, |
| "mean_token_accuracy": 0.909375, |
| "num_tokens": 74880.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.8746075868606568, |
| "epoch": 0.28688524590163933, |
| "grad_norm": 12.406326293945312, |
| "learning_rate": 1.8114754098360656e-05, |
| "loss": 0.1185, |
| "mean_token_accuracy": 0.95390625, |
| "num_tokens": 87360.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.8339551448822021, |
| "epoch": 0.32786885245901637, |
| "grad_norm": 8.674996376037598, |
| "learning_rate": 1.784153005464481e-05, |
| "loss": 0.1133, |
| "mean_token_accuracy": 0.95390625, |
| "num_tokens": 99840.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 0.8026882886886597, |
| "epoch": 0.36885245901639346, |
| "grad_norm": 27.134490966796875, |
| "learning_rate": 1.7568306010928963e-05, |
| "loss": 0.1526, |
| "mean_token_accuracy": 0.94296875, |
| "num_tokens": 112320.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 0.8337083220481872, |
| "epoch": 0.4098360655737705, |
| "grad_norm": 12.572772026062012, |
| "learning_rate": 1.7295081967213117e-05, |
| "loss": 0.069, |
| "mean_token_accuracy": 0.96796875, |
| "num_tokens": 124800.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 0.789594167470932, |
| "epoch": 0.45081967213114754, |
| "grad_norm": 11.466198921203613, |
| "learning_rate": 1.702185792349727e-05, |
| "loss": 0.0618, |
| "mean_token_accuracy": 0.978125, |
| "num_tokens": 137280.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 0.7361592233181, |
| "epoch": 0.4918032786885246, |
| "grad_norm": 31.19264793395996, |
| "learning_rate": 1.674863387978142e-05, |
| "loss": 0.1167, |
| "mean_token_accuracy": 0.95078125, |
| "num_tokens": 149760.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 0.722903597354889, |
| "epoch": 0.5327868852459017, |
| "grad_norm": 13.952592849731445, |
| "learning_rate": 1.6475409836065574e-05, |
| "loss": 0.0195, |
| "mean_token_accuracy": 0.99375, |
| "num_tokens": 162240.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 0.6958432495594025, |
| "epoch": 0.5737704918032787, |
| "grad_norm": 1.7772663831710815, |
| "learning_rate": 1.6202185792349728e-05, |
| "loss": 0.0258, |
| "mean_token_accuracy": 0.98984375, |
| "num_tokens": 174720.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 0.7353324055671692, |
| "epoch": 0.6147540983606558, |
| "grad_norm": 47.304779052734375, |
| "learning_rate": 1.592896174863388e-05, |
| "loss": 0.0353, |
| "mean_token_accuracy": 0.98828125, |
| "num_tokens": 187200.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 0.6916000425815583, |
| "epoch": 0.6557377049180327, |
| "grad_norm": 30.046722412109375, |
| "learning_rate": 1.5655737704918035e-05, |
| "loss": 0.0376, |
| "mean_token_accuracy": 0.9875, |
| "num_tokens": 199680.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 0.6607187688350677, |
| "epoch": 0.6967213114754098, |
| "grad_norm": 18.224321365356445, |
| "learning_rate": 1.538251366120219e-05, |
| "loss": 0.0174, |
| "mean_token_accuracy": 0.99609375, |
| "num_tokens": 212160.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 0.641388189792633, |
| "epoch": 0.7377049180327869, |
| "grad_norm": 0.8001788854598999, |
| "learning_rate": 1.510928961748634e-05, |
| "loss": 0.0226, |
| "mean_token_accuracy": 0.99296875, |
| "num_tokens": 224640.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 0.5654755473136902, |
| "epoch": 0.7786885245901639, |
| "grad_norm": 6.179544448852539, |
| "learning_rate": 1.4836065573770492e-05, |
| "loss": 0.0184, |
| "mean_token_accuracy": 0.990625, |
| "num_tokens": 237120.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 0.5690807163715362, |
| "epoch": 0.819672131147541, |
| "grad_norm": 17.677095413208008, |
| "learning_rate": 1.4562841530054646e-05, |
| "loss": 0.0554, |
| "mean_token_accuracy": 0.9828125, |
| "num_tokens": 249600.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 0.6085642755031586, |
| "epoch": 0.860655737704918, |
| "grad_norm": 15.046479225158691, |
| "learning_rate": 1.4289617486338798e-05, |
| "loss": 0.0131, |
| "mean_token_accuracy": 0.99609375, |
| "num_tokens": 262080.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 0.5867221057415009, |
| "epoch": 0.9016393442622951, |
| "grad_norm": 9.699835777282715, |
| "learning_rate": 1.4016393442622951e-05, |
| "loss": 0.0116, |
| "mean_token_accuracy": 0.996875, |
| "num_tokens": 274560.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 0.5684578359127045, |
| "epoch": 0.9426229508196722, |
| "grad_norm": 0.10982056707143784, |
| "learning_rate": 1.3743169398907106e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.99765625, |
| "num_tokens": 287040.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 0.5817050397396087, |
| "epoch": 0.9836065573770492, |
| "grad_norm": 0.11507736891508102, |
| "learning_rate": 1.3469945355191258e-05, |
| "loss": 0.0025, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 299520.0, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_entropy": 0.5465924368964301, |
| "eval_loss": 0.004227596800774336, |
| "eval_mean_token_accuracy": 0.9989583333333333, |
| "eval_num_tokens": 304200.0, |
| "eval_runtime": 4.2641, |
| "eval_samples_per_second": 333.484, |
| "eval_steps_per_second": 10.553, |
| "step": 244 |
| }, |
| { |
| "entropy": 0.5748404681682586, |
| "epoch": 1.0245901639344261, |
| "grad_norm": 0.04080405831336975, |
| "learning_rate": 1.3196721311475412e-05, |
| "loss": 0.0116, |
| "mean_token_accuracy": 0.996875, |
| "num_tokens": 311688.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 0.5690902054309845, |
| "epoch": 1.0655737704918034, |
| "grad_norm": 25.26702880859375, |
| "learning_rate": 1.2923497267759564e-05, |
| "loss": 0.0152, |
| "mean_token_accuracy": 0.99453125, |
| "num_tokens": 324168.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 0.5449742019176483, |
| "epoch": 1.1065573770491803, |
| "grad_norm": 0.015380386263132095, |
| "learning_rate": 1.2650273224043717e-05, |
| "loss": 0.0049, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 336648.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 0.5294660866260529, |
| "epoch": 1.1475409836065573, |
| "grad_norm": 16.71912384033203, |
| "learning_rate": 1.2377049180327869e-05, |
| "loss": 0.022, |
| "mean_token_accuracy": 0.98984375, |
| "num_tokens": 349128.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 0.5246538281440735, |
| "epoch": 1.1885245901639343, |
| "grad_norm": 0.05239957571029663, |
| "learning_rate": 1.2103825136612023e-05, |
| "loss": 0.0229, |
| "mean_token_accuracy": 0.99296875, |
| "num_tokens": 361608.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 0.557120931148529, |
| "epoch": 1.2295081967213115, |
| "grad_norm": 6.907280445098877, |
| "learning_rate": 1.1830601092896176e-05, |
| "loss": 0.003, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 374088.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 0.5478886723518371, |
| "epoch": 1.2704918032786885, |
| "grad_norm": 0.22664949297904968, |
| "learning_rate": 1.155737704918033e-05, |
| "loss": 0.0051, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 386568.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 0.5443004906177521, |
| "epoch": 1.3114754098360657, |
| "grad_norm": 0.20651134848594666, |
| "learning_rate": 1.1284153005464482e-05, |
| "loss": 0.0048, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 399048.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 0.5586306273937225, |
| "epoch": 1.3524590163934427, |
| "grad_norm": 0.1290273219347, |
| "learning_rate": 1.1010928961748635e-05, |
| "loss": 0.0033, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 411528.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 0.563942140340805, |
| "epoch": 1.3934426229508197, |
| "grad_norm": 5.307101249694824, |
| "learning_rate": 1.0737704918032787e-05, |
| "loss": 0.0015, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 424008.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 0.5790425062179565, |
| "epoch": 1.4344262295081966, |
| "grad_norm": 0.037372056394815445, |
| "learning_rate": 1.046448087431694e-05, |
| "loss": 0.0004, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 436488.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 0.5586484789848327, |
| "epoch": 1.4754098360655736, |
| "grad_norm": 22.3153133392334, |
| "learning_rate": 1.0191256830601092e-05, |
| "loss": 0.0068, |
| "mean_token_accuracy": 0.996875, |
| "num_tokens": 448968.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 0.5542426824569702, |
| "epoch": 1.5163934426229508, |
| "grad_norm": 0.38335874676704407, |
| "learning_rate": 9.918032786885246e-06, |
| "loss": 0.0045, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 461448.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 0.5510773122310638, |
| "epoch": 1.5573770491803278, |
| "grad_norm": 5.615074157714844, |
| "learning_rate": 9.6448087431694e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 473928.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 0.5534482300281525, |
| "epoch": 1.598360655737705, |
| "grad_norm": 1.5892225503921509, |
| "learning_rate": 9.371584699453553e-06, |
| "loss": 0.0046, |
| "mean_token_accuracy": 0.99765625, |
| "num_tokens": 486408.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 0.5506496012210846, |
| "epoch": 1.639344262295082, |
| "grad_norm": 0.008472824469208717, |
| "learning_rate": 9.098360655737707e-06, |
| "loss": 0.0044, |
| "mean_token_accuracy": 0.9984375, |
| "num_tokens": 498888.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 0.5349482536315918, |
| "epoch": 1.680327868852459, |
| "grad_norm": 1.03555166721344, |
| "learning_rate": 8.825136612021858e-06, |
| "loss": 0.0003, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 511368.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 0.5442528307437897, |
| "epoch": 1.721311475409836, |
| "grad_norm": 0.12735722959041595, |
| "learning_rate": 8.551912568306012e-06, |
| "loss": 0.0007, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 523848.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 0.5300421416759491, |
| "epoch": 1.762295081967213, |
| "grad_norm": 0.1337188482284546, |
| "learning_rate": 8.278688524590165e-06, |
| "loss": 0.0008, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 536328.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 0.5553384840488433, |
| "epoch": 1.8032786885245902, |
| "grad_norm": 0.01123378612101078, |
| "learning_rate": 8.005464480874317e-06, |
| "loss": 0.0019, |
| "mean_token_accuracy": 0.99921875, |
| "num_tokens": 548808.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 0.5755411803722381, |
| "epoch": 1.8442622950819674, |
| "grad_norm": 0.002041811356320977, |
| "learning_rate": 7.732240437158471e-06, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 561288.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 0.564773005247116, |
| "epoch": 1.8852459016393444, |
| "grad_norm": 0.004369141533970833, |
| "learning_rate": 7.459016393442624e-06, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 573768.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 0.5664382457733155, |
| "epoch": 1.9262295081967213, |
| "grad_norm": 0.1301860511302948, |
| "learning_rate": 7.185792349726777e-06, |
| "loss": 0.0002, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 586248.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 0.5698906660079956, |
| "epoch": 1.9672131147540983, |
| "grad_norm": 0.011921355500817299, |
| "learning_rate": 6.91256830601093e-06, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 598728.0, |
| "step": 480 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_entropy": 0.5514066629939609, |
| "eval_loss": 0.0009000992868095636, |
| "eval_mean_token_accuracy": 0.9998263888888889, |
| "eval_num_tokens": 608400.0, |
| "eval_runtime": 4.2684, |
| "eval_samples_per_second": 333.145, |
| "eval_steps_per_second": 10.543, |
| "step": 488 |
| }, |
| { |
| "entropy": 0.5585790574550629, |
| "epoch": 2.0081967213114753, |
| "grad_norm": 0.16178520023822784, |
| "learning_rate": 6.6393442622950825e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 610896.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 0.5657597005367279, |
| "epoch": 2.0491803278688523, |
| "grad_norm": 0.0013239571126177907, |
| "learning_rate": 6.366120218579236e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 623376.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 0.555700272321701, |
| "epoch": 2.0901639344262297, |
| "grad_norm": 0.0003598702314775437, |
| "learning_rate": 6.092896174863389e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 635856.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 0.5675460159778595, |
| "epoch": 2.1311475409836067, |
| "grad_norm": 0.0006554612773470581, |
| "learning_rate": 5.8196721311475415e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 648336.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 0.5628720939159393, |
| "epoch": 2.1721311475409837, |
| "grad_norm": 0.0010637306841090322, |
| "learning_rate": 5.546448087431694e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 660816.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 0.5665178418159484, |
| "epoch": 2.2131147540983607, |
| "grad_norm": 0.0005588372005149722, |
| "learning_rate": 5.273224043715848e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 673296.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 0.572381979227066, |
| "epoch": 2.2540983606557377, |
| "grad_norm": 0.0007820471655577421, |
| "learning_rate": 5e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 685776.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 0.5639007449150085, |
| "epoch": 2.2950819672131146, |
| "grad_norm": 0.0003962396876886487, |
| "learning_rate": 4.726775956284154e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 698256.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 0.5546983301639556, |
| "epoch": 2.3360655737704916, |
| "grad_norm": 0.0002764791715890169, |
| "learning_rate": 4.453551912568307e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 710736.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 0.5563873648643494, |
| "epoch": 2.3770491803278686, |
| "grad_norm": 0.0006734775961376727, |
| "learning_rate": 4.180327868852459e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 723216.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 0.5603203475475311, |
| "epoch": 2.418032786885246, |
| "grad_norm": 0.001175143290311098, |
| "learning_rate": 3.907103825136612e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 735696.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 0.5609167218208313, |
| "epoch": 2.459016393442623, |
| "grad_norm": 0.0010104449465870857, |
| "learning_rate": 3.6338797814207656e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 748176.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 0.5623033583164215, |
| "epoch": 2.5, |
| "grad_norm": 0.0007569916197098792, |
| "learning_rate": 3.3606557377049183e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 760656.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 0.5588847517967224, |
| "epoch": 2.540983606557377, |
| "grad_norm": 0.00043811326031573117, |
| "learning_rate": 3.0874316939890714e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 773136.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 0.5473314583301544, |
| "epoch": 2.581967213114754, |
| "grad_norm": 0.00015868025366216898, |
| "learning_rate": 2.814207650273224e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 785616.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 0.5686765968799591, |
| "epoch": 2.6229508196721314, |
| "grad_norm": 0.000258870713878423, |
| "learning_rate": 2.5409836065573773e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 798096.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 0.5574231624603272, |
| "epoch": 2.663934426229508, |
| "grad_norm": 0.0005557859549298882, |
| "learning_rate": 2.2677595628415304e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 810576.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 0.5558390915393829, |
| "epoch": 2.7049180327868854, |
| "grad_norm": 0.002176334150135517, |
| "learning_rate": 1.994535519125683e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 823056.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 0.5470618724822998, |
| "epoch": 2.7459016393442623, |
| "grad_norm": 0.0007859561592340469, |
| "learning_rate": 1.7213114754098362e-06, |
| "loss": 0.0001, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 835536.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 0.555149644613266, |
| "epoch": 2.7868852459016393, |
| "grad_norm": 0.0003778956306632608, |
| "learning_rate": 1.4480874316939891e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 848016.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 0.5577639102935791, |
| "epoch": 2.8278688524590163, |
| "grad_norm": 0.00018476726836524904, |
| "learning_rate": 1.1748633879781422e-06, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 860496.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 0.5558693587779999, |
| "epoch": 2.8688524590163933, |
| "grad_norm": 0.00044889526907354593, |
| "learning_rate": 9.016393442622952e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 872976.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 0.5639389038085938, |
| "epoch": 2.9098360655737707, |
| "grad_norm": 0.0005407112766988575, |
| "learning_rate": 6.284153005464482e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 885456.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 0.5556601345539093, |
| "epoch": 2.9508196721311473, |
| "grad_norm": 0.0008689384558238089, |
| "learning_rate": 3.551912568306011e-07, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 897936.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 0.5493379056453704, |
| "epoch": 2.9918032786885247, |
| "grad_norm": 0.0004895636229775846, |
| "learning_rate": 8.19672131147541e-08, |
| "loss": 0.0, |
| "mean_token_accuracy": 1.0, |
| "num_tokens": 910416.0, |
| "step": 730 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_entropy": 0.5459074311786227, |
| "eval_loss": 2.739655656114337e-06, |
| "eval_mean_token_accuracy": 1.0, |
| "eval_num_tokens": 912600.0, |
| "eval_runtime": 4.2464, |
| "eval_samples_per_second": 334.874, |
| "eval_steps_per_second": 10.597, |
| "step": 732 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 732, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 549345133209600.0, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|