| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 950, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.79375, |
| "epoch": 0.021052631578947368, |
| "grad_norm": 6.71875, |
| "learning_rate": 4.9526315789473685e-05, |
| "loss": 1.7312183380126953, |
| "mean_token_accuracy": 0.6867853045463562, |
| "num_tokens": 840.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.52421875, |
| "epoch": 0.042105263157894736, |
| "grad_norm": 3.90625, |
| "learning_rate": 4.9e-05, |
| "loss": 1.3260906219482422, |
| "mean_token_accuracy": 0.7451180815696716, |
| "num_tokens": 1573.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.69921875, |
| "epoch": 0.06315789473684211, |
| "grad_norm": 6.5625, |
| "learning_rate": 4.847368421052632e-05, |
| "loss": 1.6661336898803711, |
| "mean_token_accuracy": 0.7098163902759552, |
| "num_tokens": 2338.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.5484375, |
| "epoch": 0.08421052631578947, |
| "grad_norm": 7.4375, |
| "learning_rate": 4.794736842105264e-05, |
| "loss": 1.5569435119628907, |
| "mean_token_accuracy": 0.7198988318443298, |
| "num_tokens": 3038.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.59296875, |
| "epoch": 0.10526315789473684, |
| "grad_norm": 8.1875, |
| "learning_rate": 4.742105263157895e-05, |
| "loss": 1.5420040130615233, |
| "mean_token_accuracy": 0.7149245262145996, |
| "num_tokens": 3682.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 1.80390625, |
| "epoch": 0.12631578947368421, |
| "grad_norm": 9.9375, |
| "learning_rate": 4.6894736842105264e-05, |
| "loss": 1.6832901000976563, |
| "mean_token_accuracy": 0.7065998375415802, |
| "num_tokens": 4617.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 1.66796875, |
| "epoch": 0.14736842105263157, |
| "grad_norm": 8.375, |
| "learning_rate": 4.6368421052631584e-05, |
| "loss": 1.6378042221069335, |
| "mean_token_accuracy": 0.7060423612594604, |
| "num_tokens": 5349.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 1.5109375, |
| "epoch": 0.16842105263157894, |
| "grad_norm": 6.03125, |
| "learning_rate": 4.58421052631579e-05, |
| "loss": 1.5179196357727052, |
| "mean_token_accuracy": 0.7291716754436492, |
| "num_tokens": 6122.0, |
| "step": 80 |
| }, |
| { |
| "entropy": 1.721875, |
| "epoch": 0.18947368421052632, |
| "grad_norm": 4.0625, |
| "learning_rate": 4.531578947368421e-05, |
| "loss": 1.8176845550537108, |
| "mean_token_accuracy": 0.6842875778675079, |
| "num_tokens": 7197.0, |
| "step": 90 |
| }, |
| { |
| "entropy": 1.52265625, |
| "epoch": 0.21052631578947367, |
| "grad_norm": 6.46875, |
| "learning_rate": 4.478947368421053e-05, |
| "loss": 1.5252375602722168, |
| "mean_token_accuracy": 0.711300152540207, |
| "num_tokens": 8046.0, |
| "step": 100 |
| }, |
| { |
| "entropy": 1.6515625, |
| "epoch": 0.23157894736842105, |
| "grad_norm": 7.0, |
| "learning_rate": 4.426315789473684e-05, |
| "loss": 1.611851119995117, |
| "mean_token_accuracy": 0.7179319798946381, |
| "num_tokens": 8698.0, |
| "step": 110 |
| }, |
| { |
| "entropy": 1.54140625, |
| "epoch": 0.25263157894736843, |
| "grad_norm": 6.90625, |
| "learning_rate": 4.373684210526316e-05, |
| "loss": 1.5053813934326172, |
| "mean_token_accuracy": 0.7282716870307923, |
| "num_tokens": 9455.0, |
| "step": 120 |
| }, |
| { |
| "entropy": 1.5375, |
| "epoch": 0.2736842105263158, |
| "grad_norm": 7.25, |
| "learning_rate": 4.3210526315789475e-05, |
| "loss": 1.5904606819152831, |
| "mean_token_accuracy": 0.7234093546867371, |
| "num_tokens": 10191.0, |
| "step": 130 |
| }, |
| { |
| "entropy": 1.58984375, |
| "epoch": 0.29473684210526313, |
| "grad_norm": 9.375, |
| "learning_rate": 4.2684210526315795e-05, |
| "loss": 1.6070585250854492, |
| "mean_token_accuracy": 0.7228378415107727, |
| "num_tokens": 10859.0, |
| "step": 140 |
| }, |
| { |
| "entropy": 1.80234375, |
| "epoch": 0.3157894736842105, |
| "grad_norm": 8.3125, |
| "learning_rate": 4.215789473684211e-05, |
| "loss": 1.7423076629638672, |
| "mean_token_accuracy": 0.6761326909065246, |
| "num_tokens": 11846.0, |
| "step": 150 |
| }, |
| { |
| "entropy": 1.592578125, |
| "epoch": 0.3368421052631579, |
| "grad_norm": 5.1875, |
| "learning_rate": 4.163157894736842e-05, |
| "loss": 1.6300687789916992, |
| "mean_token_accuracy": 0.7145272672176362, |
| "num_tokens": 12722.0, |
| "step": 160 |
| }, |
| { |
| "entropy": 1.5953125, |
| "epoch": 0.35789473684210527, |
| "grad_norm": 6.125, |
| "learning_rate": 4.110526315789474e-05, |
| "loss": 1.44827880859375, |
| "mean_token_accuracy": 0.725699108839035, |
| "num_tokens": 13458.0, |
| "step": 170 |
| }, |
| { |
| "entropy": 1.8171875, |
| "epoch": 0.37894736842105264, |
| "grad_norm": 7.125, |
| "learning_rate": 4.0578947368421054e-05, |
| "loss": 1.7392475128173828, |
| "mean_token_accuracy": 0.6949776589870453, |
| "num_tokens": 14197.0, |
| "step": 180 |
| }, |
| { |
| "entropy": 1.78671875, |
| "epoch": 0.4, |
| "grad_norm": 7.8125, |
| "learning_rate": 4.0052631578947367e-05, |
| "loss": 1.8162834167480468, |
| "mean_token_accuracy": 0.6562099277973175, |
| "num_tokens": 15056.0, |
| "step": 190 |
| }, |
| { |
| "entropy": 1.77421875, |
| "epoch": 0.42105263157894735, |
| "grad_norm": 7.53125, |
| "learning_rate": 3.9526315789473686e-05, |
| "loss": 1.7153417587280273, |
| "mean_token_accuracy": 0.6948422849178314, |
| "num_tokens": 15807.0, |
| "step": 200 |
| }, |
| { |
| "entropy": 1.6875, |
| "epoch": 0.4421052631578947, |
| "grad_norm": 9.125, |
| "learning_rate": 3.9000000000000006e-05, |
| "loss": 1.6782726287841796, |
| "mean_token_accuracy": 0.7238920211791993, |
| "num_tokens": 16432.0, |
| "step": 210 |
| }, |
| { |
| "entropy": 1.446484375, |
| "epoch": 0.4631578947368421, |
| "grad_norm": 5.84375, |
| "learning_rate": 3.847368421052632e-05, |
| "loss": 1.4408910751342774, |
| "mean_token_accuracy": 0.7403214454650879, |
| "num_tokens": 17103.0, |
| "step": 220 |
| }, |
| { |
| "entropy": 1.76171875, |
| "epoch": 0.4842105263157895, |
| "grad_norm": 6.375, |
| "learning_rate": 3.794736842105263e-05, |
| "loss": 1.7578521728515626, |
| "mean_token_accuracy": 0.6848492741584777, |
| "num_tokens": 18038.0, |
| "step": 230 |
| }, |
| { |
| "entropy": 1.53515625, |
| "epoch": 0.5052631578947369, |
| "grad_norm": 6.71875, |
| "learning_rate": 3.742105263157895e-05, |
| "loss": 1.48400821685791, |
| "mean_token_accuracy": 0.7131125509738923, |
| "num_tokens": 18720.0, |
| "step": 240 |
| }, |
| { |
| "entropy": 1.49140625, |
| "epoch": 0.5263157894736842, |
| "grad_norm": 6.375, |
| "learning_rate": 3.6894736842105265e-05, |
| "loss": 1.5496106147766113, |
| "mean_token_accuracy": 0.7365909218788147, |
| "num_tokens": 19308.0, |
| "step": 250 |
| }, |
| { |
| "entropy": 1.46796875, |
| "epoch": 0.5473684210526316, |
| "grad_norm": 8.125, |
| "learning_rate": 3.636842105263158e-05, |
| "loss": 1.5240853309631348, |
| "mean_token_accuracy": 0.736497437953949, |
| "num_tokens": 19967.0, |
| "step": 260 |
| }, |
| { |
| "entropy": 1.578125, |
| "epoch": 0.5684210526315789, |
| "grad_norm": 8.3125, |
| "learning_rate": 3.58421052631579e-05, |
| "loss": 1.5680004119873048, |
| "mean_token_accuracy": 0.7332142323255539, |
| "num_tokens": 20769.0, |
| "step": 270 |
| }, |
| { |
| "entropy": 1.501953125, |
| "epoch": 0.5894736842105263, |
| "grad_norm": 8.375, |
| "learning_rate": 3.531578947368421e-05, |
| "loss": 1.5029385566711426, |
| "mean_token_accuracy": 0.7254173457622528, |
| "num_tokens": 22503.0, |
| "step": 280 |
| }, |
| { |
| "entropy": 1.5921875, |
| "epoch": 0.6105263157894737, |
| "grad_norm": 7.0, |
| "learning_rate": 3.478947368421053e-05, |
| "loss": 1.6090343475341797, |
| "mean_token_accuracy": 0.7082146763801574, |
| "num_tokens": 23439.0, |
| "step": 290 |
| }, |
| { |
| "entropy": 1.75078125, |
| "epoch": 0.631578947368421, |
| "grad_norm": 8.25, |
| "learning_rate": 3.426315789473684e-05, |
| "loss": 1.6250024795532227, |
| "mean_token_accuracy": 0.6883616149425507, |
| "num_tokens": 24161.0, |
| "step": 300 |
| }, |
| { |
| "entropy": 1.269140625, |
| "epoch": 0.6526315789473685, |
| "grad_norm": 7.46875, |
| "learning_rate": 3.373684210526316e-05, |
| "loss": 1.3148769378662108, |
| "mean_token_accuracy": 0.7618813216686249, |
| "num_tokens": 24716.0, |
| "step": 310 |
| }, |
| { |
| "entropy": 1.809375, |
| "epoch": 0.6736842105263158, |
| "grad_norm": 7.40625, |
| "learning_rate": 3.3210526315789476e-05, |
| "loss": 1.9381757736206056, |
| "mean_token_accuracy": 0.667089307308197, |
| "num_tokens": 25481.0, |
| "step": 320 |
| }, |
| { |
| "entropy": 1.73515625, |
| "epoch": 0.6947368421052632, |
| "grad_norm": 11.5625, |
| "learning_rate": 3.268421052631579e-05, |
| "loss": 1.76402587890625, |
| "mean_token_accuracy": 0.6992617845535278, |
| "num_tokens": 26159.0, |
| "step": 330 |
| }, |
| { |
| "entropy": 1.644140625, |
| "epoch": 0.7157894736842105, |
| "grad_norm": 7.625, |
| "learning_rate": 3.215789473684211e-05, |
| "loss": 1.6910707473754882, |
| "mean_token_accuracy": 0.6974358975887298, |
| "num_tokens": 26983.0, |
| "step": 340 |
| }, |
| { |
| "entropy": 1.44140625, |
| "epoch": 0.7368421052631579, |
| "grad_norm": 7.84375, |
| "learning_rate": 3.163157894736842e-05, |
| "loss": 1.3823105812072753, |
| "mean_token_accuracy": 0.7533604800701141, |
| "num_tokens": 27564.0, |
| "step": 350 |
| }, |
| { |
| "entropy": 1.41953125, |
| "epoch": 0.7578947368421053, |
| "grad_norm": 9.5, |
| "learning_rate": 3.1105263157894735e-05, |
| "loss": 1.488726806640625, |
| "mean_token_accuracy": 0.733670562505722, |
| "num_tokens": 28200.0, |
| "step": 360 |
| }, |
| { |
| "entropy": 1.753125, |
| "epoch": 0.7789473684210526, |
| "grad_norm": 5.9375, |
| "learning_rate": 3.0578947368421054e-05, |
| "loss": 1.8948373794555664, |
| "mean_token_accuracy": 0.675409197807312, |
| "num_tokens": 28919.0, |
| "step": 370 |
| }, |
| { |
| "entropy": 1.98828125, |
| "epoch": 0.8, |
| "grad_norm": 8.5, |
| "learning_rate": 3.005263157894737e-05, |
| "loss": 2.146737289428711, |
| "mean_token_accuracy": 0.6334406793117523, |
| "num_tokens": 29894.0, |
| "step": 380 |
| }, |
| { |
| "entropy": 1.85625, |
| "epoch": 0.8210526315789474, |
| "grad_norm": 6.3125, |
| "learning_rate": 2.9526315789473684e-05, |
| "loss": 1.858269500732422, |
| "mean_token_accuracy": 0.6439402669668197, |
| "num_tokens": 30652.0, |
| "step": 390 |
| }, |
| { |
| "entropy": 1.727734375, |
| "epoch": 0.8421052631578947, |
| "grad_norm": 7.96875, |
| "learning_rate": 2.9e-05, |
| "loss": 1.7343599319458007, |
| "mean_token_accuracy": 0.700012594461441, |
| "num_tokens": 31372.0, |
| "step": 400 |
| }, |
| { |
| "entropy": 1.65, |
| "epoch": 0.8631578947368421, |
| "grad_norm": 8.0625, |
| "learning_rate": 2.847368421052632e-05, |
| "loss": 1.6553001403808594, |
| "mean_token_accuracy": 0.7165175020694733, |
| "num_tokens": 32149.0, |
| "step": 410 |
| }, |
| { |
| "entropy": 1.596875, |
| "epoch": 0.8842105263157894, |
| "grad_norm": 6.59375, |
| "learning_rate": 2.7947368421052633e-05, |
| "loss": 1.5661128997802733, |
| "mean_token_accuracy": 0.7166097521781921, |
| "num_tokens": 33203.0, |
| "step": 420 |
| }, |
| { |
| "entropy": 1.6015625, |
| "epoch": 0.9052631578947369, |
| "grad_norm": 7.1875, |
| "learning_rate": 2.7421052631578946e-05, |
| "loss": 1.6179162979125976, |
| "mean_token_accuracy": 0.7093785464763641, |
| "num_tokens": 33889.0, |
| "step": 430 |
| }, |
| { |
| "entropy": 1.75546875, |
| "epoch": 0.9263157894736842, |
| "grad_norm": 7.21875, |
| "learning_rate": 2.6894736842105266e-05, |
| "loss": 1.7675729751586915, |
| "mean_token_accuracy": 0.6769322335720063, |
| "num_tokens": 34862.0, |
| "step": 440 |
| }, |
| { |
| "entropy": 1.92109375, |
| "epoch": 0.9473684210526315, |
| "grad_norm": 11.0625, |
| "learning_rate": 2.6368421052631582e-05, |
| "loss": 1.9548963546752929, |
| "mean_token_accuracy": 0.648491358757019, |
| "num_tokens": 35710.0, |
| "step": 450 |
| }, |
| { |
| "entropy": 1.699609375, |
| "epoch": 0.968421052631579, |
| "grad_norm": 6.03125, |
| "learning_rate": 2.5842105263157895e-05, |
| "loss": 1.6432403564453124, |
| "mean_token_accuracy": 0.6918732106685639, |
| "num_tokens": 36374.0, |
| "step": 460 |
| }, |
| { |
| "entropy": 1.84453125, |
| "epoch": 0.9894736842105263, |
| "grad_norm": 7.03125, |
| "learning_rate": 2.5315789473684208e-05, |
| "loss": 1.7591934204101562, |
| "mean_token_accuracy": 0.6581568241119384, |
| "num_tokens": 37139.0, |
| "step": 470 |
| }, |
| { |
| "entropy": 1.75859375, |
| "epoch": 1.0105263157894737, |
| "grad_norm": 7.15625, |
| "learning_rate": 2.4789473684210528e-05, |
| "loss": 1.7067642211914062, |
| "mean_token_accuracy": 0.7008034646511078, |
| "num_tokens": 38037.0, |
| "step": 480 |
| }, |
| { |
| "entropy": 1.4203125, |
| "epoch": 1.0315789473684212, |
| "grad_norm": 6.4375, |
| "learning_rate": 2.4263157894736844e-05, |
| "loss": 1.378176498413086, |
| "mean_token_accuracy": 0.7486959993839264, |
| "num_tokens": 38668.0, |
| "step": 490 |
| }, |
| { |
| "entropy": 1.56328125, |
| "epoch": 1.0526315789473684, |
| "grad_norm": 7.15625, |
| "learning_rate": 2.373684210526316e-05, |
| "loss": 1.4762983322143555, |
| "mean_token_accuracy": 0.7175065577030182, |
| "num_tokens": 39424.0, |
| "step": 500 |
| }, |
| { |
| "entropy": 1.630078125, |
| "epoch": 1.0736842105263158, |
| "grad_norm": 4.84375, |
| "learning_rate": 2.3210526315789473e-05, |
| "loss": 1.5906378746032714, |
| "mean_token_accuracy": 0.6876044690608978, |
| "num_tokens": 40317.0, |
| "step": 510 |
| }, |
| { |
| "entropy": 1.556640625, |
| "epoch": 1.0947368421052632, |
| "grad_norm": 6.59375, |
| "learning_rate": 2.268421052631579e-05, |
| "loss": 1.6246864318847656, |
| "mean_token_accuracy": 0.7021546125411987, |
| "num_tokens": 41101.0, |
| "step": 520 |
| }, |
| { |
| "entropy": 1.86015625, |
| "epoch": 1.1157894736842104, |
| "grad_norm": 6.3125, |
| "learning_rate": 2.2157894736842106e-05, |
| "loss": 1.832110595703125, |
| "mean_token_accuracy": 0.6659113824367523, |
| "num_tokens": 42080.0, |
| "step": 530 |
| }, |
| { |
| "entropy": 1.50859375, |
| "epoch": 1.1368421052631579, |
| "grad_norm": 4.75, |
| "learning_rate": 2.1631578947368423e-05, |
| "loss": 1.495261001586914, |
| "mean_token_accuracy": 0.7114485323429107, |
| "num_tokens": 43064.0, |
| "step": 540 |
| }, |
| { |
| "entropy": 1.379296875, |
| "epoch": 1.1578947368421053, |
| "grad_norm": 7.90625, |
| "learning_rate": 2.110526315789474e-05, |
| "loss": 1.3109845161437987, |
| "mean_token_accuracy": 0.7768619418144226, |
| "num_tokens": 43638.0, |
| "step": 550 |
| }, |
| { |
| "entropy": 1.671875, |
| "epoch": 1.1789473684210527, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.0578947368421052e-05, |
| "loss": 1.7390716552734375, |
| "mean_token_accuracy": 0.6966245353221894, |
| "num_tokens": 44538.0, |
| "step": 560 |
| }, |
| { |
| "entropy": 1.45390625, |
| "epoch": 1.2, |
| "grad_norm": 8.5, |
| "learning_rate": 2.005263157894737e-05, |
| "loss": 1.3690235137939453, |
| "mean_token_accuracy": 0.742480456829071, |
| "num_tokens": 45159.0, |
| "step": 570 |
| }, |
| { |
| "entropy": 1.8234375, |
| "epoch": 1.2210526315789474, |
| "grad_norm": 6.65625, |
| "learning_rate": 1.9526315789473685e-05, |
| "loss": 1.890069580078125, |
| "mean_token_accuracy": 0.6900433540344239, |
| "num_tokens": 45918.0, |
| "step": 580 |
| }, |
| { |
| "entropy": 1.8640625, |
| "epoch": 1.2421052631578948, |
| "grad_norm": 8.125, |
| "learning_rate": 1.9e-05, |
| "loss": 1.7881799697875977, |
| "mean_token_accuracy": 0.6537608683109284, |
| "num_tokens": 46795.0, |
| "step": 590 |
| }, |
| { |
| "entropy": 1.59453125, |
| "epoch": 1.263157894736842, |
| "grad_norm": 6.5, |
| "learning_rate": 1.8473684210526317e-05, |
| "loss": 1.5872214317321778, |
| "mean_token_accuracy": 0.722487497329712, |
| "num_tokens": 47423.0, |
| "step": 600 |
| }, |
| { |
| "entropy": 1.78828125, |
| "epoch": 1.2842105263157895, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.7947368421052634e-05, |
| "loss": 1.6725593566894532, |
| "mean_token_accuracy": 0.6980259001255036, |
| "num_tokens": 48158.0, |
| "step": 610 |
| }, |
| { |
| "entropy": 1.61015625, |
| "epoch": 1.305263157894737, |
| "grad_norm": 6.5, |
| "learning_rate": 1.742105263157895e-05, |
| "loss": 1.5390226364135742, |
| "mean_token_accuracy": 0.725999391078949, |
| "num_tokens": 48875.0, |
| "step": 620 |
| }, |
| { |
| "entropy": 1.71953125, |
| "epoch": 1.3263157894736843, |
| "grad_norm": 8.25, |
| "learning_rate": 1.6894736842105263e-05, |
| "loss": 1.6228282928466797, |
| "mean_token_accuracy": 0.7085169553756714, |
| "num_tokens": 49697.0, |
| "step": 630 |
| }, |
| { |
| "entropy": 1.437109375, |
| "epoch": 1.3473684210526315, |
| "grad_norm": 7.5625, |
| "learning_rate": 1.636842105263158e-05, |
| "loss": 1.4906560897827148, |
| "mean_token_accuracy": 0.7471937596797943, |
| "num_tokens": 50300.0, |
| "step": 640 |
| }, |
| { |
| "entropy": 1.73359375, |
| "epoch": 1.368421052631579, |
| "grad_norm": 4.375, |
| "learning_rate": 1.5842105263157896e-05, |
| "loss": 1.5986823081970214, |
| "mean_token_accuracy": 0.699000483751297, |
| "num_tokens": 51332.0, |
| "step": 650 |
| }, |
| { |
| "entropy": 1.2671875, |
| "epoch": 1.3894736842105262, |
| "grad_norm": 11.8125, |
| "learning_rate": 1.5315789473684212e-05, |
| "loss": 1.234378719329834, |
| "mean_token_accuracy": 0.7714344441890717, |
| "num_tokens": 51900.0, |
| "step": 660 |
| }, |
| { |
| "entropy": 1.90859375, |
| "epoch": 1.4105263157894736, |
| "grad_norm": 6.875, |
| "learning_rate": 1.4789473684210529e-05, |
| "loss": 1.815606689453125, |
| "mean_token_accuracy": 0.6656625211238861, |
| "num_tokens": 52883.0, |
| "step": 670 |
| }, |
| { |
| "entropy": 1.558203125, |
| "epoch": 1.431578947368421, |
| "grad_norm": 9.375, |
| "learning_rate": 1.4263157894736842e-05, |
| "loss": 1.4858011245727538, |
| "mean_token_accuracy": 0.7443289816379547, |
| "num_tokens": 53576.0, |
| "step": 680 |
| }, |
| { |
| "entropy": 1.64765625, |
| "epoch": 1.4526315789473685, |
| "grad_norm": 8.9375, |
| "learning_rate": 1.373684210526316e-05, |
| "loss": 1.534531307220459, |
| "mean_token_accuracy": 0.7204049170017243, |
| "num_tokens": 54450.0, |
| "step": 690 |
| }, |
| { |
| "entropy": 1.58359375, |
| "epoch": 1.4736842105263157, |
| "grad_norm": 6.46875, |
| "learning_rate": 1.3210526315789473e-05, |
| "loss": 1.5365165710449218, |
| "mean_token_accuracy": 0.7069519102573395, |
| "num_tokens": 55160.0, |
| "step": 700 |
| }, |
| { |
| "entropy": 1.4171875, |
| "epoch": 1.4947368421052631, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.268421052631579e-05, |
| "loss": 1.307802391052246, |
| "mean_token_accuracy": 0.7627157270908356, |
| "num_tokens": 55929.0, |
| "step": 710 |
| }, |
| { |
| "entropy": 1.678125, |
| "epoch": 1.5157894736842106, |
| "grad_norm": 6.34375, |
| "learning_rate": 1.2157894736842105e-05, |
| "loss": 1.5420659065246582, |
| "mean_token_accuracy": 0.7097006201744079, |
| "num_tokens": 56725.0, |
| "step": 720 |
| }, |
| { |
| "entropy": 1.58125, |
| "epoch": 1.5368421052631578, |
| "grad_norm": 7.0625, |
| "learning_rate": 1.1631578947368422e-05, |
| "loss": 1.4044910430908204, |
| "mean_token_accuracy": 0.7465328335762024, |
| "num_tokens": 57398.0, |
| "step": 730 |
| }, |
| { |
| "entropy": 1.576171875, |
| "epoch": 1.5578947368421052, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.1105263157894738e-05, |
| "loss": 1.605686569213867, |
| "mean_token_accuracy": 0.7333506286144257, |
| "num_tokens": 58072.0, |
| "step": 740 |
| }, |
| { |
| "entropy": 1.59140625, |
| "epoch": 1.5789473684210527, |
| "grad_norm": 6.03125, |
| "learning_rate": 1.0578947368421053e-05, |
| "loss": 1.4928099632263183, |
| "mean_token_accuracy": 0.7072650909423828, |
| "num_tokens": 58748.0, |
| "step": 750 |
| }, |
| { |
| "entropy": 1.63046875, |
| "epoch": 1.6, |
| "grad_norm": 7.1875, |
| "learning_rate": 1.005263157894737e-05, |
| "loss": 1.624325942993164, |
| "mean_token_accuracy": 0.690889635682106, |
| "num_tokens": 59658.0, |
| "step": 760 |
| }, |
| { |
| "entropy": 1.803125, |
| "epoch": 1.6210526315789475, |
| "grad_norm": 6.03125, |
| "learning_rate": 9.526315789473684e-06, |
| "loss": 1.7750425338745117, |
| "mean_token_accuracy": 0.6651369571685791, |
| "num_tokens": 60859.0, |
| "step": 770 |
| }, |
| { |
| "entropy": 1.4703125, |
| "epoch": 1.6421052631578947, |
| "grad_norm": 7.03125, |
| "learning_rate": 9e-06, |
| "loss": 1.4258437156677246, |
| "mean_token_accuracy": 0.7347829401493072, |
| "num_tokens": 62439.0, |
| "step": 780 |
| }, |
| { |
| "entropy": 1.671875, |
| "epoch": 1.663157894736842, |
| "grad_norm": 7.71875, |
| "learning_rate": 8.473684210526315e-06, |
| "loss": 1.721219825744629, |
| "mean_token_accuracy": 0.7035934925079346, |
| "num_tokens": 63290.0, |
| "step": 790 |
| }, |
| { |
| "entropy": 1.554296875, |
| "epoch": 1.6842105263157894, |
| "grad_norm": 9.0625, |
| "learning_rate": 7.947368421052633e-06, |
| "loss": 1.5237810134887695, |
| "mean_token_accuracy": 0.7325670003890992, |
| "num_tokens": 64010.0, |
| "step": 800 |
| }, |
| { |
| "entropy": 1.7859375, |
| "epoch": 1.7052631578947368, |
| "grad_norm": 8.5, |
| "learning_rate": 7.421052631578948e-06, |
| "loss": 1.7944671630859375, |
| "mean_token_accuracy": 0.683906614780426, |
| "num_tokens": 64966.0, |
| "step": 810 |
| }, |
| { |
| "entropy": 1.840625, |
| "epoch": 1.7263157894736842, |
| "grad_norm": 7.53125, |
| "learning_rate": 6.894736842105263e-06, |
| "loss": 1.787227249145508, |
| "mean_token_accuracy": 0.6670031368732452, |
| "num_tokens": 65703.0, |
| "step": 820 |
| }, |
| { |
| "entropy": 1.29765625, |
| "epoch": 1.7473684210526317, |
| "grad_norm": 9.0625, |
| "learning_rate": 6.368421052631579e-06, |
| "loss": 1.2813177108764648, |
| "mean_token_accuracy": 0.7610228896141052, |
| "num_tokens": 66421.0, |
| "step": 830 |
| }, |
| { |
| "entropy": 1.504296875, |
| "epoch": 1.768421052631579, |
| "grad_norm": 6.5, |
| "learning_rate": 5.842105263157895e-06, |
| "loss": 1.477138137817383, |
| "mean_token_accuracy": 0.7455608665943145, |
| "num_tokens": 67005.0, |
| "step": 840 |
| }, |
| { |
| "entropy": 1.60859375, |
| "epoch": 1.7894736842105263, |
| "grad_norm": 5.03125, |
| "learning_rate": 5.315789473684211e-06, |
| "loss": 1.6519662857055664, |
| "mean_token_accuracy": 0.7005816400051117, |
| "num_tokens": 67975.0, |
| "step": 850 |
| }, |
| { |
| "entropy": 1.6765625, |
| "epoch": 1.8105263157894735, |
| "grad_norm": 6.53125, |
| "learning_rate": 4.789473684210526e-06, |
| "loss": 1.6701608657836915, |
| "mean_token_accuracy": 0.7136963486671448, |
| "num_tokens": 68650.0, |
| "step": 860 |
| }, |
| { |
| "entropy": 1.44609375, |
| "epoch": 1.831578947368421, |
| "grad_norm": 7.1875, |
| "learning_rate": 4.2631578947368425e-06, |
| "loss": 1.3475713729858398, |
| "mean_token_accuracy": 0.7566476047039032, |
| "num_tokens": 69317.0, |
| "step": 870 |
| }, |
| { |
| "entropy": 1.71328125, |
| "epoch": 1.8526315789473684, |
| "grad_norm": 6.90625, |
| "learning_rate": 3.736842105263158e-06, |
| "loss": 1.7077770233154297, |
| "mean_token_accuracy": 0.6919207274913788, |
| "num_tokens": 70169.0, |
| "step": 880 |
| }, |
| { |
| "entropy": 1.646875, |
| "epoch": 1.8736842105263158, |
| "grad_norm": 7.8125, |
| "learning_rate": 3.2105263157894735e-06, |
| "loss": 1.5933343887329101, |
| "mean_token_accuracy": 0.708821702003479, |
| "num_tokens": 71138.0, |
| "step": 890 |
| }, |
| { |
| "entropy": 1.437109375, |
| "epoch": 1.8947368421052633, |
| "grad_norm": 8.625, |
| "learning_rate": 2.68421052631579e-06, |
| "loss": 1.4426955223083495, |
| "mean_token_accuracy": 0.7344056785106658, |
| "num_tokens": 71776.0, |
| "step": 900 |
| }, |
| { |
| "entropy": 1.591015625, |
| "epoch": 1.9157894736842105, |
| "grad_norm": 8.125, |
| "learning_rate": 2.1578947368421054e-06, |
| "loss": 1.4523811340332031, |
| "mean_token_accuracy": 0.7294364452362061, |
| "num_tokens": 72390.0, |
| "step": 910 |
| }, |
| { |
| "entropy": 1.48984375, |
| "epoch": 1.936842105263158, |
| "grad_norm": 8.375, |
| "learning_rate": 1.6315789473684212e-06, |
| "loss": 1.4202921867370606, |
| "mean_token_accuracy": 0.7283547043800354, |
| "num_tokens": 73109.0, |
| "step": 920 |
| }, |
| { |
| "entropy": 1.37578125, |
| "epoch": 1.9578947368421051, |
| "grad_norm": 7.34375, |
| "learning_rate": 1.1052631578947369e-06, |
| "loss": 1.3212156295776367, |
| "mean_token_accuracy": 0.765373581647873, |
| "num_tokens": 73665.0, |
| "step": 930 |
| }, |
| { |
| "entropy": 1.69140625, |
| "epoch": 1.9789473684210526, |
| "grad_norm": 7.5625, |
| "learning_rate": 5.789473684210527e-07, |
| "loss": 1.5747876167297363, |
| "mean_token_accuracy": 0.7353746354579925, |
| "num_tokens": 74320.0, |
| "step": 940 |
| }, |
| { |
| "entropy": 1.75390625, |
| "epoch": 2.0, |
| "grad_norm": 6.46875, |
| "learning_rate": 5.263157894736842e-08, |
| "loss": 1.7055414199829102, |
| "mean_token_accuracy": 0.6862038552761078, |
| "num_tokens": 75304.0, |
| "step": 950 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 950, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 207562104419328.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|