| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 30.914898651313194, | |
| "eval_steps": 1000, | |
| "global_step": 49000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.031548229355627413, | |
| "grad_norm": 32.0, | |
| "learning_rate": 4.9e-07, | |
| "loss": 17.4073, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06309645871125483, | |
| "grad_norm": 30.625, | |
| "learning_rate": 9.9e-07, | |
| "loss": 17.6876, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09464468806688224, | |
| "grad_norm": 32.25, | |
| "learning_rate": 1.4900000000000001e-06, | |
| "loss": 17.5782, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12619291742250965, | |
| "grad_norm": 34.5, | |
| "learning_rate": 1.99e-06, | |
| "loss": 17.7185, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15774114677813708, | |
| "grad_norm": 31.125, | |
| "learning_rate": 2.49e-06, | |
| "loss": 17.5788, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.18928937613376448, | |
| "grad_norm": 31.875, | |
| "learning_rate": 2.99e-06, | |
| "loss": 17.3918, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2208376054893919, | |
| "grad_norm": 36.5, | |
| "learning_rate": 3.49e-06, | |
| "loss": 17.5019, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2523858348450193, | |
| "grad_norm": 31.5, | |
| "learning_rate": 3.99e-06, | |
| "loss": 17.1858, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.28393406420064676, | |
| "grad_norm": 36.5, | |
| "learning_rate": 4.49e-06, | |
| "loss": 17.0212, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.31548229355627416, | |
| "grad_norm": 33.75, | |
| "learning_rate": 4.9900000000000005e-06, | |
| "loss": 16.7389, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.34703052291190156, | |
| "grad_norm": 35.5, | |
| "learning_rate": 5.49e-06, | |
| "loss": 16.4118, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.37857875226752896, | |
| "grad_norm": 31.625, | |
| "learning_rate": 5.99e-06, | |
| "loss": 16.4162, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4101269816231564, | |
| "grad_norm": 63.5, | |
| "learning_rate": 6.4900000000000005e-06, | |
| "loss": 15.9327, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.4416752109787838, | |
| "grad_norm": 30.375, | |
| "learning_rate": 6.990000000000001e-06, | |
| "loss": 15.402, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4732234403344112, | |
| "grad_norm": 27.625, | |
| "learning_rate": 7.4899999999999994e-06, | |
| "loss": 15.0267, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.5047716696900386, | |
| "grad_norm": 32.0, | |
| "learning_rate": 7.99e-06, | |
| "loss": 14.6909, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.536319899045666, | |
| "grad_norm": 34.0, | |
| "learning_rate": 8.49e-06, | |
| "loss": 14.1192, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5678681284012935, | |
| "grad_norm": 30.25, | |
| "learning_rate": 8.99e-06, | |
| "loss": 13.2533, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5994163577569209, | |
| "grad_norm": 30.625, | |
| "learning_rate": 9.49e-06, | |
| "loss": 12.4446, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6309645871125483, | |
| "grad_norm": 28.5, | |
| "learning_rate": 9.990000000000001e-06, | |
| "loss": 11.4839, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6309645871125483, | |
| "eval_loss": 12.30927848815918, | |
| "eval_runtime": 267.4428, | |
| "eval_samples_per_second": 94.813, | |
| "eval_steps_per_second": 5.927, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6625128164681757, | |
| "grad_norm": 23.25, | |
| "learning_rate": 1.049e-05, | |
| "loss": 10.7284, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6940610458238031, | |
| "grad_norm": 26.5, | |
| "learning_rate": 1.099e-05, | |
| "loss": 10.0224, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.7256092751794305, | |
| "grad_norm": 21.25, | |
| "learning_rate": 1.149e-05, | |
| "loss": 9.6291, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7571575045350579, | |
| "grad_norm": 18.75, | |
| "learning_rate": 1.199e-05, | |
| "loss": 8.6819, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7887057338906854, | |
| "grad_norm": 19.5, | |
| "learning_rate": 1.249e-05, | |
| "loss": 8.564, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.8202539632463128, | |
| "grad_norm": 19.125, | |
| "learning_rate": 1.299e-05, | |
| "loss": 7.7527, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8518021926019402, | |
| "grad_norm": 15.9375, | |
| "learning_rate": 1.349e-05, | |
| "loss": 7.3225, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8833504219575676, | |
| "grad_norm": 14.875, | |
| "learning_rate": 1.399e-05, | |
| "loss": 6.9508, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.914898651313195, | |
| "grad_norm": 15.0625, | |
| "learning_rate": 1.449e-05, | |
| "loss": 6.516, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.9464468806688224, | |
| "grad_norm": 13.6875, | |
| "learning_rate": 1.499e-05, | |
| "loss": 6.0257, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9779951100244498, | |
| "grad_norm": 11.0, | |
| "learning_rate": 1.5490000000000002e-05, | |
| "loss": 5.6735, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 1.0094644688066883, | |
| "grad_norm": 15.75, | |
| "learning_rate": 1.599e-05, | |
| "loss": 5.0756, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.0410126981623156, | |
| "grad_norm": 8.125, | |
| "learning_rate": 1.649e-05, | |
| "loss": 5.0824, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.072560927517943, | |
| "grad_norm": 7.25, | |
| "learning_rate": 1.699e-05, | |
| "loss": 4.5511, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.1041091568735704, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 1.749e-05, | |
| "loss": 4.2476, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.1356573862291979, | |
| "grad_norm": 4.125, | |
| "learning_rate": 1.7990000000000002e-05, | |
| "loss": 3.9672, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1672056155848254, | |
| "grad_norm": 5.25, | |
| "learning_rate": 1.849e-05, | |
| "loss": 3.8173, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.1987538449404527, | |
| "grad_norm": 4.4375, | |
| "learning_rate": 1.8990000000000003e-05, | |
| "loss": 3.5025, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.2303020742960802, | |
| "grad_norm": 30.375, | |
| "learning_rate": 1.949e-05, | |
| "loss": 3.3593, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.2618503036517075, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.999e-05, | |
| "loss": 3.2297, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2618503036517075, | |
| "eval_loss": 3.776749849319458, | |
| "eval_runtime": 267.0502, | |
| "eval_samples_per_second": 94.952, | |
| "eval_steps_per_second": 5.935, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.293398533007335, | |
| "grad_norm": 5.5625, | |
| "learning_rate": 2.0490000000000002e-05, | |
| "loss": 3.288, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.3249467623629623, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 2.099e-05, | |
| "loss": 2.9833, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.3564949917185898, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 2.1490000000000003e-05, | |
| "loss": 3.0733, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.3880432210742173, | |
| "grad_norm": 3.796875, | |
| "learning_rate": 2.199e-05, | |
| "loss": 3.0319, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.4195914504298446, | |
| "grad_norm": 3.78125, | |
| "learning_rate": 2.249e-05, | |
| "loss": 2.9367, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.451139679785472, | |
| "grad_norm": 3.0, | |
| "learning_rate": 2.2990000000000002e-05, | |
| "loss": 2.7882, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4826879091410994, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 2.349e-05, | |
| "loss": 2.7725, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.514236138496727, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 2.3990000000000002e-05, | |
| "loss": 2.619, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.5457843678523542, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.449e-05, | |
| "loss": 2.6745, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.5773325972079817, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 2.4990000000000003e-05, | |
| "loss": 2.5296, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.6088808265636092, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 2.549e-05, | |
| "loss": 2.6223, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.6404290559192365, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 2.5990000000000004e-05, | |
| "loss": 2.4497, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.671977285274864, | |
| "grad_norm": 3.09375, | |
| "learning_rate": 2.6490000000000002e-05, | |
| "loss": 2.5528, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.7035255146304915, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 2.6989999999999997e-05, | |
| "loss": 2.4371, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.7350737439861188, | |
| "grad_norm": 2.515625, | |
| "learning_rate": 2.749e-05, | |
| "loss": 2.3439, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.766621973341746, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 2.7989999999999998e-05, | |
| "loss": 2.3008, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.7981702026973736, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 2.849e-05, | |
| "loss": 2.429, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.829718432053001, | |
| "grad_norm": 3.109375, | |
| "learning_rate": 2.8990000000000002e-05, | |
| "loss": 2.2902, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.8612666614086284, | |
| "grad_norm": 2.578125, | |
| "learning_rate": 2.949e-05, | |
| "loss": 2.3473, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.892814890764256, | |
| "grad_norm": 3.34375, | |
| "learning_rate": 2.9990000000000003e-05, | |
| "loss": 2.3278, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.892814890764256, | |
| "eval_loss": 2.846893787384033, | |
| "eval_runtime": 267.2695, | |
| "eval_samples_per_second": 94.874, | |
| "eval_steps_per_second": 5.93, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.9243631201198834, | |
| "grad_norm": 1.9140625, | |
| "learning_rate": 3.049e-05, | |
| "loss": 2.2279, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.9559113494755107, | |
| "grad_norm": 3.0625, | |
| "learning_rate": 3.099e-05, | |
| "loss": 2.1692, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.987459578831138, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 3.1490000000000005e-05, | |
| "loss": 2.1651, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 2.0189289376133766, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.1990000000000004e-05, | |
| "loss": 2.3521, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 2.050477166969004, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 3.249e-05, | |
| "loss": 2.1494, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 2.082025396324631, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 3.299e-05, | |
| "loss": 2.1754, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.113573625680259, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.349e-05, | |
| "loss": 2.1549, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.145121855035886, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 3.399e-05, | |
| "loss": 2.1574, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.1766700843915134, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 3.449e-05, | |
| "loss": 2.075, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.2082183137471407, | |
| "grad_norm": 3.140625, | |
| "learning_rate": 3.499e-05, | |
| "loss": 2.1074, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.2397665431027685, | |
| "grad_norm": 6.34375, | |
| "learning_rate": 3.549e-05, | |
| "loss": 2.083, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.2713147724583957, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 3.599e-05, | |
| "loss": 2.1259, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.302863001814023, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 3.6490000000000005e-05, | |
| "loss": 2.0767, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.3344112311696508, | |
| "grad_norm": 2.609375, | |
| "learning_rate": 3.699e-05, | |
| "loss": 2.027, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.365959460525278, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 3.749e-05, | |
| "loss": 2.1235, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.3975076898809053, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.799e-05, | |
| "loss": 1.9496, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.429055919236533, | |
| "grad_norm": 1.8828125, | |
| "learning_rate": 3.8490000000000006e-05, | |
| "loss": 1.8565, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.4606041485921604, | |
| "grad_norm": 2.46875, | |
| "learning_rate": 3.8990000000000004e-05, | |
| "loss": 1.9914, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.4921523779477877, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 3.9489999999999996e-05, | |
| "loss": 1.9249, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.523700607303415, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 3.999e-05, | |
| "loss": 2.0286, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.523700607303415, | |
| "eval_loss": 2.5390400886535645, | |
| "eval_runtime": 268.2567, | |
| "eval_samples_per_second": 94.525, | |
| "eval_steps_per_second": 5.909, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.5552488366590427, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.049e-05, | |
| "loss": 1.895, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.58679706601467, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.099e-05, | |
| "loss": 2.037, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.6183452953702973, | |
| "grad_norm": 2.40625, | |
| "learning_rate": 4.1490000000000004e-05, | |
| "loss": 1.7669, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.6498935247259245, | |
| "grad_norm": 2.4375, | |
| "learning_rate": 4.199e-05, | |
| "loss": 1.8471, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.6814417540815523, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 4.249e-05, | |
| "loss": 2.0354, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.7129899834371796, | |
| "grad_norm": 2.390625, | |
| "learning_rate": 4.299e-05, | |
| "loss": 1.9273, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.744538212792807, | |
| "grad_norm": 1.75, | |
| "learning_rate": 4.3490000000000005e-05, | |
| "loss": 1.951, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.7760864421484346, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 4.3990000000000004e-05, | |
| "loss": 1.8838, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.807634671504062, | |
| "grad_norm": 1.625, | |
| "learning_rate": 4.449e-05, | |
| "loss": 1.9785, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.839182900859689, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 4.499e-05, | |
| "loss": 1.8728, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.870731130215317, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 4.549000000000001e-05, | |
| "loss": 1.914, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.902279359570944, | |
| "grad_norm": 2.8125, | |
| "learning_rate": 4.599e-05, | |
| "loss": 1.8725, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.9338275889265715, | |
| "grad_norm": 2.265625, | |
| "learning_rate": 4.649e-05, | |
| "loss": 1.9947, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.9653758182821988, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 4.699e-05, | |
| "loss": 1.7939, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.996924047637826, | |
| "grad_norm": 1.953125, | |
| "learning_rate": 4.749e-05, | |
| "loss": 1.7749, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 3.0283934064200646, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 4.799e-05, | |
| "loss": 1.9298, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 3.059941635775692, | |
| "grad_norm": 2.28125, | |
| "learning_rate": 4.8490000000000005e-05, | |
| "loss": 1.816, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 3.0914898651313196, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 4.8990000000000004e-05, | |
| "loss": 1.7261, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 3.123038094486947, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 4.949e-05, | |
| "loss": 1.7376, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 3.154586323842574, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 4.999e-05, | |
| "loss": 1.8473, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.154586323842574, | |
| "eval_loss": 2.3419270515441895, | |
| "eval_runtime": 267.4485, | |
| "eval_samples_per_second": 94.811, | |
| "eval_steps_per_second": 5.926, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 3.186134553198202, | |
| "grad_norm": 1.25, | |
| "learning_rate": 5.0490000000000006e-05, | |
| "loss": 1.7936, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 3.2176827825538292, | |
| "grad_norm": 2.171875, | |
| "learning_rate": 5.0990000000000005e-05, | |
| "loss": 1.8275, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 3.2492310119094565, | |
| "grad_norm": 2.90625, | |
| "learning_rate": 5.149e-05, | |
| "loss": 1.8632, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 3.2807792412650842, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 5.199000000000001e-05, | |
| "loss": 1.5972, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 3.3123274706207115, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 5.249000000000001e-05, | |
| "loss": 1.7239, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 3.343875699976339, | |
| "grad_norm": 2.84375, | |
| "learning_rate": 5.2990000000000006e-05, | |
| "loss": 1.8685, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 3.375423929331966, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.3490000000000005e-05, | |
| "loss": 1.7002, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 3.406972158687594, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 5.399000000000001e-05, | |
| "loss": 1.8572, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 3.438520388043221, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 5.449000000000001e-05, | |
| "loss": 1.6519, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 3.4700686173988484, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 5.499000000000001e-05, | |
| "loss": 1.7819, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 3.5016168467544757, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 5.549e-05, | |
| "loss": 1.6989, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 3.5331650761101034, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 5.599e-05, | |
| "loss": 1.6876, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 3.5647133054657307, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 5.6489999999999996e-05, | |
| "loss": 1.6419, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 3.596261534821358, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 5.699e-05, | |
| "loss": 1.7303, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 3.6278097641769858, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 5.749e-05, | |
| "loss": 1.6267, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 3.659357993532613, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 5.799e-05, | |
| "loss": 1.7364, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 3.6909062228882403, | |
| "grad_norm": 2.140625, | |
| "learning_rate": 5.849e-05, | |
| "loss": 1.8087, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 3.722454452243868, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 5.899e-05, | |
| "loss": 1.532, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 3.7540026815994954, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 5.949e-05, | |
| "loss": 1.7565, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 3.7855509109551226, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 5.999e-05, | |
| "loss": 1.5554, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.7855509109551226, | |
| "eval_loss": 2.193551778793335, | |
| "eval_runtime": 267.7225, | |
| "eval_samples_per_second": 94.714, | |
| "eval_steps_per_second": 5.92, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 3.81709914031075, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 6.0490000000000005e-05, | |
| "loss": 1.6697, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 3.848647369666377, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 6.0990000000000004e-05, | |
| "loss": 1.6499, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 3.880195599022005, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 6.149000000000001e-05, | |
| "loss": 1.6098, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 3.9117438283776322, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 6.199000000000001e-05, | |
| "loss": 1.6471, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 3.9432920577332595, | |
| "grad_norm": 1.9296875, | |
| "learning_rate": 6.249e-05, | |
| "loss": 1.6636, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 3.9748402870888873, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 6.299e-05, | |
| "loss": 1.68, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 4.006309645871125, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 6.349e-05, | |
| "loss": 1.6271, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 4.037857875226753, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 6.399e-05, | |
| "loss": 1.708, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 4.06940610458238, | |
| "grad_norm": 1.625, | |
| "learning_rate": 6.449e-05, | |
| "loss": 1.5844, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 4.100954333938008, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 6.499000000000001e-05, | |
| "loss": 1.7148, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 4.132502563293635, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 6.549000000000001e-05, | |
| "loss": 1.657, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 4.164050792649262, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 6.599000000000001e-05, | |
| "loss": 1.5734, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 4.19559902200489, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 6.649000000000001e-05, | |
| "loss": 1.6004, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 4.227147251360518, | |
| "grad_norm": 2.125, | |
| "learning_rate": 6.699000000000001e-05, | |
| "loss": 1.5555, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 4.258695480716145, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 6.749e-05, | |
| "loss": 1.5842, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 4.290243710071772, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 6.799e-05, | |
| "loss": 1.4458, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 4.3217919394274, | |
| "grad_norm": 1.8515625, | |
| "learning_rate": 6.849e-05, | |
| "loss": 1.5308, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 4.353340168783027, | |
| "grad_norm": 2.890625, | |
| "learning_rate": 6.899e-05, | |
| "loss": 1.6275, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 4.384888398138655, | |
| "grad_norm": 1.5, | |
| "learning_rate": 6.949e-05, | |
| "loss": 1.6239, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 4.4164366274942815, | |
| "grad_norm": 2.703125, | |
| "learning_rate": 6.999e-05, | |
| "loss": 1.6261, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.4164366274942815, | |
| "eval_loss": 2.089057207107544, | |
| "eval_runtime": 266.8117, | |
| "eval_samples_per_second": 95.037, | |
| "eval_steps_per_second": 5.941, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 4.447984856849909, | |
| "grad_norm": 1.25, | |
| "learning_rate": 7.049e-05, | |
| "loss": 1.6696, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 4.479533086205537, | |
| "grad_norm": 2.546875, | |
| "learning_rate": 7.099e-05, | |
| "loss": 1.5402, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 4.511081315561164, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 7.149e-05, | |
| "loss": 1.5104, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 4.5426295449167915, | |
| "grad_norm": 5.3125, | |
| "learning_rate": 7.199000000000001e-05, | |
| "loss": 1.5624, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 4.574177774272419, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 7.249e-05, | |
| "loss": 1.4863, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 4.605726003628046, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 7.299e-05, | |
| "loss": 1.5396, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 4.637274232983674, | |
| "grad_norm": 1.9765625, | |
| "learning_rate": 7.349e-05, | |
| "loss": 1.5752, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 4.6688224623393015, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 7.399e-05, | |
| "loss": 1.596, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 4.700370691694928, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 7.449e-05, | |
| "loss": 1.5409, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 4.731918921050556, | |
| "grad_norm": 1.75, | |
| "learning_rate": 7.499e-05, | |
| "loss": 1.4706, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 4.763467150406184, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 7.549000000000001e-05, | |
| "loss": 1.6109, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 4.795015379761811, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 7.599000000000001e-05, | |
| "loss": 1.4949, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 4.826563609117438, | |
| "grad_norm": 2.5, | |
| "learning_rate": 7.649000000000001e-05, | |
| "loss": 1.5716, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 4.858111838473066, | |
| "grad_norm": 2.21875, | |
| "learning_rate": 7.699e-05, | |
| "loss": 1.4256, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 4.889660067828693, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.749e-05, | |
| "loss": 1.4907, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 4.921208297184321, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 7.799e-05, | |
| "loss": 1.5904, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 4.952756526539948, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 7.849e-05, | |
| "loss": 1.3684, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 4.984304755895575, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 7.899000000000001e-05, | |
| "loss": 1.4927, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 5.015774114677813, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 7.949000000000001e-05, | |
| "loss": 1.5083, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 5.047322344033441, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 7.999000000000001e-05, | |
| "loss": 1.5748, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.047322344033441, | |
| "eval_loss": 1.9995193481445312, | |
| "eval_runtime": 267.072, | |
| "eval_samples_per_second": 94.944, | |
| "eval_steps_per_second": 5.935, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 5.078870573389069, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8.049e-05, | |
| "loss": 1.4866, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 5.110418802744696, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.099e-05, | |
| "loss": 1.4125, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 5.1419670321003235, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.149e-05, | |
| "loss": 1.5001, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 5.173515261455951, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 8.199e-05, | |
| "loss": 1.4564, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 5.205063490811578, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.249e-05, | |
| "loss": 1.512, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 5.236611720167206, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.299e-05, | |
| "loss": 1.409, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 5.268159949522833, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.349e-05, | |
| "loss": 1.5431, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 5.29970817887846, | |
| "grad_norm": 3.640625, | |
| "learning_rate": 8.399e-05, | |
| "loss": 1.518, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 5.331256408234088, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.449e-05, | |
| "loss": 1.4523, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 5.362804637589715, | |
| "grad_norm": 2.3125, | |
| "learning_rate": 8.499e-05, | |
| "loss": 1.4272, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 5.394352866945343, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 8.549000000000001e-05, | |
| "loss": 1.3483, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 5.42590109630097, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 8.599000000000001e-05, | |
| "loss": 1.528, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 5.457449325656597, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.649000000000001e-05, | |
| "loss": 1.3666, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 5.488997555012225, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.699e-05, | |
| "loss": 1.4751, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 5.520545784367853, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.749e-05, | |
| "loss": 1.503, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 5.5520940137234795, | |
| "grad_norm": 18.625, | |
| "learning_rate": 8.799e-05, | |
| "loss": 1.4122, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 5.583642243079107, | |
| "grad_norm": 79.5, | |
| "learning_rate": 8.849e-05, | |
| "loss": 1.4222, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 5.615190472434735, | |
| "grad_norm": 2.125, | |
| "learning_rate": 8.899e-05, | |
| "loss": 1.5393, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 5.646738701790362, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.949000000000001e-05, | |
| "loss": 1.3101, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 5.67828693114599, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.999000000000001e-05, | |
| "loss": 1.3791, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.67828693114599, | |
| "eval_loss": 1.9208731651306152, | |
| "eval_runtime": 266.9196, | |
| "eval_samples_per_second": 94.999, | |
| "eval_steps_per_second": 5.938, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 5.709835160501617, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.049000000000001e-05, | |
| "loss": 1.3748, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 5.741383389857244, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.099000000000001e-05, | |
| "loss": 1.456, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 5.772931619212872, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.149e-05, | |
| "loss": 1.3566, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 5.804479848568499, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.199e-05, | |
| "loss": 1.4585, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 5.8360280779241265, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.249e-05, | |
| "loss": 1.4235, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 5.867576307279754, | |
| "grad_norm": 0.86328125, | |
| "learning_rate": 9.299e-05, | |
| "loss": 1.4555, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 5.899124536635381, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 9.349e-05, | |
| "loss": 1.365, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 5.930672765991009, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 9.399e-05, | |
| "loss": 1.4072, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 5.9622209953466365, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.449e-05, | |
| "loss": 1.3952, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 5.993769224702263, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.499e-05, | |
| "loss": 1.4411, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 6.025238583484502, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.549e-05, | |
| "loss": 1.4389, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 6.056786812840129, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.599000000000001e-05, | |
| "loss": 1.4392, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 6.088335042195757, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.649e-05, | |
| "loss": 1.3544, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 6.119883271551384, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.699e-05, | |
| "loss": 1.335, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 6.1514315009070115, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 9.749e-05, | |
| "loss": 1.3299, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 6.182979730262639, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.799e-05, | |
| "loss": 1.4326, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 6.214527959618266, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 9.849e-05, | |
| "loss": 1.3592, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 6.246076188973894, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 9.899e-05, | |
| "loss": 1.4231, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 6.277624418329522, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.949000000000001e-05, | |
| "loss": 1.4076, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 6.309172647685148, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.999000000000001e-05, | |
| "loss": 1.3761, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.309172647685148, | |
| "eval_loss": 1.8519904613494873, | |
| "eval_runtime": 267.0769, | |
| "eval_samples_per_second": 94.943, | |
| "eval_steps_per_second": 5.935, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 6.340720877040776, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.99742105263158e-05, | |
| "loss": 1.4797, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 6.372269106396404, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 9.994789473684211e-05, | |
| "loss": 1.3483, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 6.403817335752031, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.992157894736842e-05, | |
| "loss": 1.4807, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 6.4353655651076584, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 9.989526315789473e-05, | |
| "loss": 1.3523, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 6.466913794463286, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 9.986894736842106e-05, | |
| "loss": 1.3903, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 6.498462023818913, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.984263157894738e-05, | |
| "loss": 1.3053, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 6.530010253174541, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 9.98163157894737e-05, | |
| "loss": 1.3541, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 6.5615584825301685, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.979e-05, | |
| "loss": 1.316, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 6.593106711885795, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.976368421052632e-05, | |
| "loss": 1.3258, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 6.624654941241423, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.973736842105263e-05, | |
| "loss": 1.2155, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 6.65620317059705, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 9.971105263157895e-05, | |
| "loss": 1.3149, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 6.687751399952678, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.968473684210526e-05, | |
| "loss": 1.4807, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 6.719299629308305, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.965842105263158e-05, | |
| "loss": 1.3725, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 6.750847858663932, | |
| "grad_norm": 2.15625, | |
| "learning_rate": 9.96321052631579e-05, | |
| "loss": 1.3688, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 6.78239608801956, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 9.960578947368421e-05, | |
| "loss": 1.4244, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 6.813944317375188, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.957947368421054e-05, | |
| "loss": 1.2727, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 6.8454925467308145, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.955315789473685e-05, | |
| "loss": 1.3025, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 6.877040776086442, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.952684210526316e-05, | |
| "loss": 1.3731, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 6.908589005442069, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.950052631578947e-05, | |
| "loss": 1.364, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 6.940137234797697, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 9.94742105263158e-05, | |
| "loss": 1.4627, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.940137234797697, | |
| "eval_loss": 1.7950236797332764, | |
| "eval_runtime": 267.3284, | |
| "eval_samples_per_second": 94.853, | |
| "eval_steps_per_second": 5.929, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 6.971685464153325, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.944789473684211e-05, | |
| "loss": 1.2709, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 7.003154822935563, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 9.942157894736842e-05, | |
| "loss": 1.2428, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 7.03470305229119, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 9.939526315789475e-05, | |
| "loss": 1.3802, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 7.066251281646817, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 9.936894736842106e-05, | |
| "loss": 1.416, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 7.097799511002445, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.934263157894737e-05, | |
| "loss": 1.2825, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 7.129347740358073, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.93163157894737e-05, | |
| "loss": 1.3154, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 7.1608959697137, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 9.929e-05, | |
| "loss": 1.271, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 7.192444199069327, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.926368421052632e-05, | |
| "loss": 1.3287, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 7.223992428424955, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.923736842105263e-05, | |
| "loss": 1.3063, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 7.255540657780582, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.921105263157895e-05, | |
| "loss": 1.2098, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 7.28708888713621, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.918473684210527e-05, | |
| "loss": 1.4312, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 7.318637116491837, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.915842105263158e-05, | |
| "loss": 1.3148, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 7.350185345847464, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.91321052631579e-05, | |
| "loss": 1.2584, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 7.381733575203092, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.910578947368421e-05, | |
| "loss": 1.3112, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 7.413281804558719, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 9.907947368421054e-05, | |
| "loss": 1.2949, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 7.4448300339143465, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.905315789473685e-05, | |
| "loss": 1.2393, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 7.476378263269974, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 9.902684210526316e-05, | |
| "loss": 1.4107, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 7.507926492625601, | |
| "grad_norm": 3.296875, | |
| "learning_rate": 9.900052631578947e-05, | |
| "loss": 1.1609, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 7.539474721981229, | |
| "grad_norm": 2.25, | |
| "learning_rate": 9.897421052631579e-05, | |
| "loss": 1.2993, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 7.5710229513368565, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.894789473684211e-05, | |
| "loss": 1.3573, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.5710229513368565, | |
| "eval_loss": 1.7552872896194458, | |
| "eval_runtime": 267.1568, | |
| "eval_samples_per_second": 94.914, | |
| "eval_steps_per_second": 5.933, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 7.602571180692483, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.892157894736842e-05, | |
| "loss": 1.1903, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 7.634119410048111, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.889526315789475e-05, | |
| "loss": 1.3354, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 7.665667639403739, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 9.886894736842106e-05, | |
| "loss": 1.298, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 7.697215868759366, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.884263157894737e-05, | |
| "loss": 1.3834, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 7.728764098114993, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 9.88163157894737e-05, | |
| "loss": 1.3978, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 7.76031232747062, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.879000000000001e-05, | |
| "loss": 1.2518, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 7.791860556826248, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 9.876368421052632e-05, | |
| "loss": 1.3163, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 7.823408786181876, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.873736842105263e-05, | |
| "loss": 1.3215, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 7.854957015537503, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.871105263157894e-05, | |
| "loss": 1.2938, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 7.88650524489313, | |
| "grad_norm": 0.875, | |
| "learning_rate": 9.868473684210527e-05, | |
| "loss": 1.2684, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 7.918053474248758, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.865842105263159e-05, | |
| "loss": 1.2393, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 7.949601703604385, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.86321052631579e-05, | |
| "loss": 1.3817, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 7.981149932960013, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.860578947368422e-05, | |
| "loss": 1.3158, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 8.01261929174225, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.857947368421053e-05, | |
| "loss": 1.3428, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 8.044167521097878, | |
| "grad_norm": 2.375, | |
| "learning_rate": 9.855315789473685e-05, | |
| "loss": 1.2863, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 8.075715750453506, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.852684210526316e-05, | |
| "loss": 1.2989, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 8.107263979809133, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.850052631578948e-05, | |
| "loss": 1.2974, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 8.13881220916476, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.847421052631579e-05, | |
| "loss": 1.2585, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 8.170360438520389, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.84478947368421e-05, | |
| "loss": 1.1535, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 8.201908667876015, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.842157894736842e-05, | |
| "loss": 1.2961, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.201908667876015, | |
| "eval_loss": 1.72089684009552, | |
| "eval_runtime": 267.2684, | |
| "eval_samples_per_second": 94.875, | |
| "eval_steps_per_second": 5.93, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 8.233456897231642, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.839526315789475e-05, | |
| "loss": 1.1665, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 8.26500512658727, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.836894736842106e-05, | |
| "loss": 1.3976, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 8.296553355942898, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 9.834263157894737e-05, | |
| "loss": 1.2169, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 8.328101585298525, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.831631578947368e-05, | |
| "loss": 1.3242, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 8.359649814654153, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.829000000000001e-05, | |
| "loss": 1.2378, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 8.39119804400978, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 9.826368421052632e-05, | |
| "loss": 1.3139, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 8.422746273365407, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.823736842105263e-05, | |
| "loss": 1.2975, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 8.454294502721035, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.821105263157894e-05, | |
| "loss": 1.2578, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 8.485842732076662, | |
| "grad_norm": 2.796875, | |
| "learning_rate": 9.818473684210527e-05, | |
| "loss": 1.2879, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 8.51739096143229, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.81584210526316e-05, | |
| "loss": 1.34, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 8.548939190787918, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 9.81321052631579e-05, | |
| "loss": 1.248, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 8.580487420143545, | |
| "grad_norm": 4.40625, | |
| "learning_rate": 9.810578947368422e-05, | |
| "loss": 1.3071, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 8.612035649499171, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.807947368421053e-05, | |
| "loss": 1.2628, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 8.6435838788548, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 9.805315789473684e-05, | |
| "loss": 1.211, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 8.675132108210427, | |
| "grad_norm": 0.76953125, | |
| "learning_rate": 9.802684210526317e-05, | |
| "loss": 1.27, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 8.706680337566054, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.800052631578948e-05, | |
| "loss": 1.383, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 8.73822856692168, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 9.797421052631579e-05, | |
| "loss": 1.2795, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 8.76977679627731, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.79478947368421e-05, | |
| "loss": 1.278, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 8.801325025632936, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.792157894736843e-05, | |
| "loss": 1.1991, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 8.832873254988563, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.789526315789475e-05, | |
| "loss": 1.2975, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.832873254988563, | |
| "eval_loss": 1.693904995918274, | |
| "eval_runtime": 266.8978, | |
| "eval_samples_per_second": 95.006, | |
| "eval_steps_per_second": 5.939, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 8.864421484344192, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.786894736842106e-05, | |
| "loss": 1.2153, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 8.895969713699818, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.784263157894737e-05, | |
| "loss": 1.2027, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 8.927517943055445, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.781631578947369e-05, | |
| "loss": 1.3394, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 8.959066172411074, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.779e-05, | |
| "loss": 1.3347, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 8.9906144017667, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.776368421052632e-05, | |
| "loss": 1.2417, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 9.02208376054894, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 9.773736842105263e-05, | |
| "loss": 1.3005, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 9.053631989904567, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.771105263157895e-05, | |
| "loss": 1.141, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 9.085180219260193, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.768473684210527e-05, | |
| "loss": 1.3553, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 9.116728448615822, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 9.765842105263158e-05, | |
| "loss": 1.3469, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 9.148276677971449, | |
| "grad_norm": 0.81640625, | |
| "learning_rate": 9.763210526315791e-05, | |
| "loss": 1.3001, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 9.179824907327076, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 9.760578947368422e-05, | |
| "loss": 1.2651, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 9.211373136682704, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.757947368421053e-05, | |
| "loss": 1.2318, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 9.242921366038331, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.755315789473684e-05, | |
| "loss": 1.2397, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 9.274469595393958, | |
| "grad_norm": 2.234375, | |
| "learning_rate": 9.752684210526317e-05, | |
| "loss": 1.2617, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 9.306017824749587, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.750052631578948e-05, | |
| "loss": 1.308, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 9.337566054105213, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.747421052631579e-05, | |
| "loss": 1.2499, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 9.36911428346084, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.74478947368421e-05, | |
| "loss": 1.2713, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 9.400662512816469, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.742157894736843e-05, | |
| "loss": 1.16, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 9.432210742172096, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.739526315789474e-05, | |
| "loss": 1.2691, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 9.463758971527723, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 9.736894736842106e-05, | |
| "loss": 1.2895, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.463758971527723, | |
| "eval_loss": 1.6718602180480957, | |
| "eval_runtime": 267.6306, | |
| "eval_samples_per_second": 94.746, | |
| "eval_steps_per_second": 5.922, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 9.495307200883351, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.734263157894738e-05, | |
| "loss": 1.2325, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 9.526855430238978, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.731631578947369e-05, | |
| "loss": 1.211, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 9.558403659594605, | |
| "grad_norm": 3.078125, | |
| "learning_rate": 9.729e-05, | |
| "loss": 1.2128, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 9.589951888950232, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.726368421052632e-05, | |
| "loss": 1.3211, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 9.62150011830586, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.723736842105264e-05, | |
| "loss": 1.1707, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 9.653048347661487, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.721105263157895e-05, | |
| "loss": 1.1723, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 9.684596577017114, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.718473684210527e-05, | |
| "loss": 1.1736, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 9.716144806372743, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.715842105263158e-05, | |
| "loss": 1.2063, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 9.74769303572837, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.713210526315791e-05, | |
| "loss": 1.2337, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 9.779241265083996, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.710578947368422e-05, | |
| "loss": 1.1708, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 9.810789494439625, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.707947368421053e-05, | |
| "loss": 1.303, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 9.842337723795252, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 9.705315789473684e-05, | |
| "loss": 1.1924, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 9.873885953150879, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.702684210526316e-05, | |
| "loss": 1.2857, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 9.905434182506507, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 9.700052631578948e-05, | |
| "loss": 1.2931, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 9.936982411862134, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.697421052631579e-05, | |
| "loss": 1.2534, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 9.968530641217761, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.694789473684212e-05, | |
| "loss": 1.2154, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.692157894736843e-05, | |
| "loss": 1.1951, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 10.031548229355627, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.689526315789474e-05, | |
| "loss": 1.1643, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 10.063096458711255, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 9.686894736842107e-05, | |
| "loss": 1.2679, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 10.094644688066882, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.684263157894738e-05, | |
| "loss": 1.2818, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 10.094644688066882, | |
| "eval_loss": 1.653988242149353, | |
| "eval_runtime": 267.0729, | |
| "eval_samples_per_second": 94.944, | |
| "eval_steps_per_second": 5.935, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 10.12619291742251, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.681631578947369e-05, | |
| "loss": 1.0915, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 10.157741146778138, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 9.679e-05, | |
| "loss": 1.2581, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 10.189289376133765, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.676368421052631e-05, | |
| "loss": 1.3474, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 10.220837605489391, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.673736842105264e-05, | |
| "loss": 1.2432, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 10.25238583484502, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.671105263157895e-05, | |
| "loss": 1.1038, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 10.283934064200647, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.668473684210527e-05, | |
| "loss": 1.2775, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 10.315482293556274, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 9.665842105263158e-05, | |
| "loss": 1.2662, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 10.347030522911902, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 9.66321052631579e-05, | |
| "loss": 1.1715, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 10.37857875226753, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.660578947368422e-05, | |
| "loss": 1.2537, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 10.410126981623156, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.657947368421053e-05, | |
| "loss": 1.3039, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 10.441675210978783, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.655315789473684e-05, | |
| "loss": 1.188, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 10.473223440334412, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.652684210526316e-05, | |
| "loss": 1.1287, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 10.504771669690038, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.650052631578947e-05, | |
| "loss": 1.2609, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 10.536319899045665, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.647421052631579e-05, | |
| "loss": 1.2599, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 10.567868128401294, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.644789473684212e-05, | |
| "loss": 1.184, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 10.59941635775692, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.642157894736843e-05, | |
| "loss": 1.207, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 10.630964587112548, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.639526315789474e-05, | |
| "loss": 1.2456, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 10.662512816468176, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.636894736842105e-05, | |
| "loss": 1.1748, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 10.694061045823803, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.634263157894738e-05, | |
| "loss": 1.2294, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 10.72560927517943, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.631631578947369e-05, | |
| "loss": 1.2622, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 10.72560927517943, | |
| "eval_loss": 1.6381371021270752, | |
| "eval_runtime": 266.9095, | |
| "eval_samples_per_second": 95.002, | |
| "eval_steps_per_second": 5.938, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 10.757157504535058, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 9.629e-05, | |
| "loss": 1.2294, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 10.788705733890685, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.626368421052631e-05, | |
| "loss": 1.1825, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 10.820253963246312, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.623736842105264e-05, | |
| "loss": 1.2099, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 10.85180219260194, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.621105263157896e-05, | |
| "loss": 1.2358, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 10.883350421957568, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.618473684210527e-05, | |
| "loss": 1.1026, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 10.914898651313194, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.615842105263159e-05, | |
| "loss": 1.1988, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 10.946446880668823, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.61321052631579e-05, | |
| "loss": 1.2846, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 10.97799511002445, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 9.610578947368421e-05, | |
| "loss": 1.1801, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 11.009464468806689, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.607947368421053e-05, | |
| "loss": 1.2068, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 11.041012698162316, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.605315789473685e-05, | |
| "loss": 1.2886, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 11.072560927517943, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.602684210526316e-05, | |
| "loss": 1.2498, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 11.104109156873571, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.600052631578947e-05, | |
| "loss": 1.2796, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 11.135657386229198, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.59742105263158e-05, | |
| "loss": 1.2457, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 11.167205615584825, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.594789473684212e-05, | |
| "loss": 1.1583, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 11.198753844940454, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.592157894736843e-05, | |
| "loss": 1.1494, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 11.23030207429608, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.589526315789474e-05, | |
| "loss": 1.1675, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 11.261850303651707, | |
| "grad_norm": 1.0, | |
| "learning_rate": 9.586894736842105e-05, | |
| "loss": 1.1923, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 11.293398533007334, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.584263157894737e-05, | |
| "loss": 1.1461, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 11.324946762362963, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.581631578947369e-05, | |
| "loss": 1.1753, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 11.35649499171859, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.579e-05, | |
| "loss": 1.2912, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.35649499171859, | |
| "eval_loss": 1.6246815919876099, | |
| "eval_runtime": 266.8814, | |
| "eval_samples_per_second": 95.012, | |
| "eval_steps_per_second": 5.939, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 11.388043221074216, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.576368421052631e-05, | |
| "loss": 1.1746, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 11.419591450429845, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.573736842105263e-05, | |
| "loss": 1.2125, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 11.451139679785472, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.571105263157895e-05, | |
| "loss": 1.2383, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 11.482687909141099, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 9.568473684210528e-05, | |
| "loss": 1.1907, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 11.514236138496727, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.565842105263159e-05, | |
| "loss": 1.2293, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 11.545784367852354, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.56321052631579e-05, | |
| "loss": 1.2854, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 11.577332597207981, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.560578947368421e-05, | |
| "loss": 1.2259, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 11.60888082656361, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.557947368421054e-05, | |
| "loss": 1.1974, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 11.640429055919236, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.555315789473685e-05, | |
| "loss": 1.1581, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 11.671977285274863, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.552684210526316e-05, | |
| "loss": 1.3139, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 11.703525514630492, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 9.550052631578947e-05, | |
| "loss": 1.2093, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 11.735073743986119, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.54742105263158e-05, | |
| "loss": 1.1618, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 11.766621973341746, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.544789473684211e-05, | |
| "loss": 1.2333, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 11.798170202697374, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.542157894736843e-05, | |
| "loss": 1.2568, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 11.829718432053001, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.539526315789474e-05, | |
| "loss": 1.2077, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 11.861266661408628, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.536894736842106e-05, | |
| "loss": 1.1648, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 11.892814890764257, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.534263157894737e-05, | |
| "loss": 1.201, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 11.924363120119883, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 9.531631578947369e-05, | |
| "loss": 1.1852, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 11.95591134947551, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.529e-05, | |
| "loss": 1.2017, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 11.987459578831139, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.526368421052632e-05, | |
| "loss": 1.24, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 11.987459578831139, | |
| "eval_loss": 1.6129719018936157, | |
| "eval_runtime": 267.2903, | |
| "eval_samples_per_second": 94.867, | |
| "eval_steps_per_second": 5.93, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 12.018928937613376, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.523736842105263e-05, | |
| "loss": 1.1555, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 12.050477166969005, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.521105263157895e-05, | |
| "loss": 1.2965, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 12.082025396324632, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.518473684210528e-05, | |
| "loss": 1.2043, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 12.113573625680258, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.515842105263159e-05, | |
| "loss": 1.1309, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 12.145121855035885, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.51321052631579e-05, | |
| "loss": 1.1215, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 12.176670084391514, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.510578947368421e-05, | |
| "loss": 1.1511, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 12.20821831374714, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.507947368421052e-05, | |
| "loss": 1.1652, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 12.239766543102768, | |
| "grad_norm": 0.73046875, | |
| "learning_rate": 9.505315789473685e-05, | |
| "loss": 1.2012, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 12.271314772458396, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.502684210526316e-05, | |
| "loss": 1.0918, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 12.302863001814023, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.500052631578947e-05, | |
| "loss": 1.249, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 12.33441123116965, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.49742105263158e-05, | |
| "loss": 1.1607, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 12.365959460525279, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.494789473684211e-05, | |
| "loss": 1.21, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 12.397507689880905, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.492157894736843e-05, | |
| "loss": 1.23, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 12.429055919236532, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.489526315789475e-05, | |
| "loss": 1.1942, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 12.46060414859216, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.486894736842106e-05, | |
| "loss": 1.1034, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 12.492152377947788, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.484263157894737e-05, | |
| "loss": 1.1593, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 12.523700607303415, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.481631578947368e-05, | |
| "loss": 1.2432, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 12.555248836659043, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.479e-05, | |
| "loss": 1.2159, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 12.58679706601467, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.476368421052632e-05, | |
| "loss": 1.2214, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 12.618345295370297, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.473736842105264e-05, | |
| "loss": 1.2311, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 12.618345295370297, | |
| "eval_loss": 1.6025601625442505, | |
| "eval_runtime": 267.1111, | |
| "eval_samples_per_second": 94.931, | |
| "eval_steps_per_second": 5.934, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 12.649893524725925, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.471105263157895e-05, | |
| "loss": 1.178, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 12.681441754081552, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.468473684210527e-05, | |
| "loss": 1.145, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 12.71298998343718, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 9.465842105263159e-05, | |
| "loss": 1.1935, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 12.744538212792808, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.46321052631579e-05, | |
| "loss": 1.2261, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 12.776086442148435, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 9.460578947368421e-05, | |
| "loss": 1.1438, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 12.807634671504061, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.457947368421053e-05, | |
| "loss": 1.1981, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 12.83918290085969, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.455315789473684e-05, | |
| "loss": 1.2022, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 12.870731130215317, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.452684210526316e-05, | |
| "loss": 1.1456, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 12.902279359570944, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 9.450052631578947e-05, | |
| "loss": 1.3042, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 12.933827588926572, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.44742105263158e-05, | |
| "loss": 1.1636, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 12.9653758182822, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 9.444789473684211e-05, | |
| "loss": 1.1927, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 12.996924047637826, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.442157894736842e-05, | |
| "loss": 1.19, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 13.028393406420065, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.439526315789475e-05, | |
| "loss": 1.1314, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 13.059941635775692, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.436894736842106e-05, | |
| "loss": 1.1814, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 13.091489865131319, | |
| "grad_norm": 2.484375, | |
| "learning_rate": 9.434263157894737e-05, | |
| "loss": 1.2089, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 13.123038094486947, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 9.431631578947368e-05, | |
| "loss": 1.1614, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 13.154586323842574, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 9.429000000000001e-05, | |
| "loss": 1.2293, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 13.186134553198201, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.426368421052632e-05, | |
| "loss": 1.1544, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 13.21768278255383, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.423736842105264e-05, | |
| "loss": 1.2051, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 13.249231011909457, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 9.421105263157896e-05, | |
| "loss": 1.1744, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 13.249231011909457, | |
| "eval_loss": 1.5931901931762695, | |
| "eval_runtime": 266.8272, | |
| "eval_samples_per_second": 95.032, | |
| "eval_steps_per_second": 5.94, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 13.280779241265083, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 9.418473684210527e-05, | |
| "loss": 1.2007, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 13.312327470620712, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.415842105263158e-05, | |
| "loss": 1.1093, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 13.343875699976339, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.41321052631579e-05, | |
| "loss": 1.2242, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 13.375423929331966, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.410578947368422e-05, | |
| "loss": 1.2096, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 13.406972158687594, | |
| "grad_norm": 3.234375, | |
| "learning_rate": 9.407947368421053e-05, | |
| "loss": 1.1943, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 13.438520388043221, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.405315789473684e-05, | |
| "loss": 1.1824, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 13.470068617398848, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.402684210526316e-05, | |
| "loss": 1.2037, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 13.501616846754477, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.400052631578949e-05, | |
| "loss": 1.1881, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 13.533165076110103, | |
| "grad_norm": 1.625, | |
| "learning_rate": 9.39742105263158e-05, | |
| "loss": 1.0935, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 13.56471330546573, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.394789473684211e-05, | |
| "loss": 1.1977, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 13.596261534821359, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.392157894736842e-05, | |
| "loss": 1.2146, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 13.627809764176986, | |
| "grad_norm": 1.625, | |
| "learning_rate": 9.389526315789475e-05, | |
| "loss": 1.1826, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 13.659357993532613, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.386894736842106e-05, | |
| "loss": 1.1387, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 13.690906222888241, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.384263157894737e-05, | |
| "loss": 1.1169, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 13.722454452243868, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.381631578947368e-05, | |
| "loss": 1.2465, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 13.754002681599495, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.379e-05, | |
| "loss": 1.2198, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 13.785550910955124, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 9.376368421052632e-05, | |
| "loss": 1.1828, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 13.81709914031075, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.373736842105265e-05, | |
| "loss": 1.1286, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 13.848647369666377, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.371105263157896e-05, | |
| "loss": 1.1864, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 13.880195599022004, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 9.368473684210527e-05, | |
| "loss": 1.1334, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 13.880195599022004, | |
| "eval_loss": 1.5843102931976318, | |
| "eval_runtime": 267.102, | |
| "eval_samples_per_second": 94.934, | |
| "eval_steps_per_second": 5.934, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 13.911743828377633, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.365842105263158e-05, | |
| "loss": 1.1686, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 13.94329205773326, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.36321052631579e-05, | |
| "loss": 1.2305, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 13.974840287088886, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.360578947368422e-05, | |
| "loss": 1.2467, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 14.006309645871125, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 9.357947368421053e-05, | |
| "loss": 1.1825, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 14.037857875226752, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 9.355315789473684e-05, | |
| "loss": 1.2244, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 14.06940610458238, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.352684210526315e-05, | |
| "loss": 1.2818, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 14.100954333938008, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 9.350052631578948e-05, | |
| "loss": 1.2653, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 14.132502563293635, | |
| "grad_norm": 1.734375, | |
| "learning_rate": 9.34742105263158e-05, | |
| "loss": 1.1813, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 14.164050792649263, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 9.344789473684211e-05, | |
| "loss": 1.1284, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 14.19559902200489, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.342157894736842e-05, | |
| "loss": 1.1927, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 14.227147251360517, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 9.339526315789474e-05, | |
| "loss": 1.2206, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 14.258695480716145, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 9.336894736842106e-05, | |
| "loss": 1.2683, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 14.290243710071772, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.334263157894737e-05, | |
| "loss": 1.1436, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 14.3217919394274, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 9.331631578947368e-05, | |
| "loss": 1.1272, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 14.353340168783028, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.329e-05, | |
| "loss": 1.1771, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 14.384888398138655, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.326368421052632e-05, | |
| "loss": 1.2135, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 14.416436627494281, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 9.323736842105265e-05, | |
| "loss": 1.1247, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 14.44798485684991, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.321105263157896e-05, | |
| "loss": 1.0685, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 14.479533086205537, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.318473684210527e-05, | |
| "loss": 1.1276, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 14.511081315561164, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 9.315842105263158e-05, | |
| "loss": 1.1941, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 14.511081315561164, | |
| "eval_loss": 1.5769537687301636, | |
| "eval_runtime": 266.8284, | |
| "eval_samples_per_second": 95.031, | |
| "eval_steps_per_second": 5.94, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 14.542629544916792, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.313210526315789e-05, | |
| "loss": 1.1838, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 14.57417777427242, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.310578947368422e-05, | |
| "loss": 1.0749, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 14.605726003628046, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 9.307947368421053e-05, | |
| "loss": 1.1389, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 14.637274232983675, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.305315789473684e-05, | |
| "loss": 1.1979, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 14.668822462339302, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.302684210526317e-05, | |
| "loss": 1.1649, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 14.700370691694928, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 9.300052631578948e-05, | |
| "loss": 1.1631, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 14.731918921050555, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 9.29742105263158e-05, | |
| "loss": 1.2639, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 14.763467150406184, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.294789473684211e-05, | |
| "loss": 1.0786, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 14.79501537976181, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.292157894736843e-05, | |
| "loss": 1.1023, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 14.826563609117438, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 9.289526315789474e-05, | |
| "loss": 1.2058, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 14.858111838473066, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.286894736842105e-05, | |
| "loss": 1.2545, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 14.889660067828693, | |
| "grad_norm": 1.5, | |
| "learning_rate": 9.284263157894737e-05, | |
| "loss": 1.1352, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 14.92120829718432, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 9.281631578947369e-05, | |
| "loss": 1.1925, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 14.952756526539948, | |
| "grad_norm": 0.8359375, | |
| "learning_rate": 9.279e-05, | |
| "loss": 1.1737, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 14.984304755895575, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.276368421052632e-05, | |
| "loss": 1.2321, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 15.015774114677814, | |
| "grad_norm": 0.80078125, | |
| "learning_rate": 9.273736842105263e-05, | |
| "loss": 1.104, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 15.047322344033441, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 9.271105263157896e-05, | |
| "loss": 1.2197, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 15.078870573389068, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.268473684210527e-05, | |
| "loss": 1.1403, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 15.110418802744697, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.265842105263158e-05, | |
| "loss": 1.1551, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 15.141967032100323, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.26321052631579e-05, | |
| "loss": 1.2111, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 15.141967032100323, | |
| "eval_loss": 1.570060133934021, | |
| "eval_runtime": 266.8513, | |
| "eval_samples_per_second": 95.023, | |
| "eval_steps_per_second": 5.94, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 15.17351526145595, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.26057894736842e-05, | |
| "loss": 1.193, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 15.205063490811579, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 9.257947368421053e-05, | |
| "loss": 1.1775, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 15.236611720167206, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.255315789473684e-05, | |
| "loss": 1.0497, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 15.268159949522833, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.252684210526317e-05, | |
| "loss": 1.2102, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 15.299708178878461, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 9.250052631578948e-05, | |
| "loss": 1.2412, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 15.331256408234088, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 9.247421052631579e-05, | |
| "loss": 1.1529, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 15.362804637589715, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.244789473684212e-05, | |
| "loss": 1.1953, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 15.394352866945344, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 9.242157894736843e-05, | |
| "loss": 1.1939, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 15.42590109630097, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.239526315789474e-05, | |
| "loss": 1.0984, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 15.457449325656597, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 9.236894736842105e-05, | |
| "loss": 1.1652, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 15.488997555012224, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.234263157894738e-05, | |
| "loss": 1.0468, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 15.520545784367853, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.231631578947369e-05, | |
| "loss": 1.2469, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 15.55209401372348, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 9.229000000000001e-05, | |
| "loss": 1.2097, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 15.583642243079106, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 9.226368421052632e-05, | |
| "loss": 1.1814, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 15.615190472434735, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 9.223736842105264e-05, | |
| "loss": 1.1004, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 15.646738701790362, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.221105263157895e-05, | |
| "loss": 1.0581, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 15.678286931145989, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 9.218473684210527e-05, | |
| "loss": 1.0528, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 15.709835160501617, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 9.215842105263158e-05, | |
| "loss": 1.1313, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 15.741383389857244, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.21321052631579e-05, | |
| "loss": 1.132, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 15.772931619212871, | |
| "grad_norm": 82.0, | |
| "learning_rate": 9.210578947368421e-05, | |
| "loss": 1.2929, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 15.772931619212871, | |
| "eval_loss": 1.5642035007476807, | |
| "eval_runtime": 267.4044, | |
| "eval_samples_per_second": 94.826, | |
| "eval_steps_per_second": 5.927, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 15.8044798485685, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 9.207947368421053e-05, | |
| "loss": 1.1608, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 15.836028077924126, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.205315789473684e-05, | |
| "loss": 1.2047, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 15.867576307279753, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 9.202684210526317e-05, | |
| "loss": 1.1987, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 15.899124536635382, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 9.200052631578948e-05, | |
| "loss": 1.136, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 15.930672765991009, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 9.197421052631579e-05, | |
| "loss": 1.1869, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 15.962220995346636, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 9.194789473684212e-05, | |
| "loss": 1.1783, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 15.993769224702264, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 9.192157894736843e-05, | |
| "loss": 1.1213, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 16.0252385834845, | |
| "grad_norm": 1.375, | |
| "learning_rate": 9.189526315789474e-05, | |
| "loss": 1.1586, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 16.05678681284013, | |
| "grad_norm": 1.6484375, | |
| "learning_rate": 9.186894736842105e-05, | |
| "loss": 1.1799, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 16.088335042195755, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 9.184263157894736e-05, | |
| "loss": 1.1178, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 16.119883271551384, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.181631578947369e-05, | |
| "loss": 1.1504, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 16.151431500907012, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 9.179000000000001e-05, | |
| "loss": 1.2006, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 16.182979730262637, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.176368421052633e-05, | |
| "loss": 1.1127, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 16.214527959618266, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.173736842105264e-05, | |
| "loss": 1.1468, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 16.246076188973895, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.171105263157895e-05, | |
| "loss": 1.1406, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 16.27762441832952, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 9.168473684210527e-05, | |
| "loss": 1.0935, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 16.30917264768515, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.165842105263159e-05, | |
| "loss": 1.1796, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 16.340720877040777, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 9.16321052631579e-05, | |
| "loss": 1.1953, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 16.372269106396402, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.160578947368421e-05, | |
| "loss": 1.137, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 16.40381733575203, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.157947368421052e-05, | |
| "loss": 1.1988, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 16.40381733575203, | |
| "eval_loss": 1.5581732988357544, | |
| "eval_runtime": 267.3647, | |
| "eval_samples_per_second": 94.841, | |
| "eval_steps_per_second": 5.928, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 16.43536556510766, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.155315789473685e-05, | |
| "loss": 1.0596, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 16.466913794463284, | |
| "grad_norm": 1.6171875, | |
| "learning_rate": 9.152684210526317e-05, | |
| "loss": 1.0931, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 16.498462023818913, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.150052631578948e-05, | |
| "loss": 1.1419, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 16.53001025317454, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.14742105263158e-05, | |
| "loss": 1.1514, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 16.561558482530167, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.14478947368421e-05, | |
| "loss": 1.1669, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 16.593106711885795, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.142157894736843e-05, | |
| "loss": 1.1621, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 16.624654941241424, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.139526315789474e-05, | |
| "loss": 1.263, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 16.65620317059705, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 9.136894736842105e-05, | |
| "loss": 1.1713, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 16.687751399952678, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 9.134263157894737e-05, | |
| "loss": 1.2331, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 16.719299629308306, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.131631578947368e-05, | |
| "loss": 1.1634, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 16.75084785866393, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 9.129000000000002e-05, | |
| "loss": 1.0728, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 16.78239608801956, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 9.126368421052633e-05, | |
| "loss": 1.279, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 16.81394431737519, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 9.123736842105264e-05, | |
| "loss": 1.1402, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 16.845492546730814, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 9.121105263157895e-05, | |
| "loss": 1.1017, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 16.877040776086442, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 9.118473684210526e-05, | |
| "loss": 1.0707, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 16.90858900544207, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 9.115842105263159e-05, | |
| "loss": 1.2186, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 16.940137234797696, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.11321052631579e-05, | |
| "loss": 1.2004, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 16.971685464153325, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 9.110578947368421e-05, | |
| "loss": 1.2102, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 17.003154822935564, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.107947368421052e-05, | |
| "loss": 1.1377, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 17.03470305229119, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 9.105315789473685e-05, | |
| "loss": 1.1954, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 17.03470305229119, | |
| "eval_loss": 1.5533775091171265, | |
| "eval_runtime": 267.3193, | |
| "eval_samples_per_second": 94.857, | |
| "eval_steps_per_second": 5.929, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 17.066251281646817, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 9.102684210526317e-05, | |
| "loss": 1.1347, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 17.097799511002446, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 9.100052631578948e-05, | |
| "loss": 1.2264, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 17.12934774035807, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.09742105263158e-05, | |
| "loss": 1.1454, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 17.1608959697137, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 9.09478947368421e-05, | |
| "loss": 1.136, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 17.192444199069328, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.092157894736842e-05, | |
| "loss": 1.1995, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 17.223992428424953, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.089526315789474e-05, | |
| "loss": 1.1956, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 17.255540657780582, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 9.086894736842105e-05, | |
| "loss": 1.1551, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 17.28708888713621, | |
| "grad_norm": 2.765625, | |
| "learning_rate": 9.084263157894737e-05, | |
| "loss": 1.192, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 17.318637116491836, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.081631578947369e-05, | |
| "loss": 1.1508, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 17.350185345847464, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 9.079e-05, | |
| "loss": 1.105, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 17.381733575203093, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.076368421052633e-05, | |
| "loss": 1.2126, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 17.413281804558718, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 9.073736842105264e-05, | |
| "loss": 1.2555, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 17.444830033914347, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 9.071105263157895e-05, | |
| "loss": 1.1146, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 17.476378263269975, | |
| "grad_norm": 1.25, | |
| "learning_rate": 9.068473684210526e-05, | |
| "loss": 1.2614, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 17.5079264926256, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.065842105263157e-05, | |
| "loss": 1.0694, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 17.53947472198123, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 9.06321052631579e-05, | |
| "loss": 1.0467, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 17.571022951336857, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.060578947368421e-05, | |
| "loss": 1.1542, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 17.602571180692482, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.057947368421052e-05, | |
| "loss": 1.1161, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 17.63411941004811, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.055315789473685e-05, | |
| "loss": 1.1102, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 17.66566763940374, | |
| "grad_norm": 1.125, | |
| "learning_rate": 9.052684210526316e-05, | |
| "loss": 1.1637, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 17.66566763940374, | |
| "eval_loss": 1.5486234426498413, | |
| "eval_runtime": 267.4326, | |
| "eval_samples_per_second": 94.816, | |
| "eval_steps_per_second": 5.927, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 17.697215868759365, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 9.050052631578948e-05, | |
| "loss": 1.0892, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 17.728764098114993, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.04742105263158e-05, | |
| "loss": 1.1039, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 17.760312327470622, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 9.044789473684211e-05, | |
| "loss": 1.0568, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 17.791860556826247, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.042157894736842e-05, | |
| "loss": 1.2409, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 17.823408786181876, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 9.039526315789474e-05, | |
| "loss": 1.1879, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 17.854957015537504, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.036894736842106e-05, | |
| "loss": 1.2804, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 17.88650524489313, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 9.034263157894737e-05, | |
| "loss": 1.1923, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 17.918053474248758, | |
| "grad_norm": 1.4296875, | |
| "learning_rate": 9.031631578947369e-05, | |
| "loss": 1.0509, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 17.949601703604387, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 9.029e-05, | |
| "loss": 1.2324, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 17.98114993296001, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 9.026368421052632e-05, | |
| "loss": 1.1335, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 18.01261929174225, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 9.023736842105264e-05, | |
| "loss": 1.1168, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 18.04416752109788, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 9.021105263157895e-05, | |
| "loss": 1.1836, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 18.075715750453504, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 9.018473684210526e-05, | |
| "loss": 1.2124, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 18.107263979809133, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.015842105263158e-05, | |
| "loss": 1.041, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 18.13881220916476, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.01321052631579e-05, | |
| "loss": 1.1059, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 18.170360438520387, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.010578947368421e-05, | |
| "loss": 1.1117, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 18.201908667876015, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.007947368421054e-05, | |
| "loss": 1.1449, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 18.233456897231644, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 9.005315789473685e-05, | |
| "loss": 1.0887, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 18.26500512658727, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 9.002684210526316e-05, | |
| "loss": 1.1533, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 18.296553355942898, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 9.000052631578949e-05, | |
| "loss": 1.114, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 18.296553355942898, | |
| "eval_loss": 1.5437445640563965, | |
| "eval_runtime": 266.5209, | |
| "eval_samples_per_second": 95.141, | |
| "eval_steps_per_second": 5.947, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 18.328101585298526, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.99742105263158e-05, | |
| "loss": 1.1433, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 18.35964981465415, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.994789473684211e-05, | |
| "loss": 1.2408, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 18.39119804400978, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.992157894736842e-05, | |
| "loss": 1.2331, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 18.42274627336541, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.989526315789473e-05, | |
| "loss": 1.1272, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 18.454294502721034, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8.986894736842106e-05, | |
| "loss": 1.1593, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 18.485842732076662, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 8.984263157894738e-05, | |
| "loss": 1.1064, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 18.51739096143229, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.98163157894737e-05, | |
| "loss": 1.1021, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 18.548939190787916, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.979e-05, | |
| "loss": 1.1155, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 18.580487420143545, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 8.976368421052632e-05, | |
| "loss": 1.0826, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 18.612035649499173, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.973736842105264e-05, | |
| "loss": 1.1426, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 18.6435838788548, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.971105263157895e-05, | |
| "loss": 1.2192, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 18.675132108210427, | |
| "grad_norm": 3.21875, | |
| "learning_rate": 8.968473684210527e-05, | |
| "loss": 1.2204, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 18.706680337566056, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 8.965842105263158e-05, | |
| "loss": 1.2113, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 18.73822856692168, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.963210526315789e-05, | |
| "loss": 1.0822, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 18.76977679627731, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8.960578947368421e-05, | |
| "loss": 1.1272, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 18.801325025632938, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 8.957947368421054e-05, | |
| "loss": 1.0672, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 18.832873254988563, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8.955315789473685e-05, | |
| "loss": 1.1568, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 18.86442148434419, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 8.952684210526316e-05, | |
| "loss": 1.1823, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 18.89596971369982, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 8.950052631578947e-05, | |
| "loss": 1.2598, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 18.927517943055445, | |
| "grad_norm": 1.9375, | |
| "learning_rate": 8.94742105263158e-05, | |
| "loss": 1.2443, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 18.927517943055445, | |
| "eval_loss": 1.5400972366333008, | |
| "eval_runtime": 266.768, | |
| "eval_samples_per_second": 95.053, | |
| "eval_steps_per_second": 5.941, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 18.959066172411074, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.944789473684211e-05, | |
| "loss": 1.2057, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 18.990614401766702, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.942157894736842e-05, | |
| "loss": 1.2, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 19.022083760548938, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8.939526315789473e-05, | |
| "loss": 1.231, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 19.053631989904567, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8.936894736842105e-05, | |
| "loss": 1.133, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 19.085180219260195, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 8.934263157894738e-05, | |
| "loss": 1.2142, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 19.11672844861582, | |
| "grad_norm": 3.65625, | |
| "learning_rate": 8.93163157894737e-05, | |
| "loss": 1.1454, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 19.14827667797145, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.929000000000001e-05, | |
| "loss": 1.1329, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 19.179824907327077, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.926368421052632e-05, | |
| "loss": 1.1663, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 19.211373136682703, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.923736842105263e-05, | |
| "loss": 1.1911, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 19.24292136603833, | |
| "grad_norm": 0.75, | |
| "learning_rate": 8.921105263157896e-05, | |
| "loss": 1.1312, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 19.27446959539396, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.918473684210527e-05, | |
| "loss": 1.0483, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 19.306017824749585, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8.915842105263158e-05, | |
| "loss": 1.2329, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 19.337566054105213, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 8.913210526315789e-05, | |
| "loss": 1.096, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 19.369114283460842, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 8.910578947368422e-05, | |
| "loss": 1.1441, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 19.400662512816467, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8.907947368421054e-05, | |
| "loss": 1.1642, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 19.432210742172096, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 8.905315789473685e-05, | |
| "loss": 1.1549, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 19.463758971527724, | |
| "grad_norm": 3.25, | |
| "learning_rate": 8.902684210526316e-05, | |
| "loss": 1.1997, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 19.49530720088335, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 8.900052631578948e-05, | |
| "loss": 1.1528, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 19.526855430238978, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 8.897421052631579e-05, | |
| "loss": 1.093, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 19.558403659594607, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.894789473684211e-05, | |
| "loss": 1.2028, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 19.558403659594607, | |
| "eval_loss": 1.5360363721847534, | |
| "eval_runtime": 267.2349, | |
| "eval_samples_per_second": 94.887, | |
| "eval_steps_per_second": 5.931, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 19.58995188895023, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8.892157894736842e-05, | |
| "loss": 1.1147, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 19.62150011830586, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.889526315789474e-05, | |
| "loss": 1.1735, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 19.65304834766149, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8.886894736842105e-05, | |
| "loss": 1.1778, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 19.684596577017114, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.884263157894737e-05, | |
| "loss": 1.1468, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 19.716144806372743, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.88163157894737e-05, | |
| "loss": 1.1041, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 19.74769303572837, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8.879000000000001e-05, | |
| "loss": 1.144, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 19.779241265083996, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 8.876368421052632e-05, | |
| "loss": 1.1797, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 19.810789494439625, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.873736842105263e-05, | |
| "loss": 1.1098, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 19.842337723795254, | |
| "grad_norm": 9.75, | |
| "learning_rate": 8.871105263157894e-05, | |
| "loss": 1.1241, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 19.87388595315088, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.868473684210527e-05, | |
| "loss": 1.0782, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 19.905434182506507, | |
| "grad_norm": 1.828125, | |
| "learning_rate": 8.865842105263158e-05, | |
| "loss": 1.1342, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 19.936982411862136, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.863210526315789e-05, | |
| "loss": 1.0846, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 19.96853064121776, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.860578947368422e-05, | |
| "loss": 1.1876, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.857947368421053e-05, | |
| "loss": 1.2064, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 20.03154822935563, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 8.855315789473685e-05, | |
| "loss": 1.2347, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 20.063096458711254, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8.852684210526317e-05, | |
| "loss": 1.1243, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 20.094644688066882, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8.850052631578948e-05, | |
| "loss": 1.1414, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 20.12619291742251, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 8.847421052631579e-05, | |
| "loss": 1.1736, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 20.157741146778136, | |
| "grad_norm": 1.125, | |
| "learning_rate": 8.844789473684211e-05, | |
| "loss": 1.1636, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 20.189289376133765, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.842157894736843e-05, | |
| "loss": 1.1606, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 20.189289376133765, | |
| "eval_loss": 1.5331028699874878, | |
| "eval_runtime": 266.9276, | |
| "eval_samples_per_second": 94.996, | |
| "eval_steps_per_second": 5.938, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 20.220837605489393, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.839526315789474e-05, | |
| "loss": 1.0825, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 20.25238583484502, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.836894736842106e-05, | |
| "loss": 1.1555, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 20.283934064200647, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 8.834263157894737e-05, | |
| "loss": 1.1658, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 20.315482293556276, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.831631578947369e-05, | |
| "loss": 1.1245, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 20.3470305229119, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.829000000000001e-05, | |
| "loss": 1.2383, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 20.37857875226753, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 8.826368421052632e-05, | |
| "loss": 1.1005, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 20.410126981623158, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 8.823736842105263e-05, | |
| "loss": 1.1315, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 20.441675210978783, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.821105263157894e-05, | |
| "loss": 1.1512, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 20.47322344033441, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.818473684210527e-05, | |
| "loss": 1.1167, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 20.50477166969004, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.815842105263158e-05, | |
| "loss": 1.1824, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 20.536319899045665, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.81321052631579e-05, | |
| "loss": 1.1381, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 20.567868128401294, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8.810578947368422e-05, | |
| "loss": 1.1448, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 20.599416357756922, | |
| "grad_norm": 0.72265625, | |
| "learning_rate": 8.807947368421053e-05, | |
| "loss": 1.1, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 20.630964587112548, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.805315789473686e-05, | |
| "loss": 1.1527, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 20.662512816468176, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 8.802684210526317e-05, | |
| "loss": 1.1162, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 20.694061045823805, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 8.800052631578948e-05, | |
| "loss": 1.127, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 20.72560927517943, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.797421052631579e-05, | |
| "loss": 1.1265, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 20.75715750453506, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.79478947368421e-05, | |
| "loss": 1.1595, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 20.788705733890687, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.792157894736843e-05, | |
| "loss": 1.1433, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 20.820253963246312, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 8.789526315789474e-05, | |
| "loss": 1.1601, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 20.820253963246312, | |
| "eval_loss": 1.5301166772842407, | |
| "eval_runtime": 267.0084, | |
| "eval_samples_per_second": 94.967, | |
| "eval_steps_per_second": 5.936, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 20.85180219260194, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.786894736842106e-05, | |
| "loss": 1.154, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 20.883350421957566, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.784263157894737e-05, | |
| "loss": 1.276, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 20.914898651313194, | |
| "grad_norm": 1.75, | |
| "learning_rate": 8.781631578947369e-05, | |
| "loss": 1.1272, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 20.946446880668823, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8.779000000000001e-05, | |
| "loss": 1.1104, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 20.977995110024448, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.776368421052632e-05, | |
| "loss": 1.0886, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 21.009464468806687, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.773736842105263e-05, | |
| "loss": 1.0329, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 21.041012698162316, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8.771105263157895e-05, | |
| "loss": 1.1335, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 21.072560927517944, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 8.768473684210526e-05, | |
| "loss": 1.2062, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 21.10410915687357, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.765842105263158e-05, | |
| "loss": 1.1686, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 21.135657386229198, | |
| "grad_norm": 1.625, | |
| "learning_rate": 8.763210526315791e-05, | |
| "loss": 1.1086, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 21.167205615584827, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.760578947368422e-05, | |
| "loss": 1.1313, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 21.198753844940452, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.757947368421053e-05, | |
| "loss": 1.1474, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 21.23030207429608, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8.755315789473684e-05, | |
| "loss": 1.2645, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 21.26185030365171, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.752684210526317e-05, | |
| "loss": 1.107, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 21.293398533007334, | |
| "grad_norm": 1.8046875, | |
| "learning_rate": 8.750052631578948e-05, | |
| "loss": 1.083, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 21.324946762362963, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 8.747421052631579e-05, | |
| "loss": 1.1479, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 21.35649499171859, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 8.74478947368421e-05, | |
| "loss": 1.1635, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 21.388043221074216, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 8.742157894736841e-05, | |
| "loss": 1.1679, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 21.419591450429845, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.739526315789474e-05, | |
| "loss": 1.093, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 21.451139679785474, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8.736894736842106e-05, | |
| "loss": 1.0564, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 21.451139679785474, | |
| "eval_loss": 1.5272431373596191, | |
| "eval_runtime": 266.9593, | |
| "eval_samples_per_second": 94.985, | |
| "eval_steps_per_second": 5.937, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 21.4826879091411, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.734263157894738e-05, | |
| "loss": 1.1702, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 21.514236138496727, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 8.731631578947369e-05, | |
| "loss": 1.0621, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 21.545784367852356, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 8.729e-05, | |
| "loss": 1.2028, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 21.57733259720798, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.726368421052632e-05, | |
| "loss": 1.1803, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 21.60888082656361, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 8.723736842105264e-05, | |
| "loss": 1.1601, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 21.640429055919235, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.721105263157895e-05, | |
| "loss": 1.1953, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 21.671977285274863, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8.718473684210526e-05, | |
| "loss": 1.1671, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 21.703525514630492, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.715842105263158e-05, | |
| "loss": 1.0822, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 21.73507374398612, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.713210526315791e-05, | |
| "loss": 1.1072, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 21.766621973341746, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8.710578947368422e-05, | |
| "loss": 1.1747, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 21.798170202697374, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 8.707947368421053e-05, | |
| "loss": 1.1389, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 21.829718432053, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.705315789473684e-05, | |
| "loss": 1.1176, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 21.861266661408628, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 8.702684210526316e-05, | |
| "loss": 1.2013, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 21.892814890764257, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 8.700052631578948e-05, | |
| "loss": 1.1344, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 21.92436312011988, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 8.697421052631579e-05, | |
| "loss": 1.1601, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 21.95591134947551, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 8.69478947368421e-05, | |
| "loss": 1.11, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 21.98745957883114, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 8.692157894736842e-05, | |
| "loss": 1.1303, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 22.018928937613378, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 8.689526315789474e-05, | |
| "loss": 1.1803, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 22.050477166969003, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 8.686894736842107e-05, | |
| "loss": 1.0775, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 22.08202539632463, | |
| "grad_norm": 1.4375, | |
| "learning_rate": 8.684263157894738e-05, | |
| "loss": 1.1902, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 22.08202539632463, | |
| "eval_loss": 1.524708867073059, | |
| "eval_runtime": 267.2268, | |
| "eval_samples_per_second": 94.889, | |
| "eval_steps_per_second": 5.931, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 22.11357362568026, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.681631578947369e-05, | |
| "loss": 1.1983, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 22.145121855035885, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.679e-05, | |
| "loss": 1.1345, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 22.176670084391514, | |
| "grad_norm": 1.7265625, | |
| "learning_rate": 8.676368421052633e-05, | |
| "loss": 1.1564, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 22.208218313747143, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.673736842105264e-05, | |
| "loss": 1.1506, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 22.239766543102768, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 8.671105263157895e-05, | |
| "loss": 1.2614, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 22.271314772458396, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.668473684210526e-05, | |
| "loss": 1.1369, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 22.302863001814025, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.665842105263157e-05, | |
| "loss": 1.1186, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 22.33441123116965, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 8.66321052631579e-05, | |
| "loss": 1.1106, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 22.36595946052528, | |
| "grad_norm": 1.75, | |
| "learning_rate": 8.660578947368422e-05, | |
| "loss": 1.1323, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 22.397507689880907, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 8.657947368421053e-05, | |
| "loss": 1.1145, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 22.429055919236532, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.655315789473685e-05, | |
| "loss": 1.2457, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 22.46060414859216, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 8.652684210526316e-05, | |
| "loss": 1.1645, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 22.49215237794779, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 8.650052631578948e-05, | |
| "loss": 1.0613, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 22.523700607303415, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.64742105263158e-05, | |
| "loss": 1.1264, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 22.555248836659043, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.64478947368421e-05, | |
| "loss": 1.0699, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 22.586797066014668, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.642157894736842e-05, | |
| "loss": 1.1516, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 22.618345295370297, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 8.639526315789474e-05, | |
| "loss": 1.2029, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 22.649893524725925, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 8.636894736842105e-05, | |
| "loss": 1.1092, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 22.68144175408155, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 8.634263157894738e-05, | |
| "loss": 1.1379, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 22.71298998343718, | |
| "grad_norm": 1.671875, | |
| "learning_rate": 8.631631578947369e-05, | |
| "loss": 1.1758, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 22.71298998343718, | |
| "eval_loss": 1.5223606824874878, | |
| "eval_runtime": 267.1866, | |
| "eval_samples_per_second": 94.904, | |
| "eval_steps_per_second": 5.932, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 22.744538212792808, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.629e-05, | |
| "loss": 1.1285, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 22.776086442148433, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.626368421052631e-05, | |
| "loss": 1.094, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 22.80763467150406, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.623736842105264e-05, | |
| "loss": 1.1294, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 22.83918290085969, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 8.621105263157895e-05, | |
| "loss": 1.0608, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 22.870731130215315, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.618473684210526e-05, | |
| "loss": 1.1035, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 22.902279359570944, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.615842105263159e-05, | |
| "loss": 1.1301, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 22.933827588926572, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.61321052631579e-05, | |
| "loss": 1.1169, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 22.965375818282197, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.610578947368422e-05, | |
| "loss": 1.1624, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 22.996924047637826, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 8.607947368421054e-05, | |
| "loss": 1.0599, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 23.028393406420065, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.605315789473685e-05, | |
| "loss": 1.0562, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 23.059941635775694, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 8.602684210526316e-05, | |
| "loss": 1.1849, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 23.09148986513132, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8.600052631578947e-05, | |
| "loss": 1.1411, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 23.123038094486947, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8.59742105263158e-05, | |
| "loss": 1.1676, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 23.154586323842576, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.594789473684211e-05, | |
| "loss": 1.1663, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 23.1861345531982, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 8.592157894736842e-05, | |
| "loss": 1.1226, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 23.21768278255383, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.589526315789474e-05, | |
| "loss": 1.1417, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 23.24923101190946, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.586894736842106e-05, | |
| "loss": 1.1659, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 23.280779241265083, | |
| "grad_norm": 1.0, | |
| "learning_rate": 8.584263157894738e-05, | |
| "loss": 1.137, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 23.312327470620712, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 8.581631578947369e-05, | |
| "loss": 1.2469, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 23.343875699976337, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 8.579e-05, | |
| "loss": 1.2458, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 23.343875699976337, | |
| "eval_loss": 1.5205219984054565, | |
| "eval_runtime": 267.0201, | |
| "eval_samples_per_second": 94.963, | |
| "eval_steps_per_second": 5.936, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 23.375423929331966, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.576368421052632e-05, | |
| "loss": 1.1369, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 23.406972158687594, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.573736842105263e-05, | |
| "loss": 1.1991, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 23.43852038804322, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 8.571105263157895e-05, | |
| "loss": 1.185, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 23.470068617398848, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 8.568473684210526e-05, | |
| "loss": 1.1244, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 23.501616846754477, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 8.565842105263159e-05, | |
| "loss": 1.1446, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 23.5331650761101, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.56321052631579e-05, | |
| "loss": 1.0467, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 23.56471330546573, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 8.560578947368421e-05, | |
| "loss": 1.0834, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 23.59626153482136, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.557947368421054e-05, | |
| "loss": 1.0935, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 23.627809764176984, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.555315789473685e-05, | |
| "loss": 1.1686, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 23.659357993532613, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 8.552684210526316e-05, | |
| "loss": 1.0235, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 23.69090622288824, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.550052631578947e-05, | |
| "loss": 1.0734, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 23.722454452243866, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.547421052631578e-05, | |
| "loss": 1.1121, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 23.754002681599495, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.544789473684211e-05, | |
| "loss": 1.1587, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 23.785550910955124, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.542157894736843e-05, | |
| "loss": 1.0876, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 23.81709914031075, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.539526315789475e-05, | |
| "loss": 1.1147, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 23.848647369666377, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8.536894736842106e-05, | |
| "loss": 1.1459, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 23.880195599022006, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.534263157894737e-05, | |
| "loss": 1.0868, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 23.91174382837763, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 8.53163157894737e-05, | |
| "loss": 1.1512, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 23.94329205773326, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.529e-05, | |
| "loss": 1.0586, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 23.974840287088888, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8.526368421052632e-05, | |
| "loss": 1.1604, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 23.974840287088888, | |
| "eval_loss": 1.5178626775741577, | |
| "eval_runtime": 267.4107, | |
| "eval_samples_per_second": 94.824, | |
| "eval_steps_per_second": 5.927, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 24.006309645871127, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.523736842105263e-05, | |
| "loss": 1.1604, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 24.037857875226752, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 8.521105263157895e-05, | |
| "loss": 1.1294, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 24.06940610458238, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.518473684210528e-05, | |
| "loss": 1.2385, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 24.10095433393801, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 8.515842105263159e-05, | |
| "loss": 1.2034, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 24.132502563293635, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 8.51321052631579e-05, | |
| "loss": 1.1935, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 24.164050792649263, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.510578947368421e-05, | |
| "loss": 1.1127, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 24.19559902200489, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.507947368421052e-05, | |
| "loss": 1.0757, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 24.227147251360517, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.505315789473685e-05, | |
| "loss": 1.1407, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 24.258695480716145, | |
| "grad_norm": 2.359375, | |
| "learning_rate": 8.502684210526316e-05, | |
| "loss": 1.1599, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 24.29024371007177, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.500052631578947e-05, | |
| "loss": 1.0929, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 24.3217919394274, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.497421052631578e-05, | |
| "loss": 1.0718, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 24.353340168783028, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 8.494789473684211e-05, | |
| "loss": 1.0182, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 24.384888398138653, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.492157894736843e-05, | |
| "loss": 1.1052, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 24.41643662749428, | |
| "grad_norm": 3.0, | |
| "learning_rate": 8.489526315789475e-05, | |
| "loss": 1.059, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 24.44798485684991, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.486894736842106e-05, | |
| "loss": 1.1979, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 24.479533086205535, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8.484263157894737e-05, | |
| "loss": 1.0467, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 24.511081315561164, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 8.48163157894737e-05, | |
| "loss": 1.2318, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 24.542629544916792, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.479e-05, | |
| "loss": 1.1296, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 24.574177774272417, | |
| "grad_norm": 0.828125, | |
| "learning_rate": 8.476368421052632e-05, | |
| "loss": 1.1352, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 24.605726003628046, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8.473736842105263e-05, | |
| "loss": 1.1311, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 24.605726003628046, | |
| "eval_loss": 1.5162802934646606, | |
| "eval_runtime": 266.8244, | |
| "eval_samples_per_second": 95.033, | |
| "eval_steps_per_second": 5.94, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 24.637274232983675, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8.471105263157894e-05, | |
| "loss": 1.1194, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 24.6688224623393, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8.468473684210527e-05, | |
| "loss": 1.1584, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 24.70037069169493, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 8.465842105263159e-05, | |
| "loss": 1.1765, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 24.731918921050557, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.46321052631579e-05, | |
| "loss": 1.1289, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 24.763467150406182, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8.460578947368421e-05, | |
| "loss": 1.129, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 24.79501537976181, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 8.457947368421053e-05, | |
| "loss": 1.1591, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 24.82656360911744, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.455315789473685e-05, | |
| "loss": 1.2381, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 24.858111838473064, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 8.452684210526316e-05, | |
| "loss": 1.1744, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 24.889660067828693, | |
| "grad_norm": 2.046875, | |
| "learning_rate": 8.450052631578947e-05, | |
| "loss": 1.1799, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 24.92120829718432, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 8.447421052631579e-05, | |
| "loss": 1.0675, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 24.952756526539947, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.444789473684211e-05, | |
| "loss": 1.1453, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 24.984304755895575, | |
| "grad_norm": 1.765625, | |
| "learning_rate": 8.442157894736842e-05, | |
| "loss": 1.086, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 25.015774114677814, | |
| "grad_norm": 2.421875, | |
| "learning_rate": 8.439526315789475e-05, | |
| "loss": 1.1468, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 25.04732234403344, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.436894736842106e-05, | |
| "loss": 1.0776, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 25.078870573389068, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 8.434263157894737e-05, | |
| "loss": 0.9669, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 25.110418802744697, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8.431631578947368e-05, | |
| "loss": 1.1222, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 25.14196703210032, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 8.429000000000001e-05, | |
| "loss": 1.1037, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 25.17351526145595, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 8.426368421052632e-05, | |
| "loss": 1.1482, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 25.20506349081158, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.423736842105263e-05, | |
| "loss": 1.0434, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 25.236611720167204, | |
| "grad_norm": 1.390625, | |
| "learning_rate": 8.421105263157894e-05, | |
| "loss": 1.112, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 25.236611720167204, | |
| "eval_loss": 1.5145944356918335, | |
| "eval_runtime": 267.137, | |
| "eval_samples_per_second": 94.921, | |
| "eval_steps_per_second": 5.933, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 25.268159949522833, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.418473684210527e-05, | |
| "loss": 1.0981, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 25.29970817887846, | |
| "grad_norm": 1.7890625, | |
| "learning_rate": 8.415842105263159e-05, | |
| "loss": 1.156, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 25.331256408234086, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 8.41321052631579e-05, | |
| "loss": 1.2156, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 25.362804637589715, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.410578947368422e-05, | |
| "loss": 1.1541, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 25.394352866945344, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.407947368421053e-05, | |
| "loss": 1.0822, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 25.42590109630097, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 8.405315789473684e-05, | |
| "loss": 1.1099, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 25.457449325656597, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 8.402684210526316e-05, | |
| "loss": 1.1289, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 25.488997555012226, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 8.400052631578948e-05, | |
| "loss": 1.1307, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 25.52054578436785, | |
| "grad_norm": 1.78125, | |
| "learning_rate": 8.397421052631579e-05, | |
| "loss": 1.1502, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 25.55209401372348, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 8.394789473684211e-05, | |
| "loss": 1.1708, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 25.583642243079108, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.392157894736842e-05, | |
| "loss": 1.1826, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 25.615190472434733, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 8.389526315789475e-05, | |
| "loss": 1.1273, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 25.646738701790362, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8.386894736842106e-05, | |
| "loss": 1.1207, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 25.67828693114599, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 8.384263157894737e-05, | |
| "loss": 1.0559, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 25.709835160501616, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 8.381631578947368e-05, | |
| "loss": 1.097, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 25.741383389857244, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.379e-05, | |
| "loss": 1.1353, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 25.772931619212873, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.376368421052632e-05, | |
| "loss": 1.2467, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 25.804479848568498, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.373736842105263e-05, | |
| "loss": 1.2264, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 25.836028077924126, | |
| "grad_norm": 1.5859375, | |
| "learning_rate": 8.371105263157896e-05, | |
| "loss": 1.1003, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 25.867576307279755, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.368473684210527e-05, | |
| "loss": 1.0826, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 25.867576307279755, | |
| "eval_loss": 1.512583613395691, | |
| "eval_runtime": 267.7473, | |
| "eval_samples_per_second": 94.705, | |
| "eval_steps_per_second": 5.92, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 25.89912453663538, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.365842105263158e-05, | |
| "loss": 1.2477, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 25.93067276599101, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 8.36321052631579e-05, | |
| "loss": 1.16, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 25.962220995346637, | |
| "grad_norm": 6.53125, | |
| "learning_rate": 8.360578947368422e-05, | |
| "loss": 1.1243, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 25.993769224702262, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 8.357947368421053e-05, | |
| "loss": 1.0781, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 26.0252385834845, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 8.355315789473684e-05, | |
| "loss": 1.1761, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 26.05678681284013, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.352684210526315e-05, | |
| "loss": 1.1105, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 26.088335042195755, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 8.350052631578948e-05, | |
| "loss": 1.1172, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 26.119883271551384, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.347421052631579e-05, | |
| "loss": 1.0579, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 26.151431500907012, | |
| "grad_norm": 1.7578125, | |
| "learning_rate": 8.344789473684211e-05, | |
| "loss": 1.1108, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 26.182979730262637, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.342157894736843e-05, | |
| "loss": 1.1698, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 26.214527959618266, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.339526315789474e-05, | |
| "loss": 1.1217, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 26.246076188973895, | |
| "grad_norm": 2.109375, | |
| "learning_rate": 8.336894736842106e-05, | |
| "loss": 1.1791, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 26.27762441832952, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 8.334263157894737e-05, | |
| "loss": 1.1931, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 26.30917264768515, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 8.331631578947369e-05, | |
| "loss": 1.1063, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 26.340720877040777, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.329e-05, | |
| "loss": 1.1386, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 26.372269106396402, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.326368421052632e-05, | |
| "loss": 1.1654, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 26.40381733575203, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.323736842105263e-05, | |
| "loss": 1.1421, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 26.43536556510766, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.321105263157896e-05, | |
| "loss": 1.1093, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 26.466913794463284, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8.318473684210527e-05, | |
| "loss": 1.1201, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 26.498462023818913, | |
| "grad_norm": 1.5, | |
| "learning_rate": 8.315842105263158e-05, | |
| "loss": 1.24, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 26.498462023818913, | |
| "eval_loss": 1.5111093521118164, | |
| "eval_runtime": 267.2484, | |
| "eval_samples_per_second": 94.882, | |
| "eval_steps_per_second": 5.931, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 26.53001025317454, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.31321052631579e-05, | |
| "loss": 1.1231, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 26.561558482530167, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.310578947368422e-05, | |
| "loss": 1.1488, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 26.593106711885795, | |
| "grad_norm": 1.71875, | |
| "learning_rate": 8.307947368421053e-05, | |
| "loss": 1.1427, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 26.624654941241424, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.305315789473684e-05, | |
| "loss": 1.1537, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 26.65620317059705, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 8.302684210526315e-05, | |
| "loss": 1.116, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 26.687751399952678, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 8.300052631578948e-05, | |
| "loss": 1.0836, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 26.719299629308306, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.29742105263158e-05, | |
| "loss": 1.1648, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 26.75084785866393, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.294789473684212e-05, | |
| "loss": 1.0502, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 26.78239608801956, | |
| "grad_norm": 2.34375, | |
| "learning_rate": 8.292157894736843e-05, | |
| "loss": 1.2082, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 26.81394431737519, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.289526315789474e-05, | |
| "loss": 1.1388, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 26.845492546730814, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.286894736842106e-05, | |
| "loss": 1.1777, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 26.877040776086442, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.284263157894738e-05, | |
| "loss": 1.1671, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 26.90858900544207, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.281631578947369e-05, | |
| "loss": 1.1348, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 26.940137234797696, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.279e-05, | |
| "loss": 1.0797, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 26.971685464153325, | |
| "grad_norm": 1.5, | |
| "learning_rate": 8.276368421052631e-05, | |
| "loss": 1.0572, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 27.003154822935564, | |
| "grad_norm": 1.3359375, | |
| "learning_rate": 8.273736842105264e-05, | |
| "loss": 1.1384, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 27.03470305229119, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.271105263157896e-05, | |
| "loss": 1.0789, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 27.066251281646817, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8.268473684210527e-05, | |
| "loss": 1.0927, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 27.097799511002446, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.265842105263158e-05, | |
| "loss": 1.0928, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 27.12934774035807, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.26321052631579e-05, | |
| "loss": 1.1963, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 27.12934774035807, | |
| "eval_loss": 1.5096408128738403, | |
| "eval_runtime": 268.4062, | |
| "eval_samples_per_second": 94.472, | |
| "eval_steps_per_second": 5.905, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 27.1608959697137, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.260578947368422e-05, | |
| "loss": 1.0704, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 27.192444199069328, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.257947368421053e-05, | |
| "loss": 1.208, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 27.223992428424953, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 8.255315789473684e-05, | |
| "loss": 1.0895, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 27.255540657780582, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.252684210526315e-05, | |
| "loss": 1.1455, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 27.28708888713621, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 8.250052631578947e-05, | |
| "loss": 1.2282, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 27.318637116491836, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 8.247421052631579e-05, | |
| "loss": 1.0985, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 27.350185345847464, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 8.244789473684212e-05, | |
| "loss": 1.1631, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 27.381733575203093, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 8.242157894736843e-05, | |
| "loss": 1.105, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 27.413281804558718, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 8.239526315789474e-05, | |
| "loss": 1.1341, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 27.444830033914347, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8.236894736842105e-05, | |
| "loss": 1.0494, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 27.476378263269975, | |
| "grad_norm": 0.7890625, | |
| "learning_rate": 8.234263157894738e-05, | |
| "loss": 1.1368, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 27.5079264926256, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 8.231631578947369e-05, | |
| "loss": 1.1349, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 27.53947472198123, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.229e-05, | |
| "loss": 1.1768, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 27.571022951336857, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.226368421052631e-05, | |
| "loss": 1.0864, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 27.602571180692482, | |
| "grad_norm": 1.0, | |
| "learning_rate": 8.223736842105264e-05, | |
| "loss": 1.0461, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 27.63411941004811, | |
| "grad_norm": 2.640625, | |
| "learning_rate": 8.221105263157896e-05, | |
| "loss": 1.203, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 27.66566763940374, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.218473684210527e-05, | |
| "loss": 1.0647, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 27.697215868759365, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 8.215842105263158e-05, | |
| "loss": 1.0393, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 27.728764098114993, | |
| "grad_norm": 2.453125, | |
| "learning_rate": 8.21321052631579e-05, | |
| "loss": 1.0748, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 27.760312327470622, | |
| "grad_norm": 2.71875, | |
| "learning_rate": 8.210578947368421e-05, | |
| "loss": 1.1257, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 27.760312327470622, | |
| "eval_loss": 1.508254885673523, | |
| "eval_runtime": 267.4817, | |
| "eval_samples_per_second": 94.799, | |
| "eval_steps_per_second": 5.926, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 27.791860556826247, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 8.207947368421053e-05, | |
| "loss": 1.1718, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 27.823408786181876, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 8.205315789473684e-05, | |
| "loss": 1.1457, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 27.854957015537504, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 8.202684210526316e-05, | |
| "loss": 1.0853, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 27.88650524489313, | |
| "grad_norm": 1.140625, | |
| "learning_rate": 8.200052631578947e-05, | |
| "loss": 1.1679, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 27.918053474248758, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.197421052631579e-05, | |
| "loss": 1.1635, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 27.949601703604387, | |
| "grad_norm": 0.84765625, | |
| "learning_rate": 8.194789473684212e-05, | |
| "loss": 1.1341, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 27.98114993296001, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 8.192157894736843e-05, | |
| "loss": 1.0777, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 28.01261929174225, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 8.189526315789474e-05, | |
| "loss": 1.1823, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 28.04416752109788, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.186894736842105e-05, | |
| "loss": 1.0569, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 28.075715750453504, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 8.184263157894736e-05, | |
| "loss": 1.0733, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 28.107263979809133, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8.181631578947369e-05, | |
| "loss": 1.1116, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 28.13881220916476, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 8.179e-05, | |
| "loss": 1.1406, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 28.170360438520387, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.176368421052631e-05, | |
| "loss": 1.07, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 28.201908667876015, | |
| "grad_norm": 1.375, | |
| "learning_rate": 8.173736842105264e-05, | |
| "loss": 1.0451, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 28.233456897231644, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 8.171105263157895e-05, | |
| "loss": 1.0857, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 28.26500512658727, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.168473684210527e-05, | |
| "loss": 1.1488, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 28.296553355942898, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 8.165842105263159e-05, | |
| "loss": 1.0355, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 28.328101585298526, | |
| "grad_norm": 1.1484375, | |
| "learning_rate": 8.16321052631579e-05, | |
| "loss": 1.1336, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 28.35964981465415, | |
| "grad_norm": 1.09375, | |
| "learning_rate": 8.160578947368421e-05, | |
| "loss": 1.1152, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 28.39119804400978, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.157947368421053e-05, | |
| "loss": 1.1292, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 28.39119804400978, | |
| "eval_loss": 1.5078208446502686, | |
| "eval_runtime": 267.4752, | |
| "eval_samples_per_second": 94.801, | |
| "eval_steps_per_second": 5.926, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 28.42274627336541, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.155315789473685e-05, | |
| "loss": 1.2702, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 28.454294502721034, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.152684210526316e-05, | |
| "loss": 1.2059, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 28.485842732076662, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 8.150052631578948e-05, | |
| "loss": 1.2433, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 28.51739096143229, | |
| "grad_norm": 1.21875, | |
| "learning_rate": 8.14742105263158e-05, | |
| "loss": 1.1294, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 28.548939190787916, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8.14478947368421e-05, | |
| "loss": 1.1268, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 28.580487420143545, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.142157894736843e-05, | |
| "loss": 1.134, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 28.612035649499173, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.139526315789474e-05, | |
| "loss": 1.1095, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 28.6435838788548, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 8.136894736842105e-05, | |
| "loss": 1.1825, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 28.675132108210427, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 8.134263157894737e-05, | |
| "loss": 1.1514, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 28.706680337566056, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 8.131631578947369e-05, | |
| "loss": 1.0768, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 28.73822856692168, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 8.129e-05, | |
| "loss": 1.1478, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 28.76977679627731, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.126368421052633e-05, | |
| "loss": 1.1336, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 28.801325025632938, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.123736842105264e-05, | |
| "loss": 1.0757, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 28.832873254988563, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 8.121105263157895e-05, | |
| "loss": 1.1605, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 28.86442148434419, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 8.118473684210526e-05, | |
| "loss": 1.1447, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 28.89596971369982, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.115842105263159e-05, | |
| "loss": 1.0994, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 28.927517943055445, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 8.11321052631579e-05, | |
| "loss": 1.146, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 28.959066172411074, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 8.110578947368421e-05, | |
| "loss": 1.1763, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 28.990614401766702, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.107947368421052e-05, | |
| "loss": 1.1102, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 29.022083760548938, | |
| "grad_norm": 1.5, | |
| "learning_rate": 8.105315789473685e-05, | |
| "loss": 1.0882, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 29.022083760548938, | |
| "eval_loss": 1.5057809352874756, | |
| "eval_runtime": 267.5234, | |
| "eval_samples_per_second": 94.784, | |
| "eval_steps_per_second": 5.925, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 29.053631989904567, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.102684210526316e-05, | |
| "loss": 1.0811, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 29.085180219260195, | |
| "grad_norm": 1.2734375, | |
| "learning_rate": 8.100052631578948e-05, | |
| "loss": 1.1022, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 29.11672844861582, | |
| "grad_norm": 1.34375, | |
| "learning_rate": 8.09742105263158e-05, | |
| "loss": 1.1502, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 29.14827667797145, | |
| "grad_norm": 1.609375, | |
| "learning_rate": 8.094789473684211e-05, | |
| "loss": 1.0814, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 29.179824907327077, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 8.092157894736843e-05, | |
| "loss": 1.1044, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 29.211373136682703, | |
| "grad_norm": 1.5546875, | |
| "learning_rate": 8.089526315789474e-05, | |
| "loss": 1.0989, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 29.24292136603833, | |
| "grad_norm": 1.4453125, | |
| "learning_rate": 8.086894736842106e-05, | |
| "loss": 1.1061, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 29.27446959539396, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.084263157894737e-05, | |
| "loss": 1.0923, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 29.306017824749585, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.081631578947368e-05, | |
| "loss": 1.1236, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 29.337566054105213, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 8.079e-05, | |
| "loss": 1.1432, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 29.369114283460842, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.076368421052633e-05, | |
| "loss": 1.1098, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 29.400662512816467, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.073736842105264e-05, | |
| "loss": 1.1515, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 29.432210742172096, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 8.071105263157895e-05, | |
| "loss": 1.1382, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 29.463758971527724, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.068473684210526e-05, | |
| "loss": 1.1341, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 29.49530720088335, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.065842105263159e-05, | |
| "loss": 1.0388, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 29.526855430238978, | |
| "grad_norm": 1.296875, | |
| "learning_rate": 8.06321052631579e-05, | |
| "loss": 1.1351, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 29.558403659594607, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 8.060578947368421e-05, | |
| "loss": 1.1116, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 29.58995188895023, | |
| "grad_norm": 1.3828125, | |
| "learning_rate": 8.057947368421052e-05, | |
| "loss": 1.1026, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 29.62150011830586, | |
| "grad_norm": 1.5625, | |
| "learning_rate": 8.055315789473684e-05, | |
| "loss": 1.1015, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 29.65304834766149, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 8.052684210526316e-05, | |
| "loss": 1.2103, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 29.65304834766149, | |
| "eval_loss": 1.504706859588623, | |
| "eval_runtime": 267.8989, | |
| "eval_samples_per_second": 94.651, | |
| "eval_steps_per_second": 5.916, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 29.684596577017114, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.050052631578949e-05, | |
| "loss": 1.0734, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 29.716144806372743, | |
| "grad_norm": 1.25, | |
| "learning_rate": 8.04742105263158e-05, | |
| "loss": 1.1404, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 29.74769303572837, | |
| "grad_norm": 0.8125, | |
| "learning_rate": 8.044789473684211e-05, | |
| "loss": 1.1454, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 29.779241265083996, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 8.042157894736842e-05, | |
| "loss": 1.1346, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 29.810789494439625, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 8.039526315789475e-05, | |
| "loss": 1.202, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 29.842337723795254, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 8.036894736842106e-05, | |
| "loss": 1.1173, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 29.87388595315088, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.034263157894737e-05, | |
| "loss": 1.1577, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 29.905434182506507, | |
| "grad_norm": 2.828125, | |
| "learning_rate": 8.031631578947368e-05, | |
| "loss": 1.088, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 29.936982411862136, | |
| "grad_norm": 1.6640625, | |
| "learning_rate": 8.028999999999999e-05, | |
| "loss": 1.1395, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 29.96853064121776, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 8.026368421052633e-05, | |
| "loss": 1.1112, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 8.023736842105264e-05, | |
| "loss": 1.0993, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 30.03154822935563, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 8.021105263157895e-05, | |
| "loss": 1.1962, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 30.063096458711254, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 8.018473684210527e-05, | |
| "loss": 1.0634, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 30.094644688066882, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 8.015842105263158e-05, | |
| "loss": 1.1137, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 30.12619291742251, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 8.01321052631579e-05, | |
| "loss": 1.1681, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 30.157741146778136, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 8.010578947368421e-05, | |
| "loss": 1.0424, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 30.189289376133765, | |
| "grad_norm": 1.3046875, | |
| "learning_rate": 8.007947368421053e-05, | |
| "loss": 1.1927, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 30.220837605489393, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 8.005315789473684e-05, | |
| "loss": 1.0464, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 30.25238583484502, | |
| "grad_norm": 1.2109375, | |
| "learning_rate": 8.002684210526316e-05, | |
| "loss": 1.0849, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 30.283934064200647, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 8.000052631578949e-05, | |
| "loss": 1.1061, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 30.283934064200647, | |
| "eval_loss": 1.5036768913269043, | |
| "eval_runtime": 267.7531, | |
| "eval_samples_per_second": 94.703, | |
| "eval_steps_per_second": 5.92, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 30.315482293556276, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 7.99742105263158e-05, | |
| "loss": 1.1553, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 30.3470305229119, | |
| "grad_norm": 1.375, | |
| "learning_rate": 7.994789473684211e-05, | |
| "loss": 1.0855, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 30.37857875226753, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 7.992157894736842e-05, | |
| "loss": 1.1316, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 30.410126981623158, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 7.989526315789473e-05, | |
| "loss": 1.1295, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 30.441675210978783, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 7.986894736842106e-05, | |
| "loss": 1.1605, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 30.47322344033441, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 7.984263157894737e-05, | |
| "loss": 1.1485, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 30.50477166969004, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 7.981631578947368e-05, | |
| "loss": 1.1767, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 30.536319899045665, | |
| "grad_norm": 1.046875, | |
| "learning_rate": 7.979000000000001e-05, | |
| "loss": 1.1085, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 30.567868128401294, | |
| "grad_norm": 1.0859375, | |
| "learning_rate": 7.976368421052632e-05, | |
| "loss": 1.081, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 30.599416357756922, | |
| "grad_norm": 2.25, | |
| "learning_rate": 7.973736842105264e-05, | |
| "loss": 1.1187, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 30.630964587112548, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 7.971105263157896e-05, | |
| "loss": 1.1455, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 30.662512816468176, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 7.968473684210527e-05, | |
| "loss": 1.0459, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 30.694061045823805, | |
| "grad_norm": 1.078125, | |
| "learning_rate": 7.965842105263158e-05, | |
| "loss": 1.1228, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 30.72560927517943, | |
| "grad_norm": 1.6328125, | |
| "learning_rate": 7.96321052631579e-05, | |
| "loss": 1.1881, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 30.75715750453506, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 7.960578947368422e-05, | |
| "loss": 1.1357, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 30.788705733890687, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 7.957947368421053e-05, | |
| "loss": 1.1449, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 30.820253963246312, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 7.955315789473684e-05, | |
| "loss": 1.0769, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 30.85180219260194, | |
| "grad_norm": 1.3203125, | |
| "learning_rate": 7.952684210526316e-05, | |
| "loss": 1.1382, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 30.883350421957566, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 7.950052631578947e-05, | |
| "loss": 1.1921, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 30.914898651313194, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 7.94742105263158e-05, | |
| "loss": 1.1503, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 30.914898651313194, | |
| "eval_loss": 1.5025678873062134, | |
| "eval_runtime": 268.5667, | |
| "eval_samples_per_second": 94.416, | |
| "eval_steps_per_second": 5.902, | |
| "step": 49000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 200000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 127, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.634955028608e+17, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |