| { |
| "best_global_step": 72000, |
| "best_metric": 3.534351110458374, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_last_to_push_5039/checkpoint-40000", |
| "epoch": 26.814387314911976, |
| "eval_steps": 1000, |
| "global_step": 92000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014573860324122653, |
| "grad_norm": 0.9907153248786926, |
| "learning_rate": 0.000294, |
| "loss": 8.4934, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.029147720648245307, |
| "grad_norm": 0.7095437049865723, |
| "learning_rate": 0.0005939999999999999, |
| "loss": 6.7575, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.04372158097236796, |
| "grad_norm": 0.5371580719947815, |
| "learning_rate": 0.0005998285214348206, |
| "loss": 6.3802, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.05829544129649061, |
| "grad_norm": 0.4288189113140106, |
| "learning_rate": 0.0005996535433070866, |
| "loss": 6.1462, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.07286930162061327, |
| "grad_norm": 0.69915372133255, |
| "learning_rate": 0.0005994785651793525, |
| "loss": 5.9982, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.08744316194473592, |
| "grad_norm": 0.5082014203071594, |
| "learning_rate": 0.0005993035870516185, |
| "loss": 5.8844, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.10201702226885857, |
| "grad_norm": 0.5007938742637634, |
| "learning_rate": 0.0005991286089238845, |
| "loss": 5.7567, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.11659088259298123, |
| "grad_norm": 0.4490969181060791, |
| "learning_rate": 0.0005989536307961504, |
| "loss": 5.6409, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.1311647429171039, |
| "grad_norm": 0.4547346532344818, |
| "learning_rate": 0.0005987786526684164, |
| "loss": 5.5206, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.14573860324122653, |
| "grad_norm": 0.473257452249527, |
| "learning_rate": 0.0005986036745406824, |
| "loss": 5.4221, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1603124635653492, |
| "grad_norm": 0.47390425205230713, |
| "learning_rate": 0.0005984286964129484, |
| "loss": 5.3534, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.17488632388947184, |
| "grad_norm": 0.4946690499782562, |
| "learning_rate": 0.0005982537182852143, |
| "loss": 5.2798, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1894601842135945, |
| "grad_norm": 0.478254109621048, |
| "learning_rate": 0.0005980787401574803, |
| "loss": 5.2198, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.20403404453771715, |
| "grad_norm": 0.4964013993740082, |
| "learning_rate": 0.0005979037620297463, |
| "loss": 5.1363, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.2186079048618398, |
| "grad_norm": 0.4532807171344757, |
| "learning_rate": 0.0005977287839020123, |
| "loss": 5.0752, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.23318176518596245, |
| "grad_norm": 0.49224039912223816, |
| "learning_rate": 0.0005975538057742782, |
| "loss": 5.0214, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.24775562551008512, |
| "grad_norm": 0.4381696283817291, |
| "learning_rate": 0.0005973788276465442, |
| "loss": 4.9831, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.2623294858342078, |
| "grad_norm": 0.455869197845459, |
| "learning_rate": 0.0005972038495188102, |
| "loss": 4.9287, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.2769033461583304, |
| "grad_norm": 0.5022788047790527, |
| "learning_rate": 0.000597028871391076, |
| "loss": 4.8776, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.29147720648245307, |
| "grad_norm": 0.41615989804267883, |
| "learning_rate": 0.000596853893263342, |
| "loss": 4.827, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.29147720648245307, |
| "eval_accuracy": 0.25474598232639084, |
| "eval_loss": 4.756115913391113, |
| "eval_runtime": 53.1827, |
| "eval_samples_per_second": 312.639, |
| "eval_steps_per_second": 19.555, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.30605106680657573, |
| "grad_norm": 0.47037482261657715, |
| "learning_rate": 0.000596678915135608, |
| "loss": 4.7837, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.3206249271306984, |
| "grad_norm": 0.40905576944351196, |
| "learning_rate": 0.0005965039370078739, |
| "loss": 4.7417, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.335198787454821, |
| "grad_norm": 0.6451007127761841, |
| "learning_rate": 0.0005963289588801399, |
| "loss": 4.7029, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.3497726477789437, |
| "grad_norm": 0.45241931080818176, |
| "learning_rate": 0.0005961539807524059, |
| "loss": 4.6664, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.36434650810306635, |
| "grad_norm": 0.4666035771369934, |
| "learning_rate": 0.0005959790026246719, |
| "loss": 4.6486, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.378920368427189, |
| "grad_norm": 0.49886658787727356, |
| "learning_rate": 0.0005958040244969378, |
| "loss": 4.6071, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3934942287513116, |
| "grad_norm": 0.4707571864128113, |
| "learning_rate": 0.0005956290463692038, |
| "loss": 4.566, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.4080680890754343, |
| "grad_norm": 0.40836653113365173, |
| "learning_rate": 0.0005954540682414698, |
| "loss": 4.5522, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.42264194939955696, |
| "grad_norm": 0.4472939074039459, |
| "learning_rate": 0.0005952790901137357, |
| "loss": 4.5238, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.4372158097236796, |
| "grad_norm": 0.4636169970035553, |
| "learning_rate": 0.0005951041119860017, |
| "loss": 4.5108, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.45178967004780224, |
| "grad_norm": 0.4297902584075928, |
| "learning_rate": 0.0005949291338582677, |
| "loss": 4.4998, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.4663635303719249, |
| "grad_norm": 0.45430973172187805, |
| "learning_rate": 0.0005947541557305336, |
| "loss": 4.4607, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.4809373906960476, |
| "grad_norm": 0.4021083116531372, |
| "learning_rate": 0.0005945791776027996, |
| "loss": 4.4399, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.49551125102017024, |
| "grad_norm": 0.45110058784484863, |
| "learning_rate": 0.0005944041994750656, |
| "loss": 4.4175, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.5100851113442929, |
| "grad_norm": 0.44936510920524597, |
| "learning_rate": 0.0005942292213473315, |
| "loss": 4.4125, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.5246589716684156, |
| "grad_norm": 0.38193148374557495, |
| "learning_rate": 0.0005940542432195975, |
| "loss": 4.4001, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.5392328319925381, |
| "grad_norm": 0.3981386125087738, |
| "learning_rate": 0.0005938792650918635, |
| "loss": 4.3892, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.5538066923166608, |
| "grad_norm": 0.41019147634506226, |
| "learning_rate": 0.0005937042869641295, |
| "loss": 4.3717, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.5683805526407835, |
| "grad_norm": 0.3775465190410614, |
| "learning_rate": 0.0005935293088363953, |
| "loss": 4.3588, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.5829544129649061, |
| "grad_norm": 0.3853199779987335, |
| "learning_rate": 0.0005933543307086613, |
| "loss": 4.3473, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5829544129649061, |
| "eval_accuracy": 0.2976993659783082, |
| "eval_loss": 4.288848876953125, |
| "eval_runtime": 53.1983, |
| "eval_samples_per_second": 312.548, |
| "eval_steps_per_second": 19.55, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.5975282732890288, |
| "grad_norm": 0.3746494650840759, |
| "learning_rate": 0.0005931793525809273, |
| "loss": 4.3314, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.6121021336131515, |
| "grad_norm": 0.5510130524635315, |
| "learning_rate": 0.0005930043744531933, |
| "loss": 4.3333, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.6266759939372741, |
| "grad_norm": 0.39102843403816223, |
| "learning_rate": 0.0005928293963254592, |
| "loss": 4.3264, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.6412498542613968, |
| "grad_norm": 0.5034298300743103, |
| "learning_rate": 0.0005926544181977252, |
| "loss": 4.2971, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.6558237145855194, |
| "grad_norm": 0.4380589723587036, |
| "learning_rate": 0.0005924794400699912, |
| "loss": 4.2766, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.670397574909642, |
| "grad_norm": 0.36597833037376404, |
| "learning_rate": 0.0005923044619422571, |
| "loss": 4.2774, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.6849714352337647, |
| "grad_norm": 0.4457620680332184, |
| "learning_rate": 0.0005921294838145231, |
| "loss": 4.2622, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.6995452955578874, |
| "grad_norm": 0.35867607593536377, |
| "learning_rate": 0.0005919545056867891, |
| "loss": 4.2556, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.71411915588201, |
| "grad_norm": 0.36599287390708923, |
| "learning_rate": 0.0005917795275590551, |
| "loss": 4.2527, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.7286930162061327, |
| "grad_norm": 0.3873817026615143, |
| "learning_rate": 0.000591604549431321, |
| "loss": 4.2245, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.7432668765302554, |
| "grad_norm": 0.4228278696537018, |
| "learning_rate": 0.000591429571303587, |
| "loss": 4.2314, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.757840736854378, |
| "grad_norm": 0.37973201274871826, |
| "learning_rate": 0.000591254593175853, |
| "loss": 4.2146, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.7724145971785007, |
| "grad_norm": 0.35704922676086426, |
| "learning_rate": 0.000591079615048119, |
| "loss": 4.1993, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.7869884575026233, |
| "grad_norm": 0.3590843379497528, |
| "learning_rate": 0.0005909046369203849, |
| "loss": 4.2082, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.8015623178267459, |
| "grad_norm": 0.36054208874702454, |
| "learning_rate": 0.0005907296587926509, |
| "loss": 4.1991, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.8161361781508686, |
| "grad_norm": 0.3614412844181061, |
| "learning_rate": 0.0005905546806649169, |
| "loss": 4.1991, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.8307100384749913, |
| "grad_norm": 0.4009391665458679, |
| "learning_rate": 0.0005903797025371829, |
| "loss": 4.1866, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.8452838987991139, |
| "grad_norm": 0.40568235516548157, |
| "learning_rate": 0.0005902047244094488, |
| "loss": 4.1652, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.8598577591232366, |
| "grad_norm": 0.3657606542110443, |
| "learning_rate": 0.0005900297462817148, |
| "loss": 4.1592, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.8744316194473593, |
| "grad_norm": 0.36263778805732727, |
| "learning_rate": 0.0005898547681539808, |
| "loss": 4.1507, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8744316194473593, |
| "eval_accuracy": 0.31444622938405536, |
| "eval_loss": 4.105124473571777, |
| "eval_runtime": 53.364, |
| "eval_samples_per_second": 311.577, |
| "eval_steps_per_second": 19.489, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.8890054797714819, |
| "grad_norm": 0.34934473037719727, |
| "learning_rate": 0.0005896797900262466, |
| "loss": 4.1475, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.9035793400956045, |
| "grad_norm": 0.37282851338386536, |
| "learning_rate": 0.0005895048118985126, |
| "loss": 4.14, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.9181532004197271, |
| "grad_norm": 0.33918800950050354, |
| "learning_rate": 0.0005893298337707786, |
| "loss": 4.145, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.9327270607438498, |
| "grad_norm": 0.36305153369903564, |
| "learning_rate": 0.0005891548556430446, |
| "loss": 4.1251, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.9473009210679725, |
| "grad_norm": 0.3614549934864044, |
| "learning_rate": 0.0005889798775153105, |
| "loss": 4.1294, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.9618747813920951, |
| "grad_norm": 0.3925406038761139, |
| "learning_rate": 0.0005888048993875765, |
| "loss": 4.1132, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.9764486417162178, |
| "grad_norm": 0.37444689869880676, |
| "learning_rate": 0.0005886299212598425, |
| "loss": 4.1134, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.9910225020403405, |
| "grad_norm": 0.3604774475097656, |
| "learning_rate": 0.0005884549431321084, |
| "loss": 4.1033, |
| "step": 3400 |
| }, |
| { |
| "epoch": 1.0055380669231666, |
| "grad_norm": 0.3345955014228821, |
| "learning_rate": 0.0005882799650043744, |
| "loss": 4.0831, |
| "step": 3450 |
| }, |
| { |
| "epoch": 1.0201119272472892, |
| "grad_norm": 0.336396723985672, |
| "learning_rate": 0.0005881049868766404, |
| "loss": 4.0266, |
| "step": 3500 |
| }, |
| { |
| "epoch": 1.034685787571412, |
| "grad_norm": 0.34870588779449463, |
| "learning_rate": 0.0005879300087489063, |
| "loss": 4.0253, |
| "step": 3550 |
| }, |
| { |
| "epoch": 1.0492596478955345, |
| "grad_norm": 0.35148200392723083, |
| "learning_rate": 0.0005877550306211723, |
| "loss": 4.0038, |
| "step": 3600 |
| }, |
| { |
| "epoch": 1.0638335082196573, |
| "grad_norm": 0.35439014434814453, |
| "learning_rate": 0.0005875800524934383, |
| "loss": 4.0146, |
| "step": 3650 |
| }, |
| { |
| "epoch": 1.0784073685437798, |
| "grad_norm": 0.34169119596481323, |
| "learning_rate": 0.0005874050743657042, |
| "loss": 4.026, |
| "step": 3700 |
| }, |
| { |
| "epoch": 1.0929812288679026, |
| "grad_norm": 0.3638543486595154, |
| "learning_rate": 0.0005872300962379702, |
| "loss": 4.0098, |
| "step": 3750 |
| }, |
| { |
| "epoch": 1.1075550891920252, |
| "grad_norm": 0.35680437088012695, |
| "learning_rate": 0.0005870551181102362, |
| "loss": 4.0094, |
| "step": 3800 |
| }, |
| { |
| "epoch": 1.122128949516148, |
| "grad_norm": 0.38842830061912537, |
| "learning_rate": 0.0005868801399825022, |
| "loss": 3.9804, |
| "step": 3850 |
| }, |
| { |
| "epoch": 1.1367028098402705, |
| "grad_norm": 0.35586678981781006, |
| "learning_rate": 0.0005867051618547681, |
| "loss": 3.9883, |
| "step": 3900 |
| }, |
| { |
| "epoch": 1.151276670164393, |
| "grad_norm": 0.3357126712799072, |
| "learning_rate": 0.0005865301837270341, |
| "loss": 4.0055, |
| "step": 3950 |
| }, |
| { |
| "epoch": 1.1658505304885158, |
| "grad_norm": 0.36053982377052307, |
| "learning_rate": 0.0005863552055993001, |
| "loss": 4.0024, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1658505304885158, |
| "eval_accuracy": 0.3242646265234546, |
| "eval_loss": 3.9950461387634277, |
| "eval_runtime": 53.4878, |
| "eval_samples_per_second": 310.856, |
| "eval_steps_per_second": 19.444, |
| "step": 4000 |
| }, |
| { |
| "epoch": 1.1804243908126384, |
| "grad_norm": 0.3483583629131317, |
| "learning_rate": 0.0005861802274715659, |
| "loss": 4.0074, |
| "step": 4050 |
| }, |
| { |
| "epoch": 1.1949982511367612, |
| "grad_norm": 0.33784136176109314, |
| "learning_rate": 0.0005860052493438319, |
| "loss": 3.9894, |
| "step": 4100 |
| }, |
| { |
| "epoch": 1.2095721114608837, |
| "grad_norm": 0.3274170160293579, |
| "learning_rate": 0.0005858302712160979, |
| "loss": 4.0094, |
| "step": 4150 |
| }, |
| { |
| "epoch": 1.2241459717850065, |
| "grad_norm": 0.36927348375320435, |
| "learning_rate": 0.0005856552930883638, |
| "loss": 3.9823, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.238719832109129, |
| "grad_norm": 0.3337404131889343, |
| "learning_rate": 0.0005854803149606298, |
| "loss": 3.9831, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.2532936924332518, |
| "grad_norm": 0.334148108959198, |
| "learning_rate": 0.0005853053368328958, |
| "loss": 3.9801, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.2678675527573744, |
| "grad_norm": 0.3403607904911041, |
| "learning_rate": 0.0005851303587051618, |
| "loss": 3.9726, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.282441413081497, |
| "grad_norm": 0.3903663158416748, |
| "learning_rate": 0.0005849553805774277, |
| "loss": 3.9824, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.2970152734056197, |
| "grad_norm": 0.3315300941467285, |
| "learning_rate": 0.0005847804024496937, |
| "loss": 3.9742, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.3115891337297423, |
| "grad_norm": 0.3257898688316345, |
| "learning_rate": 0.0005846054243219597, |
| "loss": 3.9761, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.326162994053865, |
| "grad_norm": 0.33835312724113464, |
| "learning_rate": 0.0005844304461942257, |
| "loss": 3.9561, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.3407368543779876, |
| "grad_norm": 0.35677939653396606, |
| "learning_rate": 0.0005842554680664916, |
| "loss": 3.9435, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.3553107147021102, |
| "grad_norm": 0.35002270340919495, |
| "learning_rate": 0.0005840804899387576, |
| "loss": 3.958, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.369884575026233, |
| "grad_norm": 0.3446170687675476, |
| "learning_rate": 0.0005839055118110236, |
| "loss": 3.961, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.3844584353503557, |
| "grad_norm": 0.3279706835746765, |
| "learning_rate": 0.0005837305336832896, |
| "loss": 3.9471, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.3990322956744783, |
| "grad_norm": 0.34252220392227173, |
| "learning_rate": 0.0005835555555555555, |
| "loss": 3.9414, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.4136061559986008, |
| "grad_norm": 0.3297818899154663, |
| "learning_rate": 0.0005833805774278215, |
| "loss": 3.9481, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.4281800163227236, |
| "grad_norm": 0.3407423496246338, |
| "learning_rate": 0.0005832055993000875, |
| "loss": 3.9385, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.4427538766468462, |
| "grad_norm": 0.32950732111930847, |
| "learning_rate": 0.0005830306211723534, |
| "loss": 3.9409, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.457327736970969, |
| "grad_norm": 0.33204489946365356, |
| "learning_rate": 0.0005828556430446194, |
| "loss": 3.9542, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.457327736970969, |
| "eval_accuracy": 0.3311323611643853, |
| "eval_loss": 3.919184923171997, |
| "eval_runtime": 53.3507, |
| "eval_samples_per_second": 311.655, |
| "eval_steps_per_second": 19.494, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.4719015972950915, |
| "grad_norm": 0.33288151025772095, |
| "learning_rate": 0.0005826806649168854, |
| "loss": 3.9432, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.486475457619214, |
| "grad_norm": 0.3878774642944336, |
| "learning_rate": 0.0005825056867891514, |
| "loss": 3.9275, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.5010493179433368, |
| "grad_norm": 0.34014689922332764, |
| "learning_rate": 0.0005823307086614172, |
| "loss": 3.9246, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.5156231782674596, |
| "grad_norm": 0.339933305978775, |
| "learning_rate": 0.0005821557305336832, |
| "loss": 3.9304, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.5301970385915822, |
| "grad_norm": 0.34320276975631714, |
| "learning_rate": 0.0005819807524059492, |
| "loss": 3.9175, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.5447708989157047, |
| "grad_norm": 0.35694506764411926, |
| "learning_rate": 0.0005818057742782152, |
| "loss": 3.9222, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.5593447592398273, |
| "grad_norm": 0.31538674235343933, |
| "learning_rate": 0.0005816307961504811, |
| "loss": 3.9163, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.57391861956395, |
| "grad_norm": 0.3151552379131317, |
| "learning_rate": 0.0005814558180227471, |
| "loss": 3.9166, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.5884924798880729, |
| "grad_norm": 0.3546421229839325, |
| "learning_rate": 0.0005812808398950131, |
| "loss": 3.9212, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.6030663402121954, |
| "grad_norm": 0.3432241678237915, |
| "learning_rate": 0.0005811058617672791, |
| "loss": 3.9089, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.617640200536318, |
| "grad_norm": 0.36986014246940613, |
| "learning_rate": 0.000580930883639545, |
| "loss": 3.9162, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.6322140608604407, |
| "grad_norm": 0.3325847387313843, |
| "learning_rate": 0.000580755905511811, |
| "loss": 3.899, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.6467879211845635, |
| "grad_norm": 0.3691854774951935, |
| "learning_rate": 0.000580580927384077, |
| "loss": 3.9085, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.661361781508686, |
| "grad_norm": 0.31372612714767456, |
| "learning_rate": 0.0005804059492563429, |
| "loss": 3.9036, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.6759356418328086, |
| "grad_norm": 0.3524612784385681, |
| "learning_rate": 0.0005802309711286089, |
| "loss": 3.8883, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.6905095021569312, |
| "grad_norm": 0.3460583984851837, |
| "learning_rate": 0.0005800559930008749, |
| "loss": 3.8901, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.705083362481054, |
| "grad_norm": 0.34100937843322754, |
| "learning_rate": 0.0005798810148731408, |
| "loss": 3.8784, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.7196572228051767, |
| "grad_norm": 0.3111191391944885, |
| "learning_rate": 0.0005797060367454068, |
| "loss": 3.8952, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.7342310831292993, |
| "grad_norm": 0.33718207478523254, |
| "learning_rate": 0.0005795310586176728, |
| "loss": 3.8867, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.7488049434534219, |
| "grad_norm": 0.32707616686820984, |
| "learning_rate": 0.0005793560804899387, |
| "loss": 3.8828, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7488049434534219, |
| "eval_accuracy": 0.33619380073694766, |
| "eval_loss": 3.8606996536254883, |
| "eval_runtime": 53.3541, |
| "eval_samples_per_second": 311.635, |
| "eval_steps_per_second": 19.492, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.7633788037775446, |
| "grad_norm": 0.3523644804954529, |
| "learning_rate": 0.0005791811023622047, |
| "loss": 3.879, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.7779526641016674, |
| "grad_norm": 0.33810436725616455, |
| "learning_rate": 0.0005790061242344707, |
| "loss": 3.881, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.79252652442579, |
| "grad_norm": 0.32578548789024353, |
| "learning_rate": 0.0005788311461067365, |
| "loss": 3.8926, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.8071003847499125, |
| "grad_norm": 0.32520464062690735, |
| "learning_rate": 0.0005786561679790025, |
| "loss": 3.8743, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.821674245074035, |
| "grad_norm": 0.329898476600647, |
| "learning_rate": 0.0005784811898512685, |
| "loss": 3.8676, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.8362481053981579, |
| "grad_norm": 0.3443533778190613, |
| "learning_rate": 0.0005783062117235344, |
| "loss": 3.867, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.8508219657222806, |
| "grad_norm": 0.32963231205940247, |
| "learning_rate": 0.0005781312335958004, |
| "loss": 3.8756, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.8653958260464032, |
| "grad_norm": 0.3340328633785248, |
| "learning_rate": 0.0005779562554680664, |
| "loss": 3.8615, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.8799696863705257, |
| "grad_norm": 0.33938348293304443, |
| "learning_rate": 0.0005777812773403324, |
| "loss": 3.8719, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.8945435466946485, |
| "grad_norm": 0.3234139680862427, |
| "learning_rate": 0.0005776062992125983, |
| "loss": 3.8626, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.909117407018771, |
| "grad_norm": 0.32839807868003845, |
| "learning_rate": 0.0005774313210848643, |
| "loss": 3.8653, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.9236912673428939, |
| "grad_norm": 0.32402539253234863, |
| "learning_rate": 0.0005772563429571303, |
| "loss": 3.8552, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.9382651276670164, |
| "grad_norm": 0.3139670193195343, |
| "learning_rate": 0.0005770813648293962, |
| "loss": 3.8605, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.952838987991139, |
| "grad_norm": 0.3228314518928528, |
| "learning_rate": 0.0005769063867016622, |
| "loss": 3.8639, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.9674128483152618, |
| "grad_norm": 0.3443976044654846, |
| "learning_rate": 0.0005767314085739282, |
| "loss": 3.8482, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.9819867086393845, |
| "grad_norm": 0.33928486704826355, |
| "learning_rate": 0.0005765564304461942, |
| "loss": 3.8488, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.996560568963507, |
| "grad_norm": 0.33698633313179016, |
| "learning_rate": 0.0005763814523184601, |
| "loss": 3.8495, |
| "step": 6850 |
| }, |
| { |
| "epoch": 2.011076133846333, |
| "grad_norm": 0.3378838002681732, |
| "learning_rate": 0.0005762064741907261, |
| "loss": 3.7769, |
| "step": 6900 |
| }, |
| { |
| "epoch": 2.0256499941704558, |
| "grad_norm": 0.3321957290172577, |
| "learning_rate": 0.0005760314960629921, |
| "loss": 3.7508, |
| "step": 6950 |
| }, |
| { |
| "epoch": 2.0402238544945783, |
| "grad_norm": 0.3420303165912628, |
| "learning_rate": 0.0005758565179352581, |
| "loss": 3.7609, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0402238544945783, |
| "eval_accuracy": 0.34050150905142496, |
| "eval_loss": 3.8192999362945557, |
| "eval_runtime": 53.4903, |
| "eval_samples_per_second": 310.842, |
| "eval_steps_per_second": 19.443, |
| "step": 7000 |
| }, |
| { |
| "epoch": 2.0547977148187013, |
| "grad_norm": 0.3463504910469055, |
| "learning_rate": 0.000575681539807524, |
| "loss": 3.7501, |
| "step": 7050 |
| }, |
| { |
| "epoch": 2.069371575142824, |
| "grad_norm": 0.33339545130729675, |
| "learning_rate": 0.00057550656167979, |
| "loss": 3.7616, |
| "step": 7100 |
| }, |
| { |
| "epoch": 2.0839454354669464, |
| "grad_norm": 0.3201228380203247, |
| "learning_rate": 0.000575331583552056, |
| "loss": 3.7462, |
| "step": 7150 |
| }, |
| { |
| "epoch": 2.098519295791069, |
| "grad_norm": 0.31870558857917786, |
| "learning_rate": 0.000575156605424322, |
| "loss": 3.7598, |
| "step": 7200 |
| }, |
| { |
| "epoch": 2.113093156115192, |
| "grad_norm": 0.33072277903556824, |
| "learning_rate": 0.0005749816272965878, |
| "loss": 3.7593, |
| "step": 7250 |
| }, |
| { |
| "epoch": 2.1276670164393146, |
| "grad_norm": 0.3490276038646698, |
| "learning_rate": 0.0005748066491688538, |
| "loss": 3.7572, |
| "step": 7300 |
| }, |
| { |
| "epoch": 2.142240876763437, |
| "grad_norm": 0.3266213834285736, |
| "learning_rate": 0.0005746316710411198, |
| "loss": 3.7672, |
| "step": 7350 |
| }, |
| { |
| "epoch": 2.1568147370875597, |
| "grad_norm": 0.30979838967323303, |
| "learning_rate": 0.0005744566929133858, |
| "loss": 3.7579, |
| "step": 7400 |
| }, |
| { |
| "epoch": 2.171388597411682, |
| "grad_norm": 0.3439822494983673, |
| "learning_rate": 0.0005742817147856517, |
| "loss": 3.7583, |
| "step": 7450 |
| }, |
| { |
| "epoch": 2.185962457735805, |
| "grad_norm": 0.3368069529533386, |
| "learning_rate": 0.0005741067366579177, |
| "loss": 3.7749, |
| "step": 7500 |
| }, |
| { |
| "epoch": 2.2005363180599278, |
| "grad_norm": 0.34231144189834595, |
| "learning_rate": 0.0005739317585301837, |
| "loss": 3.7605, |
| "step": 7550 |
| }, |
| { |
| "epoch": 2.2151101783840503, |
| "grad_norm": 0.30864593386650085, |
| "learning_rate": 0.0005737567804024496, |
| "loss": 3.7697, |
| "step": 7600 |
| }, |
| { |
| "epoch": 2.229684038708173, |
| "grad_norm": 0.3351763188838959, |
| "learning_rate": 0.0005735818022747156, |
| "loss": 3.7557, |
| "step": 7650 |
| }, |
| { |
| "epoch": 2.244257899032296, |
| "grad_norm": 0.31327828764915466, |
| "learning_rate": 0.0005734068241469816, |
| "loss": 3.756, |
| "step": 7700 |
| }, |
| { |
| "epoch": 2.2588317593564184, |
| "grad_norm": 0.33311957120895386, |
| "learning_rate": 0.0005732318460192476, |
| "loss": 3.7595, |
| "step": 7750 |
| }, |
| { |
| "epoch": 2.273405619680541, |
| "grad_norm": 0.32785460352897644, |
| "learning_rate": 0.0005730568678915135, |
| "loss": 3.7652, |
| "step": 7800 |
| }, |
| { |
| "epoch": 2.2879794800046636, |
| "grad_norm": 0.3556518852710724, |
| "learning_rate": 0.0005728818897637795, |
| "loss": 3.7798, |
| "step": 7850 |
| }, |
| { |
| "epoch": 2.302553340328786, |
| "grad_norm": 0.3210819661617279, |
| "learning_rate": 0.0005727069116360455, |
| "loss": 3.7643, |
| "step": 7900 |
| }, |
| { |
| "epoch": 2.317127200652909, |
| "grad_norm": 0.3378058969974518, |
| "learning_rate": 0.0005725319335083115, |
| "loss": 3.7612, |
| "step": 7950 |
| }, |
| { |
| "epoch": 2.3317010609770317, |
| "grad_norm": 0.33185309171676636, |
| "learning_rate": 0.0005723569553805774, |
| "loss": 3.758, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3317010609770317, |
| "eval_accuracy": 0.3433224695126652, |
| "eval_loss": 3.789189577102661, |
| "eval_runtime": 53.4245, |
| "eval_samples_per_second": 311.224, |
| "eval_steps_per_second": 19.467, |
| "step": 8000 |
| }, |
| { |
| "epoch": 2.3462749213011542, |
| "grad_norm": 0.3262949585914612, |
| "learning_rate": 0.0005721819772528434, |
| "loss": 3.7594, |
| "step": 8050 |
| }, |
| { |
| "epoch": 2.360848781625277, |
| "grad_norm": 0.3377199172973633, |
| "learning_rate": 0.0005720069991251094, |
| "loss": 3.7597, |
| "step": 8100 |
| }, |
| { |
| "epoch": 2.3754226419494, |
| "grad_norm": 0.3243359923362732, |
| "learning_rate": 0.0005718320209973753, |
| "loss": 3.7479, |
| "step": 8150 |
| }, |
| { |
| "epoch": 2.3899965022735223, |
| "grad_norm": 0.32717669010162354, |
| "learning_rate": 0.0005716570428696413, |
| "loss": 3.7524, |
| "step": 8200 |
| }, |
| { |
| "epoch": 2.404570362597645, |
| "grad_norm": 0.3221069872379303, |
| "learning_rate": 0.0005714820647419073, |
| "loss": 3.7565, |
| "step": 8250 |
| }, |
| { |
| "epoch": 2.4191442229217675, |
| "grad_norm": 0.3426828682422638, |
| "learning_rate": 0.0005713070866141731, |
| "loss": 3.7599, |
| "step": 8300 |
| }, |
| { |
| "epoch": 2.43371808324589, |
| "grad_norm": 0.33828017115592957, |
| "learning_rate": 0.0005711321084864391, |
| "loss": 3.7608, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.448291943570013, |
| "grad_norm": 0.35512813925743103, |
| "learning_rate": 0.0005709571303587051, |
| "loss": 3.7473, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.4628658038941356, |
| "grad_norm": 0.3175588846206665, |
| "learning_rate": 0.000570782152230971, |
| "loss": 3.7579, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.477439664218258, |
| "grad_norm": 0.3399229347705841, |
| "learning_rate": 0.000570607174103237, |
| "loss": 3.7446, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.4920135245423807, |
| "grad_norm": 0.32327115535736084, |
| "learning_rate": 0.000570432195975503, |
| "loss": 3.7421, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.5065873848665037, |
| "grad_norm": 0.33032727241516113, |
| "learning_rate": 0.0005702572178477689, |
| "loss": 3.7491, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.5211612451906262, |
| "grad_norm": 0.35663163661956787, |
| "learning_rate": 0.0005700822397200349, |
| "loss": 3.7538, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.535735105514749, |
| "grad_norm": 0.31177642941474915, |
| "learning_rate": 0.0005699072615923009, |
| "loss": 3.7524, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.5503089658388713, |
| "grad_norm": 0.3071121275424957, |
| "learning_rate": 0.0005697322834645668, |
| "loss": 3.7478, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.564882826162994, |
| "grad_norm": 0.32533368468284607, |
| "learning_rate": 0.0005695573053368328, |
| "loss": 3.7501, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.5794566864871165, |
| "grad_norm": 0.3158879280090332, |
| "learning_rate": 0.0005693823272090988, |
| "loss": 3.7423, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.5940305468112395, |
| "grad_norm": 0.3276424705982208, |
| "learning_rate": 0.0005692073490813648, |
| "loss": 3.7559, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.608604407135362, |
| "grad_norm": 0.3426973819732666, |
| "learning_rate": 0.0005690323709536307, |
| "loss": 3.7507, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.6231782674594846, |
| "grad_norm": 0.32447370886802673, |
| "learning_rate": 0.0005688573928258967, |
| "loss": 3.7488, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6231782674594846, |
| "eval_accuracy": 0.3460520971418826, |
| "eval_loss": 3.757739305496216, |
| "eval_runtime": 53.5003, |
| "eval_samples_per_second": 310.783, |
| "eval_steps_per_second": 19.439, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.6377521277836076, |
| "grad_norm": 0.3436560034751892, |
| "learning_rate": 0.0005686824146981627, |
| "loss": 3.7321, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.65232598810773, |
| "grad_norm": 0.3185674548149109, |
| "learning_rate": 0.0005685074365704287, |
| "loss": 3.7383, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.6668998484318527, |
| "grad_norm": 0.3252021074295044, |
| "learning_rate": 0.0005683324584426946, |
| "loss": 3.7329, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.6814737087559752, |
| "grad_norm": 0.34753361344337463, |
| "learning_rate": 0.0005681574803149606, |
| "loss": 3.7418, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.696047569080098, |
| "grad_norm": 0.3484492003917694, |
| "learning_rate": 0.0005679825021872266, |
| "loss": 3.7335, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.7106214294042204, |
| "grad_norm": 0.3338894844055176, |
| "learning_rate": 0.0005678075240594926, |
| "loss": 3.7371, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.7251952897283434, |
| "grad_norm": 0.3270379602909088, |
| "learning_rate": 0.0005676325459317584, |
| "loss": 3.7252, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.739769150052466, |
| "grad_norm": 0.3320631682872772, |
| "learning_rate": 0.0005674575678040244, |
| "loss": 3.7412, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.7543430103765885, |
| "grad_norm": 0.3204483091831207, |
| "learning_rate": 0.0005672825896762904, |
| "loss": 3.7289, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.7689168707007115, |
| "grad_norm": 0.3120351731777191, |
| "learning_rate": 0.0005671076115485563, |
| "loss": 3.7382, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.783490731024834, |
| "grad_norm": 0.32004034519195557, |
| "learning_rate": 0.0005669326334208223, |
| "loss": 3.7376, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.7980645913489566, |
| "grad_norm": 0.3711410164833069, |
| "learning_rate": 0.0005667576552930883, |
| "loss": 3.7472, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.812638451673079, |
| "grad_norm": 0.32650843262672424, |
| "learning_rate": 0.0005665826771653543, |
| "loss": 3.7301, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.8272123119972017, |
| "grad_norm": 0.3227387070655823, |
| "learning_rate": 0.0005664076990376202, |
| "loss": 3.7348, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.8417861723213242, |
| "grad_norm": 0.34051623940467834, |
| "learning_rate": 0.0005662327209098862, |
| "loss": 3.7171, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.8563600326454472, |
| "grad_norm": 0.3245267868041992, |
| "learning_rate": 0.0005660577427821522, |
| "loss": 3.7254, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.87093389296957, |
| "grad_norm": 0.32455405592918396, |
| "learning_rate": 0.0005658827646544182, |
| "loss": 3.7409, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.8855077532936924, |
| "grad_norm": 0.32807657122612, |
| "learning_rate": 0.0005657077865266841, |
| "loss": 3.7419, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.9000816136178154, |
| "grad_norm": 0.33293142914772034, |
| "learning_rate": 0.0005655328083989501, |
| "loss": 3.7359, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.914655473941938, |
| "grad_norm": 0.3291257619857788, |
| "learning_rate": 0.0005653578302712161, |
| "loss": 3.7284, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.914655473941938, |
| "eval_accuracy": 0.34856280844692167, |
| "eval_loss": 3.7323226928710938, |
| "eval_runtime": 53.6003, |
| "eval_samples_per_second": 310.204, |
| "eval_steps_per_second": 19.403, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.9292293342660605, |
| "grad_norm": 0.31554660201072693, |
| "learning_rate": 0.0005651828521434821, |
| "loss": 3.7178, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.943803194590183, |
| "grad_norm": 0.3339604437351227, |
| "learning_rate": 0.000565007874015748, |
| "loss": 3.7461, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.9583770549143056, |
| "grad_norm": 0.31859248876571655, |
| "learning_rate": 0.000564832895888014, |
| "loss": 3.7223, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.972950915238428, |
| "grad_norm": 0.3081834316253662, |
| "learning_rate": 0.00056465791776028, |
| "loss": 3.7252, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.987524775562551, |
| "grad_norm": 0.3068216145038605, |
| "learning_rate": 0.0005644829396325459, |
| "loss": 3.7286, |
| "step": 10250 |
| }, |
| { |
| "epoch": 3.0020403404453773, |
| "grad_norm": 0.33173108100891113, |
| "learning_rate": 0.0005643079615048119, |
| "loss": 3.7107, |
| "step": 10300 |
| }, |
| { |
| "epoch": 3.0166142007695, |
| "grad_norm": 0.3206230700016022, |
| "learning_rate": 0.0005641329833770779, |
| "loss": 3.6135, |
| "step": 10350 |
| }, |
| { |
| "epoch": 3.0311880610936224, |
| "grad_norm": 0.3366788327693939, |
| "learning_rate": 0.0005639580052493437, |
| "loss": 3.6257, |
| "step": 10400 |
| }, |
| { |
| "epoch": 3.045761921417745, |
| "grad_norm": 0.31224706768989563, |
| "learning_rate": 0.0005637830271216097, |
| "loss": 3.6076, |
| "step": 10450 |
| }, |
| { |
| "epoch": 3.060335781741868, |
| "grad_norm": 0.345214307308197, |
| "learning_rate": 0.0005636080489938757, |
| "loss": 3.6282, |
| "step": 10500 |
| }, |
| { |
| "epoch": 3.0749096420659905, |
| "grad_norm": 0.3326195478439331, |
| "learning_rate": 0.0005634330708661417, |
| "loss": 3.6249, |
| "step": 10550 |
| }, |
| { |
| "epoch": 3.089483502390113, |
| "grad_norm": 0.3294229507446289, |
| "learning_rate": 0.0005632580927384076, |
| "loss": 3.6408, |
| "step": 10600 |
| }, |
| { |
| "epoch": 3.1040573627142356, |
| "grad_norm": 0.32834747433662415, |
| "learning_rate": 0.0005630831146106736, |
| "loss": 3.6255, |
| "step": 10650 |
| }, |
| { |
| "epoch": 3.1186312230383586, |
| "grad_norm": 0.34953683614730835, |
| "learning_rate": 0.0005629081364829396, |
| "loss": 3.6312, |
| "step": 10700 |
| }, |
| { |
| "epoch": 3.133205083362481, |
| "grad_norm": 0.3384673595428467, |
| "learning_rate": 0.0005627331583552055, |
| "loss": 3.6308, |
| "step": 10750 |
| }, |
| { |
| "epoch": 3.1477789436866037, |
| "grad_norm": 0.3406563699245453, |
| "learning_rate": 0.0005625581802274715, |
| "loss": 3.6487, |
| "step": 10800 |
| }, |
| { |
| "epoch": 3.1623528040107263, |
| "grad_norm": 0.325390100479126, |
| "learning_rate": 0.0005623832020997375, |
| "loss": 3.6447, |
| "step": 10850 |
| }, |
| { |
| "epoch": 3.176926664334849, |
| "grad_norm": 0.32227933406829834, |
| "learning_rate": 0.0005622082239720034, |
| "loss": 3.6264, |
| "step": 10900 |
| }, |
| { |
| "epoch": 3.191500524658972, |
| "grad_norm": 0.3505759835243225, |
| "learning_rate": 0.0005620332458442694, |
| "loss": 3.6376, |
| "step": 10950 |
| }, |
| { |
| "epoch": 3.2060743849830944, |
| "grad_norm": 0.3448062241077423, |
| "learning_rate": 0.0005618582677165354, |
| "loss": 3.6425, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.2060743849830944, |
| "eval_accuracy": 0.3505417649387146, |
| "eval_loss": 3.7195873260498047, |
| "eval_runtime": 53.6091, |
| "eval_samples_per_second": 310.152, |
| "eval_steps_per_second": 19.4, |
| "step": 11000 |
| }, |
| { |
| "epoch": 3.220648245307217, |
| "grad_norm": 0.34422972798347473, |
| "learning_rate": 0.0005616832895888013, |
| "loss": 3.6545, |
| "step": 11050 |
| }, |
| { |
| "epoch": 3.2352221056313395, |
| "grad_norm": 0.3293432295322418, |
| "learning_rate": 0.0005615083114610673, |
| "loss": 3.6583, |
| "step": 11100 |
| }, |
| { |
| "epoch": 3.249795965955462, |
| "grad_norm": 0.30929455161094666, |
| "learning_rate": 0.0005613333333333333, |
| "loss": 3.6267, |
| "step": 11150 |
| }, |
| { |
| "epoch": 3.264369826279585, |
| "grad_norm": 0.3245292603969574, |
| "learning_rate": 0.0005611583552055992, |
| "loss": 3.6452, |
| "step": 11200 |
| }, |
| { |
| "epoch": 3.2789436866037076, |
| "grad_norm": 0.3093254268169403, |
| "learning_rate": 0.0005609833770778652, |
| "loss": 3.6332, |
| "step": 11250 |
| }, |
| { |
| "epoch": 3.29351754692783, |
| "grad_norm": 0.31387147307395935, |
| "learning_rate": 0.0005608083989501312, |
| "loss": 3.6445, |
| "step": 11300 |
| }, |
| { |
| "epoch": 3.3080914072519527, |
| "grad_norm": 0.3269018828868866, |
| "learning_rate": 0.0005606334208223972, |
| "loss": 3.6374, |
| "step": 11350 |
| }, |
| { |
| "epoch": 3.3226652675760757, |
| "grad_norm": 0.3166932463645935, |
| "learning_rate": 0.000560458442694663, |
| "loss": 3.6408, |
| "step": 11400 |
| }, |
| { |
| "epoch": 3.3372391279001983, |
| "grad_norm": 0.32048463821411133, |
| "learning_rate": 0.000560283464566929, |
| "loss": 3.6529, |
| "step": 11450 |
| }, |
| { |
| "epoch": 3.351812988224321, |
| "grad_norm": 0.3116978108882904, |
| "learning_rate": 0.000560108486439195, |
| "loss": 3.6401, |
| "step": 11500 |
| }, |
| { |
| "epoch": 3.3663868485484434, |
| "grad_norm": 0.32269567251205444, |
| "learning_rate": 0.000559933508311461, |
| "loss": 3.6411, |
| "step": 11550 |
| }, |
| { |
| "epoch": 3.380960708872566, |
| "grad_norm": 0.3213954269886017, |
| "learning_rate": 0.0005597585301837269, |
| "loss": 3.6531, |
| "step": 11600 |
| }, |
| { |
| "epoch": 3.395534569196689, |
| "grad_norm": 0.3335840404033661, |
| "learning_rate": 0.0005595835520559929, |
| "loss": 3.656, |
| "step": 11650 |
| }, |
| { |
| "epoch": 3.4101084295208115, |
| "grad_norm": 0.31643956899642944, |
| "learning_rate": 0.0005594085739282589, |
| "loss": 3.6471, |
| "step": 11700 |
| }, |
| { |
| "epoch": 3.424682289844934, |
| "grad_norm": 0.33030158281326294, |
| "learning_rate": 0.0005592335958005249, |
| "loss": 3.6499, |
| "step": 11750 |
| }, |
| { |
| "epoch": 3.4392561501690566, |
| "grad_norm": 0.32754048705101013, |
| "learning_rate": 0.0005590586176727908, |
| "loss": 3.6535, |
| "step": 11800 |
| }, |
| { |
| "epoch": 3.4538300104931796, |
| "grad_norm": 0.3218769133090973, |
| "learning_rate": 0.0005588836395450568, |
| "loss": 3.6421, |
| "step": 11850 |
| }, |
| { |
| "epoch": 3.468403870817302, |
| "grad_norm": 0.30800119042396545, |
| "learning_rate": 0.0005587086614173228, |
| "loss": 3.653, |
| "step": 11900 |
| }, |
| { |
| "epoch": 3.4829777311414247, |
| "grad_norm": 0.3221539556980133, |
| "learning_rate": 0.0005585336832895888, |
| "loss": 3.6455, |
| "step": 11950 |
| }, |
| { |
| "epoch": 3.4975515914655473, |
| "grad_norm": 0.3269602656364441, |
| "learning_rate": 0.0005583587051618547, |
| "loss": 3.6524, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.4975515914655473, |
| "eval_accuracy": 0.3524415113841785, |
| "eval_loss": 3.7039685249328613, |
| "eval_runtime": 53.6734, |
| "eval_samples_per_second": 309.781, |
| "eval_steps_per_second": 19.376, |
| "step": 12000 |
| }, |
| { |
| "epoch": 3.51212545178967, |
| "grad_norm": 0.33083227276802063, |
| "learning_rate": 0.0005581837270341207, |
| "loss": 3.6433, |
| "step": 12050 |
| }, |
| { |
| "epoch": 3.526699312113793, |
| "grad_norm": 0.3274780213832855, |
| "learning_rate": 0.0005580087489063867, |
| "loss": 3.6548, |
| "step": 12100 |
| }, |
| { |
| "epoch": 3.5412731724379154, |
| "grad_norm": 0.3086439073085785, |
| "learning_rate": 0.0005578337707786526, |
| "loss": 3.6417, |
| "step": 12150 |
| }, |
| { |
| "epoch": 3.555847032762038, |
| "grad_norm": 0.313618928194046, |
| "learning_rate": 0.0005576587926509186, |
| "loss": 3.6569, |
| "step": 12200 |
| }, |
| { |
| "epoch": 3.5704208930861605, |
| "grad_norm": 0.32694700360298157, |
| "learning_rate": 0.0005574838145231846, |
| "loss": 3.6413, |
| "step": 12250 |
| }, |
| { |
| "epoch": 3.5849947534102835, |
| "grad_norm": 0.33732131123542786, |
| "learning_rate": 0.0005573088363954506, |
| "loss": 3.6538, |
| "step": 12300 |
| }, |
| { |
| "epoch": 3.599568613734406, |
| "grad_norm": 0.3250352740287781, |
| "learning_rate": 0.0005571338582677165, |
| "loss": 3.6577, |
| "step": 12350 |
| }, |
| { |
| "epoch": 3.6141424740585286, |
| "grad_norm": 0.32333865761756897, |
| "learning_rate": 0.0005569588801399825, |
| "loss": 3.6445, |
| "step": 12400 |
| }, |
| { |
| "epoch": 3.628716334382651, |
| "grad_norm": 0.34342822432518005, |
| "learning_rate": 0.0005567839020122485, |
| "loss": 3.644, |
| "step": 12450 |
| }, |
| { |
| "epoch": 3.6432901947067737, |
| "grad_norm": 0.3159576654434204, |
| "learning_rate": 0.0005566089238845145, |
| "loss": 3.6499, |
| "step": 12500 |
| }, |
| { |
| "epoch": 3.6578640550308967, |
| "grad_norm": 0.3476894795894623, |
| "learning_rate": 0.0005564339457567803, |
| "loss": 3.6544, |
| "step": 12550 |
| }, |
| { |
| "epoch": 3.6724379153550193, |
| "grad_norm": 0.3275102972984314, |
| "learning_rate": 0.0005562589676290463, |
| "loss": 3.6575, |
| "step": 12600 |
| }, |
| { |
| "epoch": 3.687011775679142, |
| "grad_norm": 0.3174804151058197, |
| "learning_rate": 0.0005560839895013123, |
| "loss": 3.6549, |
| "step": 12650 |
| }, |
| { |
| "epoch": 3.7015856360032644, |
| "grad_norm": 0.316368043422699, |
| "learning_rate": 0.0005559090113735782, |
| "loss": 3.6366, |
| "step": 12700 |
| }, |
| { |
| "epoch": 3.7161594963273874, |
| "grad_norm": 0.32830533385276794, |
| "learning_rate": 0.0005557340332458442, |
| "loss": 3.6528, |
| "step": 12750 |
| }, |
| { |
| "epoch": 3.73073335665151, |
| "grad_norm": 0.31506970524787903, |
| "learning_rate": 0.0005555590551181102, |
| "loss": 3.6619, |
| "step": 12800 |
| }, |
| { |
| "epoch": 3.7453072169756325, |
| "grad_norm": 0.32358768582344055, |
| "learning_rate": 0.0005553840769903761, |
| "loss": 3.6497, |
| "step": 12850 |
| }, |
| { |
| "epoch": 3.759881077299755, |
| "grad_norm": 0.31227579712867737, |
| "learning_rate": 0.0005552090988626421, |
| "loss": 3.6473, |
| "step": 12900 |
| }, |
| { |
| "epoch": 3.7744549376238776, |
| "grad_norm": 0.3337423801422119, |
| "learning_rate": 0.0005550341207349081, |
| "loss": 3.6468, |
| "step": 12950 |
| }, |
| { |
| "epoch": 3.7890287979480006, |
| "grad_norm": 0.3123199939727783, |
| "learning_rate": 0.000554859142607174, |
| "loss": 3.6604, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.7890287979480006, |
| "eval_accuracy": 0.3538631728249045, |
| "eval_loss": 3.6831274032592773, |
| "eval_runtime": 53.4292, |
| "eval_samples_per_second": 311.197, |
| "eval_steps_per_second": 19.465, |
| "step": 13000 |
| }, |
| { |
| "epoch": 3.803602658272123, |
| "grad_norm": 0.3376936912536621, |
| "learning_rate": 0.00055468416447944, |
| "loss": 3.6393, |
| "step": 13050 |
| }, |
| { |
| "epoch": 3.8181765185962457, |
| "grad_norm": 0.3237517476081848, |
| "learning_rate": 0.000554509186351706, |
| "loss": 3.6467, |
| "step": 13100 |
| }, |
| { |
| "epoch": 3.8327503789203683, |
| "grad_norm": 0.30755048990249634, |
| "learning_rate": 0.000554334208223972, |
| "loss": 3.6435, |
| "step": 13150 |
| }, |
| { |
| "epoch": 3.8473242392444913, |
| "grad_norm": 0.31253567337989807, |
| "learning_rate": 0.0005541592300962379, |
| "loss": 3.6334, |
| "step": 13200 |
| }, |
| { |
| "epoch": 3.861898099568614, |
| "grad_norm": 0.32961559295654297, |
| "learning_rate": 0.0005539842519685039, |
| "loss": 3.6452, |
| "step": 13250 |
| }, |
| { |
| "epoch": 3.8764719598927364, |
| "grad_norm": 0.30204135179519653, |
| "learning_rate": 0.0005538092738407699, |
| "loss": 3.6256, |
| "step": 13300 |
| }, |
| { |
| "epoch": 3.891045820216859, |
| "grad_norm": 0.342984676361084, |
| "learning_rate": 0.0005536342957130358, |
| "loss": 3.6601, |
| "step": 13350 |
| }, |
| { |
| "epoch": 3.9056196805409815, |
| "grad_norm": 0.3146904706954956, |
| "learning_rate": 0.0005534593175853018, |
| "loss": 3.6473, |
| "step": 13400 |
| }, |
| { |
| "epoch": 3.920193540865104, |
| "grad_norm": 0.331033319234848, |
| "learning_rate": 0.0005532843394575678, |
| "loss": 3.6499, |
| "step": 13450 |
| }, |
| { |
| "epoch": 3.934767401189227, |
| "grad_norm": 0.32355043292045593, |
| "learning_rate": 0.0005531093613298337, |
| "loss": 3.6355, |
| "step": 13500 |
| }, |
| { |
| "epoch": 3.9493412615133496, |
| "grad_norm": 0.3244037330150604, |
| "learning_rate": 0.0005529343832020997, |
| "loss": 3.6424, |
| "step": 13550 |
| }, |
| { |
| "epoch": 3.963915121837472, |
| "grad_norm": 0.33598390221595764, |
| "learning_rate": 0.0005527594050743656, |
| "loss": 3.6394, |
| "step": 13600 |
| }, |
| { |
| "epoch": 3.978488982161595, |
| "grad_norm": 0.3128249943256378, |
| "learning_rate": 0.0005525844269466316, |
| "loss": 3.6561, |
| "step": 13650 |
| }, |
| { |
| "epoch": 3.9930628424857177, |
| "grad_norm": 0.3237094581127167, |
| "learning_rate": 0.0005524094488188975, |
| "loss": 3.6546, |
| "step": 13700 |
| }, |
| { |
| "epoch": 4.007578407368544, |
| "grad_norm": 0.3438703417778015, |
| "learning_rate": 0.0005522344706911635, |
| "loss": 3.5822, |
| "step": 13750 |
| }, |
| { |
| "epoch": 4.022152267692666, |
| "grad_norm": 0.3437328338623047, |
| "learning_rate": 0.0005520594925634295, |
| "loss": 3.5292, |
| "step": 13800 |
| }, |
| { |
| "epoch": 4.036726128016789, |
| "grad_norm": 0.32742053270339966, |
| "learning_rate": 0.0005518845144356954, |
| "loss": 3.5379, |
| "step": 13850 |
| }, |
| { |
| "epoch": 4.0512999883409115, |
| "grad_norm": 0.3464297652244568, |
| "learning_rate": 0.0005517095363079614, |
| "loss": 3.5425, |
| "step": 13900 |
| }, |
| { |
| "epoch": 4.065873848665034, |
| "grad_norm": 0.3182748854160309, |
| "learning_rate": 0.0005515345581802274, |
| "loss": 3.5653, |
| "step": 13950 |
| }, |
| { |
| "epoch": 4.080447708989157, |
| "grad_norm": 0.33514368534088135, |
| "learning_rate": 0.0005513595800524934, |
| "loss": 3.5523, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.080447708989157, |
| "eval_accuracy": 0.35556601227555634, |
| "eval_loss": 3.676593542098999, |
| "eval_runtime": 53.8951, |
| "eval_samples_per_second": 308.507, |
| "eval_steps_per_second": 19.297, |
| "step": 14000 |
| }, |
| { |
| "epoch": 4.09502156931328, |
| "grad_norm": 0.3363269865512848, |
| "learning_rate": 0.0005511846019247593, |
| "loss": 3.5483, |
| "step": 14050 |
| }, |
| { |
| "epoch": 4.109595429637403, |
| "grad_norm": 0.3380267322063446, |
| "learning_rate": 0.0005510096237970253, |
| "loss": 3.5424, |
| "step": 14100 |
| }, |
| { |
| "epoch": 4.124169289961525, |
| "grad_norm": 0.3331514894962311, |
| "learning_rate": 0.0005508346456692913, |
| "loss": 3.5427, |
| "step": 14150 |
| }, |
| { |
| "epoch": 4.138743150285648, |
| "grad_norm": 0.3334326148033142, |
| "learning_rate": 0.0005506596675415573, |
| "loss": 3.5377, |
| "step": 14200 |
| }, |
| { |
| "epoch": 4.15331701060977, |
| "grad_norm": 0.3300160765647888, |
| "learning_rate": 0.0005504846894138232, |
| "loss": 3.5603, |
| "step": 14250 |
| }, |
| { |
| "epoch": 4.167890870933893, |
| "grad_norm": 0.31219443678855896, |
| "learning_rate": 0.0005503097112860892, |
| "loss": 3.5599, |
| "step": 14300 |
| }, |
| { |
| "epoch": 4.182464731258015, |
| "grad_norm": 0.33152374625205994, |
| "learning_rate": 0.0005501347331583552, |
| "loss": 3.5656, |
| "step": 14350 |
| }, |
| { |
| "epoch": 4.197038591582138, |
| "grad_norm": 0.36081743240356445, |
| "learning_rate": 0.0005499597550306212, |
| "loss": 3.5581, |
| "step": 14400 |
| }, |
| { |
| "epoch": 4.2116124519062605, |
| "grad_norm": 0.3012588620185852, |
| "learning_rate": 0.0005497847769028871, |
| "loss": 3.5553, |
| "step": 14450 |
| }, |
| { |
| "epoch": 4.226186312230384, |
| "grad_norm": 0.33108580112457275, |
| "learning_rate": 0.0005496097987751531, |
| "loss": 3.5687, |
| "step": 14500 |
| }, |
| { |
| "epoch": 4.2407601725545065, |
| "grad_norm": 0.33463039994239807, |
| "learning_rate": 0.0005494348206474191, |
| "loss": 3.5635, |
| "step": 14550 |
| }, |
| { |
| "epoch": 4.255334032878629, |
| "grad_norm": 0.34705042839050293, |
| "learning_rate": 0.0005492598425196851, |
| "loss": 3.5607, |
| "step": 14600 |
| }, |
| { |
| "epoch": 4.269907893202752, |
| "grad_norm": 0.34977859258651733, |
| "learning_rate": 0.000549084864391951, |
| "loss": 3.5714, |
| "step": 14650 |
| }, |
| { |
| "epoch": 4.284481753526874, |
| "grad_norm": 0.3136276602745056, |
| "learning_rate": 0.000548909886264217, |
| "loss": 3.5731, |
| "step": 14700 |
| }, |
| { |
| "epoch": 4.299055613850997, |
| "grad_norm": 0.3151850700378418, |
| "learning_rate": 0.000548734908136483, |
| "loss": 3.5722, |
| "step": 14750 |
| }, |
| { |
| "epoch": 4.313629474175119, |
| "grad_norm": 0.3240025043487549, |
| "learning_rate": 0.0005485599300087488, |
| "loss": 3.5809, |
| "step": 14800 |
| }, |
| { |
| "epoch": 4.328203334499242, |
| "grad_norm": 0.3266666531562805, |
| "learning_rate": 0.0005483849518810148, |
| "loss": 3.5757, |
| "step": 14850 |
| }, |
| { |
| "epoch": 4.342777194823364, |
| "grad_norm": 0.3137483596801758, |
| "learning_rate": 0.0005482099737532808, |
| "loss": 3.5655, |
| "step": 14900 |
| }, |
| { |
| "epoch": 4.357351055147488, |
| "grad_norm": 0.32489004731178284, |
| "learning_rate": 0.0005480349956255468, |
| "loss": 3.582, |
| "step": 14950 |
| }, |
| { |
| "epoch": 4.37192491547161, |
| "grad_norm": 0.3228713572025299, |
| "learning_rate": 0.0005478600174978127, |
| "loss": 3.565, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.37192491547161, |
| "eval_accuracy": 0.3562637197861635, |
| "eval_loss": 3.6654460430145264, |
| "eval_runtime": 53.5507, |
| "eval_samples_per_second": 310.491, |
| "eval_steps_per_second": 19.421, |
| "step": 15000 |
| }, |
| { |
| "epoch": 4.386498775795733, |
| "grad_norm": 0.3213447332382202, |
| "learning_rate": 0.0005476850393700787, |
| "loss": 3.5723, |
| "step": 15050 |
| }, |
| { |
| "epoch": 4.4010726361198556, |
| "grad_norm": 0.3222188949584961, |
| "learning_rate": 0.0005475100612423447, |
| "loss": 3.579, |
| "step": 15100 |
| }, |
| { |
| "epoch": 4.415646496443978, |
| "grad_norm": 0.3166174590587616, |
| "learning_rate": 0.0005473350831146106, |
| "loss": 3.5841, |
| "step": 15150 |
| }, |
| { |
| "epoch": 4.430220356768101, |
| "grad_norm": 0.33479446172714233, |
| "learning_rate": 0.0005471601049868766, |
| "loss": 3.5844, |
| "step": 15200 |
| }, |
| { |
| "epoch": 4.444794217092223, |
| "grad_norm": 0.33594754338264465, |
| "learning_rate": 0.0005469851268591426, |
| "loss": 3.5648, |
| "step": 15250 |
| }, |
| { |
| "epoch": 4.459368077416346, |
| "grad_norm": 0.33211374282836914, |
| "learning_rate": 0.0005468101487314085, |
| "loss": 3.5795, |
| "step": 15300 |
| }, |
| { |
| "epoch": 4.473941937740468, |
| "grad_norm": 0.34345540404319763, |
| "learning_rate": 0.0005466351706036745, |
| "loss": 3.5807, |
| "step": 15350 |
| }, |
| { |
| "epoch": 4.488515798064592, |
| "grad_norm": 0.3405556380748749, |
| "learning_rate": 0.0005464601924759405, |
| "loss": 3.5865, |
| "step": 15400 |
| }, |
| { |
| "epoch": 4.503089658388714, |
| "grad_norm": 0.320850133895874, |
| "learning_rate": 0.0005462852143482064, |
| "loss": 3.5825, |
| "step": 15450 |
| }, |
| { |
| "epoch": 4.517663518712837, |
| "grad_norm": 0.32576000690460205, |
| "learning_rate": 0.0005461102362204724, |
| "loss": 3.5739, |
| "step": 15500 |
| }, |
| { |
| "epoch": 4.5322373790369594, |
| "grad_norm": 0.312652051448822, |
| "learning_rate": 0.0005459352580927384, |
| "loss": 3.5855, |
| "step": 15550 |
| }, |
| { |
| "epoch": 4.546811239361082, |
| "grad_norm": 0.32264983654022217, |
| "learning_rate": 0.0005457602799650043, |
| "loss": 3.5852, |
| "step": 15600 |
| }, |
| { |
| "epoch": 4.561385099685205, |
| "grad_norm": 0.3088845908641815, |
| "learning_rate": 0.0005455853018372703, |
| "loss": 3.5819, |
| "step": 15650 |
| }, |
| { |
| "epoch": 4.575958960009327, |
| "grad_norm": 0.3305530548095703, |
| "learning_rate": 0.0005454103237095363, |
| "loss": 3.5874, |
| "step": 15700 |
| }, |
| { |
| "epoch": 4.59053282033345, |
| "grad_norm": 0.32807108759880066, |
| "learning_rate": 0.0005452353455818022, |
| "loss": 3.5757, |
| "step": 15750 |
| }, |
| { |
| "epoch": 4.605106680657572, |
| "grad_norm": 0.32480913400650024, |
| "learning_rate": 0.0005450603674540681, |
| "loss": 3.587, |
| "step": 15800 |
| }, |
| { |
| "epoch": 4.619680540981696, |
| "grad_norm": 0.3335857093334198, |
| "learning_rate": 0.0005448853893263341, |
| "loss": 3.581, |
| "step": 15850 |
| }, |
| { |
| "epoch": 4.634254401305818, |
| "grad_norm": 0.33189520239830017, |
| "learning_rate": 0.0005447104111986001, |
| "loss": 3.5908, |
| "step": 15900 |
| }, |
| { |
| "epoch": 4.648828261629941, |
| "grad_norm": 0.3243214190006256, |
| "learning_rate": 0.000544535433070866, |
| "loss": 3.5852, |
| "step": 15950 |
| }, |
| { |
| "epoch": 4.663402121954063, |
| "grad_norm": 0.32132813334465027, |
| "learning_rate": 0.000544360454943132, |
| "loss": 3.5805, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.663402121954063, |
| "eval_accuracy": 0.3573717188591823, |
| "eval_loss": 3.65263032913208, |
| "eval_runtime": 53.5114, |
| "eval_samples_per_second": 310.719, |
| "eval_steps_per_second": 19.435, |
| "step": 16000 |
| }, |
| { |
| "epoch": 4.677975982278186, |
| "grad_norm": 0.3119511604309082, |
| "learning_rate": 0.000544185476815398, |
| "loss": 3.598, |
| "step": 16050 |
| }, |
| { |
| "epoch": 4.6925498426023085, |
| "grad_norm": 0.31116464734077454, |
| "learning_rate": 0.000544010498687664, |
| "loss": 3.5789, |
| "step": 16100 |
| }, |
| { |
| "epoch": 4.707123702926431, |
| "grad_norm": 0.3112083673477173, |
| "learning_rate": 0.0005438355205599299, |
| "loss": 3.5819, |
| "step": 16150 |
| }, |
| { |
| "epoch": 4.721697563250554, |
| "grad_norm": 0.3235861361026764, |
| "learning_rate": 0.0005436605424321959, |
| "loss": 3.5741, |
| "step": 16200 |
| }, |
| { |
| "epoch": 4.736271423574676, |
| "grad_norm": 0.3252529203891754, |
| "learning_rate": 0.0005434855643044619, |
| "loss": 3.5808, |
| "step": 16250 |
| }, |
| { |
| "epoch": 4.7508452838988, |
| "grad_norm": 0.31969958543777466, |
| "learning_rate": 0.0005433105861767279, |
| "loss": 3.5818, |
| "step": 16300 |
| }, |
| { |
| "epoch": 4.765419144222922, |
| "grad_norm": 0.303388774394989, |
| "learning_rate": 0.0005431356080489938, |
| "loss": 3.5758, |
| "step": 16350 |
| }, |
| { |
| "epoch": 4.779993004547045, |
| "grad_norm": 0.3366004228591919, |
| "learning_rate": 0.0005429606299212598, |
| "loss": 3.5768, |
| "step": 16400 |
| }, |
| { |
| "epoch": 4.794566864871167, |
| "grad_norm": 0.3353453278541565, |
| "learning_rate": 0.0005427856517935258, |
| "loss": 3.5856, |
| "step": 16450 |
| }, |
| { |
| "epoch": 4.80914072519529, |
| "grad_norm": 0.33559808135032654, |
| "learning_rate": 0.0005426106736657917, |
| "loss": 3.5909, |
| "step": 16500 |
| }, |
| { |
| "epoch": 4.823714585519412, |
| "grad_norm": 0.3218340575695038, |
| "learning_rate": 0.0005424356955380577, |
| "loss": 3.5981, |
| "step": 16550 |
| }, |
| { |
| "epoch": 4.838288445843535, |
| "grad_norm": 0.31812185049057007, |
| "learning_rate": 0.0005422607174103237, |
| "loss": 3.5841, |
| "step": 16600 |
| }, |
| { |
| "epoch": 4.8528623061676575, |
| "grad_norm": 0.31284356117248535, |
| "learning_rate": 0.0005420857392825897, |
| "loss": 3.5834, |
| "step": 16650 |
| }, |
| { |
| "epoch": 4.86743616649178, |
| "grad_norm": 0.30119526386260986, |
| "learning_rate": 0.0005419107611548556, |
| "loss": 3.5854, |
| "step": 16700 |
| }, |
| { |
| "epoch": 4.8820100268159035, |
| "grad_norm": 0.32048872113227844, |
| "learning_rate": 0.0005417357830271216, |
| "loss": 3.5856, |
| "step": 16750 |
| }, |
| { |
| "epoch": 4.896583887140026, |
| "grad_norm": 0.32042795419692993, |
| "learning_rate": 0.0005415608048993876, |
| "loss": 3.5818, |
| "step": 16800 |
| }, |
| { |
| "epoch": 4.911157747464149, |
| "grad_norm": 0.33472687005996704, |
| "learning_rate": 0.0005413858267716535, |
| "loss": 3.5788, |
| "step": 16850 |
| }, |
| { |
| "epoch": 4.925731607788271, |
| "grad_norm": 0.320378839969635, |
| "learning_rate": 0.0005412108486439194, |
| "loss": 3.5993, |
| "step": 16900 |
| }, |
| { |
| "epoch": 4.940305468112394, |
| "grad_norm": 0.35995224118232727, |
| "learning_rate": 0.0005410358705161854, |
| "loss": 3.5888, |
| "step": 16950 |
| }, |
| { |
| "epoch": 4.954879328436516, |
| "grad_norm": 0.3255212604999542, |
| "learning_rate": 0.0005408608923884514, |
| "loss": 3.5797, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.954879328436516, |
| "eval_accuracy": 0.35837108364875137, |
| "eval_loss": 3.639648199081421, |
| "eval_runtime": 53.7249, |
| "eval_samples_per_second": 309.484, |
| "eval_steps_per_second": 19.358, |
| "step": 17000 |
| }, |
| { |
| "epoch": 4.969453188760639, |
| "grad_norm": 0.3346654176712036, |
| "learning_rate": 0.0005406859142607174, |
| "loss": 3.5722, |
| "step": 17050 |
| }, |
| { |
| "epoch": 4.984027049084761, |
| "grad_norm": 0.3161713778972626, |
| "learning_rate": 0.0005405109361329833, |
| "loss": 3.5805, |
| "step": 17100 |
| }, |
| { |
| "epoch": 4.998600909408884, |
| "grad_norm": 0.30994942784309387, |
| "learning_rate": 0.0005403359580052493, |
| "loss": 3.5784, |
| "step": 17150 |
| }, |
| { |
| "epoch": 5.01311647429171, |
| "grad_norm": 0.32207217812538147, |
| "learning_rate": 0.0005401609798775153, |
| "loss": 3.4704, |
| "step": 17200 |
| }, |
| { |
| "epoch": 5.0276903346158335, |
| "grad_norm": 0.3239103853702545, |
| "learning_rate": 0.0005399860017497813, |
| "loss": 3.477, |
| "step": 17250 |
| }, |
| { |
| "epoch": 5.042264194939956, |
| "grad_norm": 0.31806662678718567, |
| "learning_rate": 0.0005398110236220472, |
| "loss": 3.4855, |
| "step": 17300 |
| }, |
| { |
| "epoch": 5.056838055264079, |
| "grad_norm": 0.32169944047927856, |
| "learning_rate": 0.0005396360454943132, |
| "loss": 3.4785, |
| "step": 17350 |
| }, |
| { |
| "epoch": 5.071411915588201, |
| "grad_norm": 0.35187849402427673, |
| "learning_rate": 0.0005394610673665792, |
| "loss": 3.4871, |
| "step": 17400 |
| }, |
| { |
| "epoch": 5.085985775912324, |
| "grad_norm": 0.3193110227584839, |
| "learning_rate": 0.0005392860892388451, |
| "loss": 3.484, |
| "step": 17450 |
| }, |
| { |
| "epoch": 5.100559636236446, |
| "grad_norm": 0.31753817200660706, |
| "learning_rate": 0.0005391111111111111, |
| "loss": 3.4912, |
| "step": 17500 |
| }, |
| { |
| "epoch": 5.115133496560569, |
| "grad_norm": 0.31478115916252136, |
| "learning_rate": 0.0005389361329833771, |
| "loss": 3.4876, |
| "step": 17550 |
| }, |
| { |
| "epoch": 5.129707356884691, |
| "grad_norm": 0.32835716009140015, |
| "learning_rate": 0.000538761154855643, |
| "loss": 3.4917, |
| "step": 17600 |
| }, |
| { |
| "epoch": 5.144281217208814, |
| "grad_norm": 0.31704792380332947, |
| "learning_rate": 0.000538586176727909, |
| "loss": 3.498, |
| "step": 17650 |
| }, |
| { |
| "epoch": 5.158855077532937, |
| "grad_norm": 0.3236917555332184, |
| "learning_rate": 0.000538411198600175, |
| "loss": 3.5003, |
| "step": 17700 |
| }, |
| { |
| "epoch": 5.17342893785706, |
| "grad_norm": 0.34724608063697815, |
| "learning_rate": 0.0005382362204724409, |
| "loss": 3.4988, |
| "step": 17750 |
| }, |
| { |
| "epoch": 5.1880027981811825, |
| "grad_norm": 0.32653898000717163, |
| "learning_rate": 0.0005380612423447069, |
| "loss": 3.5061, |
| "step": 17800 |
| }, |
| { |
| "epoch": 5.202576658505305, |
| "grad_norm": 0.31315353512763977, |
| "learning_rate": 0.0005378862642169729, |
| "loss": 3.5036, |
| "step": 17850 |
| }, |
| { |
| "epoch": 5.217150518829428, |
| "grad_norm": 0.324532151222229, |
| "learning_rate": 0.0005377112860892387, |
| "loss": 3.5129, |
| "step": 17900 |
| }, |
| { |
| "epoch": 5.23172437915355, |
| "grad_norm": 0.3222633898258209, |
| "learning_rate": 0.0005375363079615047, |
| "loss": 3.5064, |
| "step": 17950 |
| }, |
| { |
| "epoch": 5.246298239477673, |
| "grad_norm": 0.3194296956062317, |
| "learning_rate": 0.0005373613298337707, |
| "loss": 3.5128, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.246298239477673, |
| "eval_accuracy": 0.35955617422302655, |
| "eval_loss": 3.6400327682495117, |
| "eval_runtime": 53.481, |
| "eval_samples_per_second": 310.896, |
| "eval_steps_per_second": 19.446, |
| "step": 18000 |
| }, |
| { |
| "epoch": 5.260872099801795, |
| "grad_norm": 0.32165059447288513, |
| "learning_rate": 0.0005371863517060366, |
| "loss": 3.5079, |
| "step": 18050 |
| }, |
| { |
| "epoch": 5.275445960125918, |
| "grad_norm": 0.35680216550827026, |
| "learning_rate": 0.0005370113735783026, |
| "loss": 3.5205, |
| "step": 18100 |
| }, |
| { |
| "epoch": 5.290019820450041, |
| "grad_norm": 0.33753862977027893, |
| "learning_rate": 0.0005368363954505686, |
| "loss": 3.5086, |
| "step": 18150 |
| }, |
| { |
| "epoch": 5.304593680774164, |
| "grad_norm": 0.34423062205314636, |
| "learning_rate": 0.0005366614173228346, |
| "loss": 3.5164, |
| "step": 18200 |
| }, |
| { |
| "epoch": 5.319167541098286, |
| "grad_norm": 0.3393954038619995, |
| "learning_rate": 0.0005364864391951005, |
| "loss": 3.5058, |
| "step": 18250 |
| }, |
| { |
| "epoch": 5.333741401422409, |
| "grad_norm": 0.3219560980796814, |
| "learning_rate": 0.0005363114610673665, |
| "loss": 3.5228, |
| "step": 18300 |
| }, |
| { |
| "epoch": 5.3483152617465315, |
| "grad_norm": 0.3105454742908478, |
| "learning_rate": 0.0005361364829396325, |
| "loss": 3.5133, |
| "step": 18350 |
| }, |
| { |
| "epoch": 5.362889122070654, |
| "grad_norm": 0.32755526900291443, |
| "learning_rate": 0.0005359615048118984, |
| "loss": 3.5169, |
| "step": 18400 |
| }, |
| { |
| "epoch": 5.377462982394777, |
| "grad_norm": 0.3297589123249054, |
| "learning_rate": 0.0005357865266841644, |
| "loss": 3.5232, |
| "step": 18450 |
| }, |
| { |
| "epoch": 5.392036842718899, |
| "grad_norm": 0.3151703178882599, |
| "learning_rate": 0.0005356115485564304, |
| "loss": 3.5214, |
| "step": 18500 |
| }, |
| { |
| "epoch": 5.406610703043022, |
| "grad_norm": 0.3276863694190979, |
| "learning_rate": 0.0005354365704286964, |
| "loss": 3.5306, |
| "step": 18550 |
| }, |
| { |
| "epoch": 5.421184563367145, |
| "grad_norm": 0.3235160708427429, |
| "learning_rate": 0.0005352615923009623, |
| "loss": 3.5263, |
| "step": 18600 |
| }, |
| { |
| "epoch": 5.435758423691268, |
| "grad_norm": 0.3278557062149048, |
| "learning_rate": 0.0005350866141732283, |
| "loss": 3.5119, |
| "step": 18650 |
| }, |
| { |
| "epoch": 5.45033228401539, |
| "grad_norm": 0.3406359553337097, |
| "learning_rate": 0.0005349116360454943, |
| "loss": 3.5305, |
| "step": 18700 |
| }, |
| { |
| "epoch": 5.464906144339513, |
| "grad_norm": 0.3346172273159027, |
| "learning_rate": 0.0005347366579177603, |
| "loss": 3.5328, |
| "step": 18750 |
| }, |
| { |
| "epoch": 5.479480004663635, |
| "grad_norm": 0.32716649770736694, |
| "learning_rate": 0.0005345616797900262, |
| "loss": 3.5297, |
| "step": 18800 |
| }, |
| { |
| "epoch": 5.494053864987758, |
| "grad_norm": 0.35766148567199707, |
| "learning_rate": 0.0005343867016622922, |
| "loss": 3.5228, |
| "step": 18850 |
| }, |
| { |
| "epoch": 5.5086277253118805, |
| "grad_norm": 0.3449589014053345, |
| "learning_rate": 0.0005342117235345582, |
| "loss": 3.53, |
| "step": 18900 |
| }, |
| { |
| "epoch": 5.523201585636003, |
| "grad_norm": 0.3134307563304901, |
| "learning_rate": 0.0005340367454068242, |
| "loss": 3.5254, |
| "step": 18950 |
| }, |
| { |
| "epoch": 5.537775445960126, |
| "grad_norm": 0.30853351950645447, |
| "learning_rate": 0.00053386176727909, |
| "loss": 3.5289, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.537775445960126, |
| "eval_accuracy": 0.3601132338801965, |
| "eval_loss": 3.63059663772583, |
| "eval_runtime": 53.6889, |
| "eval_samples_per_second": 309.692, |
| "eval_steps_per_second": 19.371, |
| "step": 19000 |
| }, |
| { |
| "epoch": 5.552349306284249, |
| "grad_norm": 0.3088659346103668, |
| "learning_rate": 0.000533686789151356, |
| "loss": 3.5352, |
| "step": 19050 |
| }, |
| { |
| "epoch": 5.566923166608372, |
| "grad_norm": 0.3493700325489044, |
| "learning_rate": 0.000533511811023622, |
| "loss": 3.5208, |
| "step": 19100 |
| }, |
| { |
| "epoch": 5.581497026932494, |
| "grad_norm": 0.3162291646003723, |
| "learning_rate": 0.000533336832895888, |
| "loss": 3.528, |
| "step": 19150 |
| }, |
| { |
| "epoch": 5.596070887256617, |
| "grad_norm": 0.32827121019363403, |
| "learning_rate": 0.0005331618547681539, |
| "loss": 3.5213, |
| "step": 19200 |
| }, |
| { |
| "epoch": 5.610644747580739, |
| "grad_norm": 0.3129236400127411, |
| "learning_rate": 0.0005329868766404199, |
| "loss": 3.53, |
| "step": 19250 |
| }, |
| { |
| "epoch": 5.625218607904862, |
| "grad_norm": 0.3162083327770233, |
| "learning_rate": 0.0005328118985126859, |
| "loss": 3.5285, |
| "step": 19300 |
| }, |
| { |
| "epoch": 5.639792468228984, |
| "grad_norm": 0.30967944860458374, |
| "learning_rate": 0.0005326369203849518, |
| "loss": 3.5365, |
| "step": 19350 |
| }, |
| { |
| "epoch": 5.654366328553107, |
| "grad_norm": 0.3239143192768097, |
| "learning_rate": 0.0005324619422572178, |
| "loss": 3.5307, |
| "step": 19400 |
| }, |
| { |
| "epoch": 5.6689401888772295, |
| "grad_norm": 0.33831366896629333, |
| "learning_rate": 0.0005322869641294838, |
| "loss": 3.53, |
| "step": 19450 |
| }, |
| { |
| "epoch": 5.683514049201353, |
| "grad_norm": 0.29488807916641235, |
| "learning_rate": 0.0005321119860017498, |
| "loss": 3.5295, |
| "step": 19500 |
| }, |
| { |
| "epoch": 5.698087909525475, |
| "grad_norm": 0.3035849332809448, |
| "learning_rate": 0.0005319370078740157, |
| "loss": 3.5397, |
| "step": 19550 |
| }, |
| { |
| "epoch": 5.712661769849598, |
| "grad_norm": 0.31908565759658813, |
| "learning_rate": 0.0005317620297462817, |
| "loss": 3.538, |
| "step": 19600 |
| }, |
| { |
| "epoch": 5.727235630173721, |
| "grad_norm": 0.33275604248046875, |
| "learning_rate": 0.0005315870516185477, |
| "loss": 3.5357, |
| "step": 19650 |
| }, |
| { |
| "epoch": 5.741809490497843, |
| "grad_norm": 0.34495431184768677, |
| "learning_rate": 0.0005314120734908137, |
| "loss": 3.5395, |
| "step": 19700 |
| }, |
| { |
| "epoch": 5.756383350821966, |
| "grad_norm": 0.3354114890098572, |
| "learning_rate": 0.0005312370953630796, |
| "loss": 3.5347, |
| "step": 19750 |
| }, |
| { |
| "epoch": 5.770957211146088, |
| "grad_norm": 0.3229857385158539, |
| "learning_rate": 0.0005310621172353456, |
| "loss": 3.5481, |
| "step": 19800 |
| }, |
| { |
| "epoch": 5.785531071470211, |
| "grad_norm": 0.3382706344127655, |
| "learning_rate": 0.0005308871391076116, |
| "loss": 3.5434, |
| "step": 19850 |
| }, |
| { |
| "epoch": 5.800104931794333, |
| "grad_norm": 0.3282252848148346, |
| "learning_rate": 0.0005307121609798775, |
| "loss": 3.5255, |
| "step": 19900 |
| }, |
| { |
| "epoch": 5.814678792118456, |
| "grad_norm": 0.3156355321407318, |
| "learning_rate": 0.0005305371828521435, |
| "loss": 3.5483, |
| "step": 19950 |
| }, |
| { |
| "epoch": 5.8292526524425785, |
| "grad_norm": 0.3342091739177704, |
| "learning_rate": 0.0005303622047244095, |
| "loss": 3.5303, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.8292526524425785, |
| "eval_accuracy": 0.36105304401383315, |
| "eval_loss": 3.6181437969207764, |
| "eval_runtime": 53.6796, |
| "eval_samples_per_second": 309.745, |
| "eval_steps_per_second": 19.374, |
| "step": 20000 |
| }, |
| { |
| "epoch": 5.843826512766702, |
| "grad_norm": 0.3538828194141388, |
| "learning_rate": 0.0005301872265966753, |
| "loss": 3.53, |
| "step": 20050 |
| }, |
| { |
| "epoch": 5.8584003730908245, |
| "grad_norm": 0.32005009055137634, |
| "learning_rate": 0.0005300122484689413, |
| "loss": 3.5313, |
| "step": 20100 |
| }, |
| { |
| "epoch": 5.872974233414947, |
| "grad_norm": 0.34840235114097595, |
| "learning_rate": 0.0005298372703412073, |
| "loss": 3.5429, |
| "step": 20150 |
| }, |
| { |
| "epoch": 5.88754809373907, |
| "grad_norm": 0.34065404534339905, |
| "learning_rate": 0.0005296622922134732, |
| "loss": 3.5272, |
| "step": 20200 |
| }, |
| { |
| "epoch": 5.902121954063192, |
| "grad_norm": 0.3250705301761627, |
| "learning_rate": 0.0005294873140857392, |
| "loss": 3.5373, |
| "step": 20250 |
| }, |
| { |
| "epoch": 5.916695814387315, |
| "grad_norm": 0.3528779149055481, |
| "learning_rate": 0.0005293123359580052, |
| "loss": 3.5229, |
| "step": 20300 |
| }, |
| { |
| "epoch": 5.931269674711437, |
| "grad_norm": 0.33068379759788513, |
| "learning_rate": 0.0005291373578302711, |
| "loss": 3.5493, |
| "step": 20350 |
| }, |
| { |
| "epoch": 5.94584353503556, |
| "grad_norm": 0.30970242619514465, |
| "learning_rate": 0.0005289623797025371, |
| "loss": 3.5388, |
| "step": 20400 |
| }, |
| { |
| "epoch": 5.960417395359682, |
| "grad_norm": 0.34459182620048523, |
| "learning_rate": 0.0005287874015748031, |
| "loss": 3.5404, |
| "step": 20450 |
| }, |
| { |
| "epoch": 5.974991255683806, |
| "grad_norm": 0.31457436084747314, |
| "learning_rate": 0.000528612423447069, |
| "loss": 3.5411, |
| "step": 20500 |
| }, |
| { |
| "epoch": 5.989565116007928, |
| "grad_norm": 0.3335103392601013, |
| "learning_rate": 0.000528437445319335, |
| "loss": 3.5547, |
| "step": 20550 |
| }, |
| { |
| "epoch": 6.0040806808907545, |
| "grad_norm": 0.3316030502319336, |
| "learning_rate": 0.000528262467191601, |
| "loss": 3.5073, |
| "step": 20600 |
| }, |
| { |
| "epoch": 6.018654541214877, |
| "grad_norm": 0.33155548572540283, |
| "learning_rate": 0.000528087489063867, |
| "loss": 3.4268, |
| "step": 20650 |
| }, |
| { |
| "epoch": 6.033228401539, |
| "grad_norm": 0.36727064847946167, |
| "learning_rate": 0.0005279125109361329, |
| "loss": 3.4253, |
| "step": 20700 |
| }, |
| { |
| "epoch": 6.047802261863122, |
| "grad_norm": 0.34363842010498047, |
| "learning_rate": 0.0005277375328083989, |
| "loss": 3.4321, |
| "step": 20750 |
| }, |
| { |
| "epoch": 6.062376122187245, |
| "grad_norm": 0.34471696615219116, |
| "learning_rate": 0.0005275625546806649, |
| "loss": 3.4213, |
| "step": 20800 |
| }, |
| { |
| "epoch": 6.076949982511367, |
| "grad_norm": 0.3431618809700012, |
| "learning_rate": 0.0005273875765529309, |
| "loss": 3.4343, |
| "step": 20850 |
| }, |
| { |
| "epoch": 6.09152384283549, |
| "grad_norm": 0.32377782464027405, |
| "learning_rate": 0.0005272125984251968, |
| "loss": 3.4436, |
| "step": 20900 |
| }, |
| { |
| "epoch": 6.106097703159613, |
| "grad_norm": 0.3368135392665863, |
| "learning_rate": 0.0005270376202974628, |
| "loss": 3.4414, |
| "step": 20950 |
| }, |
| { |
| "epoch": 6.120671563483736, |
| "grad_norm": 0.3313571810722351, |
| "learning_rate": 0.0005268626421697288, |
| "loss": 3.4462, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.120671563483736, |
| "eval_accuracy": 0.36132586554041674, |
| "eval_loss": 3.6233930587768555, |
| "eval_runtime": 53.4098, |
| "eval_samples_per_second": 311.31, |
| "eval_steps_per_second": 19.472, |
| "step": 21000 |
| }, |
| { |
| "epoch": 6.135245423807858, |
| "grad_norm": 0.34603914618492126, |
| "learning_rate": 0.0005266876640419946, |
| "loss": 3.4571, |
| "step": 21050 |
| }, |
| { |
| "epoch": 6.149819284131981, |
| "grad_norm": 0.3259107172489166, |
| "learning_rate": 0.0005265126859142606, |
| "loss": 3.4641, |
| "step": 21100 |
| }, |
| { |
| "epoch": 6.1643931444561035, |
| "grad_norm": 0.3470149040222168, |
| "learning_rate": 0.0005263377077865266, |
| "loss": 3.4489, |
| "step": 21150 |
| }, |
| { |
| "epoch": 6.178967004780226, |
| "grad_norm": 0.36092814803123474, |
| "learning_rate": 0.0005261627296587926, |
| "loss": 3.4532, |
| "step": 21200 |
| }, |
| { |
| "epoch": 6.193540865104349, |
| "grad_norm": 0.3365027606487274, |
| "learning_rate": 0.0005259877515310585, |
| "loss": 3.4637, |
| "step": 21250 |
| }, |
| { |
| "epoch": 6.208114725428471, |
| "grad_norm": 0.33958983421325684, |
| "learning_rate": 0.0005258127734033245, |
| "loss": 3.4564, |
| "step": 21300 |
| }, |
| { |
| "epoch": 6.222688585752594, |
| "grad_norm": 0.3248773217201233, |
| "learning_rate": 0.0005256377952755905, |
| "loss": 3.4734, |
| "step": 21350 |
| }, |
| { |
| "epoch": 6.237262446076717, |
| "grad_norm": 0.34088876843452454, |
| "learning_rate": 0.0005254628171478565, |
| "loss": 3.4608, |
| "step": 21400 |
| }, |
| { |
| "epoch": 6.25183630640084, |
| "grad_norm": 0.3255520164966583, |
| "learning_rate": 0.0005252878390201224, |
| "loss": 3.4514, |
| "step": 21450 |
| }, |
| { |
| "epoch": 6.266410166724962, |
| "grad_norm": 0.32107624411582947, |
| "learning_rate": 0.0005251128608923884, |
| "loss": 3.4664, |
| "step": 21500 |
| }, |
| { |
| "epoch": 6.280984027049085, |
| "grad_norm": 0.3083617687225342, |
| "learning_rate": 0.0005249378827646544, |
| "loss": 3.4791, |
| "step": 21550 |
| }, |
| { |
| "epoch": 6.295557887373207, |
| "grad_norm": 0.3297259509563446, |
| "learning_rate": 0.0005247629046369204, |
| "loss": 3.4716, |
| "step": 21600 |
| }, |
| { |
| "epoch": 6.31013174769733, |
| "grad_norm": 0.328360378742218, |
| "learning_rate": 0.0005245879265091863, |
| "loss": 3.4762, |
| "step": 21650 |
| }, |
| { |
| "epoch": 6.3247056080214525, |
| "grad_norm": 0.3250866234302521, |
| "learning_rate": 0.0005244129483814523, |
| "loss": 3.4935, |
| "step": 21700 |
| }, |
| { |
| "epoch": 6.339279468345575, |
| "grad_norm": 0.3311171531677246, |
| "learning_rate": 0.0005242379702537183, |
| "loss": 3.4735, |
| "step": 21750 |
| }, |
| { |
| "epoch": 6.353853328669698, |
| "grad_norm": 0.3201950490474701, |
| "learning_rate": 0.0005240629921259843, |
| "loss": 3.4703, |
| "step": 21800 |
| }, |
| { |
| "epoch": 6.368427188993821, |
| "grad_norm": 0.32387447357177734, |
| "learning_rate": 0.0005238880139982502, |
| "loss": 3.4773, |
| "step": 21850 |
| }, |
| { |
| "epoch": 6.383001049317944, |
| "grad_norm": 0.32138365507125854, |
| "learning_rate": 0.0005237130358705162, |
| "loss": 3.4841, |
| "step": 21900 |
| }, |
| { |
| "epoch": 6.397574909642066, |
| "grad_norm": 0.3251798152923584, |
| "learning_rate": 0.0005235380577427822, |
| "loss": 3.4965, |
| "step": 21950 |
| }, |
| { |
| "epoch": 6.412148769966189, |
| "grad_norm": 0.35269978642463684, |
| "learning_rate": 0.0005233630796150481, |
| "loss": 3.473, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.412148769966189, |
| "eval_accuracy": 0.36237042595820323, |
| "eval_loss": 3.615170478820801, |
| "eval_runtime": 53.5334, |
| "eval_samples_per_second": 310.591, |
| "eval_steps_per_second": 19.427, |
| "step": 22000 |
| }, |
| { |
| "epoch": 6.426722630290311, |
| "grad_norm": 0.3353267014026642, |
| "learning_rate": 0.0005231881014873141, |
| "loss": 3.4795, |
| "step": 22050 |
| }, |
| { |
| "epoch": 6.441296490614434, |
| "grad_norm": 0.333974152803421, |
| "learning_rate": 0.00052301312335958, |
| "loss": 3.4844, |
| "step": 22100 |
| }, |
| { |
| "epoch": 6.455870350938556, |
| "grad_norm": 0.3244946002960205, |
| "learning_rate": 0.0005228381452318459, |
| "loss": 3.4783, |
| "step": 22150 |
| }, |
| { |
| "epoch": 6.470444211262679, |
| "grad_norm": 0.33467456698417664, |
| "learning_rate": 0.0005226631671041119, |
| "loss": 3.4826, |
| "step": 22200 |
| }, |
| { |
| "epoch": 6.4850180715868015, |
| "grad_norm": 0.3200705647468567, |
| "learning_rate": 0.0005224881889763779, |
| "loss": 3.4892, |
| "step": 22250 |
| }, |
| { |
| "epoch": 6.499591931910924, |
| "grad_norm": 0.33912044763565063, |
| "learning_rate": 0.0005223132108486439, |
| "loss": 3.4791, |
| "step": 22300 |
| }, |
| { |
| "epoch": 6.5141657922350475, |
| "grad_norm": 0.3464968800544739, |
| "learning_rate": 0.0005221382327209098, |
| "loss": 3.4894, |
| "step": 22350 |
| }, |
| { |
| "epoch": 6.52873965255917, |
| "grad_norm": 0.3422609865665436, |
| "learning_rate": 0.0005219632545931758, |
| "loss": 3.4994, |
| "step": 22400 |
| }, |
| { |
| "epoch": 6.543313512883293, |
| "grad_norm": 0.3351075351238251, |
| "learning_rate": 0.0005217882764654418, |
| "loss": 3.4762, |
| "step": 22450 |
| }, |
| { |
| "epoch": 6.557887373207415, |
| "grad_norm": 0.31506097316741943, |
| "learning_rate": 0.0005216132983377077, |
| "loss": 3.4807, |
| "step": 22500 |
| }, |
| { |
| "epoch": 6.572461233531538, |
| "grad_norm": 0.3468664288520813, |
| "learning_rate": 0.0005214383202099737, |
| "loss": 3.4996, |
| "step": 22550 |
| }, |
| { |
| "epoch": 6.58703509385566, |
| "grad_norm": 0.3405575454235077, |
| "learning_rate": 0.0005212633420822397, |
| "loss": 3.4814, |
| "step": 22600 |
| }, |
| { |
| "epoch": 6.601608954179783, |
| "grad_norm": 0.3330761790275574, |
| "learning_rate": 0.0005210883639545056, |
| "loss": 3.486, |
| "step": 22650 |
| }, |
| { |
| "epoch": 6.616182814503905, |
| "grad_norm": 0.3251539468765259, |
| "learning_rate": 0.0005209133858267716, |
| "loss": 3.4845, |
| "step": 22700 |
| }, |
| { |
| "epoch": 6.630756674828028, |
| "grad_norm": 0.32927173376083374, |
| "learning_rate": 0.0005207384076990376, |
| "loss": 3.494, |
| "step": 22750 |
| }, |
| { |
| "epoch": 6.645330535152151, |
| "grad_norm": 0.33050647377967834, |
| "learning_rate": 0.0005205634295713035, |
| "loss": 3.5028, |
| "step": 22800 |
| }, |
| { |
| "epoch": 6.659904395476274, |
| "grad_norm": 0.3605846166610718, |
| "learning_rate": 0.0005203884514435695, |
| "loss": 3.4799, |
| "step": 22850 |
| }, |
| { |
| "epoch": 6.6744782558003966, |
| "grad_norm": 0.3219446539878845, |
| "learning_rate": 0.0005202134733158355, |
| "loss": 3.4831, |
| "step": 22900 |
| }, |
| { |
| "epoch": 6.689052116124519, |
| "grad_norm": 0.31571391224861145, |
| "learning_rate": 0.0005200384951881014, |
| "loss": 3.4939, |
| "step": 22950 |
| }, |
| { |
| "epoch": 6.703625976448642, |
| "grad_norm": 0.33340510725975037, |
| "learning_rate": 0.0005198635170603674, |
| "loss": 3.4919, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.703625976448642, |
| "eval_accuracy": 0.3629433747034184, |
| "eval_loss": 3.6044983863830566, |
| "eval_runtime": 53.5271, |
| "eval_samples_per_second": 310.628, |
| "eval_steps_per_second": 19.429, |
| "step": 23000 |
| }, |
| { |
| "epoch": 6.718199836772764, |
| "grad_norm": 0.35659676790237427, |
| "learning_rate": 0.0005196885389326334, |
| "loss": 3.484, |
| "step": 23050 |
| }, |
| { |
| "epoch": 6.732773697096887, |
| "grad_norm": 0.32627058029174805, |
| "learning_rate": 0.0005195135608048994, |
| "loss": 3.4845, |
| "step": 23100 |
| }, |
| { |
| "epoch": 6.747347557421009, |
| "grad_norm": 0.35046520829200745, |
| "learning_rate": 0.0005193385826771652, |
| "loss": 3.4954, |
| "step": 23150 |
| }, |
| { |
| "epoch": 6.761921417745132, |
| "grad_norm": 0.36109864711761475, |
| "learning_rate": 0.0005191636045494312, |
| "loss": 3.4911, |
| "step": 23200 |
| }, |
| { |
| "epoch": 6.776495278069255, |
| "grad_norm": 0.3392275273799896, |
| "learning_rate": 0.0005189886264216972, |
| "loss": 3.5055, |
| "step": 23250 |
| }, |
| { |
| "epoch": 6.791069138393378, |
| "grad_norm": 0.3231343626976013, |
| "learning_rate": 0.0005188136482939632, |
| "loss": 3.5025, |
| "step": 23300 |
| }, |
| { |
| "epoch": 6.8056429987175004, |
| "grad_norm": 0.3419477343559265, |
| "learning_rate": 0.0005186386701662291, |
| "loss": 3.5093, |
| "step": 23350 |
| }, |
| { |
| "epoch": 6.820216859041623, |
| "grad_norm": 0.33013713359832764, |
| "learning_rate": 0.0005184636920384951, |
| "loss": 3.4955, |
| "step": 23400 |
| }, |
| { |
| "epoch": 6.834790719365746, |
| "grad_norm": 0.3481701910495758, |
| "learning_rate": 0.0005182887139107611, |
| "loss": 3.5006, |
| "step": 23450 |
| }, |
| { |
| "epoch": 6.849364579689868, |
| "grad_norm": 0.3264204263687134, |
| "learning_rate": 0.0005181137357830271, |
| "loss": 3.5001, |
| "step": 23500 |
| }, |
| { |
| "epoch": 6.863938440013991, |
| "grad_norm": 0.32489365339279175, |
| "learning_rate": 0.000517938757655293, |
| "loss": 3.4975, |
| "step": 23550 |
| }, |
| { |
| "epoch": 6.878512300338113, |
| "grad_norm": 0.3244480788707733, |
| "learning_rate": 0.000517763779527559, |
| "loss": 3.4865, |
| "step": 23600 |
| }, |
| { |
| "epoch": 6.893086160662236, |
| "grad_norm": 0.34287551045417786, |
| "learning_rate": 0.000517588801399825, |
| "loss": 3.5094, |
| "step": 23650 |
| }, |
| { |
| "epoch": 6.907660020986359, |
| "grad_norm": 0.346699982881546, |
| "learning_rate": 0.0005174138232720909, |
| "loss": 3.4957, |
| "step": 23700 |
| }, |
| { |
| "epoch": 6.922233881310482, |
| "grad_norm": 0.339921236038208, |
| "learning_rate": 0.0005172388451443569, |
| "loss": 3.4928, |
| "step": 23750 |
| }, |
| { |
| "epoch": 6.936807741634604, |
| "grad_norm": 0.3370542824268341, |
| "learning_rate": 0.0005170638670166229, |
| "loss": 3.5116, |
| "step": 23800 |
| }, |
| { |
| "epoch": 6.951381601958727, |
| "grad_norm": 0.31165871024131775, |
| "learning_rate": 0.0005168888888888889, |
| "loss": 3.502, |
| "step": 23850 |
| }, |
| { |
| "epoch": 6.9659554622828495, |
| "grad_norm": 0.3403184711933136, |
| "learning_rate": 0.0005167139107611548, |
| "loss": 3.5032, |
| "step": 23900 |
| }, |
| { |
| "epoch": 6.980529322606972, |
| "grad_norm": 0.3310294449329376, |
| "learning_rate": 0.0005165389326334208, |
| "loss": 3.4935, |
| "step": 23950 |
| }, |
| { |
| "epoch": 6.995103182931095, |
| "grad_norm": 0.32134073972702026, |
| "learning_rate": 0.0005163639545056868, |
| "loss": 3.5037, |
| "step": 24000 |
| }, |
| { |
| "epoch": 6.995103182931095, |
| "eval_accuracy": 0.36358505846654765, |
| "eval_loss": 3.596416711807251, |
| "eval_runtime": 53.4511, |
| "eval_samples_per_second": 311.069, |
| "eval_steps_per_second": 19.457, |
| "step": 24000 |
| }, |
| { |
| "epoch": 7.009618747813921, |
| "grad_norm": 0.36255356669425964, |
| "learning_rate": 0.0005161889763779528, |
| "loss": 3.4252, |
| "step": 24050 |
| }, |
| { |
| "epoch": 7.024192608138043, |
| "grad_norm": 0.33867883682250977, |
| "learning_rate": 0.0005160139982502187, |
| "loss": 3.3897, |
| "step": 24100 |
| }, |
| { |
| "epoch": 7.038766468462167, |
| "grad_norm": 0.3251667618751526, |
| "learning_rate": 0.0005158390201224847, |
| "loss": 3.385, |
| "step": 24150 |
| }, |
| { |
| "epoch": 7.053340328786289, |
| "grad_norm": 0.3292306363582611, |
| "learning_rate": 0.0005156640419947507, |
| "loss": 3.3792, |
| "step": 24200 |
| }, |
| { |
| "epoch": 7.067914189110412, |
| "grad_norm": 0.3216950595378876, |
| "learning_rate": 0.0005154890638670167, |
| "loss": 3.405, |
| "step": 24250 |
| }, |
| { |
| "epoch": 7.082488049434534, |
| "grad_norm": 0.33355313539505005, |
| "learning_rate": 0.0005153140857392825, |
| "loss": 3.412, |
| "step": 24300 |
| }, |
| { |
| "epoch": 7.097061909758657, |
| "grad_norm": 0.3156958222389221, |
| "learning_rate": 0.0005151391076115485, |
| "loss": 3.4045, |
| "step": 24350 |
| }, |
| { |
| "epoch": 7.1116357700827795, |
| "grad_norm": 0.3113691210746765, |
| "learning_rate": 0.0005149641294838145, |
| "loss": 3.4162, |
| "step": 24400 |
| }, |
| { |
| "epoch": 7.126209630406902, |
| "grad_norm": 0.3176027834415436, |
| "learning_rate": 0.0005147891513560804, |
| "loss": 3.4165, |
| "step": 24450 |
| }, |
| { |
| "epoch": 7.140783490731025, |
| "grad_norm": 0.3334745764732361, |
| "learning_rate": 0.0005146141732283464, |
| "loss": 3.4222, |
| "step": 24500 |
| }, |
| { |
| "epoch": 7.155357351055147, |
| "grad_norm": 0.3818000555038452, |
| "learning_rate": 0.0005144391951006124, |
| "loss": 3.4186, |
| "step": 24550 |
| }, |
| { |
| "epoch": 7.16993121137927, |
| "grad_norm": 0.3112426698207855, |
| "learning_rate": 0.0005142642169728783, |
| "loss": 3.4187, |
| "step": 24600 |
| }, |
| { |
| "epoch": 7.184505071703393, |
| "grad_norm": 0.33514121174812317, |
| "learning_rate": 0.0005140892388451443, |
| "loss": 3.4247, |
| "step": 24650 |
| }, |
| { |
| "epoch": 7.199078932027516, |
| "grad_norm": 0.3549887239933014, |
| "learning_rate": 0.0005139142607174103, |
| "loss": 3.4347, |
| "step": 24700 |
| }, |
| { |
| "epoch": 7.213652792351638, |
| "grad_norm": 0.3485502600669861, |
| "learning_rate": 0.0005137392825896762, |
| "loss": 3.4245, |
| "step": 24750 |
| }, |
| { |
| "epoch": 7.228226652675761, |
| "grad_norm": 0.3879625201225281, |
| "learning_rate": 0.0005135643044619422, |
| "loss": 3.4285, |
| "step": 24800 |
| }, |
| { |
| "epoch": 7.242800512999883, |
| "grad_norm": 0.32558587193489075, |
| "learning_rate": 0.0005133893263342082, |
| "loss": 3.4324, |
| "step": 24850 |
| }, |
| { |
| "epoch": 7.257374373324006, |
| "grad_norm": 0.33529454469680786, |
| "learning_rate": 0.0005132143482064742, |
| "loss": 3.434, |
| "step": 24900 |
| }, |
| { |
| "epoch": 7.2719482336481285, |
| "grad_norm": 0.3609262704849243, |
| "learning_rate": 0.0005130393700787401, |
| "loss": 3.4394, |
| "step": 24950 |
| }, |
| { |
| "epoch": 7.286522093972251, |
| "grad_norm": 0.33341631293296814, |
| "learning_rate": 0.0005128643919510061, |
| "loss": 3.4324, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.286522093972251, |
| "eval_accuracy": 0.36367062414809476, |
| "eval_loss": 3.6029181480407715, |
| "eval_runtime": 53.5346, |
| "eval_samples_per_second": 310.584, |
| "eval_steps_per_second": 19.427, |
| "step": 25000 |
| }, |
| { |
| "epoch": 7.301095954296374, |
| "grad_norm": 0.32108888030052185, |
| "learning_rate": 0.0005126894138232721, |
| "loss": 3.4364, |
| "step": 25050 |
| }, |
| { |
| "epoch": 7.315669814620497, |
| "grad_norm": 0.3594111502170563, |
| "learning_rate": 0.000512514435695538, |
| "loss": 3.4409, |
| "step": 25100 |
| }, |
| { |
| "epoch": 7.33024367494462, |
| "grad_norm": 0.3587343096733093, |
| "learning_rate": 0.000512339457567804, |
| "loss": 3.4384, |
| "step": 25150 |
| }, |
| { |
| "epoch": 7.344817535268742, |
| "grad_norm": 0.34162479639053345, |
| "learning_rate": 0.00051216447944007, |
| "loss": 3.4444, |
| "step": 25200 |
| }, |
| { |
| "epoch": 7.359391395592865, |
| "grad_norm": 0.3396339416503906, |
| "learning_rate": 0.0005119895013123358, |
| "loss": 3.4521, |
| "step": 25250 |
| }, |
| { |
| "epoch": 7.373965255916987, |
| "grad_norm": 0.3595559298992157, |
| "learning_rate": 0.0005118145231846018, |
| "loss": 3.4508, |
| "step": 25300 |
| }, |
| { |
| "epoch": 7.38853911624111, |
| "grad_norm": 0.32329264283180237, |
| "learning_rate": 0.0005116395450568678, |
| "loss": 3.4462, |
| "step": 25350 |
| }, |
| { |
| "epoch": 7.403112976565232, |
| "grad_norm": 0.34269917011260986, |
| "learning_rate": 0.0005114645669291338, |
| "loss": 3.4371, |
| "step": 25400 |
| }, |
| { |
| "epoch": 7.417686836889355, |
| "grad_norm": 0.35610729455947876, |
| "learning_rate": 0.0005112895888013997, |
| "loss": 3.4456, |
| "step": 25450 |
| }, |
| { |
| "epoch": 7.4322606972134775, |
| "grad_norm": 0.33263474702835083, |
| "learning_rate": 0.0005111146106736657, |
| "loss": 3.4487, |
| "step": 25500 |
| }, |
| { |
| "epoch": 7.446834557537601, |
| "grad_norm": 0.3399927616119385, |
| "learning_rate": 0.0005109396325459317, |
| "loss": 3.4493, |
| "step": 25550 |
| }, |
| { |
| "epoch": 7.4614084178617235, |
| "grad_norm": 0.31125643849372864, |
| "learning_rate": 0.0005107646544181976, |
| "loss": 3.4456, |
| "step": 25600 |
| }, |
| { |
| "epoch": 7.475982278185846, |
| "grad_norm": 0.3542948067188263, |
| "learning_rate": 0.0005105896762904636, |
| "loss": 3.4626, |
| "step": 25650 |
| }, |
| { |
| "epoch": 7.490556138509969, |
| "grad_norm": 0.3267478048801422, |
| "learning_rate": 0.0005104146981627296, |
| "loss": 3.4477, |
| "step": 25700 |
| }, |
| { |
| "epoch": 7.505129998834091, |
| "grad_norm": 0.36132851243019104, |
| "learning_rate": 0.0005102397200349956, |
| "loss": 3.4576, |
| "step": 25750 |
| }, |
| { |
| "epoch": 7.519703859158214, |
| "grad_norm": 0.3309080898761749, |
| "learning_rate": 0.0005100647419072615, |
| "loss": 3.4465, |
| "step": 25800 |
| }, |
| { |
| "epoch": 7.534277719482336, |
| "grad_norm": 0.3140285909175873, |
| "learning_rate": 0.0005098897637795275, |
| "loss": 3.44, |
| "step": 25850 |
| }, |
| { |
| "epoch": 7.548851579806459, |
| "grad_norm": 0.3533434271812439, |
| "learning_rate": 0.0005097147856517935, |
| "loss": 3.4445, |
| "step": 25900 |
| }, |
| { |
| "epoch": 7.563425440130581, |
| "grad_norm": 0.31856703758239746, |
| "learning_rate": 0.0005095398075240595, |
| "loss": 3.4566, |
| "step": 25950 |
| }, |
| { |
| "epoch": 7.577999300454705, |
| "grad_norm": 0.34421059489250183, |
| "learning_rate": 0.0005093648293963254, |
| "loss": 3.4514, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.577999300454705, |
| "eval_accuracy": 0.36456100156336857, |
| "eval_loss": 3.5964791774749756, |
| "eval_runtime": 53.5496, |
| "eval_samples_per_second": 310.497, |
| "eval_steps_per_second": 19.421, |
| "step": 26000 |
| }, |
| { |
| "epoch": 7.592573160778827, |
| "grad_norm": 0.3079422414302826, |
| "learning_rate": 0.0005091898512685914, |
| "loss": 3.4527, |
| "step": 26050 |
| }, |
| { |
| "epoch": 7.60714702110295, |
| "grad_norm": 0.3323257565498352, |
| "learning_rate": 0.0005090148731408574, |
| "loss": 3.4484, |
| "step": 26100 |
| }, |
| { |
| "epoch": 7.6217208814270725, |
| "grad_norm": 0.3399113118648529, |
| "learning_rate": 0.0005088398950131234, |
| "loss": 3.4613, |
| "step": 26150 |
| }, |
| { |
| "epoch": 7.636294741751195, |
| "grad_norm": 0.32449769973754883, |
| "learning_rate": 0.0005086649168853893, |
| "loss": 3.4563, |
| "step": 26200 |
| }, |
| { |
| "epoch": 7.650868602075318, |
| "grad_norm": 0.32541415095329285, |
| "learning_rate": 0.0005084899387576553, |
| "loss": 3.4578, |
| "step": 26250 |
| }, |
| { |
| "epoch": 7.66544246239944, |
| "grad_norm": 0.3436015546321869, |
| "learning_rate": 0.0005083149606299213, |
| "loss": 3.4684, |
| "step": 26300 |
| }, |
| { |
| "epoch": 7.680016322723563, |
| "grad_norm": 0.33585062623023987, |
| "learning_rate": 0.0005081399825021873, |
| "loss": 3.4625, |
| "step": 26350 |
| }, |
| { |
| "epoch": 7.694590183047685, |
| "grad_norm": 0.3081617057323456, |
| "learning_rate": 0.0005079650043744531, |
| "loss": 3.4584, |
| "step": 26400 |
| }, |
| { |
| "epoch": 7.709164043371809, |
| "grad_norm": 0.33808818459510803, |
| "learning_rate": 0.0005077900262467191, |
| "loss": 3.4487, |
| "step": 26450 |
| }, |
| { |
| "epoch": 7.723737903695931, |
| "grad_norm": 0.3737938404083252, |
| "learning_rate": 0.0005076150481189851, |
| "loss": 3.447, |
| "step": 26500 |
| }, |
| { |
| "epoch": 7.738311764020054, |
| "grad_norm": 0.35393649339675903, |
| "learning_rate": 0.000507440069991251, |
| "loss": 3.4709, |
| "step": 26550 |
| }, |
| { |
| "epoch": 7.752885624344176, |
| "grad_norm": 0.3017658591270447, |
| "learning_rate": 0.000507265091863517, |
| "loss": 3.4624, |
| "step": 26600 |
| }, |
| { |
| "epoch": 7.767459484668299, |
| "grad_norm": 0.33267614245414734, |
| "learning_rate": 0.000507090113735783, |
| "loss": 3.4616, |
| "step": 26650 |
| }, |
| { |
| "epoch": 7.7820333449924215, |
| "grad_norm": 0.33807429671287537, |
| "learning_rate": 0.000506915135608049, |
| "loss": 3.4679, |
| "step": 26700 |
| }, |
| { |
| "epoch": 7.796607205316544, |
| "grad_norm": 0.33967483043670654, |
| "learning_rate": 0.0005067401574803149, |
| "loss": 3.4568, |
| "step": 26750 |
| }, |
| { |
| "epoch": 7.811181065640667, |
| "grad_norm": 0.34301498532295227, |
| "learning_rate": 0.0005065651793525809, |
| "loss": 3.4668, |
| "step": 26800 |
| }, |
| { |
| "epoch": 7.825754925964789, |
| "grad_norm": 0.33017200231552124, |
| "learning_rate": 0.0005063902012248469, |
| "loss": 3.4515, |
| "step": 26850 |
| }, |
| { |
| "epoch": 7.840328786288913, |
| "grad_norm": 0.3316206932067871, |
| "learning_rate": 0.0005062152230971128, |
| "loss": 3.4665, |
| "step": 26900 |
| }, |
| { |
| "epoch": 7.854902646613035, |
| "grad_norm": 0.36610499024391174, |
| "learning_rate": 0.0005060402449693788, |
| "loss": 3.4697, |
| "step": 26950 |
| }, |
| { |
| "epoch": 7.869476506937158, |
| "grad_norm": 0.31481724977493286, |
| "learning_rate": 0.0005058652668416448, |
| "loss": 3.4773, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.869476506937158, |
| "eval_accuracy": 0.3649848282748558, |
| "eval_loss": 3.5873711109161377, |
| "eval_runtime": 53.4911, |
| "eval_samples_per_second": 310.837, |
| "eval_steps_per_second": 19.442, |
| "step": 27000 |
| }, |
| { |
| "epoch": 7.88405036726128, |
| "grad_norm": 0.32205820083618164, |
| "learning_rate": 0.0005056902887139107, |
| "loss": 3.4667, |
| "step": 27050 |
| }, |
| { |
| "epoch": 7.898624227585403, |
| "grad_norm": 0.3444332480430603, |
| "learning_rate": 0.0005055153105861767, |
| "loss": 3.4622, |
| "step": 27100 |
| }, |
| { |
| "epoch": 7.913198087909525, |
| "grad_norm": 0.3325342833995819, |
| "learning_rate": 0.0005053403324584427, |
| "loss": 3.4625, |
| "step": 27150 |
| }, |
| { |
| "epoch": 7.927771948233648, |
| "grad_norm": 0.31255239248275757, |
| "learning_rate": 0.0005051653543307086, |
| "loss": 3.4711, |
| "step": 27200 |
| }, |
| { |
| "epoch": 7.9423458085577705, |
| "grad_norm": 0.3473881185054779, |
| "learning_rate": 0.0005049903762029746, |
| "loss": 3.4655, |
| "step": 27250 |
| }, |
| { |
| "epoch": 7.956919668881893, |
| "grad_norm": 0.3250695765018463, |
| "learning_rate": 0.0005048153980752406, |
| "loss": 3.4686, |
| "step": 27300 |
| }, |
| { |
| "epoch": 7.9714935292060165, |
| "grad_norm": 0.322086364030838, |
| "learning_rate": 0.0005046404199475064, |
| "loss": 3.4763, |
| "step": 27350 |
| }, |
| { |
| "epoch": 7.986067389530139, |
| "grad_norm": 0.3338831961154938, |
| "learning_rate": 0.0005044654418197724, |
| "loss": 3.4721, |
| "step": 27400 |
| }, |
| { |
| "epoch": 8.000582954412964, |
| "grad_norm": 0.3455122709274292, |
| "learning_rate": 0.0005042904636920384, |
| "loss": 3.4573, |
| "step": 27450 |
| }, |
| { |
| "epoch": 8.015156814737088, |
| "grad_norm": 0.3366275727748871, |
| "learning_rate": 0.0005041154855643044, |
| "loss": 3.3555, |
| "step": 27500 |
| }, |
| { |
| "epoch": 8.02973067506121, |
| "grad_norm": 0.3565216064453125, |
| "learning_rate": 0.0005039405074365703, |
| "loss": 3.36, |
| "step": 27550 |
| }, |
| { |
| "epoch": 8.044304535385333, |
| "grad_norm": 0.32767388224601746, |
| "learning_rate": 0.0005037655293088363, |
| "loss": 3.3617, |
| "step": 27600 |
| }, |
| { |
| "epoch": 8.058878395709456, |
| "grad_norm": 0.3338020145893097, |
| "learning_rate": 0.0005035905511811023, |
| "loss": 3.3589, |
| "step": 27650 |
| }, |
| { |
| "epoch": 8.073452256033578, |
| "grad_norm": 0.3493526875972748, |
| "learning_rate": 0.0005034155730533682, |
| "loss": 3.3802, |
| "step": 27700 |
| }, |
| { |
| "epoch": 8.088026116357701, |
| "grad_norm": 0.38675856590270996, |
| "learning_rate": 0.0005032405949256342, |
| "loss": 3.3874, |
| "step": 27750 |
| }, |
| { |
| "epoch": 8.102599976681823, |
| "grad_norm": 0.32882654666900635, |
| "learning_rate": 0.0005030656167979002, |
| "loss": 3.3766, |
| "step": 27800 |
| }, |
| { |
| "epoch": 8.117173837005947, |
| "grad_norm": 0.3182843327522278, |
| "learning_rate": 0.0005028906386701662, |
| "loss": 3.373, |
| "step": 27850 |
| }, |
| { |
| "epoch": 8.131747697330068, |
| "grad_norm": 0.3248377740383148, |
| "learning_rate": 0.0005027156605424321, |
| "loss": 3.3894, |
| "step": 27900 |
| }, |
| { |
| "epoch": 8.146321557654192, |
| "grad_norm": 0.37339022755622864, |
| "learning_rate": 0.0005025406824146981, |
| "loss": 3.3882, |
| "step": 27950 |
| }, |
| { |
| "epoch": 8.160895417978313, |
| "grad_norm": 0.3370717167854309, |
| "learning_rate": 0.0005023657042869641, |
| "loss": 3.3931, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.160895417978313, |
| "eval_accuracy": 0.365049208505676, |
| "eval_loss": 3.5939173698425293, |
| "eval_runtime": 53.5571, |
| "eval_samples_per_second": 310.454, |
| "eval_steps_per_second": 19.419, |
| "step": 28000 |
| }, |
| { |
| "epoch": 8.175469278302437, |
| "grad_norm": 0.3199685513973236, |
| "learning_rate": 0.0005021907261592301, |
| "loss": 3.3841, |
| "step": 28050 |
| }, |
| { |
| "epoch": 8.19004313862656, |
| "grad_norm": 0.34581318497657776, |
| "learning_rate": 0.000502015748031496, |
| "loss": 3.396, |
| "step": 28100 |
| }, |
| { |
| "epoch": 8.204616998950682, |
| "grad_norm": 0.33265337347984314, |
| "learning_rate": 0.000501840769903762, |
| "loss": 3.3869, |
| "step": 28150 |
| }, |
| { |
| "epoch": 8.219190859274805, |
| "grad_norm": 0.34901463985443115, |
| "learning_rate": 0.000501665791776028, |
| "loss": 3.399, |
| "step": 28200 |
| }, |
| { |
| "epoch": 8.233764719598927, |
| "grad_norm": 0.36483561992645264, |
| "learning_rate": 0.0005014908136482939, |
| "loss": 3.3957, |
| "step": 28250 |
| }, |
| { |
| "epoch": 8.24833857992305, |
| "grad_norm": 0.3788173496723175, |
| "learning_rate": 0.0005013158355205599, |
| "loss": 3.3948, |
| "step": 28300 |
| }, |
| { |
| "epoch": 8.262912440247172, |
| "grad_norm": 0.3419286906719208, |
| "learning_rate": 0.0005011408573928259, |
| "loss": 3.4104, |
| "step": 28350 |
| }, |
| { |
| "epoch": 8.277486300571296, |
| "grad_norm": 0.34480592608451843, |
| "learning_rate": 0.0005009658792650919, |
| "loss": 3.4063, |
| "step": 28400 |
| }, |
| { |
| "epoch": 8.292060160895417, |
| "grad_norm": 0.34333592653274536, |
| "learning_rate": 0.0005007909011373577, |
| "loss": 3.4079, |
| "step": 28450 |
| }, |
| { |
| "epoch": 8.30663402121954, |
| "grad_norm": 0.3285306990146637, |
| "learning_rate": 0.0005006159230096237, |
| "loss": 3.4079, |
| "step": 28500 |
| }, |
| { |
| "epoch": 8.321207881543664, |
| "grad_norm": 0.31494301557540894, |
| "learning_rate": 0.0005004409448818897, |
| "loss": 3.4046, |
| "step": 28550 |
| }, |
| { |
| "epoch": 8.335781741867786, |
| "grad_norm": 0.32112932205200195, |
| "learning_rate": 0.0005002659667541557, |
| "loss": 3.4026, |
| "step": 28600 |
| }, |
| { |
| "epoch": 8.35035560219191, |
| "grad_norm": 0.34374773502349854, |
| "learning_rate": 0.0005000909886264216, |
| "loss": 3.418, |
| "step": 28650 |
| }, |
| { |
| "epoch": 8.36492946251603, |
| "grad_norm": 0.37203073501586914, |
| "learning_rate": 0.0004999160104986876, |
| "loss": 3.409, |
| "step": 28700 |
| }, |
| { |
| "epoch": 8.379503322840154, |
| "grad_norm": 0.3256841003894806, |
| "learning_rate": 0.0004997410323709536, |
| "loss": 3.4212, |
| "step": 28750 |
| }, |
| { |
| "epoch": 8.394077183164276, |
| "grad_norm": 0.3450561463832855, |
| "learning_rate": 0.0004995660542432196, |
| "loss": 3.4215, |
| "step": 28800 |
| }, |
| { |
| "epoch": 8.4086510434884, |
| "grad_norm": 0.3316645920276642, |
| "learning_rate": 0.0004993910761154855, |
| "loss": 3.4092, |
| "step": 28850 |
| }, |
| { |
| "epoch": 8.423224903812521, |
| "grad_norm": 0.33119675517082214, |
| "learning_rate": 0.0004992160979877515, |
| "loss": 3.407, |
| "step": 28900 |
| }, |
| { |
| "epoch": 8.437798764136645, |
| "grad_norm": 0.3265671133995056, |
| "learning_rate": 0.0004990411198600175, |
| "loss": 3.4178, |
| "step": 28950 |
| }, |
| { |
| "epoch": 8.452372624460768, |
| "grad_norm": 0.3504529893398285, |
| "learning_rate": 0.0004988661417322835, |
| "loss": 3.4055, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.452372624460768, |
| "eval_accuracy": 0.3656461674283817, |
| "eval_loss": 3.5862491130828857, |
| "eval_runtime": 53.6363, |
| "eval_samples_per_second": 309.995, |
| "eval_steps_per_second": 19.39, |
| "step": 29000 |
| }, |
| { |
| "epoch": 8.46694648478489, |
| "grad_norm": 0.33847320079803467, |
| "learning_rate": 0.0004986911636045494, |
| "loss": 3.4063, |
| "step": 29050 |
| }, |
| { |
| "epoch": 8.481520345109013, |
| "grad_norm": 0.38739728927612305, |
| "learning_rate": 0.0004985161854768154, |
| "loss": 3.4147, |
| "step": 29100 |
| }, |
| { |
| "epoch": 8.496094205433135, |
| "grad_norm": 0.35963594913482666, |
| "learning_rate": 0.0004983412073490814, |
| "loss": 3.4287, |
| "step": 29150 |
| }, |
| { |
| "epoch": 8.510668065757258, |
| "grad_norm": 0.32501381635665894, |
| "learning_rate": 0.0004981662292213473, |
| "loss": 3.4133, |
| "step": 29200 |
| }, |
| { |
| "epoch": 8.52524192608138, |
| "grad_norm": 0.3212740421295166, |
| "learning_rate": 0.0004979912510936133, |
| "loss": 3.4244, |
| "step": 29250 |
| }, |
| { |
| "epoch": 8.539815786405503, |
| "grad_norm": 0.35725831985473633, |
| "learning_rate": 0.0004978162729658793, |
| "loss": 3.4357, |
| "step": 29300 |
| }, |
| { |
| "epoch": 8.554389646729625, |
| "grad_norm": 0.3251437246799469, |
| "learning_rate": 0.0004976412948381452, |
| "loss": 3.4176, |
| "step": 29350 |
| }, |
| { |
| "epoch": 8.568963507053748, |
| "grad_norm": 0.3582191467285156, |
| "learning_rate": 0.0004974663167104112, |
| "loss": 3.4264, |
| "step": 29400 |
| }, |
| { |
| "epoch": 8.583537367377872, |
| "grad_norm": 0.33836236596107483, |
| "learning_rate": 0.0004972913385826772, |
| "loss": 3.4312, |
| "step": 29450 |
| }, |
| { |
| "epoch": 8.598111227701994, |
| "grad_norm": 0.3506194055080414, |
| "learning_rate": 0.000497116360454943, |
| "loss": 3.4195, |
| "step": 29500 |
| }, |
| { |
| "epoch": 8.612685088026117, |
| "grad_norm": 0.3574237525463104, |
| "learning_rate": 0.000496941382327209, |
| "loss": 3.4292, |
| "step": 29550 |
| }, |
| { |
| "epoch": 8.627258948350239, |
| "grad_norm": 0.3134848177433014, |
| "learning_rate": 0.000496766404199475, |
| "loss": 3.4214, |
| "step": 29600 |
| }, |
| { |
| "epoch": 8.641832808674362, |
| "grad_norm": 0.32016828656196594, |
| "learning_rate": 0.0004965914260717409, |
| "loss": 3.4344, |
| "step": 29650 |
| }, |
| { |
| "epoch": 8.656406668998484, |
| "grad_norm": 0.3254229724407196, |
| "learning_rate": 0.0004964164479440069, |
| "loss": 3.4381, |
| "step": 29700 |
| }, |
| { |
| "epoch": 8.670980529322607, |
| "grad_norm": 0.3310669958591461, |
| "learning_rate": 0.0004962414698162729, |
| "loss": 3.4182, |
| "step": 29750 |
| }, |
| { |
| "epoch": 8.685554389646729, |
| "grad_norm": 0.309650719165802, |
| "learning_rate": 0.0004960664916885388, |
| "loss": 3.4345, |
| "step": 29800 |
| }, |
| { |
| "epoch": 8.700128249970852, |
| "grad_norm": 0.37001681327819824, |
| "learning_rate": 0.0004958915135608048, |
| "loss": 3.4347, |
| "step": 29850 |
| }, |
| { |
| "epoch": 8.714702110294976, |
| "grad_norm": 0.3444061875343323, |
| "learning_rate": 0.0004957165354330708, |
| "loss": 3.4358, |
| "step": 29900 |
| }, |
| { |
| "epoch": 8.729275970619097, |
| "grad_norm": 0.34317103028297424, |
| "learning_rate": 0.0004955415573053368, |
| "loss": 3.4445, |
| "step": 29950 |
| }, |
| { |
| "epoch": 8.74384983094322, |
| "grad_norm": 0.333115816116333, |
| "learning_rate": 0.0004953665791776027, |
| "loss": 3.432, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.74384983094322, |
| "eval_accuracy": 0.3662541898642448, |
| "eval_loss": 3.58001708984375, |
| "eval_runtime": 53.6482, |
| "eval_samples_per_second": 309.926, |
| "eval_steps_per_second": 19.386, |
| "step": 30000 |
| }, |
| { |
| "epoch": 8.758423691267343, |
| "grad_norm": 0.3124735951423645, |
| "learning_rate": 0.0004951916010498687, |
| "loss": 3.4423, |
| "step": 30050 |
| }, |
| { |
| "epoch": 8.772997551591466, |
| "grad_norm": 0.32269686460494995, |
| "learning_rate": 0.0004950166229221347, |
| "loss": 3.4423, |
| "step": 30100 |
| }, |
| { |
| "epoch": 8.787571411915588, |
| "grad_norm": 0.3414989113807678, |
| "learning_rate": 0.0004948416447944006, |
| "loss": 3.4283, |
| "step": 30150 |
| }, |
| { |
| "epoch": 8.802145272239711, |
| "grad_norm": 0.34327372908592224, |
| "learning_rate": 0.0004946666666666666, |
| "loss": 3.4533, |
| "step": 30200 |
| }, |
| { |
| "epoch": 8.816719132563833, |
| "grad_norm": 0.3256373107433319, |
| "learning_rate": 0.0004944916885389326, |
| "loss": 3.4469, |
| "step": 30250 |
| }, |
| { |
| "epoch": 8.831292992887956, |
| "grad_norm": 0.3294476270675659, |
| "learning_rate": 0.0004943167104111986, |
| "loss": 3.4369, |
| "step": 30300 |
| }, |
| { |
| "epoch": 8.84586685321208, |
| "grad_norm": 0.3473595082759857, |
| "learning_rate": 0.0004941417322834645, |
| "loss": 3.4296, |
| "step": 30350 |
| }, |
| { |
| "epoch": 8.860440713536201, |
| "grad_norm": 0.32299938797950745, |
| "learning_rate": 0.0004939667541557305, |
| "loss": 3.4371, |
| "step": 30400 |
| }, |
| { |
| "epoch": 8.875014573860325, |
| "grad_norm": 0.33664125204086304, |
| "learning_rate": 0.0004937917760279965, |
| "loss": 3.4336, |
| "step": 30450 |
| }, |
| { |
| "epoch": 8.889588434184446, |
| "grad_norm": 0.3254507780075073, |
| "learning_rate": 0.0004936167979002625, |
| "loss": 3.434, |
| "step": 30500 |
| }, |
| { |
| "epoch": 8.90416229450857, |
| "grad_norm": 0.32773420214653015, |
| "learning_rate": 0.0004934418197725284, |
| "loss": 3.4356, |
| "step": 30550 |
| }, |
| { |
| "epoch": 8.918736154832692, |
| "grad_norm": 0.33789709210395813, |
| "learning_rate": 0.0004932668416447943, |
| "loss": 3.4407, |
| "step": 30600 |
| }, |
| { |
| "epoch": 8.933310015156815, |
| "grad_norm": 0.3574780821800232, |
| "learning_rate": 0.0004930918635170603, |
| "loss": 3.4383, |
| "step": 30650 |
| }, |
| { |
| "epoch": 8.947883875480937, |
| "grad_norm": 0.34196579456329346, |
| "learning_rate": 0.0004929168853893263, |
| "loss": 3.4421, |
| "step": 30700 |
| }, |
| { |
| "epoch": 8.96245773580506, |
| "grad_norm": 0.31978747248649597, |
| "learning_rate": 0.0004927419072615922, |
| "loss": 3.4478, |
| "step": 30750 |
| }, |
| { |
| "epoch": 8.977031596129184, |
| "grad_norm": 0.3313916027545929, |
| "learning_rate": 0.0004925669291338582, |
| "loss": 3.4452, |
| "step": 30800 |
| }, |
| { |
| "epoch": 8.991605456453305, |
| "grad_norm": 0.31331366300582886, |
| "learning_rate": 0.0004923919510061242, |
| "loss": 3.4391, |
| "step": 30850 |
| }, |
| { |
| "epoch": 9.006121021336131, |
| "grad_norm": 0.34389740228652954, |
| "learning_rate": 0.0004922169728783901, |
| "loss": 3.3886, |
| "step": 30900 |
| }, |
| { |
| "epoch": 9.020694881660255, |
| "grad_norm": 0.3569186329841614, |
| "learning_rate": 0.0004920419947506561, |
| "loss": 3.3252, |
| "step": 30950 |
| }, |
| { |
| "epoch": 9.035268741984376, |
| "grad_norm": 0.32477042078971863, |
| "learning_rate": 0.0004918670166229221, |
| "loss": 3.3373, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.035268741984376, |
| "eval_accuracy": 0.36603056566212716, |
| "eval_loss": 3.584533929824829, |
| "eval_runtime": 53.4957, |
| "eval_samples_per_second": 310.81, |
| "eval_steps_per_second": 19.441, |
| "step": 31000 |
| }, |
| { |
| "epoch": 9.0498426023085, |
| "grad_norm": 0.3547387421131134, |
| "learning_rate": 0.0004916920384951881, |
| "loss": 3.3302, |
| "step": 31050 |
| }, |
| { |
| "epoch": 9.064416462632622, |
| "grad_norm": 0.3375994563102722, |
| "learning_rate": 0.000491517060367454, |
| "loss": 3.349, |
| "step": 31100 |
| }, |
| { |
| "epoch": 9.078990322956745, |
| "grad_norm": 0.34682807326316833, |
| "learning_rate": 0.00049134208223972, |
| "loss": 3.3495, |
| "step": 31150 |
| }, |
| { |
| "epoch": 9.093564183280867, |
| "grad_norm": 0.34465840458869934, |
| "learning_rate": 0.000491167104111986, |
| "loss": 3.341, |
| "step": 31200 |
| }, |
| { |
| "epoch": 9.10813804360499, |
| "grad_norm": 0.31737080216407776, |
| "learning_rate": 0.000490992125984252, |
| "loss": 3.3392, |
| "step": 31250 |
| }, |
| { |
| "epoch": 9.122711903929114, |
| "grad_norm": 0.34440186619758606, |
| "learning_rate": 0.0004908171478565179, |
| "loss": 3.3524, |
| "step": 31300 |
| }, |
| { |
| "epoch": 9.137285764253235, |
| "grad_norm": 0.33889293670654297, |
| "learning_rate": 0.0004906421697287839, |
| "loss": 3.3492, |
| "step": 31350 |
| }, |
| { |
| "epoch": 9.151859624577359, |
| "grad_norm": 0.33665016293525696, |
| "learning_rate": 0.0004904671916010499, |
| "loss": 3.3546, |
| "step": 31400 |
| }, |
| { |
| "epoch": 9.16643348490148, |
| "grad_norm": 0.34208089113235474, |
| "learning_rate": 0.0004902922134733158, |
| "loss": 3.3617, |
| "step": 31450 |
| }, |
| { |
| "epoch": 9.181007345225604, |
| "grad_norm": 0.3269473612308502, |
| "learning_rate": 0.0004901172353455818, |
| "loss": 3.3576, |
| "step": 31500 |
| }, |
| { |
| "epoch": 9.195581205549725, |
| "grad_norm": 0.3334795832633972, |
| "learning_rate": 0.0004899422572178478, |
| "loss": 3.3587, |
| "step": 31550 |
| }, |
| { |
| "epoch": 9.210155065873849, |
| "grad_norm": 0.33613160252571106, |
| "learning_rate": 0.0004897672790901138, |
| "loss": 3.3765, |
| "step": 31600 |
| }, |
| { |
| "epoch": 9.22472892619797, |
| "grad_norm": 0.3446190357208252, |
| "learning_rate": 0.0004895923009623796, |
| "loss": 3.3663, |
| "step": 31650 |
| }, |
| { |
| "epoch": 9.239302786522094, |
| "grad_norm": 0.3384830057621002, |
| "learning_rate": 0.0004894173228346456, |
| "loss": 3.3831, |
| "step": 31700 |
| }, |
| { |
| "epoch": 9.253876646846217, |
| "grad_norm": 0.35532593727111816, |
| "learning_rate": 0.0004892423447069116, |
| "loss": 3.3837, |
| "step": 31750 |
| }, |
| { |
| "epoch": 9.26845050717034, |
| "grad_norm": 0.3461291790008545, |
| "learning_rate": 0.0004890673665791775, |
| "loss": 3.3761, |
| "step": 31800 |
| }, |
| { |
| "epoch": 9.283024367494463, |
| "grad_norm": 0.3272002339363098, |
| "learning_rate": 0.0004888923884514435, |
| "loss": 3.3877, |
| "step": 31850 |
| }, |
| { |
| "epoch": 9.297598227818584, |
| "grad_norm": 0.34134840965270996, |
| "learning_rate": 0.0004887174103237095, |
| "loss": 3.3749, |
| "step": 31900 |
| }, |
| { |
| "epoch": 9.312172088142708, |
| "grad_norm": 0.319865882396698, |
| "learning_rate": 0.0004885424321959754, |
| "loss": 3.3842, |
| "step": 31950 |
| }, |
| { |
| "epoch": 9.32674594846683, |
| "grad_norm": 0.33463987708091736, |
| "learning_rate": 0.0004883674540682414, |
| "loss": 3.382, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.32674594846683, |
| "eval_accuracy": 0.3662880865854079, |
| "eval_loss": 3.5837769508361816, |
| "eval_runtime": 53.6851, |
| "eval_samples_per_second": 309.713, |
| "eval_steps_per_second": 19.372, |
| "step": 32000 |
| }, |
| { |
| "epoch": 9.341319808790953, |
| "grad_norm": 0.3650956451892853, |
| "learning_rate": 0.00048819247594050736, |
| "loss": 3.3956, |
| "step": 32050 |
| }, |
| { |
| "epoch": 9.355893669115074, |
| "grad_norm": 0.3498532474040985, |
| "learning_rate": 0.00048801749781277336, |
| "loss": 3.3831, |
| "step": 32100 |
| }, |
| { |
| "epoch": 9.370467529439198, |
| "grad_norm": 0.33984145522117615, |
| "learning_rate": 0.00048784251968503936, |
| "loss": 3.3976, |
| "step": 32150 |
| }, |
| { |
| "epoch": 9.385041389763321, |
| "grad_norm": 0.3673052191734314, |
| "learning_rate": 0.0004876675415573053, |
| "loss": 3.374, |
| "step": 32200 |
| }, |
| { |
| "epoch": 9.399615250087443, |
| "grad_norm": 0.3448958694934845, |
| "learning_rate": 0.00048749256342957124, |
| "loss": 3.3875, |
| "step": 32250 |
| }, |
| { |
| "epoch": 9.414189110411566, |
| "grad_norm": 0.3689291775226593, |
| "learning_rate": 0.00048731758530183724, |
| "loss": 3.3878, |
| "step": 32300 |
| }, |
| { |
| "epoch": 9.428762970735688, |
| "grad_norm": 0.31993743777275085, |
| "learning_rate": 0.0004871426071741032, |
| "loss": 3.4137, |
| "step": 32350 |
| }, |
| { |
| "epoch": 9.443336831059812, |
| "grad_norm": 0.38846081495285034, |
| "learning_rate": 0.0004869676290463692, |
| "loss": 3.3882, |
| "step": 32400 |
| }, |
| { |
| "epoch": 9.457910691383933, |
| "grad_norm": 0.326425701379776, |
| "learning_rate": 0.0004867926509186351, |
| "loss": 3.3948, |
| "step": 32450 |
| }, |
| { |
| "epoch": 9.472484551708057, |
| "grad_norm": 0.35158270597457886, |
| "learning_rate": 0.00048661767279090107, |
| "loss": 3.4017, |
| "step": 32500 |
| }, |
| { |
| "epoch": 9.487058412032178, |
| "grad_norm": 0.36797675490379333, |
| "learning_rate": 0.00048644269466316707, |
| "loss": 3.3926, |
| "step": 32550 |
| }, |
| { |
| "epoch": 9.501632272356302, |
| "grad_norm": 0.36031150817871094, |
| "learning_rate": 0.00048626771653543306, |
| "loss": 3.4006, |
| "step": 32600 |
| }, |
| { |
| "epoch": 9.516206132680423, |
| "grad_norm": 0.3411952257156372, |
| "learning_rate": 0.00048609273840769895, |
| "loss": 3.404, |
| "step": 32650 |
| }, |
| { |
| "epoch": 9.530779993004547, |
| "grad_norm": 0.3394002616405487, |
| "learning_rate": 0.00048591776027996495, |
| "loss": 3.4056, |
| "step": 32700 |
| }, |
| { |
| "epoch": 9.54535385332867, |
| "grad_norm": 0.3555849492549896, |
| "learning_rate": 0.00048574278215223095, |
| "loss": 3.3915, |
| "step": 32750 |
| }, |
| { |
| "epoch": 9.559927713652792, |
| "grad_norm": 0.3551422953605652, |
| "learning_rate": 0.0004855678040244969, |
| "loss": 3.3977, |
| "step": 32800 |
| }, |
| { |
| "epoch": 9.574501573976915, |
| "grad_norm": 0.3270210921764374, |
| "learning_rate": 0.00048539282589676283, |
| "loss": 3.3919, |
| "step": 32850 |
| }, |
| { |
| "epoch": 9.589075434301037, |
| "grad_norm": 0.3263692855834961, |
| "learning_rate": 0.00048521784776902883, |
| "loss": 3.4081, |
| "step": 32900 |
| }, |
| { |
| "epoch": 9.60364929462516, |
| "grad_norm": 0.354744553565979, |
| "learning_rate": 0.00048504286964129483, |
| "loss": 3.4122, |
| "step": 32950 |
| }, |
| { |
| "epoch": 9.618223154949282, |
| "grad_norm": 0.32990899682044983, |
| "learning_rate": 0.0004848678915135607, |
| "loss": 3.4042, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.618223154949282, |
| "eval_accuracy": 0.3671097289827676, |
| "eval_loss": 3.5730035305023193, |
| "eval_runtime": 53.4769, |
| "eval_samples_per_second": 310.92, |
| "eval_steps_per_second": 19.448, |
| "step": 33000 |
| }, |
| { |
| "epoch": 9.632797015273406, |
| "grad_norm": 0.3566998243331909, |
| "learning_rate": 0.0004846929133858267, |
| "loss": 3.3925, |
| "step": 33050 |
| }, |
| { |
| "epoch": 9.647370875597527, |
| "grad_norm": 0.3311980366706848, |
| "learning_rate": 0.0004845179352580927, |
| "loss": 3.4096, |
| "step": 33100 |
| }, |
| { |
| "epoch": 9.66194473592165, |
| "grad_norm": 0.3281240165233612, |
| "learning_rate": 0.0004843429571303587, |
| "loss": 3.4183, |
| "step": 33150 |
| }, |
| { |
| "epoch": 9.676518596245774, |
| "grad_norm": 0.34353914856910706, |
| "learning_rate": 0.0004841679790026246, |
| "loss": 3.4089, |
| "step": 33200 |
| }, |
| { |
| "epoch": 9.691092456569896, |
| "grad_norm": 0.3408343195915222, |
| "learning_rate": 0.0004839930008748906, |
| "loss": 3.4063, |
| "step": 33250 |
| }, |
| { |
| "epoch": 9.70566631689402, |
| "grad_norm": 0.33646687865257263, |
| "learning_rate": 0.0004838180227471566, |
| "loss": 3.4033, |
| "step": 33300 |
| }, |
| { |
| "epoch": 9.720240177218141, |
| "grad_norm": 0.3331036865711212, |
| "learning_rate": 0.00048364304461942254, |
| "loss": 3.4071, |
| "step": 33350 |
| }, |
| { |
| "epoch": 9.734814037542264, |
| "grad_norm": 0.3450242280960083, |
| "learning_rate": 0.0004834680664916885, |
| "loss": 3.4055, |
| "step": 33400 |
| }, |
| { |
| "epoch": 9.749387897866386, |
| "grad_norm": 0.3468281626701355, |
| "learning_rate": 0.0004832930883639545, |
| "loss": 3.4073, |
| "step": 33450 |
| }, |
| { |
| "epoch": 9.76396175819051, |
| "grad_norm": 0.3567153215408325, |
| "learning_rate": 0.0004831181102362204, |
| "loss": 3.4164, |
| "step": 33500 |
| }, |
| { |
| "epoch": 9.778535618514631, |
| "grad_norm": 0.3392173647880554, |
| "learning_rate": 0.00048294313210848637, |
| "loss": 3.4109, |
| "step": 33550 |
| }, |
| { |
| "epoch": 9.793109478838755, |
| "grad_norm": 0.3409736156463623, |
| "learning_rate": 0.00048276815398075237, |
| "loss": 3.4057, |
| "step": 33600 |
| }, |
| { |
| "epoch": 9.807683339162878, |
| "grad_norm": 0.3547792136669159, |
| "learning_rate": 0.0004825931758530183, |
| "loss": 3.4028, |
| "step": 33650 |
| }, |
| { |
| "epoch": 9.822257199487, |
| "grad_norm": 0.3348761200904846, |
| "learning_rate": 0.0004824181977252843, |
| "loss": 3.4184, |
| "step": 33700 |
| }, |
| { |
| "epoch": 9.836831059811123, |
| "grad_norm": 0.310391366481781, |
| "learning_rate": 0.00048224321959755025, |
| "loss": 3.4075, |
| "step": 33750 |
| }, |
| { |
| "epoch": 9.851404920135245, |
| "grad_norm": 0.34502100944519043, |
| "learning_rate": 0.0004820682414698162, |
| "loss": 3.406, |
| "step": 33800 |
| }, |
| { |
| "epoch": 9.865978780459368, |
| "grad_norm": 0.3541765809059143, |
| "learning_rate": 0.0004818932633420822, |
| "loss": 3.4125, |
| "step": 33850 |
| }, |
| { |
| "epoch": 9.88055264078349, |
| "grad_norm": 0.3402947187423706, |
| "learning_rate": 0.0004817182852143482, |
| "loss": 3.4251, |
| "step": 33900 |
| }, |
| { |
| "epoch": 9.895126501107613, |
| "grad_norm": 0.3501048684120178, |
| "learning_rate": 0.0004815433070866141, |
| "loss": 3.4192, |
| "step": 33950 |
| }, |
| { |
| "epoch": 9.909700361431735, |
| "grad_norm": 0.33060869574546814, |
| "learning_rate": 0.0004813683289588801, |
| "loss": 3.4188, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.909700361431735, |
| "eval_accuracy": 0.3675928749562903, |
| "eval_loss": 3.5656988620758057, |
| "eval_runtime": 53.7371, |
| "eval_samples_per_second": 309.414, |
| "eval_steps_per_second": 19.353, |
| "step": 34000 |
| }, |
| { |
| "epoch": 9.924274221755859, |
| "grad_norm": 0.3279787600040436, |
| "learning_rate": 0.0004811933508311461, |
| "loss": 3.4191, |
| "step": 34050 |
| }, |
| { |
| "epoch": 9.938848082079982, |
| "grad_norm": 0.33069542050361633, |
| "learning_rate": 0.00048101837270341207, |
| "loss": 3.4223, |
| "step": 34100 |
| }, |
| { |
| "epoch": 9.953421942404104, |
| "grad_norm": 0.32148513197898865, |
| "learning_rate": 0.00048084339457567796, |
| "loss": 3.4304, |
| "step": 34150 |
| }, |
| { |
| "epoch": 9.967995802728227, |
| "grad_norm": 0.337943971157074, |
| "learning_rate": 0.00048066841644794396, |
| "loss": 3.4164, |
| "step": 34200 |
| }, |
| { |
| "epoch": 9.982569663052349, |
| "grad_norm": 0.32066062092781067, |
| "learning_rate": 0.00048049343832020996, |
| "loss": 3.4176, |
| "step": 34250 |
| }, |
| { |
| "epoch": 9.997143523376472, |
| "grad_norm": 0.32504358887672424, |
| "learning_rate": 0.00048031846019247595, |
| "loss": 3.422, |
| "step": 34300 |
| }, |
| { |
| "epoch": 10.011659088259298, |
| "grad_norm": 0.32396450638771057, |
| "learning_rate": 0.00048014348206474184, |
| "loss": 3.3196, |
| "step": 34350 |
| }, |
| { |
| "epoch": 10.02623294858342, |
| "grad_norm": 0.3505806028842926, |
| "learning_rate": 0.00047996850393700784, |
| "loss": 3.3125, |
| "step": 34400 |
| }, |
| { |
| "epoch": 10.040806808907544, |
| "grad_norm": 0.31957384943962097, |
| "learning_rate": 0.00047979352580927384, |
| "loss": 3.2967, |
| "step": 34450 |
| }, |
| { |
| "epoch": 10.055380669231667, |
| "grad_norm": 0.34067434072494507, |
| "learning_rate": 0.00047961854768153973, |
| "loss": 3.3204, |
| "step": 34500 |
| }, |
| { |
| "epoch": 10.069954529555789, |
| "grad_norm": 0.36348363757133484, |
| "learning_rate": 0.0004794435695538057, |
| "loss": 3.3163, |
| "step": 34550 |
| }, |
| { |
| "epoch": 10.084528389879912, |
| "grad_norm": 0.34538960456848145, |
| "learning_rate": 0.0004792685914260717, |
| "loss": 3.3176, |
| "step": 34600 |
| }, |
| { |
| "epoch": 10.099102250204034, |
| "grad_norm": 0.35785046219825745, |
| "learning_rate": 0.00047909361329833767, |
| "loss": 3.3298, |
| "step": 34650 |
| }, |
| { |
| "epoch": 10.113676110528157, |
| "grad_norm": 0.35128745436668396, |
| "learning_rate": 0.0004789186351706036, |
| "loss": 3.3251, |
| "step": 34700 |
| }, |
| { |
| "epoch": 10.128249970852279, |
| "grad_norm": 0.33623847365379333, |
| "learning_rate": 0.0004787436570428696, |
| "loss": 3.3255, |
| "step": 34750 |
| }, |
| { |
| "epoch": 10.142823831176402, |
| "grad_norm": 0.3497258722782135, |
| "learning_rate": 0.00047856867891513555, |
| "loss": 3.3251, |
| "step": 34800 |
| }, |
| { |
| "epoch": 10.157397691500524, |
| "grad_norm": 0.3594476878643036, |
| "learning_rate": 0.00047839370078740155, |
| "loss": 3.3389, |
| "step": 34850 |
| }, |
| { |
| "epoch": 10.171971551824647, |
| "grad_norm": 0.3446819484233856, |
| "learning_rate": 0.0004782187226596675, |
| "loss": 3.3343, |
| "step": 34900 |
| }, |
| { |
| "epoch": 10.18654541214877, |
| "grad_norm": 0.36314183473587036, |
| "learning_rate": 0.00047804374453193344, |
| "loss": 3.3541, |
| "step": 34950 |
| }, |
| { |
| "epoch": 10.201119272472893, |
| "grad_norm": 0.35615265369415283, |
| "learning_rate": 0.00047786876640419943, |
| "loss": 3.3418, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.201119272472893, |
| "eval_accuracy": 0.36732770373135815, |
| "eval_loss": 3.5762977600097656, |
| "eval_runtime": 53.5644, |
| "eval_samples_per_second": 310.411, |
| "eval_steps_per_second": 19.416, |
| "step": 35000 |
| }, |
| { |
| "epoch": 10.215693132797016, |
| "grad_norm": 0.32317477464675903, |
| "learning_rate": 0.00047769378827646543, |
| "loss": 3.3544, |
| "step": 35050 |
| }, |
| { |
| "epoch": 10.230266993121138, |
| "grad_norm": 0.33711734414100647, |
| "learning_rate": 0.0004775188101487313, |
| "loss": 3.3419, |
| "step": 35100 |
| }, |
| { |
| "epoch": 10.244840853445261, |
| "grad_norm": 0.3401689827442169, |
| "learning_rate": 0.0004773438320209973, |
| "loss": 3.344, |
| "step": 35150 |
| }, |
| { |
| "epoch": 10.259414713769383, |
| "grad_norm": 0.34233129024505615, |
| "learning_rate": 0.0004771688538932633, |
| "loss": 3.3536, |
| "step": 35200 |
| }, |
| { |
| "epoch": 10.273988574093506, |
| "grad_norm": 0.3696705102920532, |
| "learning_rate": 0.0004769938757655293, |
| "loss": 3.3555, |
| "step": 35250 |
| }, |
| { |
| "epoch": 10.288562434417628, |
| "grad_norm": 0.33182162046432495, |
| "learning_rate": 0.0004768188976377952, |
| "loss": 3.3531, |
| "step": 35300 |
| }, |
| { |
| "epoch": 10.303136294741751, |
| "grad_norm": 0.34486010670661926, |
| "learning_rate": 0.0004766439195100612, |
| "loss": 3.362, |
| "step": 35350 |
| }, |
| { |
| "epoch": 10.317710155065875, |
| "grad_norm": 0.34664061665534973, |
| "learning_rate": 0.0004764689413823272, |
| "loss": 3.3657, |
| "step": 35400 |
| }, |
| { |
| "epoch": 10.332284015389996, |
| "grad_norm": 0.3353883624076843, |
| "learning_rate": 0.0004762939632545931, |
| "loss": 3.344, |
| "step": 35450 |
| }, |
| { |
| "epoch": 10.34685787571412, |
| "grad_norm": 0.3351586163043976, |
| "learning_rate": 0.0004761189851268591, |
| "loss": 3.3485, |
| "step": 35500 |
| }, |
| { |
| "epoch": 10.361431736038242, |
| "grad_norm": 0.3284741938114166, |
| "learning_rate": 0.0004759440069991251, |
| "loss": 3.369, |
| "step": 35550 |
| }, |
| { |
| "epoch": 10.376005596362365, |
| "grad_norm": 0.33612391352653503, |
| "learning_rate": 0.0004757690288713911, |
| "loss": 3.3685, |
| "step": 35600 |
| }, |
| { |
| "epoch": 10.390579456686487, |
| "grad_norm": 0.3895581364631653, |
| "learning_rate": 0.00047559405074365697, |
| "loss": 3.361, |
| "step": 35650 |
| }, |
| { |
| "epoch": 10.40515331701061, |
| "grad_norm": 0.3500942885875702, |
| "learning_rate": 0.00047541907261592297, |
| "loss": 3.3694, |
| "step": 35700 |
| }, |
| { |
| "epoch": 10.419727177334732, |
| "grad_norm": 0.3412453830242157, |
| "learning_rate": 0.00047524409448818897, |
| "loss": 3.3642, |
| "step": 35750 |
| }, |
| { |
| "epoch": 10.434301037658855, |
| "grad_norm": 0.35120752453804016, |
| "learning_rate": 0.0004750691163604549, |
| "loss": 3.3689, |
| "step": 35800 |
| }, |
| { |
| "epoch": 10.448874897982979, |
| "grad_norm": 0.33640050888061523, |
| "learning_rate": 0.00047489413823272085, |
| "loss": 3.3804, |
| "step": 35850 |
| }, |
| { |
| "epoch": 10.4634487583071, |
| "grad_norm": 0.33409053087234497, |
| "learning_rate": 0.00047471916010498685, |
| "loss": 3.3694, |
| "step": 35900 |
| }, |
| { |
| "epoch": 10.478022618631224, |
| "grad_norm": 0.37299105525016785, |
| "learning_rate": 0.0004745441819772528, |
| "loss": 3.3716, |
| "step": 35950 |
| }, |
| { |
| "epoch": 10.492596478955345, |
| "grad_norm": 0.35533326864242554, |
| "learning_rate": 0.0004743692038495188, |
| "loss": 3.3722, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.492596478955345, |
| "eval_accuracy": 0.36726367659138337, |
| "eval_loss": 3.571775436401367, |
| "eval_runtime": 53.7055, |
| "eval_samples_per_second": 309.596, |
| "eval_steps_per_second": 19.365, |
| "step": 36000 |
| }, |
| { |
| "epoch": 10.507170339279469, |
| "grad_norm": 0.318449467420578, |
| "learning_rate": 0.00047419422572178474, |
| "loss": 3.3661, |
| "step": 36050 |
| }, |
| { |
| "epoch": 10.52174419960359, |
| "grad_norm": 0.3540860116481781, |
| "learning_rate": 0.0004740192475940507, |
| "loss": 3.3702, |
| "step": 36100 |
| }, |
| { |
| "epoch": 10.536318059927714, |
| "grad_norm": 0.3569958209991455, |
| "learning_rate": 0.0004738442694663167, |
| "loss": 3.384, |
| "step": 36150 |
| }, |
| { |
| "epoch": 10.550891920251836, |
| "grad_norm": 0.3365253508090973, |
| "learning_rate": 0.0004736692913385827, |
| "loss": 3.3755, |
| "step": 36200 |
| }, |
| { |
| "epoch": 10.565465780575959, |
| "grad_norm": 0.3488047420978546, |
| "learning_rate": 0.00047349431321084856, |
| "loss": 3.3857, |
| "step": 36250 |
| }, |
| { |
| "epoch": 10.580039640900083, |
| "grad_norm": 0.37860432267189026, |
| "learning_rate": 0.00047331933508311456, |
| "loss": 3.3859, |
| "step": 36300 |
| }, |
| { |
| "epoch": 10.594613501224204, |
| "grad_norm": 0.3671683967113495, |
| "learning_rate": 0.00047314435695538056, |
| "loss": 3.3862, |
| "step": 36350 |
| }, |
| { |
| "epoch": 10.609187361548328, |
| "grad_norm": 0.3138248026371002, |
| "learning_rate": 0.00047296937882764645, |
| "loss": 3.3724, |
| "step": 36400 |
| }, |
| { |
| "epoch": 10.62376122187245, |
| "grad_norm": 0.36282312870025635, |
| "learning_rate": 0.00047279440069991245, |
| "loss": 3.4016, |
| "step": 36450 |
| }, |
| { |
| "epoch": 10.638335082196573, |
| "grad_norm": 0.3552148938179016, |
| "learning_rate": 0.00047261942257217844, |
| "loss": 3.3774, |
| "step": 36500 |
| }, |
| { |
| "epoch": 10.652908942520694, |
| "grad_norm": 0.339053213596344, |
| "learning_rate": 0.00047244444444444444, |
| "loss": 3.3964, |
| "step": 36550 |
| }, |
| { |
| "epoch": 10.667482802844818, |
| "grad_norm": 0.3407762050628662, |
| "learning_rate": 0.00047226946631671033, |
| "loss": 3.3886, |
| "step": 36600 |
| }, |
| { |
| "epoch": 10.68205666316894, |
| "grad_norm": 0.34318679571151733, |
| "learning_rate": 0.00047209448818897633, |
| "loss": 3.406, |
| "step": 36650 |
| }, |
| { |
| "epoch": 10.696630523493063, |
| "grad_norm": 0.3250613510608673, |
| "learning_rate": 0.0004719195100612423, |
| "loss": 3.3906, |
| "step": 36700 |
| }, |
| { |
| "epoch": 10.711204383817186, |
| "grad_norm": 0.32701337337493896, |
| "learning_rate": 0.00047174453193350827, |
| "loss": 3.3891, |
| "step": 36750 |
| }, |
| { |
| "epoch": 10.725778244141308, |
| "grad_norm": 0.34809330105781555, |
| "learning_rate": 0.0004715695538057742, |
| "loss": 3.3955, |
| "step": 36800 |
| }, |
| { |
| "epoch": 10.740352104465432, |
| "grad_norm": 0.38022422790527344, |
| "learning_rate": 0.0004713945756780402, |
| "loss": 3.3967, |
| "step": 36850 |
| }, |
| { |
| "epoch": 10.754925964789553, |
| "grad_norm": 0.3442574441432953, |
| "learning_rate": 0.0004712195975503062, |
| "loss": 3.3681, |
| "step": 36900 |
| }, |
| { |
| "epoch": 10.769499825113677, |
| "grad_norm": 0.3335849940776825, |
| "learning_rate": 0.00047104461942257215, |
| "loss": 3.3883, |
| "step": 36950 |
| }, |
| { |
| "epoch": 10.784073685437798, |
| "grad_norm": 0.34586572647094727, |
| "learning_rate": 0.0004708696412948381, |
| "loss": 3.3859, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.784073685437798, |
| "eval_accuracy": 0.3684486494687101, |
| "eval_loss": 3.5641283988952637, |
| "eval_runtime": 53.7119, |
| "eval_samples_per_second": 309.559, |
| "eval_steps_per_second": 19.363, |
| "step": 37000 |
| }, |
| { |
| "epoch": 10.798647545761922, |
| "grad_norm": 0.3282599151134491, |
| "learning_rate": 0.0004706946631671041, |
| "loss": 3.3869, |
| "step": 37050 |
| }, |
| { |
| "epoch": 10.813221406086043, |
| "grad_norm": 0.3208552300930023, |
| "learning_rate": 0.00047051968503937004, |
| "loss": 3.3761, |
| "step": 37100 |
| }, |
| { |
| "epoch": 10.827795266410167, |
| "grad_norm": 0.33274680376052856, |
| "learning_rate": 0.000470344706911636, |
| "loss": 3.3924, |
| "step": 37150 |
| }, |
| { |
| "epoch": 10.84236912673429, |
| "grad_norm": 0.34555304050445557, |
| "learning_rate": 0.000470169728783902, |
| "loss": 3.3937, |
| "step": 37200 |
| }, |
| { |
| "epoch": 10.856942987058412, |
| "grad_norm": 0.34741854667663574, |
| "learning_rate": 0.0004699947506561679, |
| "loss": 3.3877, |
| "step": 37250 |
| }, |
| { |
| "epoch": 10.871516847382535, |
| "grad_norm": 0.3210920989513397, |
| "learning_rate": 0.0004698197725284339, |
| "loss": 3.4016, |
| "step": 37300 |
| }, |
| { |
| "epoch": 10.886090707706657, |
| "grad_norm": 0.35668283700942993, |
| "learning_rate": 0.00046964479440069986, |
| "loss": 3.4034, |
| "step": 37350 |
| }, |
| { |
| "epoch": 10.90066456803078, |
| "grad_norm": 0.330739289522171, |
| "learning_rate": 0.0004694698162729658, |
| "loss": 3.3983, |
| "step": 37400 |
| }, |
| { |
| "epoch": 10.915238428354902, |
| "grad_norm": 0.3247140347957611, |
| "learning_rate": 0.0004692948381452318, |
| "loss": 3.3936, |
| "step": 37450 |
| }, |
| { |
| "epoch": 10.929812288679026, |
| "grad_norm": 0.34280630946159363, |
| "learning_rate": 0.0004691198600174978, |
| "loss": 3.4012, |
| "step": 37500 |
| }, |
| { |
| "epoch": 10.944386149003147, |
| "grad_norm": 0.3623274564743042, |
| "learning_rate": 0.0004689448818897637, |
| "loss": 3.3912, |
| "step": 37550 |
| }, |
| { |
| "epoch": 10.95896000932727, |
| "grad_norm": 0.3515354096889496, |
| "learning_rate": 0.0004687699037620297, |
| "loss": 3.3979, |
| "step": 37600 |
| }, |
| { |
| "epoch": 10.973533869651392, |
| "grad_norm": 0.3107682168483734, |
| "learning_rate": 0.0004685949256342957, |
| "loss": 3.4063, |
| "step": 37650 |
| }, |
| { |
| "epoch": 10.988107729975516, |
| "grad_norm": 0.33212000131607056, |
| "learning_rate": 0.0004684199475065617, |
| "loss": 3.3897, |
| "step": 37700 |
| }, |
| { |
| "epoch": 11.002623294858342, |
| "grad_norm": 0.34243836998939514, |
| "learning_rate": 0.0004682449693788276, |
| "loss": 3.3857, |
| "step": 37750 |
| }, |
| { |
| "epoch": 11.017197155182465, |
| "grad_norm": 0.32494038343429565, |
| "learning_rate": 0.00046806999125109357, |
| "loss": 3.2879, |
| "step": 37800 |
| }, |
| { |
| "epoch": 11.031771015506587, |
| "grad_norm": 0.3623080253601074, |
| "learning_rate": 0.00046789501312335957, |
| "loss": 3.2829, |
| "step": 37850 |
| }, |
| { |
| "epoch": 11.04634487583071, |
| "grad_norm": 0.4331565797328949, |
| "learning_rate": 0.0004677200349956255, |
| "loss": 3.2984, |
| "step": 37900 |
| }, |
| { |
| "epoch": 11.060918736154832, |
| "grad_norm": 0.345201313495636, |
| "learning_rate": 0.00046754505686789146, |
| "loss": 3.2901, |
| "step": 37950 |
| }, |
| { |
| "epoch": 11.075492596478956, |
| "grad_norm": 0.36016756296157837, |
| "learning_rate": 0.00046737007874015745, |
| "loss": 3.2956, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.075492596478956, |
| "eval_accuracy": 0.36816947230690844, |
| "eval_loss": 3.572314500808716, |
| "eval_runtime": 53.4893, |
| "eval_samples_per_second": 310.847, |
| "eval_steps_per_second": 19.443, |
| "step": 38000 |
| }, |
| { |
| "epoch": 11.090066456803077, |
| "grad_norm": 0.33840474486351013, |
| "learning_rate": 0.0004671951006124234, |
| "loss": 3.2932, |
| "step": 38050 |
| }, |
| { |
| "epoch": 11.1046403171272, |
| "grad_norm": 0.35402387380599976, |
| "learning_rate": 0.00046702012248468934, |
| "loss": 3.3106, |
| "step": 38100 |
| }, |
| { |
| "epoch": 11.119214177451322, |
| "grad_norm": 0.34707942605018616, |
| "learning_rate": 0.00046684514435695534, |
| "loss": 3.3193, |
| "step": 38150 |
| }, |
| { |
| "epoch": 11.133788037775446, |
| "grad_norm": 0.34295645356178284, |
| "learning_rate": 0.00046667016622922134, |
| "loss": 3.3037, |
| "step": 38200 |
| }, |
| { |
| "epoch": 11.14836189809957, |
| "grad_norm": 0.3585542142391205, |
| "learning_rate": 0.0004664951881014873, |
| "loss": 3.3092, |
| "step": 38250 |
| }, |
| { |
| "epoch": 11.162935758423691, |
| "grad_norm": 0.33070051670074463, |
| "learning_rate": 0.0004663202099737532, |
| "loss": 3.3223, |
| "step": 38300 |
| }, |
| { |
| "epoch": 11.177509618747814, |
| "grad_norm": 0.3767431676387787, |
| "learning_rate": 0.0004661452318460192, |
| "loss": 3.3158, |
| "step": 38350 |
| }, |
| { |
| "epoch": 11.192083479071936, |
| "grad_norm": 0.342354953289032, |
| "learning_rate": 0.00046597025371828516, |
| "loss": 3.3312, |
| "step": 38400 |
| }, |
| { |
| "epoch": 11.20665733939606, |
| "grad_norm": 0.3583531975746155, |
| "learning_rate": 0.00046579527559055116, |
| "loss": 3.3151, |
| "step": 38450 |
| }, |
| { |
| "epoch": 11.221231199720181, |
| "grad_norm": 0.34673362970352173, |
| "learning_rate": 0.0004656202974628171, |
| "loss": 3.3251, |
| "step": 38500 |
| }, |
| { |
| "epoch": 11.235805060044305, |
| "grad_norm": 0.3563697636127472, |
| "learning_rate": 0.00046544531933508305, |
| "loss": 3.3311, |
| "step": 38550 |
| }, |
| { |
| "epoch": 11.250378920368426, |
| "grad_norm": 0.347130686044693, |
| "learning_rate": 0.00046527034120734905, |
| "loss": 3.3333, |
| "step": 38600 |
| }, |
| { |
| "epoch": 11.26495278069255, |
| "grad_norm": 0.35779157280921936, |
| "learning_rate": 0.00046509536307961504, |
| "loss": 3.3241, |
| "step": 38650 |
| }, |
| { |
| "epoch": 11.279526641016673, |
| "grad_norm": 0.3590455949306488, |
| "learning_rate": 0.00046492038495188093, |
| "loss": 3.329, |
| "step": 38700 |
| }, |
| { |
| "epoch": 11.294100501340795, |
| "grad_norm": 0.37478992342948914, |
| "learning_rate": 0.00046474540682414693, |
| "loss": 3.3282, |
| "step": 38750 |
| }, |
| { |
| "epoch": 11.308674361664918, |
| "grad_norm": 0.3612726628780365, |
| "learning_rate": 0.00046457042869641293, |
| "loss": 3.3343, |
| "step": 38800 |
| }, |
| { |
| "epoch": 11.32324822198904, |
| "grad_norm": 0.33900731801986694, |
| "learning_rate": 0.0004643954505686789, |
| "loss": 3.3377, |
| "step": 38850 |
| }, |
| { |
| "epoch": 11.337822082313163, |
| "grad_norm": 0.37971436977386475, |
| "learning_rate": 0.0004642204724409448, |
| "loss": 3.3379, |
| "step": 38900 |
| }, |
| { |
| "epoch": 11.352395942637285, |
| "grad_norm": 0.3361247777938843, |
| "learning_rate": 0.0004640454943132108, |
| "loss": 3.3493, |
| "step": 38950 |
| }, |
| { |
| "epoch": 11.366969802961409, |
| "grad_norm": 0.3497176170349121, |
| "learning_rate": 0.0004638705161854768, |
| "loss": 3.3474, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.366969802961409, |
| "eval_accuracy": 0.3682512716861041, |
| "eval_loss": 3.570511817932129, |
| "eval_runtime": 53.493, |
| "eval_samples_per_second": 310.826, |
| "eval_steps_per_second": 19.442, |
| "step": 39000 |
| }, |
| { |
| "epoch": 11.38154366328553, |
| "grad_norm": 0.3290112614631653, |
| "learning_rate": 0.0004636955380577427, |
| "loss": 3.3496, |
| "step": 39050 |
| }, |
| { |
| "epoch": 11.396117523609654, |
| "grad_norm": 0.3501684069633484, |
| "learning_rate": 0.0004635205599300087, |
| "loss": 3.3368, |
| "step": 39100 |
| }, |
| { |
| "epoch": 11.410691383933777, |
| "grad_norm": 0.38452014327049255, |
| "learning_rate": 0.0004633455818022747, |
| "loss": 3.3386, |
| "step": 39150 |
| }, |
| { |
| "epoch": 11.425265244257899, |
| "grad_norm": 0.33995065093040466, |
| "learning_rate": 0.00046317060367454064, |
| "loss": 3.3576, |
| "step": 39200 |
| }, |
| { |
| "epoch": 11.439839104582022, |
| "grad_norm": 0.35238489508628845, |
| "learning_rate": 0.0004629956255468066, |
| "loss": 3.3576, |
| "step": 39250 |
| }, |
| { |
| "epoch": 11.454412964906144, |
| "grad_norm": 0.32981160283088684, |
| "learning_rate": 0.0004628206474190726, |
| "loss": 3.3364, |
| "step": 39300 |
| }, |
| { |
| "epoch": 11.468986825230267, |
| "grad_norm": 0.3768675923347473, |
| "learning_rate": 0.0004626456692913385, |
| "loss": 3.3575, |
| "step": 39350 |
| }, |
| { |
| "epoch": 11.483560685554389, |
| "grad_norm": 0.38474708795547485, |
| "learning_rate": 0.0004624706911636045, |
| "loss": 3.3563, |
| "step": 39400 |
| }, |
| { |
| "epoch": 11.498134545878512, |
| "grad_norm": 0.331777423620224, |
| "learning_rate": 0.00046229571303587046, |
| "loss": 3.3579, |
| "step": 39450 |
| }, |
| { |
| "epoch": 11.512708406202634, |
| "grad_norm": 0.34093236923217773, |
| "learning_rate": 0.00046212073490813646, |
| "loss": 3.3583, |
| "step": 39500 |
| }, |
| { |
| "epoch": 11.527282266526758, |
| "grad_norm": 0.34337228536605835, |
| "learning_rate": 0.0004619457567804024, |
| "loss": 3.363, |
| "step": 39550 |
| }, |
| { |
| "epoch": 11.541856126850881, |
| "grad_norm": 0.3778719902038574, |
| "learning_rate": 0.0004617707786526684, |
| "loss": 3.3631, |
| "step": 39600 |
| }, |
| { |
| "epoch": 11.556429987175003, |
| "grad_norm": 0.36301884055137634, |
| "learning_rate": 0.00046159580052493435, |
| "loss": 3.3479, |
| "step": 39650 |
| }, |
| { |
| "epoch": 11.571003847499126, |
| "grad_norm": 0.3672805428504944, |
| "learning_rate": 0.0004614208223972003, |
| "loss": 3.3568, |
| "step": 39700 |
| }, |
| { |
| "epoch": 11.585577707823248, |
| "grad_norm": 0.33438730239868164, |
| "learning_rate": 0.0004612458442694663, |
| "loss": 3.3497, |
| "step": 39750 |
| }, |
| { |
| "epoch": 11.600151568147371, |
| "grad_norm": 0.35404863953590393, |
| "learning_rate": 0.0004610708661417323, |
| "loss": 3.349, |
| "step": 39800 |
| }, |
| { |
| "epoch": 11.614725428471493, |
| "grad_norm": 0.36011001467704773, |
| "learning_rate": 0.0004608958880139982, |
| "loss": 3.3656, |
| "step": 39850 |
| }, |
| { |
| "epoch": 11.629299288795616, |
| "grad_norm": 0.3489465117454529, |
| "learning_rate": 0.00046072090988626417, |
| "loss": 3.3575, |
| "step": 39900 |
| }, |
| { |
| "epoch": 11.643873149119738, |
| "grad_norm": 0.323932409286499, |
| "learning_rate": 0.00046054593175853017, |
| "loss": 3.3624, |
| "step": 39950 |
| }, |
| { |
| "epoch": 11.658447009443861, |
| "grad_norm": 0.35972705483436584, |
| "learning_rate": 0.00046037095363079606, |
| "loss": 3.3665, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.658447009443861, |
| "eval_accuracy": 0.3689142585969088, |
| "eval_loss": 3.5612871646881104, |
| "eval_runtime": 53.4892, |
| "eval_samples_per_second": 310.848, |
| "eval_steps_per_second": 19.443, |
| "step": 40000 |
| }, |
| { |
| "epoch": 11.673020869767985, |
| "grad_norm": 0.34827083349227905, |
| "learning_rate": 0.00046019597550306206, |
| "loss": 3.366, |
| "step": 40050 |
| }, |
| { |
| "epoch": 11.687594730092107, |
| "grad_norm": 0.3732535243034363, |
| "learning_rate": 0.00046002099737532806, |
| "loss": 3.3773, |
| "step": 40100 |
| }, |
| { |
| "epoch": 11.70216859041623, |
| "grad_norm": 0.3575330972671509, |
| "learning_rate": 0.00045984601924759405, |
| "loss": 3.3686, |
| "step": 40150 |
| }, |
| { |
| "epoch": 11.716742450740352, |
| "grad_norm": 0.3412376046180725, |
| "learning_rate": 0.00045967104111985994, |
| "loss": 3.3683, |
| "step": 40200 |
| }, |
| { |
| "epoch": 11.731316311064475, |
| "grad_norm": 0.3610732853412628, |
| "learning_rate": 0.00045949606299212594, |
| "loss": 3.3844, |
| "step": 40250 |
| }, |
| { |
| "epoch": 11.745890171388597, |
| "grad_norm": 0.3689727187156677, |
| "learning_rate": 0.00045932108486439194, |
| "loss": 3.3658, |
| "step": 40300 |
| }, |
| { |
| "epoch": 11.76046403171272, |
| "grad_norm": 0.3771913945674896, |
| "learning_rate": 0.0004591461067366579, |
| "loss": 3.3818, |
| "step": 40350 |
| }, |
| { |
| "epoch": 11.775037892036842, |
| "grad_norm": 0.3590978682041168, |
| "learning_rate": 0.0004589711286089238, |
| "loss": 3.3736, |
| "step": 40400 |
| }, |
| { |
| "epoch": 11.789611752360965, |
| "grad_norm": 0.3486863076686859, |
| "learning_rate": 0.0004587961504811898, |
| "loss": 3.3657, |
| "step": 40450 |
| }, |
| { |
| "epoch": 11.804185612685089, |
| "grad_norm": 0.3921540379524231, |
| "learning_rate": 0.00045862117235345577, |
| "loss": 3.3685, |
| "step": 40500 |
| }, |
| { |
| "epoch": 11.81875947300921, |
| "grad_norm": 0.350303590297699, |
| "learning_rate": 0.00045844619422572176, |
| "loss": 3.3825, |
| "step": 40550 |
| }, |
| { |
| "epoch": 11.833333333333334, |
| "grad_norm": 0.3501862585544586, |
| "learning_rate": 0.0004582712160979877, |
| "loss": 3.3793, |
| "step": 40600 |
| }, |
| { |
| "epoch": 11.847907193657456, |
| "grad_norm": 0.3574955463409424, |
| "learning_rate": 0.00045809623797025365, |
| "loss": 3.358, |
| "step": 40650 |
| }, |
| { |
| "epoch": 11.862481053981579, |
| "grad_norm": 0.36220476031303406, |
| "learning_rate": 0.00045792125984251965, |
| "loss": 3.3806, |
| "step": 40700 |
| }, |
| { |
| "epoch": 11.8770549143057, |
| "grad_norm": 0.3598235249519348, |
| "learning_rate": 0.0004577462817147856, |
| "loss": 3.3678, |
| "step": 40750 |
| }, |
| { |
| "epoch": 11.891628774629824, |
| "grad_norm": 0.36743849515914917, |
| "learning_rate": 0.0004575713035870516, |
| "loss": 3.3763, |
| "step": 40800 |
| }, |
| { |
| "epoch": 11.906202634953946, |
| "grad_norm": 0.36621856689453125, |
| "learning_rate": 0.00045739632545931753, |
| "loss": 3.3798, |
| "step": 40850 |
| }, |
| { |
| "epoch": 11.92077649527807, |
| "grad_norm": 0.36214226484298706, |
| "learning_rate": 0.00045722134733158353, |
| "loss": 3.3725, |
| "step": 40900 |
| }, |
| { |
| "epoch": 11.935350355602193, |
| "grad_norm": 0.34808284044265747, |
| "learning_rate": 0.0004570463692038495, |
| "loss": 3.3819, |
| "step": 40950 |
| }, |
| { |
| "epoch": 11.949924215926314, |
| "grad_norm": 0.3413713276386261, |
| "learning_rate": 0.0004568713910761154, |
| "loss": 3.3872, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.949924215926314, |
| "eval_accuracy": 0.3696621050075697, |
| "eval_loss": 3.5527102947235107, |
| "eval_runtime": 53.5899, |
| "eval_samples_per_second": 310.264, |
| "eval_steps_per_second": 19.407, |
| "step": 41000 |
| }, |
| { |
| "epoch": 11.964498076250438, |
| "grad_norm": 0.3228790760040283, |
| "learning_rate": 0.0004566964129483814, |
| "loss": 3.3829, |
| "step": 41050 |
| }, |
| { |
| "epoch": 11.97907193657456, |
| "grad_norm": 0.3723450005054474, |
| "learning_rate": 0.0004565214348206474, |
| "loss": 3.3849, |
| "step": 41100 |
| }, |
| { |
| "epoch": 11.993645796898683, |
| "grad_norm": 0.3735453188419342, |
| "learning_rate": 0.0004563464566929133, |
| "loss": 3.3929, |
| "step": 41150 |
| }, |
| { |
| "epoch": 12.008161361781509, |
| "grad_norm": 0.3837350606918335, |
| "learning_rate": 0.0004561714785651793, |
| "loss": 3.3206, |
| "step": 41200 |
| }, |
| { |
| "epoch": 12.02273522210563, |
| "grad_norm": 0.3676198124885559, |
| "learning_rate": 0.0004559965004374453, |
| "loss": 3.2703, |
| "step": 41250 |
| }, |
| { |
| "epoch": 12.037309082429754, |
| "grad_norm": 0.3204810321331024, |
| "learning_rate": 0.0004558215223097113, |
| "loss": 3.2777, |
| "step": 41300 |
| }, |
| { |
| "epoch": 12.051882942753876, |
| "grad_norm": 0.34587329626083374, |
| "learning_rate": 0.0004556465441819772, |
| "loss": 3.2701, |
| "step": 41350 |
| }, |
| { |
| "epoch": 12.066456803078, |
| "grad_norm": 0.3423779010772705, |
| "learning_rate": 0.0004554715660542432, |
| "loss": 3.2759, |
| "step": 41400 |
| }, |
| { |
| "epoch": 12.081030663402123, |
| "grad_norm": 0.3461553454399109, |
| "learning_rate": 0.0004552965879265092, |
| "loss": 3.2888, |
| "step": 41450 |
| }, |
| { |
| "epoch": 12.095604523726244, |
| "grad_norm": 0.3680129051208496, |
| "learning_rate": 0.0004551216097987751, |
| "loss": 3.2922, |
| "step": 41500 |
| }, |
| { |
| "epoch": 12.110178384050368, |
| "grad_norm": 0.34745341539382935, |
| "learning_rate": 0.00045494663167104107, |
| "loss": 3.2875, |
| "step": 41550 |
| }, |
| { |
| "epoch": 12.12475224437449, |
| "grad_norm": 0.3380335569381714, |
| "learning_rate": 0.00045477165354330706, |
| "loss": 3.2938, |
| "step": 41600 |
| }, |
| { |
| "epoch": 12.139326104698613, |
| "grad_norm": 0.3920893967151642, |
| "learning_rate": 0.000454596675415573, |
| "loss": 3.3019, |
| "step": 41650 |
| }, |
| { |
| "epoch": 12.153899965022735, |
| "grad_norm": 0.34419387578964233, |
| "learning_rate": 0.00045442169728783895, |
| "loss": 3.294, |
| "step": 41700 |
| }, |
| { |
| "epoch": 12.168473825346858, |
| "grad_norm": 0.3551051616668701, |
| "learning_rate": 0.00045424671916010495, |
| "loss": 3.3003, |
| "step": 41750 |
| }, |
| { |
| "epoch": 12.18304768567098, |
| "grad_norm": 0.34203040599823, |
| "learning_rate": 0.0004540717410323709, |
| "loss": 3.3046, |
| "step": 41800 |
| }, |
| { |
| "epoch": 12.197621545995103, |
| "grad_norm": 0.36551839113235474, |
| "learning_rate": 0.0004538967629046369, |
| "loss": 3.3137, |
| "step": 41850 |
| }, |
| { |
| "epoch": 12.212195406319227, |
| "grad_norm": 0.3976987302303314, |
| "learning_rate": 0.00045372178477690283, |
| "loss": 3.3056, |
| "step": 41900 |
| }, |
| { |
| "epoch": 12.226769266643348, |
| "grad_norm": 0.3605201542377472, |
| "learning_rate": 0.0004535468066491688, |
| "loss": 3.308, |
| "step": 41950 |
| }, |
| { |
| "epoch": 12.241343126967472, |
| "grad_norm": 0.3401413559913635, |
| "learning_rate": 0.0004533718285214348, |
| "loss": 3.3103, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.241343126967472, |
| "eval_accuracy": 0.3690327794240312, |
| "eval_loss": 3.5694046020507812, |
| "eval_runtime": 53.4562, |
| "eval_samples_per_second": 311.04, |
| "eval_steps_per_second": 19.455, |
| "step": 42000 |
| }, |
| { |
| "epoch": 12.255916987291593, |
| "grad_norm": 0.3535427749156952, |
| "learning_rate": 0.00045319685039370077, |
| "loss": 3.3055, |
| "step": 42050 |
| }, |
| { |
| "epoch": 12.270490847615717, |
| "grad_norm": 0.3288796544075012, |
| "learning_rate": 0.0004530218722659667, |
| "loss": 3.3139, |
| "step": 42100 |
| }, |
| { |
| "epoch": 12.285064707939839, |
| "grad_norm": 0.36437782645225525, |
| "learning_rate": 0.00045284689413823266, |
| "loss": 3.3208, |
| "step": 42150 |
| }, |
| { |
| "epoch": 12.299638568263962, |
| "grad_norm": 0.37901076674461365, |
| "learning_rate": 0.00045267191601049866, |
| "loss": 3.3208, |
| "step": 42200 |
| }, |
| { |
| "epoch": 12.314212428588084, |
| "grad_norm": 0.35146012902259827, |
| "learning_rate": 0.00045249693788276465, |
| "loss": 3.3119, |
| "step": 42250 |
| }, |
| { |
| "epoch": 12.328786288912207, |
| "grad_norm": 0.35691481828689575, |
| "learning_rate": 0.00045232195975503054, |
| "loss": 3.3241, |
| "step": 42300 |
| }, |
| { |
| "epoch": 12.34336014923633, |
| "grad_norm": 0.38521066308021545, |
| "learning_rate": 0.00045214698162729654, |
| "loss": 3.3149, |
| "step": 42350 |
| }, |
| { |
| "epoch": 12.357934009560452, |
| "grad_norm": 0.34150147438049316, |
| "learning_rate": 0.00045197200349956254, |
| "loss": 3.3346, |
| "step": 42400 |
| }, |
| { |
| "epoch": 12.372507869884576, |
| "grad_norm": 0.3610418438911438, |
| "learning_rate": 0.00045179702537182854, |
| "loss": 3.328, |
| "step": 42450 |
| }, |
| { |
| "epoch": 12.387081730208697, |
| "grad_norm": 0.36372873187065125, |
| "learning_rate": 0.0004516220472440944, |
| "loss": 3.3186, |
| "step": 42500 |
| }, |
| { |
| "epoch": 12.40165559053282, |
| "grad_norm": 0.36496156454086304, |
| "learning_rate": 0.0004514470691163604, |
| "loss": 3.3356, |
| "step": 42550 |
| }, |
| { |
| "epoch": 12.416229450856942, |
| "grad_norm": 0.33488455414772034, |
| "learning_rate": 0.0004512720909886264, |
| "loss": 3.3282, |
| "step": 42600 |
| }, |
| { |
| "epoch": 12.430803311181066, |
| "grad_norm": 0.3839046061038971, |
| "learning_rate": 0.0004510971128608923, |
| "loss": 3.3297, |
| "step": 42650 |
| }, |
| { |
| "epoch": 12.445377171505188, |
| "grad_norm": 0.39402374625205994, |
| "learning_rate": 0.0004509221347331583, |
| "loss": 3.3403, |
| "step": 42700 |
| }, |
| { |
| "epoch": 12.459951031829311, |
| "grad_norm": 0.3618732690811157, |
| "learning_rate": 0.0004507471566054243, |
| "loss": 3.3394, |
| "step": 42750 |
| }, |
| { |
| "epoch": 12.474524892153434, |
| "grad_norm": 0.3591196537017822, |
| "learning_rate": 0.00045057217847769025, |
| "loss": 3.3286, |
| "step": 42800 |
| }, |
| { |
| "epoch": 12.489098752477556, |
| "grad_norm": 0.34960296750068665, |
| "learning_rate": 0.0004503972003499562, |
| "loss": 3.3374, |
| "step": 42850 |
| }, |
| { |
| "epoch": 12.50367261280168, |
| "grad_norm": 0.334379643201828, |
| "learning_rate": 0.0004502222222222222, |
| "loss": 3.3285, |
| "step": 42900 |
| }, |
| { |
| "epoch": 12.518246473125801, |
| "grad_norm": 0.35966038703918457, |
| "learning_rate": 0.00045004724409448813, |
| "loss": 3.3383, |
| "step": 42950 |
| }, |
| { |
| "epoch": 12.532820333449925, |
| "grad_norm": 0.36193299293518066, |
| "learning_rate": 0.00044987226596675413, |
| "loss": 3.3378, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.532820333449925, |
| "eval_accuracy": 0.3695185147304204, |
| "eval_loss": 3.5594754219055176, |
| "eval_runtime": 53.596, |
| "eval_samples_per_second": 310.228, |
| "eval_steps_per_second": 19.404, |
| "step": 43000 |
| }, |
| { |
| "epoch": 12.547394193774046, |
| "grad_norm": 0.37258338928222656, |
| "learning_rate": 0.0004496972878390201, |
| "loss": 3.3357, |
| "step": 43050 |
| }, |
| { |
| "epoch": 12.56196805409817, |
| "grad_norm": 0.39255374670028687, |
| "learning_rate": 0.000449522309711286, |
| "loss": 3.3511, |
| "step": 43100 |
| }, |
| { |
| "epoch": 12.576541914422291, |
| "grad_norm": 0.3664344847202301, |
| "learning_rate": 0.000449347331583552, |
| "loss": 3.3358, |
| "step": 43150 |
| }, |
| { |
| "epoch": 12.591115774746415, |
| "grad_norm": 0.32657238841056824, |
| "learning_rate": 0.000449172353455818, |
| "loss": 3.3427, |
| "step": 43200 |
| }, |
| { |
| "epoch": 12.605689635070538, |
| "grad_norm": 0.3511047065258026, |
| "learning_rate": 0.0004489973753280839, |
| "loss": 3.3391, |
| "step": 43250 |
| }, |
| { |
| "epoch": 12.62026349539466, |
| "grad_norm": 0.4074779748916626, |
| "learning_rate": 0.0004488223972003499, |
| "loss": 3.3435, |
| "step": 43300 |
| }, |
| { |
| "epoch": 12.634837355718783, |
| "grad_norm": 0.3460705876350403, |
| "learning_rate": 0.0004486474190726159, |
| "loss": 3.3386, |
| "step": 43350 |
| }, |
| { |
| "epoch": 12.649411216042905, |
| "grad_norm": 0.34168577194213867, |
| "learning_rate": 0.0004484724409448819, |
| "loss": 3.3456, |
| "step": 43400 |
| }, |
| { |
| "epoch": 12.663985076367029, |
| "grad_norm": 0.3800879716873169, |
| "learning_rate": 0.0004482974628171478, |
| "loss": 3.3537, |
| "step": 43450 |
| }, |
| { |
| "epoch": 12.67855893669115, |
| "grad_norm": 0.3582957684993744, |
| "learning_rate": 0.0004481224846894138, |
| "loss": 3.3452, |
| "step": 43500 |
| }, |
| { |
| "epoch": 12.693132797015274, |
| "grad_norm": 0.3368239998817444, |
| "learning_rate": 0.0004479475065616798, |
| "loss": 3.3561, |
| "step": 43550 |
| }, |
| { |
| "epoch": 12.707706657339395, |
| "grad_norm": 0.3307804763317108, |
| "learning_rate": 0.00044777252843394567, |
| "loss": 3.3406, |
| "step": 43600 |
| }, |
| { |
| "epoch": 12.722280517663519, |
| "grad_norm": 0.35581493377685547, |
| "learning_rate": 0.00044759755030621167, |
| "loss": 3.3539, |
| "step": 43650 |
| }, |
| { |
| "epoch": 12.736854377987642, |
| "grad_norm": 0.37109094858169556, |
| "learning_rate": 0.00044742257217847767, |
| "loss": 3.3611, |
| "step": 43700 |
| }, |
| { |
| "epoch": 12.751428238311764, |
| "grad_norm": 0.35700997710227966, |
| "learning_rate": 0.00044724759405074366, |
| "loss": 3.3657, |
| "step": 43750 |
| }, |
| { |
| "epoch": 12.766002098635887, |
| "grad_norm": 0.3524216115474701, |
| "learning_rate": 0.00044707261592300955, |
| "loss": 3.3507, |
| "step": 43800 |
| }, |
| { |
| "epoch": 12.780575958960009, |
| "grad_norm": 0.36555764079093933, |
| "learning_rate": 0.00044689763779527555, |
| "loss": 3.3498, |
| "step": 43850 |
| }, |
| { |
| "epoch": 12.795149819284132, |
| "grad_norm": 0.39025068283081055, |
| "learning_rate": 0.00044672265966754155, |
| "loss": 3.3462, |
| "step": 43900 |
| }, |
| { |
| "epoch": 12.809723679608254, |
| "grad_norm": 0.32423970103263855, |
| "learning_rate": 0.0004465476815398075, |
| "loss": 3.3643, |
| "step": 43950 |
| }, |
| { |
| "epoch": 12.824297539932378, |
| "grad_norm": 0.3827119469642639, |
| "learning_rate": 0.00044637270341207344, |
| "loss": 3.3613, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.824297539932378, |
| "eval_accuracy": 0.36983464873404576, |
| "eval_loss": 3.554856538772583, |
| "eval_runtime": 53.5716, |
| "eval_samples_per_second": 310.37, |
| "eval_steps_per_second": 19.413, |
| "step": 44000 |
| }, |
| { |
| "epoch": 12.8388714002565, |
| "grad_norm": 0.37172967195510864, |
| "learning_rate": 0.00044619772528433943, |
| "loss": 3.3679, |
| "step": 44050 |
| }, |
| { |
| "epoch": 12.853445260580623, |
| "grad_norm": 0.33306294679641724, |
| "learning_rate": 0.0004460227471566054, |
| "loss": 3.3589, |
| "step": 44100 |
| }, |
| { |
| "epoch": 12.868019120904744, |
| "grad_norm": 0.3361258804798126, |
| "learning_rate": 0.0004458477690288714, |
| "loss": 3.35, |
| "step": 44150 |
| }, |
| { |
| "epoch": 12.882592981228868, |
| "grad_norm": 0.35810065269470215, |
| "learning_rate": 0.0004456727909011373, |
| "loss": 3.339, |
| "step": 44200 |
| }, |
| { |
| "epoch": 12.897166841552991, |
| "grad_norm": 0.332621693611145, |
| "learning_rate": 0.00044549781277340326, |
| "loss": 3.3578, |
| "step": 44250 |
| }, |
| { |
| "epoch": 12.911740701877113, |
| "grad_norm": 0.3466978073120117, |
| "learning_rate": 0.00044532283464566926, |
| "loss": 3.3699, |
| "step": 44300 |
| }, |
| { |
| "epoch": 12.926314562201236, |
| "grad_norm": 0.36178258061408997, |
| "learning_rate": 0.0004451478565179352, |
| "loss": 3.36, |
| "step": 44350 |
| }, |
| { |
| "epoch": 12.940888422525358, |
| "grad_norm": 0.34199562668800354, |
| "learning_rate": 0.00044497287839020115, |
| "loss": 3.3637, |
| "step": 44400 |
| }, |
| { |
| "epoch": 12.955462282849481, |
| "grad_norm": 0.3354377746582031, |
| "learning_rate": 0.00044479790026246714, |
| "loss": 3.3557, |
| "step": 44450 |
| }, |
| { |
| "epoch": 12.970036143173603, |
| "grad_norm": 0.3417295217514038, |
| "learning_rate": 0.00044462292213473314, |
| "loss": 3.362, |
| "step": 44500 |
| }, |
| { |
| "epoch": 12.984610003497727, |
| "grad_norm": 0.3830733597278595, |
| "learning_rate": 0.00044444794400699903, |
| "loss": 3.346, |
| "step": 44550 |
| }, |
| { |
| "epoch": 12.999183863821848, |
| "grad_norm": 0.3363073468208313, |
| "learning_rate": 0.00044427296587926503, |
| "loss": 3.3518, |
| "step": 44600 |
| }, |
| { |
| "epoch": 13.013699428704676, |
| "grad_norm": 0.32705244421958923, |
| "learning_rate": 0.000444097987751531, |
| "loss": 3.2613, |
| "step": 44650 |
| }, |
| { |
| "epoch": 13.028273289028798, |
| "grad_norm": 0.34872934222221375, |
| "learning_rate": 0.000443923009623797, |
| "loss": 3.2499, |
| "step": 44700 |
| }, |
| { |
| "epoch": 13.042847149352921, |
| "grad_norm": 0.34995242953300476, |
| "learning_rate": 0.0004437480314960629, |
| "loss": 3.2539, |
| "step": 44750 |
| }, |
| { |
| "epoch": 13.057421009677043, |
| "grad_norm": 0.3677576184272766, |
| "learning_rate": 0.0004435730533683289, |
| "loss": 3.2627, |
| "step": 44800 |
| }, |
| { |
| "epoch": 13.071994870001166, |
| "grad_norm": 0.3671504855155945, |
| "learning_rate": 0.0004433980752405949, |
| "loss": 3.2712, |
| "step": 44850 |
| }, |
| { |
| "epoch": 13.086568730325288, |
| "grad_norm": 0.3539280295372009, |
| "learning_rate": 0.0004432230971128609, |
| "loss": 3.2661, |
| "step": 44900 |
| }, |
| { |
| "epoch": 13.101142590649411, |
| "grad_norm": 0.33801642060279846, |
| "learning_rate": 0.0004430481189851268, |
| "loss": 3.2857, |
| "step": 44950 |
| }, |
| { |
| "epoch": 13.115716450973533, |
| "grad_norm": 0.3268982768058777, |
| "learning_rate": 0.0004428731408573928, |
| "loss": 3.2731, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.115716450973533, |
| "eval_accuracy": 0.3696539839181244, |
| "eval_loss": 3.5643928050994873, |
| "eval_runtime": 53.7367, |
| "eval_samples_per_second": 309.416, |
| "eval_steps_per_second": 19.354, |
| "step": 45000 |
| }, |
| { |
| "epoch": 13.130290311297657, |
| "grad_norm": 0.36461833119392395, |
| "learning_rate": 0.0004426981627296588, |
| "loss": 3.2744, |
| "step": 45050 |
| }, |
| { |
| "epoch": 13.14486417162178, |
| "grad_norm": 0.3448938727378845, |
| "learning_rate": 0.00044252318460192473, |
| "loss": 3.2765, |
| "step": 45100 |
| }, |
| { |
| "epoch": 13.159438031945902, |
| "grad_norm": 0.37296536564826965, |
| "learning_rate": 0.0004423482064741907, |
| "loss": 3.2731, |
| "step": 45150 |
| }, |
| { |
| "epoch": 13.174011892270025, |
| "grad_norm": 0.3660445511341095, |
| "learning_rate": 0.0004421732283464567, |
| "loss": 3.2786, |
| "step": 45200 |
| }, |
| { |
| "epoch": 13.188585752594147, |
| "grad_norm": 0.3396225571632385, |
| "learning_rate": 0.0004419982502187226, |
| "loss": 3.2926, |
| "step": 45250 |
| }, |
| { |
| "epoch": 13.20315961291827, |
| "grad_norm": 0.35958313941955566, |
| "learning_rate": 0.00044182327209098856, |
| "loss": 3.2899, |
| "step": 45300 |
| }, |
| { |
| "epoch": 13.217733473242392, |
| "grad_norm": 0.3425939977169037, |
| "learning_rate": 0.00044164829396325456, |
| "loss": 3.2836, |
| "step": 45350 |
| }, |
| { |
| "epoch": 13.232307333566515, |
| "grad_norm": 0.3608160614967346, |
| "learning_rate": 0.0004414733158355205, |
| "loss": 3.2962, |
| "step": 45400 |
| }, |
| { |
| "epoch": 13.246881193890637, |
| "grad_norm": 0.35369420051574707, |
| "learning_rate": 0.0004412983377077865, |
| "loss": 3.283, |
| "step": 45450 |
| }, |
| { |
| "epoch": 13.26145505421476, |
| "grad_norm": 0.3353852927684784, |
| "learning_rate": 0.00044112335958005244, |
| "loss": 3.3005, |
| "step": 45500 |
| }, |
| { |
| "epoch": 13.276028914538884, |
| "grad_norm": 0.34149834513664246, |
| "learning_rate": 0.0004409483814523184, |
| "loss": 3.2996, |
| "step": 45550 |
| }, |
| { |
| "epoch": 13.290602774863006, |
| "grad_norm": 0.3686482906341553, |
| "learning_rate": 0.0004407734033245844, |
| "loss": 3.3024, |
| "step": 45600 |
| }, |
| { |
| "epoch": 13.305176635187129, |
| "grad_norm": 0.3600018322467804, |
| "learning_rate": 0.0004405984251968504, |
| "loss": 3.2901, |
| "step": 45650 |
| }, |
| { |
| "epoch": 13.31975049551125, |
| "grad_norm": 0.3920084536075592, |
| "learning_rate": 0.0004404234470691163, |
| "loss": 3.2995, |
| "step": 45700 |
| }, |
| { |
| "epoch": 13.334324355835374, |
| "grad_norm": 0.3769385516643524, |
| "learning_rate": 0.00044024846894138227, |
| "loss": 3.3149, |
| "step": 45750 |
| }, |
| { |
| "epoch": 13.348898216159496, |
| "grad_norm": 0.3545381426811218, |
| "learning_rate": 0.00044007349081364827, |
| "loss": 3.3101, |
| "step": 45800 |
| }, |
| { |
| "epoch": 13.36347207648362, |
| "grad_norm": 0.3694087564945221, |
| "learning_rate": 0.00043989851268591427, |
| "loss": 3.2971, |
| "step": 45850 |
| }, |
| { |
| "epoch": 13.378045936807741, |
| "grad_norm": 0.36577796936035156, |
| "learning_rate": 0.00043972353455818016, |
| "loss": 3.3109, |
| "step": 45900 |
| }, |
| { |
| "epoch": 13.392619797131864, |
| "grad_norm": 0.36210718750953674, |
| "learning_rate": 0.00043954855643044615, |
| "loss": 3.309, |
| "step": 45950 |
| }, |
| { |
| "epoch": 13.407193657455988, |
| "grad_norm": 0.4298291802406311, |
| "learning_rate": 0.00043937357830271215, |
| "loss": 3.3164, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.407193657455988, |
| "eval_accuracy": 0.36993221950433813, |
| "eval_loss": 3.5596821308135986, |
| "eval_runtime": 53.6476, |
| "eval_samples_per_second": 309.93, |
| "eval_steps_per_second": 19.386, |
| "step": 46000 |
| }, |
| { |
| "epoch": 13.42176751778011, |
| "grad_norm": 0.3537709712982178, |
| "learning_rate": 0.0004391986001749781, |
| "loss": 3.3308, |
| "step": 46050 |
| }, |
| { |
| "epoch": 13.436341378104233, |
| "grad_norm": 0.3565349280834198, |
| "learning_rate": 0.00043902362204724404, |
| "loss": 3.3114, |
| "step": 46100 |
| }, |
| { |
| "epoch": 13.450915238428355, |
| "grad_norm": 0.3568958640098572, |
| "learning_rate": 0.00043884864391951004, |
| "loss": 3.321, |
| "step": 46150 |
| }, |
| { |
| "epoch": 13.465489098752478, |
| "grad_norm": 0.36677974462509155, |
| "learning_rate": 0.00043867366579177603, |
| "loss": 3.319, |
| "step": 46200 |
| }, |
| { |
| "epoch": 13.4800629590766, |
| "grad_norm": 0.37393441796302795, |
| "learning_rate": 0.0004384986876640419, |
| "loss": 3.32, |
| "step": 46250 |
| }, |
| { |
| "epoch": 13.494636819400723, |
| "grad_norm": 0.3644926846027374, |
| "learning_rate": 0.0004383237095363079, |
| "loss": 3.3186, |
| "step": 46300 |
| }, |
| { |
| "epoch": 13.509210679724845, |
| "grad_norm": 0.3858359754085541, |
| "learning_rate": 0.0004381487314085739, |
| "loss": 3.3288, |
| "step": 46350 |
| }, |
| { |
| "epoch": 13.523784540048968, |
| "grad_norm": 0.37332263588905334, |
| "learning_rate": 0.00043797375328083986, |
| "loss": 3.3238, |
| "step": 46400 |
| }, |
| { |
| "epoch": 13.538358400373092, |
| "grad_norm": 0.3647373616695404, |
| "learning_rate": 0.0004377987751531058, |
| "loss": 3.3188, |
| "step": 46450 |
| }, |
| { |
| "epoch": 13.552932260697213, |
| "grad_norm": 0.3774361312389374, |
| "learning_rate": 0.0004376237970253718, |
| "loss": 3.325, |
| "step": 46500 |
| }, |
| { |
| "epoch": 13.567506121021337, |
| "grad_norm": 0.34221887588500977, |
| "learning_rate": 0.00043744881889763775, |
| "loss": 3.3216, |
| "step": 46550 |
| }, |
| { |
| "epoch": 13.582079981345458, |
| "grad_norm": 0.3474332094192505, |
| "learning_rate": 0.00043727384076990374, |
| "loss": 3.3238, |
| "step": 46600 |
| }, |
| { |
| "epoch": 13.596653841669582, |
| "grad_norm": 0.36634954810142517, |
| "learning_rate": 0.0004370988626421697, |
| "loss": 3.3217, |
| "step": 46650 |
| }, |
| { |
| "epoch": 13.611227701993704, |
| "grad_norm": 0.3415551781654358, |
| "learning_rate": 0.00043692388451443563, |
| "loss": 3.3246, |
| "step": 46700 |
| }, |
| { |
| "epoch": 13.625801562317827, |
| "grad_norm": 0.33416080474853516, |
| "learning_rate": 0.00043674890638670163, |
| "loss": 3.3401, |
| "step": 46750 |
| }, |
| { |
| "epoch": 13.640375422641949, |
| "grad_norm": 0.3393879532814026, |
| "learning_rate": 0.0004365739282589676, |
| "loss": 3.3347, |
| "step": 46800 |
| }, |
| { |
| "epoch": 13.654949282966072, |
| "grad_norm": 0.3485293686389923, |
| "learning_rate": 0.0004363989501312335, |
| "loss": 3.344, |
| "step": 46850 |
| }, |
| { |
| "epoch": 13.669523143290196, |
| "grad_norm": 0.36705392599105835, |
| "learning_rate": 0.0004362239720034995, |
| "loss": 3.3304, |
| "step": 46900 |
| }, |
| { |
| "epoch": 13.684097003614317, |
| "grad_norm": 0.3389139175415039, |
| "learning_rate": 0.0004360489938757655, |
| "loss": 3.3394, |
| "step": 46950 |
| }, |
| { |
| "epoch": 13.69867086393844, |
| "grad_norm": 0.34914955496788025, |
| "learning_rate": 0.0004358740157480315, |
| "loss": 3.3302, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.69867086393844, |
| "eval_accuracy": 0.37021775230135784, |
| "eval_loss": 3.5525195598602295, |
| "eval_runtime": 53.674, |
| "eval_samples_per_second": 309.777, |
| "eval_steps_per_second": 19.376, |
| "step": 47000 |
| }, |
| { |
| "epoch": 13.713244724262562, |
| "grad_norm": 0.3674139678478241, |
| "learning_rate": 0.0004356990376202974, |
| "loss": 3.3422, |
| "step": 47050 |
| }, |
| { |
| "epoch": 13.727818584586686, |
| "grad_norm": 0.38315895199775696, |
| "learning_rate": 0.0004355240594925634, |
| "loss": 3.3291, |
| "step": 47100 |
| }, |
| { |
| "epoch": 13.742392444910807, |
| "grad_norm": 0.3434007167816162, |
| "learning_rate": 0.0004353490813648294, |
| "loss": 3.334, |
| "step": 47150 |
| }, |
| { |
| "epoch": 13.756966305234931, |
| "grad_norm": 0.34636348485946655, |
| "learning_rate": 0.0004351741032370953, |
| "loss": 3.3411, |
| "step": 47200 |
| }, |
| { |
| "epoch": 13.771540165559053, |
| "grad_norm": 0.38614946603775024, |
| "learning_rate": 0.0004349991251093613, |
| "loss": 3.342, |
| "step": 47250 |
| }, |
| { |
| "epoch": 13.786114025883176, |
| "grad_norm": 0.36161521077156067, |
| "learning_rate": 0.0004348241469816273, |
| "loss": 3.3345, |
| "step": 47300 |
| }, |
| { |
| "epoch": 13.8006878862073, |
| "grad_norm": 0.3345310389995575, |
| "learning_rate": 0.0004346491688538932, |
| "loss": 3.3347, |
| "step": 47350 |
| }, |
| { |
| "epoch": 13.815261746531421, |
| "grad_norm": 0.3997637629508972, |
| "learning_rate": 0.00043447419072615916, |
| "loss": 3.336, |
| "step": 47400 |
| }, |
| { |
| "epoch": 13.829835606855545, |
| "grad_norm": 0.34914571046829224, |
| "learning_rate": 0.00043429921259842516, |
| "loss": 3.3287, |
| "step": 47450 |
| }, |
| { |
| "epoch": 13.844409467179666, |
| "grad_norm": 0.3687745928764343, |
| "learning_rate": 0.00043412423447069116, |
| "loss": 3.3339, |
| "step": 47500 |
| }, |
| { |
| "epoch": 13.85898332750379, |
| "grad_norm": 0.35051122307777405, |
| "learning_rate": 0.0004339492563429571, |
| "loss": 3.3451, |
| "step": 47550 |
| }, |
| { |
| "epoch": 13.873557187827911, |
| "grad_norm": 0.37655699253082275, |
| "learning_rate": 0.00043377427821522305, |
| "loss": 3.3395, |
| "step": 47600 |
| }, |
| { |
| "epoch": 13.888131048152035, |
| "grad_norm": 0.36076241731643677, |
| "learning_rate": 0.00043359930008748904, |
| "loss": 3.345, |
| "step": 47650 |
| }, |
| { |
| "epoch": 13.902704908476156, |
| "grad_norm": 0.3756115436553955, |
| "learning_rate": 0.000433424321959755, |
| "loss": 3.3359, |
| "step": 47700 |
| }, |
| { |
| "epoch": 13.91727876880028, |
| "grad_norm": 0.32584476470947266, |
| "learning_rate": 0.000433249343832021, |
| "loss": 3.3462, |
| "step": 47750 |
| }, |
| { |
| "epoch": 13.931852629124403, |
| "grad_norm": 0.3678532540798187, |
| "learning_rate": 0.00043307436570428693, |
| "loss": 3.3531, |
| "step": 47800 |
| }, |
| { |
| "epoch": 13.946426489448525, |
| "grad_norm": 0.34742629528045654, |
| "learning_rate": 0.00043289938757655287, |
| "loss": 3.3505, |
| "step": 47850 |
| }, |
| { |
| "epoch": 13.961000349772648, |
| "grad_norm": 0.34186413884162903, |
| "learning_rate": 0.00043272440944881887, |
| "loss": 3.3402, |
| "step": 47900 |
| }, |
| { |
| "epoch": 13.97557421009677, |
| "grad_norm": 0.34656524658203125, |
| "learning_rate": 0.0004325494313210848, |
| "loss": 3.3415, |
| "step": 47950 |
| }, |
| { |
| "epoch": 13.990148070420894, |
| "grad_norm": 0.36282360553741455, |
| "learning_rate": 0.00043237445319335076, |
| "loss": 3.3409, |
| "step": 48000 |
| }, |
| { |
| "epoch": 13.990148070420894, |
| "eval_accuracy": 0.37056342823905236, |
| "eval_loss": 3.5480504035949707, |
| "eval_runtime": 53.5217, |
| "eval_samples_per_second": 310.659, |
| "eval_steps_per_second": 19.431, |
| "step": 48000 |
| }, |
| { |
| "epoch": 14.00466363530372, |
| "grad_norm": 0.38082319498062134, |
| "learning_rate": 0.00043219947506561676, |
| "loss": 3.3161, |
| "step": 48050 |
| }, |
| { |
| "epoch": 14.019237495627841, |
| "grad_norm": 0.348488450050354, |
| "learning_rate": 0.00043202449693788275, |
| "loss": 3.2295, |
| "step": 48100 |
| }, |
| { |
| "epoch": 14.033811355951965, |
| "grad_norm": 0.3672857880592346, |
| "learning_rate": 0.00043184951881014864, |
| "loss": 3.2446, |
| "step": 48150 |
| }, |
| { |
| "epoch": 14.048385216276086, |
| "grad_norm": 0.347853422164917, |
| "learning_rate": 0.00043167454068241464, |
| "loss": 3.2571, |
| "step": 48200 |
| }, |
| { |
| "epoch": 14.06295907660021, |
| "grad_norm": 0.38092395663261414, |
| "learning_rate": 0.00043149956255468064, |
| "loss": 3.2434, |
| "step": 48250 |
| }, |
| { |
| "epoch": 14.077532936924333, |
| "grad_norm": 0.38503310084342957, |
| "learning_rate": 0.00043132458442694664, |
| "loss": 3.2469, |
| "step": 48300 |
| }, |
| { |
| "epoch": 14.092106797248455, |
| "grad_norm": 0.37730053067207336, |
| "learning_rate": 0.0004311496062992125, |
| "loss": 3.2574, |
| "step": 48350 |
| }, |
| { |
| "epoch": 14.106680657572578, |
| "grad_norm": 0.3532187044620514, |
| "learning_rate": 0.0004309746281714785, |
| "loss": 3.2601, |
| "step": 48400 |
| }, |
| { |
| "epoch": 14.1212545178967, |
| "grad_norm": 0.36954471468925476, |
| "learning_rate": 0.0004307996500437445, |
| "loss": 3.2628, |
| "step": 48450 |
| }, |
| { |
| "epoch": 14.135828378220824, |
| "grad_norm": 0.37675559520721436, |
| "learning_rate": 0.00043062467191601046, |
| "loss": 3.27, |
| "step": 48500 |
| }, |
| { |
| "epoch": 14.150402238544945, |
| "grad_norm": 0.35064902901649475, |
| "learning_rate": 0.0004304496937882764, |
| "loss": 3.2608, |
| "step": 48550 |
| }, |
| { |
| "epoch": 14.164976098869069, |
| "grad_norm": 0.3470677435398102, |
| "learning_rate": 0.0004302747156605424, |
| "loss": 3.2687, |
| "step": 48600 |
| }, |
| { |
| "epoch": 14.17954995919319, |
| "grad_norm": 0.35394686460494995, |
| "learning_rate": 0.00043009973753280835, |
| "loss": 3.2741, |
| "step": 48650 |
| }, |
| { |
| "epoch": 14.194123819517314, |
| "grad_norm": 0.3794461488723755, |
| "learning_rate": 0.00042992475940507435, |
| "loss": 3.273, |
| "step": 48700 |
| }, |
| { |
| "epoch": 14.208697679841436, |
| "grad_norm": 0.35849666595458984, |
| "learning_rate": 0.0004297497812773403, |
| "loss": 3.2638, |
| "step": 48750 |
| }, |
| { |
| "epoch": 14.223271540165559, |
| "grad_norm": 0.36475321650505066, |
| "learning_rate": 0.0004295748031496063, |
| "loss": 3.2696, |
| "step": 48800 |
| }, |
| { |
| "epoch": 14.237845400489682, |
| "grad_norm": 0.36442142724990845, |
| "learning_rate": 0.00042939982502187223, |
| "loss": 3.2722, |
| "step": 48850 |
| }, |
| { |
| "epoch": 14.252419260813804, |
| "grad_norm": 0.36305540800094604, |
| "learning_rate": 0.0004292248468941382, |
| "loss": 3.2731, |
| "step": 48900 |
| }, |
| { |
| "epoch": 14.266993121137928, |
| "grad_norm": 0.3696446418762207, |
| "learning_rate": 0.00042904986876640417, |
| "loss": 3.2842, |
| "step": 48950 |
| }, |
| { |
| "epoch": 14.28156698146205, |
| "grad_norm": 0.3651410639286041, |
| "learning_rate": 0.0004288748906386701, |
| "loss": 3.2917, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.28156698146205, |
| "eval_accuracy": 0.3703023764073171, |
| "eval_loss": 3.5600101947784424, |
| "eval_runtime": 53.5293, |
| "eval_samples_per_second": 310.615, |
| "eval_steps_per_second": 19.429, |
| "step": 49000 |
| }, |
| { |
| "epoch": 14.296140841786173, |
| "grad_norm": 0.3510845899581909, |
| "learning_rate": 0.0004286999125109361, |
| "loss": 3.2906, |
| "step": 49050 |
| }, |
| { |
| "epoch": 14.310714702110294, |
| "grad_norm": 0.3592023253440857, |
| "learning_rate": 0.00042852493438320206, |
| "loss": 3.2814, |
| "step": 49100 |
| }, |
| { |
| "epoch": 14.325288562434418, |
| "grad_norm": 0.3811452090740204, |
| "learning_rate": 0.000428349956255468, |
| "loss": 3.2905, |
| "step": 49150 |
| }, |
| { |
| "epoch": 14.33986242275854, |
| "grad_norm": 0.37574225664138794, |
| "learning_rate": 0.000428174978127734, |
| "loss": 3.2812, |
| "step": 49200 |
| }, |
| { |
| "epoch": 14.354436283082663, |
| "grad_norm": 0.39398401975631714, |
| "learning_rate": 0.000428, |
| "loss": 3.2983, |
| "step": 49250 |
| }, |
| { |
| "epoch": 14.369010143406786, |
| "grad_norm": 0.3542300760746002, |
| "learning_rate": 0.0004278250218722659, |
| "loss": 3.291, |
| "step": 49300 |
| }, |
| { |
| "epoch": 14.383584003730908, |
| "grad_norm": 0.3780006766319275, |
| "learning_rate": 0.0004276500437445319, |
| "loss": 3.2916, |
| "step": 49350 |
| }, |
| { |
| "epoch": 14.398157864055031, |
| "grad_norm": 0.3502989709377289, |
| "learning_rate": 0.0004274750656167979, |
| "loss": 3.2936, |
| "step": 49400 |
| }, |
| { |
| "epoch": 14.412731724379153, |
| "grad_norm": 0.3444402515888214, |
| "learning_rate": 0.0004273000874890639, |
| "loss": 3.3004, |
| "step": 49450 |
| }, |
| { |
| "epoch": 14.427305584703277, |
| "grad_norm": 0.37139010429382324, |
| "learning_rate": 0.00042712510936132977, |
| "loss": 3.2947, |
| "step": 49500 |
| }, |
| { |
| "epoch": 14.441879445027398, |
| "grad_norm": 0.3862885534763336, |
| "learning_rate": 0.00042695013123359576, |
| "loss": 3.2898, |
| "step": 49550 |
| }, |
| { |
| "epoch": 14.456453305351522, |
| "grad_norm": 0.3839344084262848, |
| "learning_rate": 0.00042677515310586176, |
| "loss": 3.3125, |
| "step": 49600 |
| }, |
| { |
| "epoch": 14.471027165675643, |
| "grad_norm": 0.3504730463027954, |
| "learning_rate": 0.0004266001749781277, |
| "loss": 3.3016, |
| "step": 49650 |
| }, |
| { |
| "epoch": 14.485601025999767, |
| "grad_norm": 0.35012415051460266, |
| "learning_rate": 0.00042642519685039365, |
| "loss": 3.2979, |
| "step": 49700 |
| }, |
| { |
| "epoch": 14.50017488632389, |
| "grad_norm": 0.3540355861186981, |
| "learning_rate": 0.00042625021872265965, |
| "loss": 3.3021, |
| "step": 49750 |
| }, |
| { |
| "epoch": 14.514748746648012, |
| "grad_norm": 0.3770597577095032, |
| "learning_rate": 0.0004260752405949256, |
| "loss": 3.3037, |
| "step": 49800 |
| }, |
| { |
| "epoch": 14.529322606972135, |
| "grad_norm": 0.37958604097366333, |
| "learning_rate": 0.00042590026246719153, |
| "loss": 3.3148, |
| "step": 49850 |
| }, |
| { |
| "epoch": 14.543896467296257, |
| "grad_norm": 0.3547636866569519, |
| "learning_rate": 0.00042572528433945753, |
| "loss": 3.3137, |
| "step": 49900 |
| }, |
| { |
| "epoch": 14.55847032762038, |
| "grad_norm": 0.3873496949672699, |
| "learning_rate": 0.0004255503062117235, |
| "loss": 3.3036, |
| "step": 49950 |
| }, |
| { |
| "epoch": 14.573044187944502, |
| "grad_norm": 0.3540250062942505, |
| "learning_rate": 0.00042537532808398947, |
| "loss": 3.3057, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.573044187944502, |
| "eval_accuracy": 0.37080211765057586, |
| "eval_loss": 3.550915002822876, |
| "eval_runtime": 53.5546, |
| "eval_samples_per_second": 310.468, |
| "eval_steps_per_second": 19.419, |
| "step": 50000 |
| }, |
| { |
| "epoch": 14.587618048268626, |
| "grad_norm": 0.3510439097881317, |
| "learning_rate": 0.0004252003499562554, |
| "loss": 3.305, |
| "step": 50050 |
| }, |
| { |
| "epoch": 14.602191908592747, |
| "grad_norm": 0.3439718782901764, |
| "learning_rate": 0.0004250253718285214, |
| "loss": 3.2957, |
| "step": 50100 |
| }, |
| { |
| "epoch": 14.61676576891687, |
| "grad_norm": 0.38615429401397705, |
| "learning_rate": 0.00042485039370078736, |
| "loss": 3.3199, |
| "step": 50150 |
| }, |
| { |
| "epoch": 14.631339629240994, |
| "grad_norm": 0.37799420952796936, |
| "learning_rate": 0.00042467541557305335, |
| "loss": 3.3278, |
| "step": 50200 |
| }, |
| { |
| "epoch": 14.645913489565116, |
| "grad_norm": 0.3760044276714325, |
| "learning_rate": 0.0004245004374453193, |
| "loss": 3.318, |
| "step": 50250 |
| }, |
| { |
| "epoch": 14.66048734988924, |
| "grad_norm": 0.3453315496444702, |
| "learning_rate": 0.00042432545931758524, |
| "loss": 3.3134, |
| "step": 50300 |
| }, |
| { |
| "epoch": 14.67506121021336, |
| "grad_norm": 0.3765532970428467, |
| "learning_rate": 0.00042415048118985124, |
| "loss": 3.3141, |
| "step": 50350 |
| }, |
| { |
| "epoch": 14.689635070537484, |
| "grad_norm": 0.36476606130599976, |
| "learning_rate": 0.00042397550306211724, |
| "loss": 3.3275, |
| "step": 50400 |
| }, |
| { |
| "epoch": 14.704208930861606, |
| "grad_norm": 0.3756152391433716, |
| "learning_rate": 0.0004238005249343831, |
| "loss": 3.3155, |
| "step": 50450 |
| }, |
| { |
| "epoch": 14.71878279118573, |
| "grad_norm": 0.3862133026123047, |
| "learning_rate": 0.0004236255468066491, |
| "loss": 3.3205, |
| "step": 50500 |
| }, |
| { |
| "epoch": 14.733356651509851, |
| "grad_norm": 0.3885492980480194, |
| "learning_rate": 0.0004234505686789151, |
| "loss": 3.3208, |
| "step": 50550 |
| }, |
| { |
| "epoch": 14.747930511833975, |
| "grad_norm": 0.34860819578170776, |
| "learning_rate": 0.0004232755905511811, |
| "loss": 3.3242, |
| "step": 50600 |
| }, |
| { |
| "epoch": 14.762504372158098, |
| "grad_norm": 0.3799346685409546, |
| "learning_rate": 0.000423100612423447, |
| "loss": 3.3169, |
| "step": 50650 |
| }, |
| { |
| "epoch": 14.77707823248222, |
| "grad_norm": 0.37292400002479553, |
| "learning_rate": 0.000422925634295713, |
| "loss": 3.3275, |
| "step": 50700 |
| }, |
| { |
| "epoch": 14.791652092806343, |
| "grad_norm": 0.3854385018348694, |
| "learning_rate": 0.000422750656167979, |
| "loss": 3.3277, |
| "step": 50750 |
| }, |
| { |
| "epoch": 14.806225953130465, |
| "grad_norm": 0.35672706365585327, |
| "learning_rate": 0.0004225756780402449, |
| "loss": 3.3276, |
| "step": 50800 |
| }, |
| { |
| "epoch": 14.820799813454588, |
| "grad_norm": 0.3661268949508667, |
| "learning_rate": 0.0004224006999125109, |
| "loss": 3.331, |
| "step": 50850 |
| }, |
| { |
| "epoch": 14.83537367377871, |
| "grad_norm": 0.3479280471801758, |
| "learning_rate": 0.0004222257217847769, |
| "loss": 3.3177, |
| "step": 50900 |
| }, |
| { |
| "epoch": 14.849947534102833, |
| "grad_norm": 0.35523080825805664, |
| "learning_rate": 0.00042205074365704283, |
| "loss": 3.3202, |
| "step": 50950 |
| }, |
| { |
| "epoch": 14.864521394426955, |
| "grad_norm": 0.34677961468696594, |
| "learning_rate": 0.0004218757655293088, |
| "loss": 3.3215, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.864521394426955, |
| "eval_accuracy": 0.3711707444932246, |
| "eval_loss": 3.5454001426696777, |
| "eval_runtime": 53.456, |
| "eval_samples_per_second": 311.041, |
| "eval_steps_per_second": 19.455, |
| "step": 51000 |
| }, |
| { |
| "epoch": 14.879095254751078, |
| "grad_norm": 0.35299310088157654, |
| "learning_rate": 0.0004217007874015748, |
| "loss": 3.3365, |
| "step": 51050 |
| }, |
| { |
| "epoch": 14.893669115075202, |
| "grad_norm": 0.35436713695526123, |
| "learning_rate": 0.0004215258092738407, |
| "loss": 3.3388, |
| "step": 51100 |
| }, |
| { |
| "epoch": 14.908242975399324, |
| "grad_norm": 0.3453245162963867, |
| "learning_rate": 0.0004213508311461067, |
| "loss": 3.3331, |
| "step": 51150 |
| }, |
| { |
| "epoch": 14.922816835723447, |
| "grad_norm": 0.361872136592865, |
| "learning_rate": 0.00042117585301837266, |
| "loss": 3.3407, |
| "step": 51200 |
| }, |
| { |
| "epoch": 14.937390696047569, |
| "grad_norm": 0.3539639711380005, |
| "learning_rate": 0.0004210008748906386, |
| "loss": 3.3296, |
| "step": 51250 |
| }, |
| { |
| "epoch": 14.951964556371692, |
| "grad_norm": 0.36882373690605164, |
| "learning_rate": 0.0004208258967629046, |
| "loss": 3.3236, |
| "step": 51300 |
| }, |
| { |
| "epoch": 14.966538416695814, |
| "grad_norm": 0.37631767988204956, |
| "learning_rate": 0.0004206509186351706, |
| "loss": 3.3542, |
| "step": 51350 |
| }, |
| { |
| "epoch": 14.981112277019937, |
| "grad_norm": 0.3416411280632019, |
| "learning_rate": 0.00042047594050743654, |
| "loss": 3.3288, |
| "step": 51400 |
| }, |
| { |
| "epoch": 14.995686137344059, |
| "grad_norm": 0.3502333164215088, |
| "learning_rate": 0.0004203009623797025, |
| "loss": 3.3263, |
| "step": 51450 |
| }, |
| { |
| "epoch": 15.010201702226885, |
| "grad_norm": 0.3871927261352539, |
| "learning_rate": 0.0004201259842519685, |
| "loss": 3.2464, |
| "step": 51500 |
| }, |
| { |
| "epoch": 15.024775562551008, |
| "grad_norm": 0.3655627965927124, |
| "learning_rate": 0.0004199510061242344, |
| "loss": 3.2098, |
| "step": 51550 |
| }, |
| { |
| "epoch": 15.039349422875132, |
| "grad_norm": 0.3525364398956299, |
| "learning_rate": 0.00041977602799650037, |
| "loss": 3.215, |
| "step": 51600 |
| }, |
| { |
| "epoch": 15.053923283199254, |
| "grad_norm": 0.37742120027542114, |
| "learning_rate": 0.00041960104986876637, |
| "loss": 3.2407, |
| "step": 51650 |
| }, |
| { |
| "epoch": 15.068497143523377, |
| "grad_norm": 0.384676069021225, |
| "learning_rate": 0.00041942607174103236, |
| "loss": 3.2318, |
| "step": 51700 |
| }, |
| { |
| "epoch": 15.083071003847499, |
| "grad_norm": 0.3628941476345062, |
| "learning_rate": 0.00041925109361329825, |
| "loss": 3.24, |
| "step": 51750 |
| }, |
| { |
| "epoch": 15.097644864171622, |
| "grad_norm": 0.3832191824913025, |
| "learning_rate": 0.00041907611548556425, |
| "loss": 3.2426, |
| "step": 51800 |
| }, |
| { |
| "epoch": 15.112218724495744, |
| "grad_norm": 0.3713030517101288, |
| "learning_rate": 0.00041890113735783025, |
| "loss": 3.2535, |
| "step": 51850 |
| }, |
| { |
| "epoch": 15.126792584819867, |
| "grad_norm": 0.36321592330932617, |
| "learning_rate": 0.00041872615923009625, |
| "loss": 3.2493, |
| "step": 51900 |
| }, |
| { |
| "epoch": 15.141366445143989, |
| "grad_norm": 0.3557388484477997, |
| "learning_rate": 0.00041855118110236214, |
| "loss": 3.2499, |
| "step": 51950 |
| }, |
| { |
| "epoch": 15.155940305468112, |
| "grad_norm": 0.35523858666419983, |
| "learning_rate": 0.00041837620297462813, |
| "loss": 3.2559, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.155940305468112, |
| "eval_accuracy": 0.370396651663052, |
| "eval_loss": 3.562495231628418, |
| "eval_runtime": 53.689, |
| "eval_samples_per_second": 309.691, |
| "eval_steps_per_second": 19.371, |
| "step": 52000 |
| }, |
| { |
| "epoch": 15.170514165792236, |
| "grad_norm": 0.35087063908576965, |
| "learning_rate": 0.00041820122484689413, |
| "loss": 3.2476, |
| "step": 52050 |
| }, |
| { |
| "epoch": 15.185088026116357, |
| "grad_norm": 0.35440754890441895, |
| "learning_rate": 0.0004180262467191601, |
| "loss": 3.2723, |
| "step": 52100 |
| }, |
| { |
| "epoch": 15.19966188644048, |
| "grad_norm": 0.35535645484924316, |
| "learning_rate": 0.000417851268591426, |
| "loss": 3.2599, |
| "step": 52150 |
| }, |
| { |
| "epoch": 15.214235746764603, |
| "grad_norm": 0.3804803490638733, |
| "learning_rate": 0.000417676290463692, |
| "loss": 3.2455, |
| "step": 52200 |
| }, |
| { |
| "epoch": 15.228809607088726, |
| "grad_norm": 0.3786962330341339, |
| "learning_rate": 0.00041750131233595796, |
| "loss": 3.2718, |
| "step": 52250 |
| }, |
| { |
| "epoch": 15.243383467412848, |
| "grad_norm": 0.3691527247428894, |
| "learning_rate": 0.00041732633420822396, |
| "loss": 3.2616, |
| "step": 52300 |
| }, |
| { |
| "epoch": 15.257957327736971, |
| "grad_norm": 0.35802537202835083, |
| "learning_rate": 0.0004171513560804899, |
| "loss": 3.2727, |
| "step": 52350 |
| }, |
| { |
| "epoch": 15.272531188061093, |
| "grad_norm": 0.3633171319961548, |
| "learning_rate": 0.00041697637795275584, |
| "loss": 3.2702, |
| "step": 52400 |
| }, |
| { |
| "epoch": 15.287105048385216, |
| "grad_norm": 0.3410313129425049, |
| "learning_rate": 0.00041680139982502184, |
| "loss": 3.2908, |
| "step": 52450 |
| }, |
| { |
| "epoch": 15.30167890870934, |
| "grad_norm": 0.350529283285141, |
| "learning_rate": 0.0004166264216972878, |
| "loss": 3.2667, |
| "step": 52500 |
| }, |
| { |
| "epoch": 15.316252769033461, |
| "grad_norm": 0.3821249306201935, |
| "learning_rate": 0.00041645144356955373, |
| "loss": 3.2908, |
| "step": 52550 |
| }, |
| { |
| "epoch": 15.330826629357585, |
| "grad_norm": 0.36240100860595703, |
| "learning_rate": 0.0004162764654418197, |
| "loss": 3.2826, |
| "step": 52600 |
| }, |
| { |
| "epoch": 15.345400489681706, |
| "grad_norm": 0.3723773956298828, |
| "learning_rate": 0.0004161014873140857, |
| "loss": 3.2871, |
| "step": 52650 |
| }, |
| { |
| "epoch": 15.35997435000583, |
| "grad_norm": 0.3983280062675476, |
| "learning_rate": 0.00041592650918635167, |
| "loss": 3.278, |
| "step": 52700 |
| }, |
| { |
| "epoch": 15.374548210329952, |
| "grad_norm": 0.3579860031604767, |
| "learning_rate": 0.0004157515310586176, |
| "loss": 3.2737, |
| "step": 52750 |
| }, |
| { |
| "epoch": 15.389122070654075, |
| "grad_norm": 0.3578559160232544, |
| "learning_rate": 0.0004155765529308836, |
| "loss": 3.2872, |
| "step": 52800 |
| }, |
| { |
| "epoch": 15.403695930978197, |
| "grad_norm": 0.3501421809196472, |
| "learning_rate": 0.0004154015748031496, |
| "loss": 3.2645, |
| "step": 52850 |
| }, |
| { |
| "epoch": 15.41826979130232, |
| "grad_norm": 0.34641024470329285, |
| "learning_rate": 0.0004152265966754155, |
| "loss": 3.2805, |
| "step": 52900 |
| }, |
| { |
| "epoch": 15.432843651626444, |
| "grad_norm": 0.34736379981040955, |
| "learning_rate": 0.0004150516185476815, |
| "loss": 3.2979, |
| "step": 52950 |
| }, |
| { |
| "epoch": 15.447417511950565, |
| "grad_norm": 0.37465181946754456, |
| "learning_rate": 0.0004148766404199475, |
| "loss": 3.2948, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.447417511950565, |
| "eval_accuracy": 0.37067382797672943, |
| "eval_loss": 3.5552163124084473, |
| "eval_runtime": 53.5845, |
| "eval_samples_per_second": 310.295, |
| "eval_steps_per_second": 19.409, |
| "step": 53000 |
| }, |
| { |
| "epoch": 15.461991372274689, |
| "grad_norm": 0.38083088397979736, |
| "learning_rate": 0.0004147016622922135, |
| "loss": 3.2781, |
| "step": 53050 |
| }, |
| { |
| "epoch": 15.47656523259881, |
| "grad_norm": 0.3855237066745758, |
| "learning_rate": 0.0004145266841644794, |
| "loss": 3.2917, |
| "step": 53100 |
| }, |
| { |
| "epoch": 15.491139092922934, |
| "grad_norm": 0.36797034740448, |
| "learning_rate": 0.0004143517060367454, |
| "loss": 3.2858, |
| "step": 53150 |
| }, |
| { |
| "epoch": 15.505712953247055, |
| "grad_norm": 0.4288408160209656, |
| "learning_rate": 0.0004141767279090114, |
| "loss": 3.3005, |
| "step": 53200 |
| }, |
| { |
| "epoch": 15.520286813571179, |
| "grad_norm": 0.35422101616859436, |
| "learning_rate": 0.0004140017497812773, |
| "loss": 3.2917, |
| "step": 53250 |
| }, |
| { |
| "epoch": 15.5348606738953, |
| "grad_norm": 0.3595862090587616, |
| "learning_rate": 0.00041382677165354326, |
| "loss": 3.2984, |
| "step": 53300 |
| }, |
| { |
| "epoch": 15.549434534219424, |
| "grad_norm": 0.3656311631202698, |
| "learning_rate": 0.00041365179352580926, |
| "loss": 3.3029, |
| "step": 53350 |
| }, |
| { |
| "epoch": 15.564008394543547, |
| "grad_norm": 0.39458322525024414, |
| "learning_rate": 0.0004134768153980752, |
| "loss": 3.2866, |
| "step": 53400 |
| }, |
| { |
| "epoch": 15.57858225486767, |
| "grad_norm": 0.3724213242530823, |
| "learning_rate": 0.00041330183727034114, |
| "loss": 3.3019, |
| "step": 53450 |
| }, |
| { |
| "epoch": 15.593156115191793, |
| "grad_norm": 0.3543829321861267, |
| "learning_rate": 0.00041312685914260714, |
| "loss": 3.2956, |
| "step": 53500 |
| }, |
| { |
| "epoch": 15.607729975515914, |
| "grad_norm": 0.39582404494285583, |
| "learning_rate": 0.0004129518810148731, |
| "loss": 3.3011, |
| "step": 53550 |
| }, |
| { |
| "epoch": 15.622303835840038, |
| "grad_norm": 0.36523571610450745, |
| "learning_rate": 0.0004127769028871391, |
| "loss": 3.2913, |
| "step": 53600 |
| }, |
| { |
| "epoch": 15.63687769616416, |
| "grad_norm": 0.3662680983543396, |
| "learning_rate": 0.00041260192475940503, |
| "loss": 3.3023, |
| "step": 53650 |
| }, |
| { |
| "epoch": 15.651451556488283, |
| "grad_norm": 0.34417232871055603, |
| "learning_rate": 0.00041242694663167097, |
| "loss": 3.3055, |
| "step": 53700 |
| }, |
| { |
| "epoch": 15.666025416812404, |
| "grad_norm": 0.3848658502101898, |
| "learning_rate": 0.00041225196850393697, |
| "loss": 3.2883, |
| "step": 53750 |
| }, |
| { |
| "epoch": 15.680599277136528, |
| "grad_norm": 0.371334046125412, |
| "learning_rate": 0.00041207699037620297, |
| "loss": 3.2953, |
| "step": 53800 |
| }, |
| { |
| "epoch": 15.695173137460651, |
| "grad_norm": 0.36774909496307373, |
| "learning_rate": 0.00041190201224846886, |
| "loss": 3.3086, |
| "step": 53850 |
| }, |
| { |
| "epoch": 15.709746997784773, |
| "grad_norm": 0.37255731225013733, |
| "learning_rate": 0.00041172703412073485, |
| "loss": 3.3003, |
| "step": 53900 |
| }, |
| { |
| "epoch": 15.724320858108896, |
| "grad_norm": 0.3708425760269165, |
| "learning_rate": 0.00041155205599300085, |
| "loss": 3.3102, |
| "step": 53950 |
| }, |
| { |
| "epoch": 15.738894718433018, |
| "grad_norm": 0.34874001145362854, |
| "learning_rate": 0.00041137707786526685, |
| "loss": 3.3043, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.738894718433018, |
| "eval_accuracy": 0.37145333486653226, |
| "eval_loss": 3.5496866703033447, |
| "eval_runtime": 53.5172, |
| "eval_samples_per_second": 310.685, |
| "eval_steps_per_second": 19.433, |
| "step": 54000 |
| }, |
| { |
| "epoch": 15.753468578757142, |
| "grad_norm": 0.3423483073711395, |
| "learning_rate": 0.00041120209973753274, |
| "loss": 3.3136, |
| "step": 54050 |
| }, |
| { |
| "epoch": 15.768042439081263, |
| "grad_norm": 0.33736762404441833, |
| "learning_rate": 0.00041102712160979874, |
| "loss": 3.288, |
| "step": 54100 |
| }, |
| { |
| "epoch": 15.782616299405387, |
| "grad_norm": 0.3785526752471924, |
| "learning_rate": 0.00041085214348206473, |
| "loss": 3.3105, |
| "step": 54150 |
| }, |
| { |
| "epoch": 15.797190159729508, |
| "grad_norm": 0.3413451910018921, |
| "learning_rate": 0.0004106771653543306, |
| "loss": 3.3183, |
| "step": 54200 |
| }, |
| { |
| "epoch": 15.811764020053632, |
| "grad_norm": 0.3814006447792053, |
| "learning_rate": 0.0004105021872265966, |
| "loss": 3.3037, |
| "step": 54250 |
| }, |
| { |
| "epoch": 15.826337880377755, |
| "grad_norm": 0.4063456654548645, |
| "learning_rate": 0.0004103272090988626, |
| "loss": 3.3086, |
| "step": 54300 |
| }, |
| { |
| "epoch": 15.840911740701877, |
| "grad_norm": 0.3685641884803772, |
| "learning_rate": 0.0004101522309711286, |
| "loss": 3.3159, |
| "step": 54350 |
| }, |
| { |
| "epoch": 15.855485601026, |
| "grad_norm": 0.33264556527137756, |
| "learning_rate": 0.0004099772528433945, |
| "loss": 3.3197, |
| "step": 54400 |
| }, |
| { |
| "epoch": 15.870059461350122, |
| "grad_norm": 0.38500693440437317, |
| "learning_rate": 0.0004098022747156605, |
| "loss": 3.3082, |
| "step": 54450 |
| }, |
| { |
| "epoch": 15.884633321674245, |
| "grad_norm": 0.36302870512008667, |
| "learning_rate": 0.0004096272965879265, |
| "loss": 3.3206, |
| "step": 54500 |
| }, |
| { |
| "epoch": 15.899207181998367, |
| "grad_norm": 0.3871668577194214, |
| "learning_rate": 0.00040945231846019244, |
| "loss": 3.3035, |
| "step": 54550 |
| }, |
| { |
| "epoch": 15.91378104232249, |
| "grad_norm": 0.3546806573867798, |
| "learning_rate": 0.0004092773403324584, |
| "loss": 3.3207, |
| "step": 54600 |
| }, |
| { |
| "epoch": 15.928354902646612, |
| "grad_norm": 0.3954088091850281, |
| "learning_rate": 0.0004091023622047244, |
| "loss": 3.306, |
| "step": 54650 |
| }, |
| { |
| "epoch": 15.942928762970736, |
| "grad_norm": 0.35316231846809387, |
| "learning_rate": 0.00040892738407699033, |
| "loss": 3.3167, |
| "step": 54700 |
| }, |
| { |
| "epoch": 15.95750262329486, |
| "grad_norm": 0.3522266745567322, |
| "learning_rate": 0.0004087524059492563, |
| "loss": 3.3195, |
| "step": 54750 |
| }, |
| { |
| "epoch": 15.97207648361898, |
| "grad_norm": 0.35674938559532166, |
| "learning_rate": 0.00040857742782152227, |
| "loss": 3.311, |
| "step": 54800 |
| }, |
| { |
| "epoch": 15.986650343943104, |
| "grad_norm": 0.3551098704338074, |
| "learning_rate": 0.0004084024496937882, |
| "loss": 3.3246, |
| "step": 54850 |
| }, |
| { |
| "epoch": 16.00116590882593, |
| "grad_norm": 0.36500123143196106, |
| "learning_rate": 0.0004082274715660542, |
| "loss": 3.3121, |
| "step": 54900 |
| }, |
| { |
| "epoch": 16.015739769150052, |
| "grad_norm": 0.3751806318759918, |
| "learning_rate": 0.0004080524934383202, |
| "loss": 3.1939, |
| "step": 54950 |
| }, |
| { |
| "epoch": 16.030313629474175, |
| "grad_norm": 0.34337669610977173, |
| "learning_rate": 0.0004078775153105861, |
| "loss": 3.2092, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.030313629474175, |
| "eval_accuracy": 0.37094135314063126, |
| "eval_loss": 3.557356357574463, |
| "eval_runtime": 53.5988, |
| "eval_samples_per_second": 310.212, |
| "eval_steps_per_second": 19.403, |
| "step": 55000 |
| }, |
| { |
| "epoch": 16.0448874897983, |
| "grad_norm": 0.3447566330432892, |
| "learning_rate": 0.0004077025371828521, |
| "loss": 3.2146, |
| "step": 55050 |
| }, |
| { |
| "epoch": 16.05946135012242, |
| "grad_norm": 0.36830922961235046, |
| "learning_rate": 0.0004075275590551181, |
| "loss": 3.2183, |
| "step": 55100 |
| }, |
| { |
| "epoch": 16.074035210446542, |
| "grad_norm": 0.4203527271747589, |
| "learning_rate": 0.000407352580927384, |
| "loss": 3.2203, |
| "step": 55150 |
| }, |
| { |
| "epoch": 16.088609070770666, |
| "grad_norm": 0.3611992299556732, |
| "learning_rate": 0.00040717760279965, |
| "loss": 3.2373, |
| "step": 55200 |
| }, |
| { |
| "epoch": 16.10318293109479, |
| "grad_norm": 0.3881664276123047, |
| "learning_rate": 0.000407002624671916, |
| "loss": 3.2264, |
| "step": 55250 |
| }, |
| { |
| "epoch": 16.117756791418913, |
| "grad_norm": 0.3770337700843811, |
| "learning_rate": 0.000406827646544182, |
| "loss": 3.2366, |
| "step": 55300 |
| }, |
| { |
| "epoch": 16.132330651743032, |
| "grad_norm": 0.3896472454071045, |
| "learning_rate": 0.00040665266841644786, |
| "loss": 3.2259, |
| "step": 55350 |
| }, |
| { |
| "epoch": 16.146904512067156, |
| "grad_norm": 0.3588141202926636, |
| "learning_rate": 0.00040647769028871386, |
| "loss": 3.2449, |
| "step": 55400 |
| }, |
| { |
| "epoch": 16.16147837239128, |
| "grad_norm": 0.41494470834732056, |
| "learning_rate": 0.00040630271216097986, |
| "loss": 3.2374, |
| "step": 55450 |
| }, |
| { |
| "epoch": 16.176052232715403, |
| "grad_norm": 0.3604167401790619, |
| "learning_rate": 0.00040612773403324586, |
| "loss": 3.2435, |
| "step": 55500 |
| }, |
| { |
| "epoch": 16.190626093039523, |
| "grad_norm": 0.38306134939193726, |
| "learning_rate": 0.00040595275590551175, |
| "loss": 3.2395, |
| "step": 55550 |
| }, |
| { |
| "epoch": 16.205199953363646, |
| "grad_norm": 0.3704599142074585, |
| "learning_rate": 0.00040577777777777774, |
| "loss": 3.2382, |
| "step": 55600 |
| }, |
| { |
| "epoch": 16.21977381368777, |
| "grad_norm": 0.36852410435676575, |
| "learning_rate": 0.00040560279965004374, |
| "loss": 3.2607, |
| "step": 55650 |
| }, |
| { |
| "epoch": 16.234347674011893, |
| "grad_norm": 0.37379172444343567, |
| "learning_rate": 0.0004054278215223097, |
| "loss": 3.2476, |
| "step": 55700 |
| }, |
| { |
| "epoch": 16.248921534336016, |
| "grad_norm": 0.3873160481452942, |
| "learning_rate": 0.00040525284339457563, |
| "loss": 3.24, |
| "step": 55750 |
| }, |
| { |
| "epoch": 16.263495394660136, |
| "grad_norm": 0.35882189869880676, |
| "learning_rate": 0.0004050778652668416, |
| "loss": 3.2495, |
| "step": 55800 |
| }, |
| { |
| "epoch": 16.27806925498426, |
| "grad_norm": 0.36286455392837524, |
| "learning_rate": 0.00040490288713910757, |
| "loss": 3.2529, |
| "step": 55850 |
| }, |
| { |
| "epoch": 16.292643115308383, |
| "grad_norm": 0.4211234450340271, |
| "learning_rate": 0.00040472790901137357, |
| "loss": 3.2578, |
| "step": 55900 |
| }, |
| { |
| "epoch": 16.307216975632507, |
| "grad_norm": 0.3631596863269806, |
| "learning_rate": 0.0004045529308836395, |
| "loss": 3.269, |
| "step": 55950 |
| }, |
| { |
| "epoch": 16.321790835956627, |
| "grad_norm": 0.4015319049358368, |
| "learning_rate": 0.00040437795275590546, |
| "loss": 3.2667, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.321790835956627, |
| "eval_accuracy": 0.3711181339572527, |
| "eval_loss": 3.55796480178833, |
| "eval_runtime": 53.7308, |
| "eval_samples_per_second": 309.45, |
| "eval_steps_per_second": 19.356, |
| "step": 56000 |
| }, |
| { |
| "epoch": 16.33636469628075, |
| "grad_norm": 0.35500746965408325, |
| "learning_rate": 0.00040420297462817145, |
| "loss": 3.2623, |
| "step": 56050 |
| }, |
| { |
| "epoch": 16.350938556604873, |
| "grad_norm": 0.35954955220222473, |
| "learning_rate": 0.0004040279965004374, |
| "loss": 3.2715, |
| "step": 56100 |
| }, |
| { |
| "epoch": 16.365512416928997, |
| "grad_norm": 0.3758191466331482, |
| "learning_rate": 0.00040385301837270334, |
| "loss": 3.2709, |
| "step": 56150 |
| }, |
| { |
| "epoch": 16.38008627725312, |
| "grad_norm": 0.3986791670322418, |
| "learning_rate": 0.00040367804024496934, |
| "loss": 3.2694, |
| "step": 56200 |
| }, |
| { |
| "epoch": 16.39466013757724, |
| "grad_norm": 0.35960960388183594, |
| "learning_rate": 0.00040350306211723534, |
| "loss": 3.263, |
| "step": 56250 |
| }, |
| { |
| "epoch": 16.409233997901364, |
| "grad_norm": 0.3987464904785156, |
| "learning_rate": 0.0004033280839895012, |
| "loss": 3.2595, |
| "step": 56300 |
| }, |
| { |
| "epoch": 16.423807858225487, |
| "grad_norm": 0.3835877478122711, |
| "learning_rate": 0.0004031531058617672, |
| "loss": 3.2664, |
| "step": 56350 |
| }, |
| { |
| "epoch": 16.43838171854961, |
| "grad_norm": 0.35285845398902893, |
| "learning_rate": 0.0004029781277340332, |
| "loss": 3.279, |
| "step": 56400 |
| }, |
| { |
| "epoch": 16.45295557887373, |
| "grad_norm": 0.3625013828277588, |
| "learning_rate": 0.0004028031496062992, |
| "loss": 3.2624, |
| "step": 56450 |
| }, |
| { |
| "epoch": 16.467529439197854, |
| "grad_norm": 0.3883364200592041, |
| "learning_rate": 0.0004026281714785651, |
| "loss": 3.275, |
| "step": 56500 |
| }, |
| { |
| "epoch": 16.482103299521977, |
| "grad_norm": 0.3524991571903229, |
| "learning_rate": 0.0004024531933508311, |
| "loss": 3.2767, |
| "step": 56550 |
| }, |
| { |
| "epoch": 16.4966771598461, |
| "grad_norm": 0.4123859703540802, |
| "learning_rate": 0.0004022782152230971, |
| "loss": 3.2858, |
| "step": 56600 |
| }, |
| { |
| "epoch": 16.511251020170224, |
| "grad_norm": 0.36290034651756287, |
| "learning_rate": 0.0004021032370953631, |
| "loss": 3.2855, |
| "step": 56650 |
| }, |
| { |
| "epoch": 16.525824880494344, |
| "grad_norm": 0.3715699017047882, |
| "learning_rate": 0.000401928258967629, |
| "loss": 3.2806, |
| "step": 56700 |
| }, |
| { |
| "epoch": 16.540398740818468, |
| "grad_norm": 0.3986356258392334, |
| "learning_rate": 0.000401753280839895, |
| "loss": 3.2767, |
| "step": 56750 |
| }, |
| { |
| "epoch": 16.55497260114259, |
| "grad_norm": 0.41219544410705566, |
| "learning_rate": 0.000401578302712161, |
| "loss": 3.2875, |
| "step": 56800 |
| }, |
| { |
| "epoch": 16.569546461466715, |
| "grad_norm": 0.3760019540786743, |
| "learning_rate": 0.00040140332458442693, |
| "loss": 3.2798, |
| "step": 56850 |
| }, |
| { |
| "epoch": 16.584120321790834, |
| "grad_norm": 0.3611760139465332, |
| "learning_rate": 0.00040122834645669287, |
| "loss": 3.2811, |
| "step": 56900 |
| }, |
| { |
| "epoch": 16.598694182114958, |
| "grad_norm": 0.3871175944805145, |
| "learning_rate": 0.00040105336832895887, |
| "loss": 3.2864, |
| "step": 56950 |
| }, |
| { |
| "epoch": 16.61326804243908, |
| "grad_norm": 0.36425960063934326, |
| "learning_rate": 0.0004008783902012248, |
| "loss": 3.2924, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.61326804243908, |
| "eval_accuracy": 0.371368475366676, |
| "eval_loss": 3.549255847930908, |
| "eval_runtime": 53.7345, |
| "eval_samples_per_second": 309.429, |
| "eval_steps_per_second": 19.354, |
| "step": 57000 |
| }, |
| { |
| "epoch": 16.627841902763205, |
| "grad_norm": 0.3846484422683716, |
| "learning_rate": 0.00040070341207349076, |
| "loss": 3.2988, |
| "step": 57050 |
| }, |
| { |
| "epoch": 16.642415763087328, |
| "grad_norm": 0.39037346839904785, |
| "learning_rate": 0.00040052843394575675, |
| "loss": 3.2803, |
| "step": 57100 |
| }, |
| { |
| "epoch": 16.656989623411448, |
| "grad_norm": 0.3782590925693512, |
| "learning_rate": 0.0004003534558180227, |
| "loss": 3.3004, |
| "step": 57150 |
| }, |
| { |
| "epoch": 16.67156348373557, |
| "grad_norm": 0.3686920404434204, |
| "learning_rate": 0.0004001784776902887, |
| "loss": 3.2803, |
| "step": 57200 |
| }, |
| { |
| "epoch": 16.686137344059695, |
| "grad_norm": 0.3739873468875885, |
| "learning_rate": 0.00040000349956255464, |
| "loss": 3.2875, |
| "step": 57250 |
| }, |
| { |
| "epoch": 16.70071120438382, |
| "grad_norm": 0.3582131564617157, |
| "learning_rate": 0.0003998285214348206, |
| "loss": 3.2938, |
| "step": 57300 |
| }, |
| { |
| "epoch": 16.71528506470794, |
| "grad_norm": 0.3974229693412781, |
| "learning_rate": 0.0003996535433070866, |
| "loss": 3.3116, |
| "step": 57350 |
| }, |
| { |
| "epoch": 16.72985892503206, |
| "grad_norm": 0.3692742884159088, |
| "learning_rate": 0.0003994785651793526, |
| "loss": 3.3037, |
| "step": 57400 |
| }, |
| { |
| "epoch": 16.744432785356185, |
| "grad_norm": 0.37706458568573, |
| "learning_rate": 0.00039930358705161847, |
| "loss": 3.2889, |
| "step": 57450 |
| }, |
| { |
| "epoch": 16.75900664568031, |
| "grad_norm": 0.3647792935371399, |
| "learning_rate": 0.00039912860892388446, |
| "loss": 3.3026, |
| "step": 57500 |
| }, |
| { |
| "epoch": 16.773580506004432, |
| "grad_norm": 0.35807985067367554, |
| "learning_rate": 0.00039895363079615046, |
| "loss": 3.2887, |
| "step": 57550 |
| }, |
| { |
| "epoch": 16.788154366328552, |
| "grad_norm": 0.37481561303138733, |
| "learning_rate": 0.00039877865266841646, |
| "loss": 3.2955, |
| "step": 57600 |
| }, |
| { |
| "epoch": 16.802728226652675, |
| "grad_norm": 0.3432452380657196, |
| "learning_rate": 0.00039860367454068235, |
| "loss": 3.3023, |
| "step": 57650 |
| }, |
| { |
| "epoch": 16.8173020869768, |
| "grad_norm": 0.3837871253490448, |
| "learning_rate": 0.00039842869641294835, |
| "loss": 3.2921, |
| "step": 57700 |
| }, |
| { |
| "epoch": 16.831875947300922, |
| "grad_norm": 0.37672537565231323, |
| "learning_rate": 0.00039825371828521434, |
| "loss": 3.2974, |
| "step": 57750 |
| }, |
| { |
| "epoch": 16.846449807625042, |
| "grad_norm": 0.34477701783180237, |
| "learning_rate": 0.00039807874015748023, |
| "loss": 3.3047, |
| "step": 57800 |
| }, |
| { |
| "epoch": 16.861023667949166, |
| "grad_norm": 0.36180946230888367, |
| "learning_rate": 0.00039790376202974623, |
| "loss": 3.2889, |
| "step": 57850 |
| }, |
| { |
| "epoch": 16.87559752827329, |
| "grad_norm": 0.3875158429145813, |
| "learning_rate": 0.00039772878390201223, |
| "loss": 3.304, |
| "step": 57900 |
| }, |
| { |
| "epoch": 16.890171388597413, |
| "grad_norm": 0.3722599148750305, |
| "learning_rate": 0.0003975538057742782, |
| "loss": 3.3056, |
| "step": 57950 |
| }, |
| { |
| "epoch": 16.904745248921536, |
| "grad_norm": 0.34985819458961487, |
| "learning_rate": 0.0003973788276465441, |
| "loss": 3.2971, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.904745248921536, |
| "eval_accuracy": 0.3717624070532486, |
| "eval_loss": 3.5427567958831787, |
| "eval_runtime": 53.5238, |
| "eval_samples_per_second": 310.647, |
| "eval_steps_per_second": 19.431, |
| "step": 58000 |
| }, |
| { |
| "epoch": 16.919319109245656, |
| "grad_norm": 0.3880864381790161, |
| "learning_rate": 0.0003972038495188101, |
| "loss": 3.3045, |
| "step": 58050 |
| }, |
| { |
| "epoch": 16.93389296956978, |
| "grad_norm": 0.35843950510025024, |
| "learning_rate": 0.0003970288713910761, |
| "loss": 3.3097, |
| "step": 58100 |
| }, |
| { |
| "epoch": 16.948466829893903, |
| "grad_norm": 0.364650160074234, |
| "learning_rate": 0.00039685389326334205, |
| "loss": 3.2966, |
| "step": 58150 |
| }, |
| { |
| "epoch": 16.963040690218026, |
| "grad_norm": 0.37133607268333435, |
| "learning_rate": 0.000396678915135608, |
| "loss": 3.3059, |
| "step": 58200 |
| }, |
| { |
| "epoch": 16.977614550542146, |
| "grad_norm": 0.39477208256721497, |
| "learning_rate": 0.000396503937007874, |
| "loss": 3.3186, |
| "step": 58250 |
| }, |
| { |
| "epoch": 16.99218841086627, |
| "grad_norm": 0.36673828959465027, |
| "learning_rate": 0.00039632895888013994, |
| "loss": 3.3095, |
| "step": 58300 |
| }, |
| { |
| "epoch": 17.006703975749097, |
| "grad_norm": 0.3800864517688751, |
| "learning_rate": 0.00039615398075240594, |
| "loss": 3.2534, |
| "step": 58350 |
| }, |
| { |
| "epoch": 17.021277836073217, |
| "grad_norm": 0.385430246591568, |
| "learning_rate": 0.0003959790026246719, |
| "loss": 3.1981, |
| "step": 58400 |
| }, |
| { |
| "epoch": 17.03585169639734, |
| "grad_norm": 0.3510759472846985, |
| "learning_rate": 0.0003958040244969378, |
| "loss": 3.198, |
| "step": 58450 |
| }, |
| { |
| "epoch": 17.050425556721464, |
| "grad_norm": 0.39121222496032715, |
| "learning_rate": 0.0003956290463692038, |
| "loss": 3.1921, |
| "step": 58500 |
| }, |
| { |
| "epoch": 17.064999417045588, |
| "grad_norm": 0.4235476851463318, |
| "learning_rate": 0.0003954540682414698, |
| "loss": 3.2039, |
| "step": 58550 |
| }, |
| { |
| "epoch": 17.07957327736971, |
| "grad_norm": 0.3736433982849121, |
| "learning_rate": 0.0003952790901137357, |
| "loss": 3.2208, |
| "step": 58600 |
| }, |
| { |
| "epoch": 17.09414713769383, |
| "grad_norm": 0.3668088912963867, |
| "learning_rate": 0.0003951041119860017, |
| "loss": 3.2194, |
| "step": 58650 |
| }, |
| { |
| "epoch": 17.108720998017954, |
| "grad_norm": 0.3642267882823944, |
| "learning_rate": 0.0003949291338582677, |
| "loss": 3.2241, |
| "step": 58700 |
| }, |
| { |
| "epoch": 17.123294858342078, |
| "grad_norm": 0.36421388387680054, |
| "learning_rate": 0.0003947541557305336, |
| "loss": 3.224, |
| "step": 58750 |
| }, |
| { |
| "epoch": 17.1378687186662, |
| "grad_norm": 0.3846866190433502, |
| "learning_rate": 0.0003945791776027996, |
| "loss": 3.2258, |
| "step": 58800 |
| }, |
| { |
| "epoch": 17.15244257899032, |
| "grad_norm": 0.36491894721984863, |
| "learning_rate": 0.0003944041994750656, |
| "loss": 3.2122, |
| "step": 58850 |
| }, |
| { |
| "epoch": 17.167016439314445, |
| "grad_norm": 0.35470864176750183, |
| "learning_rate": 0.0003942292213473316, |
| "loss": 3.2239, |
| "step": 58900 |
| }, |
| { |
| "epoch": 17.181590299638568, |
| "grad_norm": 0.3830486238002777, |
| "learning_rate": 0.0003940542432195975, |
| "loss": 3.2317, |
| "step": 58950 |
| }, |
| { |
| "epoch": 17.19616415996269, |
| "grad_norm": 0.377537339925766, |
| "learning_rate": 0.0003938792650918635, |
| "loss": 3.2266, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.19616415996269, |
| "eval_accuracy": 0.3712499545395537, |
| "eval_loss": 3.5594046115875244, |
| "eval_runtime": 53.4132, |
| "eval_samples_per_second": 311.29, |
| "eval_steps_per_second": 19.471, |
| "step": 59000 |
| }, |
| { |
| "epoch": 17.210738020286815, |
| "grad_norm": 0.37255439162254333, |
| "learning_rate": 0.00039370428696412947, |
| "loss": 3.244, |
| "step": 59050 |
| }, |
| { |
| "epoch": 17.225311880610935, |
| "grad_norm": 0.36858004331588745, |
| "learning_rate": 0.0003935293088363954, |
| "loss": 3.2388, |
| "step": 59100 |
| }, |
| { |
| "epoch": 17.23988574093506, |
| "grad_norm": 0.39461615681648254, |
| "learning_rate": 0.00039335433070866136, |
| "loss": 3.2456, |
| "step": 59150 |
| }, |
| { |
| "epoch": 17.25445960125918, |
| "grad_norm": 0.41613295674324036, |
| "learning_rate": 0.00039317935258092736, |
| "loss": 3.2548, |
| "step": 59200 |
| }, |
| { |
| "epoch": 17.269033461583305, |
| "grad_norm": 0.3626161813735962, |
| "learning_rate": 0.00039300437445319335, |
| "loss": 3.2519, |
| "step": 59250 |
| }, |
| { |
| "epoch": 17.283607321907425, |
| "grad_norm": 0.3921727240085602, |
| "learning_rate": 0.0003928293963254593, |
| "loss": 3.2468, |
| "step": 59300 |
| }, |
| { |
| "epoch": 17.29818118223155, |
| "grad_norm": 0.374748170375824, |
| "learning_rate": 0.00039265441819772524, |
| "loss": 3.2391, |
| "step": 59350 |
| }, |
| { |
| "epoch": 17.312755042555672, |
| "grad_norm": 0.36504217982292175, |
| "learning_rate": 0.00039247944006999124, |
| "loss": 3.2501, |
| "step": 59400 |
| }, |
| { |
| "epoch": 17.327328902879795, |
| "grad_norm": 0.40052106976509094, |
| "learning_rate": 0.0003923044619422572, |
| "loss": 3.2519, |
| "step": 59450 |
| }, |
| { |
| "epoch": 17.34190276320392, |
| "grad_norm": 0.3901118040084839, |
| "learning_rate": 0.0003921294838145232, |
| "loss": 3.2538, |
| "step": 59500 |
| }, |
| { |
| "epoch": 17.35647662352804, |
| "grad_norm": 0.3533080816268921, |
| "learning_rate": 0.0003919545056867891, |
| "loss": 3.2583, |
| "step": 59550 |
| }, |
| { |
| "epoch": 17.371050483852162, |
| "grad_norm": 0.38468673825263977, |
| "learning_rate": 0.00039177952755905507, |
| "loss": 3.2559, |
| "step": 59600 |
| }, |
| { |
| "epoch": 17.385624344176286, |
| "grad_norm": 0.4263041615486145, |
| "learning_rate": 0.00039160454943132106, |
| "loss": 3.259, |
| "step": 59650 |
| }, |
| { |
| "epoch": 17.40019820450041, |
| "grad_norm": 0.37212032079696655, |
| "learning_rate": 0.000391429571303587, |
| "loss": 3.2627, |
| "step": 59700 |
| }, |
| { |
| "epoch": 17.41477206482453, |
| "grad_norm": 0.3558288812637329, |
| "learning_rate": 0.00039125459317585295, |
| "loss": 3.263, |
| "step": 59750 |
| }, |
| { |
| "epoch": 17.429345925148652, |
| "grad_norm": 0.37633559107780457, |
| "learning_rate": 0.00039107961504811895, |
| "loss": 3.2614, |
| "step": 59800 |
| }, |
| { |
| "epoch": 17.443919785472776, |
| "grad_norm": 0.3678731322288513, |
| "learning_rate": 0.00039090463692038495, |
| "loss": 3.255, |
| "step": 59850 |
| }, |
| { |
| "epoch": 17.4584936457969, |
| "grad_norm": 0.3690950572490692, |
| "learning_rate": 0.00039072965879265084, |
| "loss": 3.2527, |
| "step": 59900 |
| }, |
| { |
| "epoch": 17.473067506121023, |
| "grad_norm": 0.3583453893661499, |
| "learning_rate": 0.00039055468066491683, |
| "loss": 3.2621, |
| "step": 59950 |
| }, |
| { |
| "epoch": 17.487641366445143, |
| "grad_norm": 0.34633949398994446, |
| "learning_rate": 0.00039037970253718283, |
| "loss": 3.2635, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.487641366445143, |
| "eval_accuracy": 0.37159916138570265, |
| "eval_loss": 3.5501837730407715, |
| "eval_runtime": 53.5046, |
| "eval_samples_per_second": 310.758, |
| "eval_steps_per_second": 19.438, |
| "step": 60000 |
| }, |
| { |
| "epoch": 17.502215226769266, |
| "grad_norm": 0.389166921377182, |
| "learning_rate": 0.00039020472440944883, |
| "loss": 3.271, |
| "step": 60050 |
| }, |
| { |
| "epoch": 17.51678908709339, |
| "grad_norm": 0.3584425747394562, |
| "learning_rate": 0.0003900297462817147, |
| "loss": 3.2651, |
| "step": 60100 |
| }, |
| { |
| "epoch": 17.531362947417513, |
| "grad_norm": 0.3982011675834656, |
| "learning_rate": 0.0003898547681539807, |
| "loss": 3.2623, |
| "step": 60150 |
| }, |
| { |
| "epoch": 17.545936807741633, |
| "grad_norm": 0.3841356635093689, |
| "learning_rate": 0.0003896797900262467, |
| "loss": 3.2721, |
| "step": 60200 |
| }, |
| { |
| "epoch": 17.560510668065756, |
| "grad_norm": 0.36676225066185, |
| "learning_rate": 0.00038950481189851266, |
| "loss": 3.2803, |
| "step": 60250 |
| }, |
| { |
| "epoch": 17.57508452838988, |
| "grad_norm": 0.3859211504459381, |
| "learning_rate": 0.0003893298337707786, |
| "loss": 3.2567, |
| "step": 60300 |
| }, |
| { |
| "epoch": 17.589658388714003, |
| "grad_norm": 0.40538913011550903, |
| "learning_rate": 0.0003891548556430446, |
| "loss": 3.2761, |
| "step": 60350 |
| }, |
| { |
| "epoch": 17.604232249038127, |
| "grad_norm": 0.4039849638938904, |
| "learning_rate": 0.00038897987751531054, |
| "loss": 3.2757, |
| "step": 60400 |
| }, |
| { |
| "epoch": 17.618806109362247, |
| "grad_norm": 0.3618032932281494, |
| "learning_rate": 0.00038880489938757654, |
| "loss": 3.2756, |
| "step": 60450 |
| }, |
| { |
| "epoch": 17.63337996968637, |
| "grad_norm": 0.3855133056640625, |
| "learning_rate": 0.0003886299212598425, |
| "loss": 3.282, |
| "step": 60500 |
| }, |
| { |
| "epoch": 17.647953830010493, |
| "grad_norm": 0.3841269016265869, |
| "learning_rate": 0.0003884549431321085, |
| "loss": 3.272, |
| "step": 60550 |
| }, |
| { |
| "epoch": 17.662527690334617, |
| "grad_norm": 0.35645824670791626, |
| "learning_rate": 0.0003882799650043744, |
| "loss": 3.2771, |
| "step": 60600 |
| }, |
| { |
| "epoch": 17.677101550658737, |
| "grad_norm": 0.4001765251159668, |
| "learning_rate": 0.00038810498687664037, |
| "loss": 3.2765, |
| "step": 60650 |
| }, |
| { |
| "epoch": 17.69167541098286, |
| "grad_norm": 0.39104482531547546, |
| "learning_rate": 0.00038793000874890637, |
| "loss": 3.2801, |
| "step": 60700 |
| }, |
| { |
| "epoch": 17.706249271306984, |
| "grad_norm": 0.3950614631175995, |
| "learning_rate": 0.0003877550306211723, |
| "loss": 3.2821, |
| "step": 60750 |
| }, |
| { |
| "epoch": 17.720823131631107, |
| "grad_norm": 0.3843282461166382, |
| "learning_rate": 0.0003875800524934383, |
| "loss": 3.2814, |
| "step": 60800 |
| }, |
| { |
| "epoch": 17.73539699195523, |
| "grad_norm": 0.3883490264415741, |
| "learning_rate": 0.00038740507436570425, |
| "loss": 3.2764, |
| "step": 60850 |
| }, |
| { |
| "epoch": 17.74997085227935, |
| "grad_norm": 0.39616382122039795, |
| "learning_rate": 0.0003872300962379702, |
| "loss": 3.2751, |
| "step": 60900 |
| }, |
| { |
| "epoch": 17.764544712603474, |
| "grad_norm": 0.3767209053039551, |
| "learning_rate": 0.0003870551181102362, |
| "loss": 3.2792, |
| "step": 60950 |
| }, |
| { |
| "epoch": 17.779118572927597, |
| "grad_norm": 0.3787990212440491, |
| "learning_rate": 0.0003868801399825022, |
| "loss": 3.2861, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.779118572927597, |
| "eval_accuracy": 0.37203487548898667, |
| "eval_loss": 3.5421674251556396, |
| "eval_runtime": 53.5131, |
| "eval_samples_per_second": 310.709, |
| "eval_steps_per_second": 19.434, |
| "step": 61000 |
| }, |
| { |
| "epoch": 17.79369243325172, |
| "grad_norm": 0.3651289641857147, |
| "learning_rate": 0.0003867051618547681, |
| "loss": 3.2864, |
| "step": 61050 |
| }, |
| { |
| "epoch": 17.80826629357584, |
| "grad_norm": 0.4002183675765991, |
| "learning_rate": 0.0003865301837270341, |
| "loss": 3.2822, |
| "step": 61100 |
| }, |
| { |
| "epoch": 17.822840153899964, |
| "grad_norm": 0.3977372348308563, |
| "learning_rate": 0.0003863552055993001, |
| "loss": 3.3017, |
| "step": 61150 |
| }, |
| { |
| "epoch": 17.837414014224088, |
| "grad_norm": 0.3658791780471802, |
| "learning_rate": 0.00038618022747156607, |
| "loss": 3.3025, |
| "step": 61200 |
| }, |
| { |
| "epoch": 17.85198787454821, |
| "grad_norm": 0.36881786584854126, |
| "learning_rate": 0.00038600524934383196, |
| "loss": 3.2896, |
| "step": 61250 |
| }, |
| { |
| "epoch": 17.866561734872334, |
| "grad_norm": 0.3770887553691864, |
| "learning_rate": 0.00038583027121609796, |
| "loss": 3.2931, |
| "step": 61300 |
| }, |
| { |
| "epoch": 17.881135595196454, |
| "grad_norm": 0.3751731216907501, |
| "learning_rate": 0.00038565529308836396, |
| "loss": 3.2836, |
| "step": 61350 |
| }, |
| { |
| "epoch": 17.895709455520578, |
| "grad_norm": 0.4210517108440399, |
| "learning_rate": 0.00038548031496062984, |
| "loss": 3.2902, |
| "step": 61400 |
| }, |
| { |
| "epoch": 17.9102833158447, |
| "grad_norm": 0.37777844071388245, |
| "learning_rate": 0.00038530533683289584, |
| "loss": 3.2881, |
| "step": 61450 |
| }, |
| { |
| "epoch": 17.924857176168825, |
| "grad_norm": 0.37144628167152405, |
| "learning_rate": 0.00038513035870516184, |
| "loss": 3.2839, |
| "step": 61500 |
| }, |
| { |
| "epoch": 17.939431036492945, |
| "grad_norm": 0.368533730506897, |
| "learning_rate": 0.0003849553805774278, |
| "loss": 3.2908, |
| "step": 61550 |
| }, |
| { |
| "epoch": 17.954004896817068, |
| "grad_norm": 0.3613893687725067, |
| "learning_rate": 0.00038478040244969373, |
| "loss": 3.2812, |
| "step": 61600 |
| }, |
| { |
| "epoch": 17.96857875714119, |
| "grad_norm": 0.3668134808540344, |
| "learning_rate": 0.0003846054243219597, |
| "loss": 3.2867, |
| "step": 61650 |
| }, |
| { |
| "epoch": 17.983152617465315, |
| "grad_norm": 0.3757164478302002, |
| "learning_rate": 0.00038443044619422567, |
| "loss": 3.2948, |
| "step": 61700 |
| }, |
| { |
| "epoch": 17.99772647778944, |
| "grad_norm": 0.3673889636993408, |
| "learning_rate": 0.00038425546806649167, |
| "loss": 3.2968, |
| "step": 61750 |
| }, |
| { |
| "epoch": 18.012242042672263, |
| "grad_norm": 0.3745373487472534, |
| "learning_rate": 0.0003840804899387576, |
| "loss": 3.2101, |
| "step": 61800 |
| }, |
| { |
| "epoch": 18.026815902996386, |
| "grad_norm": 0.38842591643333435, |
| "learning_rate": 0.0003839055118110236, |
| "loss": 3.1894, |
| "step": 61850 |
| }, |
| { |
| "epoch": 18.04138976332051, |
| "grad_norm": 0.3884901702404022, |
| "learning_rate": 0.00038373053368328955, |
| "loss": 3.1821, |
| "step": 61900 |
| }, |
| { |
| "epoch": 18.05596362364463, |
| "grad_norm": 0.42632555961608887, |
| "learning_rate": 0.00038355555555555555, |
| "loss": 3.1679, |
| "step": 61950 |
| }, |
| { |
| "epoch": 18.070537483968753, |
| "grad_norm": 0.37444430589675903, |
| "learning_rate": 0.0003833805774278215, |
| "loss": 3.2064, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.070537483968753, |
| "eval_accuracy": 0.37141108166202685, |
| "eval_loss": 3.5579850673675537, |
| "eval_runtime": 53.5806, |
| "eval_samples_per_second": 310.318, |
| "eval_steps_per_second": 19.41, |
| "step": 62000 |
| }, |
| { |
| "epoch": 18.085111344292876, |
| "grad_norm": 0.38504013419151306, |
| "learning_rate": 0.00038320559930008744, |
| "loss": 3.2129, |
| "step": 62050 |
| }, |
| { |
| "epoch": 18.099685204617, |
| "grad_norm": 0.3946874439716339, |
| "learning_rate": 0.00038303062117235343, |
| "loss": 3.2077, |
| "step": 62100 |
| }, |
| { |
| "epoch": 18.114259064941123, |
| "grad_norm": 0.379470556974411, |
| "learning_rate": 0.00038285564304461943, |
| "loss": 3.1884, |
| "step": 62150 |
| }, |
| { |
| "epoch": 18.128832925265243, |
| "grad_norm": 0.3938375413417816, |
| "learning_rate": 0.0003826806649168853, |
| "loss": 3.2087, |
| "step": 62200 |
| }, |
| { |
| "epoch": 18.143406785589367, |
| "grad_norm": 0.3999002277851105, |
| "learning_rate": 0.0003825056867891513, |
| "loss": 3.2064, |
| "step": 62250 |
| }, |
| { |
| "epoch": 18.15798064591349, |
| "grad_norm": 0.40626150369644165, |
| "learning_rate": 0.0003823307086614173, |
| "loss": 3.2229, |
| "step": 62300 |
| }, |
| { |
| "epoch": 18.172554506237613, |
| "grad_norm": 0.39746424555778503, |
| "learning_rate": 0.0003821557305336832, |
| "loss": 3.2306, |
| "step": 62350 |
| }, |
| { |
| "epoch": 18.187128366561733, |
| "grad_norm": 0.3926345705986023, |
| "learning_rate": 0.0003819807524059492, |
| "loss": 3.2226, |
| "step": 62400 |
| }, |
| { |
| "epoch": 18.201702226885857, |
| "grad_norm": 0.43490874767303467, |
| "learning_rate": 0.0003818057742782152, |
| "loss": 3.2191, |
| "step": 62450 |
| }, |
| { |
| "epoch": 18.21627608720998, |
| "grad_norm": 0.3765084147453308, |
| "learning_rate": 0.0003816307961504812, |
| "loss": 3.2148, |
| "step": 62500 |
| }, |
| { |
| "epoch": 18.230849947534104, |
| "grad_norm": 0.3878106474876404, |
| "learning_rate": 0.0003814558180227471, |
| "loss": 3.2379, |
| "step": 62550 |
| }, |
| { |
| "epoch": 18.245423807858227, |
| "grad_norm": 0.38751399517059326, |
| "learning_rate": 0.0003812808398950131, |
| "loss": 3.2223, |
| "step": 62600 |
| }, |
| { |
| "epoch": 18.259997668182347, |
| "grad_norm": 0.40568989515304565, |
| "learning_rate": 0.0003811058617672791, |
| "loss": 3.2267, |
| "step": 62650 |
| }, |
| { |
| "epoch": 18.27457152850647, |
| "grad_norm": 0.3807388246059418, |
| "learning_rate": 0.000380930883639545, |
| "loss": 3.2305, |
| "step": 62700 |
| }, |
| { |
| "epoch": 18.289145388830594, |
| "grad_norm": 0.40289512276649475, |
| "learning_rate": 0.00038075590551181097, |
| "loss": 3.2388, |
| "step": 62750 |
| }, |
| { |
| "epoch": 18.303719249154717, |
| "grad_norm": 0.35825660824775696, |
| "learning_rate": 0.00038058092738407697, |
| "loss": 3.2285, |
| "step": 62800 |
| }, |
| { |
| "epoch": 18.318293109478837, |
| "grad_norm": 0.40298762917518616, |
| "learning_rate": 0.0003804059492563429, |
| "loss": 3.2376, |
| "step": 62850 |
| }, |
| { |
| "epoch": 18.33286696980296, |
| "grad_norm": 0.40649402141571045, |
| "learning_rate": 0.0003802309711286089, |
| "loss": 3.251, |
| "step": 62900 |
| }, |
| { |
| "epoch": 18.347440830127084, |
| "grad_norm": 0.3785764276981354, |
| "learning_rate": 0.00038005599300087485, |
| "loss": 3.2371, |
| "step": 62950 |
| }, |
| { |
| "epoch": 18.362014690451208, |
| "grad_norm": 0.37714409828186035, |
| "learning_rate": 0.0003798810148731408, |
| "loss": 3.2506, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.362014690451208, |
| "eval_accuracy": 0.3719151776923795, |
| "eval_loss": 3.552590847015381, |
| "eval_runtime": 53.6543, |
| "eval_samples_per_second": 309.891, |
| "eval_steps_per_second": 19.383, |
| "step": 63000 |
| }, |
| { |
| "epoch": 18.37658855077533, |
| "grad_norm": 0.3751346468925476, |
| "learning_rate": 0.0003797060367454068, |
| "loss": 3.2456, |
| "step": 63050 |
| }, |
| { |
| "epoch": 18.39116241109945, |
| "grad_norm": 0.3642127215862274, |
| "learning_rate": 0.0003795310586176728, |
| "loss": 3.2399, |
| "step": 63100 |
| }, |
| { |
| "epoch": 18.405736271423574, |
| "grad_norm": 0.38021567463874817, |
| "learning_rate": 0.00037935608048993873, |
| "loss": 3.2476, |
| "step": 63150 |
| }, |
| { |
| "epoch": 18.420310131747698, |
| "grad_norm": 0.373936265707016, |
| "learning_rate": 0.0003791811023622047, |
| "loss": 3.2411, |
| "step": 63200 |
| }, |
| { |
| "epoch": 18.43488399207182, |
| "grad_norm": 0.3796992003917694, |
| "learning_rate": 0.0003790061242344707, |
| "loss": 3.2564, |
| "step": 63250 |
| }, |
| { |
| "epoch": 18.44945785239594, |
| "grad_norm": 0.3802540600299835, |
| "learning_rate": 0.0003788311461067366, |
| "loss": 3.247, |
| "step": 63300 |
| }, |
| { |
| "epoch": 18.464031712720065, |
| "grad_norm": 0.35876718163490295, |
| "learning_rate": 0.00037865616797900256, |
| "loss": 3.2514, |
| "step": 63350 |
| }, |
| { |
| "epoch": 18.478605573044188, |
| "grad_norm": 0.36608242988586426, |
| "learning_rate": 0.00037848118985126856, |
| "loss": 3.2482, |
| "step": 63400 |
| }, |
| { |
| "epoch": 18.49317943336831, |
| "grad_norm": 0.39771997928619385, |
| "learning_rate": 0.00037830621172353456, |
| "loss": 3.2448, |
| "step": 63450 |
| }, |
| { |
| "epoch": 18.507753293692435, |
| "grad_norm": 0.38852259516716003, |
| "learning_rate": 0.00037813123359580045, |
| "loss": 3.263, |
| "step": 63500 |
| }, |
| { |
| "epoch": 18.522327154016555, |
| "grad_norm": 0.3723866641521454, |
| "learning_rate": 0.00037795625546806644, |
| "loss": 3.2552, |
| "step": 63550 |
| }, |
| { |
| "epoch": 18.53690101434068, |
| "grad_norm": 0.4147019684314728, |
| "learning_rate": 0.00037778127734033244, |
| "loss": 3.2597, |
| "step": 63600 |
| }, |
| { |
| "epoch": 18.5514748746648, |
| "grad_norm": 0.3557119369506836, |
| "learning_rate": 0.00037760629921259844, |
| "loss": 3.2586, |
| "step": 63650 |
| }, |
| { |
| "epoch": 18.566048734988925, |
| "grad_norm": 0.37144237756729126, |
| "learning_rate": 0.00037743132108486433, |
| "loss": 3.2592, |
| "step": 63700 |
| }, |
| { |
| "epoch": 18.580622595313045, |
| "grad_norm": 0.38424184918403625, |
| "learning_rate": 0.0003772563429571303, |
| "loss": 3.2744, |
| "step": 63750 |
| }, |
| { |
| "epoch": 18.59519645563717, |
| "grad_norm": 0.3793662488460541, |
| "learning_rate": 0.0003770813648293963, |
| "loss": 3.2656, |
| "step": 63800 |
| }, |
| { |
| "epoch": 18.609770315961292, |
| "grad_norm": 0.3815428912639618, |
| "learning_rate": 0.00037690638670166227, |
| "loss": 3.2648, |
| "step": 63850 |
| }, |
| { |
| "epoch": 18.624344176285415, |
| "grad_norm": 0.38336291909217834, |
| "learning_rate": 0.0003767314085739282, |
| "loss": 3.2698, |
| "step": 63900 |
| }, |
| { |
| "epoch": 18.63891803660954, |
| "grad_norm": 0.3922796845436096, |
| "learning_rate": 0.0003765564304461942, |
| "loss": 3.2667, |
| "step": 63950 |
| }, |
| { |
| "epoch": 18.65349189693366, |
| "grad_norm": 0.35125067830085754, |
| "learning_rate": 0.00037638145231846015, |
| "loss": 3.2589, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.65349189693366, |
| "eval_accuracy": 0.37210537596112797, |
| "eval_loss": 3.547069549560547, |
| "eval_runtime": 53.5682, |
| "eval_samples_per_second": 310.389, |
| "eval_steps_per_second": 19.414, |
| "step": 64000 |
| }, |
| { |
| "epoch": 18.668065757257782, |
| "grad_norm": 0.3918921947479248, |
| "learning_rate": 0.00037620647419072615, |
| "loss": 3.2622, |
| "step": 64050 |
| }, |
| { |
| "epoch": 18.682639617581906, |
| "grad_norm": 0.3910241723060608, |
| "learning_rate": 0.0003760314960629921, |
| "loss": 3.2627, |
| "step": 64100 |
| }, |
| { |
| "epoch": 18.69721347790603, |
| "grad_norm": 0.3765471279621124, |
| "learning_rate": 0.00037585651793525804, |
| "loss": 3.2641, |
| "step": 64150 |
| }, |
| { |
| "epoch": 18.71178733823015, |
| "grad_norm": 0.36692434549331665, |
| "learning_rate": 0.00037568153980752404, |
| "loss": 3.2687, |
| "step": 64200 |
| }, |
| { |
| "epoch": 18.726361198554272, |
| "grad_norm": 0.3759457767009735, |
| "learning_rate": 0.00037550656167979, |
| "loss": 3.2682, |
| "step": 64250 |
| }, |
| { |
| "epoch": 18.740935058878396, |
| "grad_norm": 0.395193874835968, |
| "learning_rate": 0.0003753315835520559, |
| "loss": 3.2548, |
| "step": 64300 |
| }, |
| { |
| "epoch": 18.75550891920252, |
| "grad_norm": 0.39433982968330383, |
| "learning_rate": 0.0003751566054243219, |
| "loss": 3.2614, |
| "step": 64350 |
| }, |
| { |
| "epoch": 18.770082779526643, |
| "grad_norm": 0.419454425573349, |
| "learning_rate": 0.0003749816272965879, |
| "loss": 3.2692, |
| "step": 64400 |
| }, |
| { |
| "epoch": 18.784656639850763, |
| "grad_norm": 0.3734700381755829, |
| "learning_rate": 0.00037480664916885386, |
| "loss": 3.2781, |
| "step": 64450 |
| }, |
| { |
| "epoch": 18.799230500174886, |
| "grad_norm": 0.37187695503234863, |
| "learning_rate": 0.0003746316710411198, |
| "loss": 3.2772, |
| "step": 64500 |
| }, |
| { |
| "epoch": 18.81380436049901, |
| "grad_norm": 0.3946555256843567, |
| "learning_rate": 0.0003744566929133858, |
| "loss": 3.2786, |
| "step": 64550 |
| }, |
| { |
| "epoch": 18.828378220823133, |
| "grad_norm": 0.3924318850040436, |
| "learning_rate": 0.0003742817147856518, |
| "loss": 3.2777, |
| "step": 64600 |
| }, |
| { |
| "epoch": 18.842952081147253, |
| "grad_norm": 0.39290758967399597, |
| "learning_rate": 0.0003741067366579177, |
| "loss": 3.2715, |
| "step": 64650 |
| }, |
| { |
| "epoch": 18.857525941471376, |
| "grad_norm": 0.3880336582660675, |
| "learning_rate": 0.0003739317585301837, |
| "loss": 3.2821, |
| "step": 64700 |
| }, |
| { |
| "epoch": 18.8720998017955, |
| "grad_norm": 0.36619827151298523, |
| "learning_rate": 0.0003737567804024497, |
| "loss": 3.2867, |
| "step": 64750 |
| }, |
| { |
| "epoch": 18.886673662119623, |
| "grad_norm": 0.42051753401756287, |
| "learning_rate": 0.0003735818022747157, |
| "loss": 3.2712, |
| "step": 64800 |
| }, |
| { |
| "epoch": 18.901247522443747, |
| "grad_norm": 0.4102894067764282, |
| "learning_rate": 0.00037340682414698157, |
| "loss": 3.2831, |
| "step": 64850 |
| }, |
| { |
| "epoch": 18.915821382767867, |
| "grad_norm": 0.3856189250946045, |
| "learning_rate": 0.00037323184601924757, |
| "loss": 3.2751, |
| "step": 64900 |
| }, |
| { |
| "epoch": 18.93039524309199, |
| "grad_norm": 0.389129638671875, |
| "learning_rate": 0.00037305686789151357, |
| "loss": 3.3009, |
| "step": 64950 |
| }, |
| { |
| "epoch": 18.944969103416113, |
| "grad_norm": 0.3810073435306549, |
| "learning_rate": 0.00037288188976377946, |
| "loss": 3.284, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.944969103416113, |
| "eval_accuracy": 0.3724745912885191, |
| "eval_loss": 3.5394740104675293, |
| "eval_runtime": 53.5525, |
| "eval_samples_per_second": 310.48, |
| "eval_steps_per_second": 19.42, |
| "step": 65000 |
| }, |
| { |
| "epoch": 18.959542963740237, |
| "grad_norm": 0.3946700990200043, |
| "learning_rate": 0.00037270691163604545, |
| "loss": 3.2832, |
| "step": 65050 |
| }, |
| { |
| "epoch": 18.974116824064357, |
| "grad_norm": 0.3840828537940979, |
| "learning_rate": 0.00037253193350831145, |
| "loss": 3.2812, |
| "step": 65100 |
| }, |
| { |
| "epoch": 18.98869068438848, |
| "grad_norm": 0.39696988463401794, |
| "learning_rate": 0.0003723569553805774, |
| "loss": 3.2861, |
| "step": 65150 |
| }, |
| { |
| "epoch": 19.003206249271308, |
| "grad_norm": 0.37383022904396057, |
| "learning_rate": 0.00037218197725284334, |
| "loss": 3.2601, |
| "step": 65200 |
| }, |
| { |
| "epoch": 19.017780109595428, |
| "grad_norm": 0.37770166993141174, |
| "learning_rate": 0.00037200699912510934, |
| "loss": 3.1687, |
| "step": 65250 |
| }, |
| { |
| "epoch": 19.03235396991955, |
| "grad_norm": 0.40059325098991394, |
| "learning_rate": 0.0003718320209973753, |
| "loss": 3.1812, |
| "step": 65300 |
| }, |
| { |
| "epoch": 19.046927830243675, |
| "grad_norm": 0.380967378616333, |
| "learning_rate": 0.0003716570428696413, |
| "loss": 3.1904, |
| "step": 65350 |
| }, |
| { |
| "epoch": 19.0615016905678, |
| "grad_norm": 0.39268967509269714, |
| "learning_rate": 0.0003714820647419072, |
| "loss": 3.1754, |
| "step": 65400 |
| }, |
| { |
| "epoch": 19.07607555089192, |
| "grad_norm": 0.36186572909355164, |
| "learning_rate": 0.00037130708661417316, |
| "loss": 3.1804, |
| "step": 65450 |
| }, |
| { |
| "epoch": 19.09064941121604, |
| "grad_norm": 0.3893151879310608, |
| "learning_rate": 0.00037113210848643916, |
| "loss": 3.1862, |
| "step": 65500 |
| }, |
| { |
| "epoch": 19.105223271540165, |
| "grad_norm": 0.37462306022644043, |
| "learning_rate": 0.00037095713035870516, |
| "loss": 3.1854, |
| "step": 65550 |
| }, |
| { |
| "epoch": 19.11979713186429, |
| "grad_norm": 0.37590491771698, |
| "learning_rate": 0.00037078215223097105, |
| "loss": 3.2095, |
| "step": 65600 |
| }, |
| { |
| "epoch": 19.134370992188412, |
| "grad_norm": 0.420132040977478, |
| "learning_rate": 0.00037060717410323705, |
| "loss": 3.1936, |
| "step": 65650 |
| }, |
| { |
| "epoch": 19.148944852512532, |
| "grad_norm": 0.39511385560035706, |
| "learning_rate": 0.00037043219597550304, |
| "loss": 3.1972, |
| "step": 65700 |
| }, |
| { |
| "epoch": 19.163518712836655, |
| "grad_norm": 0.40953826904296875, |
| "learning_rate": 0.00037025721784776904, |
| "loss": 3.208, |
| "step": 65750 |
| }, |
| { |
| "epoch": 19.17809257316078, |
| "grad_norm": 0.39270836114883423, |
| "learning_rate": 0.00037008223972003493, |
| "loss": 3.2024, |
| "step": 65800 |
| }, |
| { |
| "epoch": 19.192666433484902, |
| "grad_norm": 0.41319575905799866, |
| "learning_rate": 0.00036990726159230093, |
| "loss": 3.2066, |
| "step": 65850 |
| }, |
| { |
| "epoch": 19.207240293809026, |
| "grad_norm": 0.3758715093135834, |
| "learning_rate": 0.0003697322834645669, |
| "loss": 3.2169, |
| "step": 65900 |
| }, |
| { |
| "epoch": 19.221814154133146, |
| "grad_norm": 0.3666883409023285, |
| "learning_rate": 0.0003695573053368328, |
| "loss": 3.211, |
| "step": 65950 |
| }, |
| { |
| "epoch": 19.23638801445727, |
| "grad_norm": 0.4095924198627472, |
| "learning_rate": 0.0003693823272090988, |
| "loss": 3.2215, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.23638801445727, |
| "eval_accuracy": 0.3715518472124125, |
| "eval_loss": 3.55948543548584, |
| "eval_runtime": 53.8059, |
| "eval_samples_per_second": 309.018, |
| "eval_steps_per_second": 19.329, |
| "step": 66000 |
| }, |
| { |
| "epoch": 19.250961874781392, |
| "grad_norm": 0.38275042176246643, |
| "learning_rate": 0.0003692073490813648, |
| "loss": 3.2215, |
| "step": 66050 |
| }, |
| { |
| "epoch": 19.265535735105516, |
| "grad_norm": 0.37089359760284424, |
| "learning_rate": 0.0003690323709536308, |
| "loss": 3.2157, |
| "step": 66100 |
| }, |
| { |
| "epoch": 19.280109595429636, |
| "grad_norm": 0.3921582102775574, |
| "learning_rate": 0.0003688573928258967, |
| "loss": 3.2173, |
| "step": 66150 |
| }, |
| { |
| "epoch": 19.29468345575376, |
| "grad_norm": 0.41277655959129333, |
| "learning_rate": 0.0003686824146981627, |
| "loss": 3.2327, |
| "step": 66200 |
| }, |
| { |
| "epoch": 19.309257316077883, |
| "grad_norm": 0.4155130684375763, |
| "learning_rate": 0.0003685074365704287, |
| "loss": 3.227, |
| "step": 66250 |
| }, |
| { |
| "epoch": 19.323831176402006, |
| "grad_norm": 0.3777115046977997, |
| "learning_rate": 0.00036833245844269464, |
| "loss": 3.2389, |
| "step": 66300 |
| }, |
| { |
| "epoch": 19.33840503672613, |
| "grad_norm": 0.39838212728500366, |
| "learning_rate": 0.0003681574803149606, |
| "loss": 3.2367, |
| "step": 66350 |
| }, |
| { |
| "epoch": 19.35297889705025, |
| "grad_norm": 0.3535468578338623, |
| "learning_rate": 0.0003679825021872266, |
| "loss": 3.2382, |
| "step": 66400 |
| }, |
| { |
| "epoch": 19.367552757374373, |
| "grad_norm": 0.380537211894989, |
| "learning_rate": 0.0003678075240594925, |
| "loss": 3.2296, |
| "step": 66450 |
| }, |
| { |
| "epoch": 19.382126617698496, |
| "grad_norm": 0.3990768790245056, |
| "learning_rate": 0.0003676325459317585, |
| "loss": 3.2338, |
| "step": 66500 |
| }, |
| { |
| "epoch": 19.39670047802262, |
| "grad_norm": 0.37194758653640747, |
| "learning_rate": 0.00036745756780402446, |
| "loss": 3.2499, |
| "step": 66550 |
| }, |
| { |
| "epoch": 19.41127433834674, |
| "grad_norm": 0.3932860195636749, |
| "learning_rate": 0.0003672825896762904, |
| "loss": 3.2347, |
| "step": 66600 |
| }, |
| { |
| "epoch": 19.425848198670863, |
| "grad_norm": 0.37496042251586914, |
| "learning_rate": 0.0003671076115485564, |
| "loss": 3.235, |
| "step": 66650 |
| }, |
| { |
| "epoch": 19.440422058994987, |
| "grad_norm": 0.39806580543518066, |
| "learning_rate": 0.0003669326334208224, |
| "loss": 3.2352, |
| "step": 66700 |
| }, |
| { |
| "epoch": 19.45499591931911, |
| "grad_norm": 0.37920618057250977, |
| "learning_rate": 0.0003667576552930883, |
| "loss": 3.2293, |
| "step": 66750 |
| }, |
| { |
| "epoch": 19.469569779643233, |
| "grad_norm": 0.4015212655067444, |
| "learning_rate": 0.0003665826771653543, |
| "loss": 3.2369, |
| "step": 66800 |
| }, |
| { |
| "epoch": 19.484143639967353, |
| "grad_norm": 0.3791128098964691, |
| "learning_rate": 0.0003664076990376203, |
| "loss": 3.2499, |
| "step": 66850 |
| }, |
| { |
| "epoch": 19.498717500291477, |
| "grad_norm": 0.3863450288772583, |
| "learning_rate": 0.0003662327209098862, |
| "loss": 3.2384, |
| "step": 66900 |
| }, |
| { |
| "epoch": 19.5132913606156, |
| "grad_norm": 0.42094987630844116, |
| "learning_rate": 0.0003660577427821522, |
| "loss": 3.2569, |
| "step": 66950 |
| }, |
| { |
| "epoch": 19.527865220939724, |
| "grad_norm": 0.3729070723056793, |
| "learning_rate": 0.00036588276465441817, |
| "loss": 3.2425, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.527865220939724, |
| "eval_accuracy": 0.37223649036173806, |
| "eval_loss": 3.548274517059326, |
| "eval_runtime": 53.4422, |
| "eval_samples_per_second": 311.121, |
| "eval_steps_per_second": 19.46, |
| "step": 67000 |
| }, |
| { |
| "epoch": 19.542439081263844, |
| "grad_norm": 0.3721741735935211, |
| "learning_rate": 0.00036570778652668417, |
| "loss": 3.2468, |
| "step": 67050 |
| }, |
| { |
| "epoch": 19.557012941587967, |
| "grad_norm": 0.3784923851490021, |
| "learning_rate": 0.00036553280839895006, |
| "loss": 3.2583, |
| "step": 67100 |
| }, |
| { |
| "epoch": 19.57158680191209, |
| "grad_norm": 0.38066989183425903, |
| "learning_rate": 0.00036535783027121606, |
| "loss": 3.2563, |
| "step": 67150 |
| }, |
| { |
| "epoch": 19.586160662236214, |
| "grad_norm": 0.38815242052078247, |
| "learning_rate": 0.00036518285214348205, |
| "loss": 3.2499, |
| "step": 67200 |
| }, |
| { |
| "epoch": 19.600734522560337, |
| "grad_norm": 0.36890047788619995, |
| "learning_rate": 0.00036500787401574805, |
| "loss": 3.2442, |
| "step": 67250 |
| }, |
| { |
| "epoch": 19.615308382884457, |
| "grad_norm": 0.40046757459640503, |
| "learning_rate": 0.00036483289588801394, |
| "loss": 3.2517, |
| "step": 67300 |
| }, |
| { |
| "epoch": 19.62988224320858, |
| "grad_norm": 0.39612528681755066, |
| "learning_rate": 0.00036465791776027994, |
| "loss": 3.2386, |
| "step": 67350 |
| }, |
| { |
| "epoch": 19.644456103532704, |
| "grad_norm": 0.36910921335220337, |
| "learning_rate": 0.00036448293963254594, |
| "loss": 3.2482, |
| "step": 67400 |
| }, |
| { |
| "epoch": 19.659029963856828, |
| "grad_norm": 0.3974798917770386, |
| "learning_rate": 0.0003643079615048119, |
| "loss": 3.2639, |
| "step": 67450 |
| }, |
| { |
| "epoch": 19.673603824180947, |
| "grad_norm": 0.40347325801849365, |
| "learning_rate": 0.0003641329833770778, |
| "loss": 3.2623, |
| "step": 67500 |
| }, |
| { |
| "epoch": 19.68817768450507, |
| "grad_norm": 0.39685729146003723, |
| "learning_rate": 0.0003639580052493438, |
| "loss": 3.2673, |
| "step": 67550 |
| }, |
| { |
| "epoch": 19.702751544829194, |
| "grad_norm": 0.3747837543487549, |
| "learning_rate": 0.00036378302712160976, |
| "loss": 3.2553, |
| "step": 67600 |
| }, |
| { |
| "epoch": 19.717325405153318, |
| "grad_norm": 0.4383140802383423, |
| "learning_rate": 0.00036360804899387576, |
| "loss": 3.2567, |
| "step": 67650 |
| }, |
| { |
| "epoch": 19.73189926547744, |
| "grad_norm": 0.39656561613082886, |
| "learning_rate": 0.0003634330708661417, |
| "loss": 3.2588, |
| "step": 67700 |
| }, |
| { |
| "epoch": 19.74647312580156, |
| "grad_norm": 0.3830217123031616, |
| "learning_rate": 0.00036325809273840765, |
| "loss": 3.2609, |
| "step": 67750 |
| }, |
| { |
| "epoch": 19.761046986125685, |
| "grad_norm": 0.3830994963645935, |
| "learning_rate": 0.00036308311461067365, |
| "loss": 3.2541, |
| "step": 67800 |
| }, |
| { |
| "epoch": 19.775620846449808, |
| "grad_norm": 0.39494529366493225, |
| "learning_rate": 0.0003629081364829396, |
| "loss": 3.2475, |
| "step": 67850 |
| }, |
| { |
| "epoch": 19.79019470677393, |
| "grad_norm": 0.3626117706298828, |
| "learning_rate": 0.00036273315835520553, |
| "loss": 3.2718, |
| "step": 67900 |
| }, |
| { |
| "epoch": 19.80476856709805, |
| "grad_norm": 0.3724912405014038, |
| "learning_rate": 0.00036255818022747153, |
| "loss": 3.2562, |
| "step": 67950 |
| }, |
| { |
| "epoch": 19.819342427422175, |
| "grad_norm": 0.40011537075042725, |
| "learning_rate": 0.00036238320209973753, |
| "loss": 3.2593, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.819342427422175, |
| "eval_accuracy": 0.37288535363872477, |
| "eval_loss": 3.542919397354126, |
| "eval_runtime": 53.6133, |
| "eval_samples_per_second": 310.128, |
| "eval_steps_per_second": 19.398, |
| "step": 68000 |
| }, |
| { |
| "epoch": 19.8339162877463, |
| "grad_norm": 0.39811837673187256, |
| "learning_rate": 0.0003622082239720034, |
| "loss": 3.2572, |
| "step": 68050 |
| }, |
| { |
| "epoch": 19.84849014807042, |
| "grad_norm": 0.38407403230667114, |
| "learning_rate": 0.0003620332458442694, |
| "loss": 3.2681, |
| "step": 68100 |
| }, |
| { |
| "epoch": 19.863064008394545, |
| "grad_norm": 0.4049363434314728, |
| "learning_rate": 0.0003618582677165354, |
| "loss": 3.2652, |
| "step": 68150 |
| }, |
| { |
| "epoch": 19.877637868718665, |
| "grad_norm": 0.395565003156662, |
| "learning_rate": 0.0003616832895888014, |
| "loss": 3.2608, |
| "step": 68200 |
| }, |
| { |
| "epoch": 19.89221172904279, |
| "grad_norm": 0.3701721429824829, |
| "learning_rate": 0.0003615083114610673, |
| "loss": 3.2606, |
| "step": 68250 |
| }, |
| { |
| "epoch": 19.906785589366912, |
| "grad_norm": 0.36809027194976807, |
| "learning_rate": 0.0003613333333333333, |
| "loss": 3.2636, |
| "step": 68300 |
| }, |
| { |
| "epoch": 19.921359449691035, |
| "grad_norm": 0.39549127221107483, |
| "learning_rate": 0.0003611583552055993, |
| "loss": 3.2686, |
| "step": 68350 |
| }, |
| { |
| "epoch": 19.935933310015155, |
| "grad_norm": 0.3891831636428833, |
| "learning_rate": 0.00036098337707786524, |
| "loss": 3.271, |
| "step": 68400 |
| }, |
| { |
| "epoch": 19.95050717033928, |
| "grad_norm": 0.36730197072029114, |
| "learning_rate": 0.0003608083989501312, |
| "loss": 3.2636, |
| "step": 68450 |
| }, |
| { |
| "epoch": 19.965081030663402, |
| "grad_norm": 0.420949250459671, |
| "learning_rate": 0.0003606334208223972, |
| "loss": 3.274, |
| "step": 68500 |
| }, |
| { |
| "epoch": 19.979654890987526, |
| "grad_norm": 0.3941866457462311, |
| "learning_rate": 0.0003604584426946632, |
| "loss": 3.2839, |
| "step": 68550 |
| }, |
| { |
| "epoch": 19.99422875131165, |
| "grad_norm": 0.4152412414550781, |
| "learning_rate": 0.00036028346456692907, |
| "loss": 3.2773, |
| "step": 68600 |
| }, |
| { |
| "epoch": 20.008744316194473, |
| "grad_norm": 0.3634067475795746, |
| "learning_rate": 0.00036010848643919507, |
| "loss": 3.2141, |
| "step": 68650 |
| }, |
| { |
| "epoch": 20.023318176518597, |
| "grad_norm": 0.3978492021560669, |
| "learning_rate": 0.00035993350831146106, |
| "loss": 3.1604, |
| "step": 68700 |
| }, |
| { |
| "epoch": 20.03789203684272, |
| "grad_norm": 0.3966376781463623, |
| "learning_rate": 0.000359758530183727, |
| "loss": 3.1659, |
| "step": 68750 |
| }, |
| { |
| "epoch": 20.05246589716684, |
| "grad_norm": 0.3949337303638458, |
| "learning_rate": 0.00035958355205599295, |
| "loss": 3.179, |
| "step": 68800 |
| }, |
| { |
| "epoch": 20.067039757490964, |
| "grad_norm": 0.37825021147727966, |
| "learning_rate": 0.00035940857392825895, |
| "loss": 3.1713, |
| "step": 68850 |
| }, |
| { |
| "epoch": 20.081613617815087, |
| "grad_norm": 0.41876670718193054, |
| "learning_rate": 0.0003592335958005249, |
| "loss": 3.1785, |
| "step": 68900 |
| }, |
| { |
| "epoch": 20.09618747813921, |
| "grad_norm": 0.4341396689414978, |
| "learning_rate": 0.0003590586176727909, |
| "loss": 3.1799, |
| "step": 68950 |
| }, |
| { |
| "epoch": 20.110761338463334, |
| "grad_norm": 0.3682248294353485, |
| "learning_rate": 0.00035888363954505683, |
| "loss": 3.2034, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.110761338463334, |
| "eval_accuracy": 0.37208842760054645, |
| "eval_loss": 3.5517804622650146, |
| "eval_runtime": 53.5851, |
| "eval_samples_per_second": 310.291, |
| "eval_steps_per_second": 19.408, |
| "step": 69000 |
| }, |
| { |
| "epoch": 20.125335198787454, |
| "grad_norm": 0.4117501676082611, |
| "learning_rate": 0.0003587086614173228, |
| "loss": 3.1927, |
| "step": 69050 |
| }, |
| { |
| "epoch": 20.139909059111577, |
| "grad_norm": 0.3802715539932251, |
| "learning_rate": 0.0003585336832895888, |
| "loss": 3.188, |
| "step": 69100 |
| }, |
| { |
| "epoch": 20.1544829194357, |
| "grad_norm": 0.3827550709247589, |
| "learning_rate": 0.00035835870516185477, |
| "loss": 3.1896, |
| "step": 69150 |
| }, |
| { |
| "epoch": 20.169056779759824, |
| "grad_norm": 0.3890596628189087, |
| "learning_rate": 0.00035818372703412066, |
| "loss": 3.1847, |
| "step": 69200 |
| }, |
| { |
| "epoch": 20.183630640083944, |
| "grad_norm": 0.4047715365886688, |
| "learning_rate": 0.00035800874890638666, |
| "loss": 3.1976, |
| "step": 69250 |
| }, |
| { |
| "epoch": 20.198204500408067, |
| "grad_norm": 0.42132121324539185, |
| "learning_rate": 0.00035783377077865266, |
| "loss": 3.2113, |
| "step": 69300 |
| }, |
| { |
| "epoch": 20.21277836073219, |
| "grad_norm": 0.43125709891319275, |
| "learning_rate": 0.00035765879265091865, |
| "loss": 3.2003, |
| "step": 69350 |
| }, |
| { |
| "epoch": 20.227352221056314, |
| "grad_norm": 0.43643462657928467, |
| "learning_rate": 0.00035748381452318454, |
| "loss": 3.2018, |
| "step": 69400 |
| }, |
| { |
| "epoch": 20.241926081380438, |
| "grad_norm": 0.41692498326301575, |
| "learning_rate": 0.00035730883639545054, |
| "loss": 3.213, |
| "step": 69450 |
| }, |
| { |
| "epoch": 20.256499941704558, |
| "grad_norm": 0.35715001821517944, |
| "learning_rate": 0.00035713385826771654, |
| "loss": 3.2123, |
| "step": 69500 |
| }, |
| { |
| "epoch": 20.27107380202868, |
| "grad_norm": 0.40020912885665894, |
| "learning_rate": 0.00035695888013998243, |
| "loss": 3.1913, |
| "step": 69550 |
| }, |
| { |
| "epoch": 20.285647662352805, |
| "grad_norm": 0.41624048352241516, |
| "learning_rate": 0.0003567839020122484, |
| "loss": 3.2248, |
| "step": 69600 |
| }, |
| { |
| "epoch": 20.300221522676928, |
| "grad_norm": 0.4075501561164856, |
| "learning_rate": 0.0003566089238845144, |
| "loss": 3.2102, |
| "step": 69650 |
| }, |
| { |
| "epoch": 20.314795383001048, |
| "grad_norm": 0.4054107666015625, |
| "learning_rate": 0.00035643394575678037, |
| "loss": 3.2071, |
| "step": 69700 |
| }, |
| { |
| "epoch": 20.32936924332517, |
| "grad_norm": 0.3951246738433838, |
| "learning_rate": 0.0003562589676290463, |
| "loss": 3.2051, |
| "step": 69750 |
| }, |
| { |
| "epoch": 20.343943103649295, |
| "grad_norm": 0.40206143260002136, |
| "learning_rate": 0.0003560839895013123, |
| "loss": 3.2263, |
| "step": 69800 |
| }, |
| { |
| "epoch": 20.35851696397342, |
| "grad_norm": 0.38143521547317505, |
| "learning_rate": 0.0003559090113735783, |
| "loss": 3.2175, |
| "step": 69850 |
| }, |
| { |
| "epoch": 20.37309082429754, |
| "grad_norm": 0.3820487856864929, |
| "learning_rate": 0.00035573403324584425, |
| "loss": 3.2183, |
| "step": 69900 |
| }, |
| { |
| "epoch": 20.38766468462166, |
| "grad_norm": 0.36672526597976685, |
| "learning_rate": 0.0003555590551181102, |
| "loss": 3.2278, |
| "step": 69950 |
| }, |
| { |
| "epoch": 20.402238544945785, |
| "grad_norm": 0.4113920331001282, |
| "learning_rate": 0.0003553840769903762, |
| "loss": 3.2154, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.402238544945785, |
| "eval_accuracy": 0.372281803686904, |
| "eval_loss": 3.5472373962402344, |
| "eval_runtime": 53.6006, |
| "eval_samples_per_second": 310.202, |
| "eval_steps_per_second": 19.403, |
| "step": 70000 |
| }, |
| { |
| "epoch": 20.41681240526991, |
| "grad_norm": 0.37555912137031555, |
| "learning_rate": 0.00035520909886264213, |
| "loss": 3.2247, |
| "step": 70050 |
| }, |
| { |
| "epoch": 20.431386265594032, |
| "grad_norm": 0.3991236388683319, |
| "learning_rate": 0.00035503412073490813, |
| "loss": 3.2381, |
| "step": 70100 |
| }, |
| { |
| "epoch": 20.445960125918152, |
| "grad_norm": 0.4171861410140991, |
| "learning_rate": 0.0003548591426071741, |
| "loss": 3.2228, |
| "step": 70150 |
| }, |
| { |
| "epoch": 20.460533986242275, |
| "grad_norm": 0.4116719961166382, |
| "learning_rate": 0.00035468416447944, |
| "loss": 3.2369, |
| "step": 70200 |
| }, |
| { |
| "epoch": 20.4751078465664, |
| "grad_norm": 0.4075976312160492, |
| "learning_rate": 0.000354509186351706, |
| "loss": 3.2343, |
| "step": 70250 |
| }, |
| { |
| "epoch": 20.489681706890522, |
| "grad_norm": 0.38528257608413696, |
| "learning_rate": 0.000354334208223972, |
| "loss": 3.2295, |
| "step": 70300 |
| }, |
| { |
| "epoch": 20.504255567214642, |
| "grad_norm": 0.4236631989479065, |
| "learning_rate": 0.0003541592300962379, |
| "loss": 3.2404, |
| "step": 70350 |
| }, |
| { |
| "epoch": 20.518829427538765, |
| "grad_norm": 0.3921928405761719, |
| "learning_rate": 0.0003539842519685039, |
| "loss": 3.2314, |
| "step": 70400 |
| }, |
| { |
| "epoch": 20.53340328786289, |
| "grad_norm": 0.39726680517196655, |
| "learning_rate": 0.0003538092738407699, |
| "loss": 3.236, |
| "step": 70450 |
| }, |
| { |
| "epoch": 20.547977148187012, |
| "grad_norm": 0.3668453097343445, |
| "learning_rate": 0.0003536342957130358, |
| "loss": 3.2204, |
| "step": 70500 |
| }, |
| { |
| "epoch": 20.562551008511136, |
| "grad_norm": 0.39525458216667175, |
| "learning_rate": 0.0003534593175853018, |
| "loss": 3.2283, |
| "step": 70550 |
| }, |
| { |
| "epoch": 20.577124868835256, |
| "grad_norm": 0.38565030694007874, |
| "learning_rate": 0.0003532843394575678, |
| "loss": 3.2361, |
| "step": 70600 |
| }, |
| { |
| "epoch": 20.59169872915938, |
| "grad_norm": 0.385259211063385, |
| "learning_rate": 0.0003531093613298338, |
| "loss": 3.2505, |
| "step": 70650 |
| }, |
| { |
| "epoch": 20.606272589483503, |
| "grad_norm": 0.3972327709197998, |
| "learning_rate": 0.00035293438320209967, |
| "loss": 3.2521, |
| "step": 70700 |
| }, |
| { |
| "epoch": 20.620846449807626, |
| "grad_norm": 0.37978488206863403, |
| "learning_rate": 0.00035275940507436567, |
| "loss": 3.24, |
| "step": 70750 |
| }, |
| { |
| "epoch": 20.63542031013175, |
| "grad_norm": 0.40364915132522583, |
| "learning_rate": 0.00035258442694663166, |
| "loss": 3.2441, |
| "step": 70800 |
| }, |
| { |
| "epoch": 20.64999417045587, |
| "grad_norm": 0.3870035707950592, |
| "learning_rate": 0.0003524094488188976, |
| "loss": 3.2385, |
| "step": 70850 |
| }, |
| { |
| "epoch": 20.664568030779993, |
| "grad_norm": 0.38927018642425537, |
| "learning_rate": 0.00035223447069116355, |
| "loss": 3.2316, |
| "step": 70900 |
| }, |
| { |
| "epoch": 20.679141891104116, |
| "grad_norm": 0.3759223520755768, |
| "learning_rate": 0.00035205949256342955, |
| "loss": 3.2492, |
| "step": 70950 |
| }, |
| { |
| "epoch": 20.69371575142824, |
| "grad_norm": 0.4070005416870117, |
| "learning_rate": 0.0003518845144356955, |
| "loss": 3.2438, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.69371575142824, |
| "eval_accuracy": 0.3726712628894342, |
| "eval_loss": 3.543128728866577, |
| "eval_runtime": 53.6099, |
| "eval_samples_per_second": 310.148, |
| "eval_steps_per_second": 19.399, |
| "step": 71000 |
| }, |
| { |
| "epoch": 20.70828961175236, |
| "grad_norm": 0.4172914922237396, |
| "learning_rate": 0.0003517095363079615, |
| "loss": 3.2553, |
| "step": 71050 |
| }, |
| { |
| "epoch": 20.722863472076483, |
| "grad_norm": 0.405660480260849, |
| "learning_rate": 0.00035153455818022743, |
| "loss": 3.2396, |
| "step": 71100 |
| }, |
| { |
| "epoch": 20.737437332400606, |
| "grad_norm": 0.39252886176109314, |
| "learning_rate": 0.00035135958005249343, |
| "loss": 3.2425, |
| "step": 71150 |
| }, |
| { |
| "epoch": 20.75201119272473, |
| "grad_norm": 0.39773955941200256, |
| "learning_rate": 0.0003511846019247594, |
| "loss": 3.254, |
| "step": 71200 |
| }, |
| { |
| "epoch": 20.76658505304885, |
| "grad_norm": 0.39778029918670654, |
| "learning_rate": 0.0003510096237970253, |
| "loss": 3.243, |
| "step": 71250 |
| }, |
| { |
| "epoch": 20.781158913372973, |
| "grad_norm": 0.4346936047077179, |
| "learning_rate": 0.0003508346456692913, |
| "loss": 3.252, |
| "step": 71300 |
| }, |
| { |
| "epoch": 20.795732773697097, |
| "grad_norm": 0.3797796368598938, |
| "learning_rate": 0.00035065966754155726, |
| "loss": 3.2562, |
| "step": 71350 |
| }, |
| { |
| "epoch": 20.81030663402122, |
| "grad_norm": 0.3897905945777893, |
| "learning_rate": 0.00035048468941382326, |
| "loss": 3.243, |
| "step": 71400 |
| }, |
| { |
| "epoch": 20.824880494345344, |
| "grad_norm": 0.3894917368888855, |
| "learning_rate": 0.0003503097112860892, |
| "loss": 3.2527, |
| "step": 71450 |
| }, |
| { |
| "epoch": 20.839454354669463, |
| "grad_norm": 0.3748098909854889, |
| "learning_rate": 0.00035013473315835514, |
| "loss": 3.2464, |
| "step": 71500 |
| }, |
| { |
| "epoch": 20.854028214993587, |
| "grad_norm": 0.37157636880874634, |
| "learning_rate": 0.00034995975503062114, |
| "loss": 3.2674, |
| "step": 71550 |
| }, |
| { |
| "epoch": 20.86860207531771, |
| "grad_norm": 0.3743491768836975, |
| "learning_rate": 0.00034978477690288714, |
| "loss": 3.2637, |
| "step": 71600 |
| }, |
| { |
| "epoch": 20.883175935641834, |
| "grad_norm": 0.3712899684906006, |
| "learning_rate": 0.00034960979877515303, |
| "loss": 3.2615, |
| "step": 71650 |
| }, |
| { |
| "epoch": 20.897749795965957, |
| "grad_norm": 0.40136638283729553, |
| "learning_rate": 0.000349434820647419, |
| "loss": 3.2552, |
| "step": 71700 |
| }, |
| { |
| "epoch": 20.912323656290077, |
| "grad_norm": 0.3779069781303406, |
| "learning_rate": 0.000349259842519685, |
| "loss": 3.2616, |
| "step": 71750 |
| }, |
| { |
| "epoch": 20.9268975166142, |
| "grad_norm": 0.3790677487850189, |
| "learning_rate": 0.000349084864391951, |
| "loss": 3.2537, |
| "step": 71800 |
| }, |
| { |
| "epoch": 20.941471376938324, |
| "grad_norm": 0.3974206745624542, |
| "learning_rate": 0.0003489098862642169, |
| "loss": 3.2593, |
| "step": 71850 |
| }, |
| { |
| "epoch": 20.956045237262448, |
| "grad_norm": 0.3856559991836548, |
| "learning_rate": 0.0003487349081364829, |
| "loss": 3.2561, |
| "step": 71900 |
| }, |
| { |
| "epoch": 20.970619097586567, |
| "grad_norm": 0.3982665538787842, |
| "learning_rate": 0.0003485599300087489, |
| "loss": 3.2689, |
| "step": 71950 |
| }, |
| { |
| "epoch": 20.98519295791069, |
| "grad_norm": 0.4200937747955322, |
| "learning_rate": 0.00034838495188101485, |
| "loss": 3.2696, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.98519295791069, |
| "eval_accuracy": 0.37311862899061804, |
| "eval_loss": 3.534351110458374, |
| "eval_runtime": 53.542, |
| "eval_samples_per_second": 310.541, |
| "eval_steps_per_second": 19.424, |
| "step": 72000 |
| }, |
| { |
| "epoch": 20.999766818234814, |
| "grad_norm": 0.3610718250274658, |
| "learning_rate": 0.0003482099737532808, |
| "loss": 3.2691, |
| "step": 72050 |
| }, |
| { |
| "epoch": 21.01428238311764, |
| "grad_norm": 0.41233742237091064, |
| "learning_rate": 0.0003480349956255468, |
| "loss": 3.1613, |
| "step": 72100 |
| }, |
| { |
| "epoch": 21.028856243441762, |
| "grad_norm": 0.39416176080703735, |
| "learning_rate": 0.00034786001749781274, |
| "loss": 3.1585, |
| "step": 72150 |
| }, |
| { |
| "epoch": 21.043430103765886, |
| "grad_norm": 0.40667569637298584, |
| "learning_rate": 0.0003476850393700787, |
| "loss": 3.1551, |
| "step": 72200 |
| }, |
| { |
| "epoch": 21.05800396409001, |
| "grad_norm": 0.37689638137817383, |
| "learning_rate": 0.0003475100612423447, |
| "loss": 3.1613, |
| "step": 72250 |
| }, |
| { |
| "epoch": 21.072577824414132, |
| "grad_norm": 0.39775094389915466, |
| "learning_rate": 0.0003473350831146106, |
| "loss": 3.1525, |
| "step": 72300 |
| }, |
| { |
| "epoch": 21.087151684738252, |
| "grad_norm": 0.39118385314941406, |
| "learning_rate": 0.0003471601049868766, |
| "loss": 3.1728, |
| "step": 72350 |
| }, |
| { |
| "epoch": 21.101725545062376, |
| "grad_norm": 0.3962038457393646, |
| "learning_rate": 0.00034698512685914256, |
| "loss": 3.1766, |
| "step": 72400 |
| }, |
| { |
| "epoch": 21.1162994053865, |
| "grad_norm": 0.41186439990997314, |
| "learning_rate": 0.00034681014873140856, |
| "loss": 3.1745, |
| "step": 72450 |
| }, |
| { |
| "epoch": 21.130873265710623, |
| "grad_norm": 0.4209057688713074, |
| "learning_rate": 0.0003466351706036745, |
| "loss": 3.1766, |
| "step": 72500 |
| }, |
| { |
| "epoch": 21.145447126034743, |
| "grad_norm": 0.4396696388721466, |
| "learning_rate": 0.0003464601924759405, |
| "loss": 3.1923, |
| "step": 72550 |
| }, |
| { |
| "epoch": 21.160020986358866, |
| "grad_norm": 0.41312292218208313, |
| "learning_rate": 0.00034628521434820644, |
| "loss": 3.1864, |
| "step": 72600 |
| }, |
| { |
| "epoch": 21.17459484668299, |
| "grad_norm": 0.41308632493019104, |
| "learning_rate": 0.0003461102362204724, |
| "loss": 3.2035, |
| "step": 72650 |
| }, |
| { |
| "epoch": 21.189168707007113, |
| "grad_norm": 0.43140512704849243, |
| "learning_rate": 0.0003459352580927384, |
| "loss": 3.19, |
| "step": 72700 |
| }, |
| { |
| "epoch": 21.203742567331236, |
| "grad_norm": 0.4572611153125763, |
| "learning_rate": 0.0003457602799650044, |
| "loss": 3.2074, |
| "step": 72750 |
| }, |
| { |
| "epoch": 21.218316427655356, |
| "grad_norm": 0.37264949083328247, |
| "learning_rate": 0.00034558530183727027, |
| "loss": 3.1994, |
| "step": 72800 |
| }, |
| { |
| "epoch": 21.23289028797948, |
| "grad_norm": 0.4249832332134247, |
| "learning_rate": 0.00034541032370953627, |
| "loss": 3.203, |
| "step": 72850 |
| }, |
| { |
| "epoch": 21.247464148303603, |
| "grad_norm": 0.37957924604415894, |
| "learning_rate": 0.00034523534558180227, |
| "loss": 3.1987, |
| "step": 72900 |
| }, |
| { |
| "epoch": 21.262038008627727, |
| "grad_norm": 0.4169476628303528, |
| "learning_rate": 0.00034506036745406826, |
| "loss": 3.2013, |
| "step": 72950 |
| }, |
| { |
| "epoch": 21.276611868951846, |
| "grad_norm": 0.38779717683792114, |
| "learning_rate": 0.00034488538932633415, |
| "loss": 3.2131, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.276611868951846, |
| "eval_accuracy": 0.37226956320426174, |
| "eval_loss": 3.5550079345703125, |
| "eval_runtime": 53.6789, |
| "eval_samples_per_second": 309.749, |
| "eval_steps_per_second": 19.374, |
| "step": 73000 |
| }, |
| { |
| "epoch": 21.29118572927597, |
| "grad_norm": 0.3919151723384857, |
| "learning_rate": 0.00034471041119860015, |
| "loss": 3.1934, |
| "step": 73050 |
| }, |
| { |
| "epoch": 21.305759589600093, |
| "grad_norm": 0.3751165270805359, |
| "learning_rate": 0.00034453543307086615, |
| "loss": 3.2008, |
| "step": 73100 |
| }, |
| { |
| "epoch": 21.320333449924217, |
| "grad_norm": 0.3845295011997223, |
| "learning_rate": 0.00034436045494313204, |
| "loss": 3.201, |
| "step": 73150 |
| }, |
| { |
| "epoch": 21.33490731024834, |
| "grad_norm": 0.3787325918674469, |
| "learning_rate": 0.00034418547681539804, |
| "loss": 3.2187, |
| "step": 73200 |
| }, |
| { |
| "epoch": 21.34948117057246, |
| "grad_norm": 0.38317716121673584, |
| "learning_rate": 0.00034401049868766403, |
| "loss": 3.2073, |
| "step": 73250 |
| }, |
| { |
| "epoch": 21.364055030896584, |
| "grad_norm": 0.40111178159713745, |
| "learning_rate": 0.00034383552055993, |
| "loss": 3.2171, |
| "step": 73300 |
| }, |
| { |
| "epoch": 21.378628891220707, |
| "grad_norm": 0.3990297317504883, |
| "learning_rate": 0.0003436605424321959, |
| "loss": 3.207, |
| "step": 73350 |
| }, |
| { |
| "epoch": 21.39320275154483, |
| "grad_norm": 0.42444097995758057, |
| "learning_rate": 0.0003434855643044619, |
| "loss": 3.205, |
| "step": 73400 |
| }, |
| { |
| "epoch": 21.40777661186895, |
| "grad_norm": 0.3900032043457031, |
| "learning_rate": 0.00034331058617672786, |
| "loss": 3.2176, |
| "step": 73450 |
| }, |
| { |
| "epoch": 21.422350472193074, |
| "grad_norm": 0.4015502631664276, |
| "learning_rate": 0.00034313560804899386, |
| "loss": 3.2265, |
| "step": 73500 |
| }, |
| { |
| "epoch": 21.436924332517197, |
| "grad_norm": 0.41432175040245056, |
| "learning_rate": 0.0003429606299212598, |
| "loss": 3.2111, |
| "step": 73550 |
| }, |
| { |
| "epoch": 21.45149819284132, |
| "grad_norm": 0.38178685307502747, |
| "learning_rate": 0.00034278565179352575, |
| "loss": 3.2145, |
| "step": 73600 |
| }, |
| { |
| "epoch": 21.466072053165444, |
| "grad_norm": 0.4028846025466919, |
| "learning_rate": 0.00034261067366579174, |
| "loss": 3.2225, |
| "step": 73650 |
| }, |
| { |
| "epoch": 21.480645913489564, |
| "grad_norm": 0.4111641049385071, |
| "learning_rate": 0.00034243569553805774, |
| "loss": 3.2223, |
| "step": 73700 |
| }, |
| { |
| "epoch": 21.495219773813687, |
| "grad_norm": 0.3817656338214874, |
| "learning_rate": 0.0003422607174103237, |
| "loss": 3.2218, |
| "step": 73750 |
| }, |
| { |
| "epoch": 21.50979363413781, |
| "grad_norm": 0.38240301609039307, |
| "learning_rate": 0.00034208573928258963, |
| "loss": 3.2223, |
| "step": 73800 |
| }, |
| { |
| "epoch": 21.524367494461934, |
| "grad_norm": 0.38335689902305603, |
| "learning_rate": 0.0003419107611548556, |
| "loss": 3.2199, |
| "step": 73850 |
| }, |
| { |
| "epoch": 21.538941354786054, |
| "grad_norm": 0.4274984300136566, |
| "learning_rate": 0.0003417357830271216, |
| "loss": 3.2281, |
| "step": 73900 |
| }, |
| { |
| "epoch": 21.553515215110178, |
| "grad_norm": 0.40092307329177856, |
| "learning_rate": 0.0003415608048993875, |
| "loss": 3.2222, |
| "step": 73950 |
| }, |
| { |
| "epoch": 21.5680890754343, |
| "grad_norm": 0.40198472142219543, |
| "learning_rate": 0.0003413858267716535, |
| "loss": 3.2219, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.5680890754343, |
| "eval_accuracy": 0.3728718184896492, |
| "eval_loss": 3.545661211013794, |
| "eval_runtime": 53.6675, |
| "eval_samples_per_second": 309.815, |
| "eval_steps_per_second": 19.379, |
| "step": 74000 |
| }, |
| { |
| "epoch": 21.582662935758425, |
| "grad_norm": 0.4117603600025177, |
| "learning_rate": 0.0003412108486439195, |
| "loss": 3.235, |
| "step": 74050 |
| }, |
| { |
| "epoch": 21.597236796082548, |
| "grad_norm": 0.3871172070503235, |
| "learning_rate": 0.0003410358705161854, |
| "loss": 3.229, |
| "step": 74100 |
| }, |
| { |
| "epoch": 21.611810656406668, |
| "grad_norm": 0.3824181854724884, |
| "learning_rate": 0.0003408608923884514, |
| "loss": 3.221, |
| "step": 74150 |
| }, |
| { |
| "epoch": 21.62638451673079, |
| "grad_norm": 0.4447501599788666, |
| "learning_rate": 0.0003406859142607174, |
| "loss": 3.23, |
| "step": 74200 |
| }, |
| { |
| "epoch": 21.640958377054915, |
| "grad_norm": 0.4036960303783417, |
| "learning_rate": 0.0003405109361329834, |
| "loss": 3.2274, |
| "step": 74250 |
| }, |
| { |
| "epoch": 21.655532237379038, |
| "grad_norm": 0.41716432571411133, |
| "learning_rate": 0.0003403359580052493, |
| "loss": 3.2388, |
| "step": 74300 |
| }, |
| { |
| "epoch": 21.670106097703158, |
| "grad_norm": 0.3935403525829315, |
| "learning_rate": 0.0003401609798775153, |
| "loss": 3.2384, |
| "step": 74350 |
| }, |
| { |
| "epoch": 21.68467995802728, |
| "grad_norm": 0.3805677592754364, |
| "learning_rate": 0.0003399860017497813, |
| "loss": 3.2316, |
| "step": 74400 |
| }, |
| { |
| "epoch": 21.699253818351405, |
| "grad_norm": 0.4220235347747803, |
| "learning_rate": 0.0003398110236220472, |
| "loss": 3.2242, |
| "step": 74450 |
| }, |
| { |
| "epoch": 21.71382767867553, |
| "grad_norm": 0.38405632972717285, |
| "learning_rate": 0.00033963604549431316, |
| "loss": 3.2376, |
| "step": 74500 |
| }, |
| { |
| "epoch": 21.728401538999652, |
| "grad_norm": 0.4087572395801544, |
| "learning_rate": 0.00033946106736657916, |
| "loss": 3.2389, |
| "step": 74550 |
| }, |
| { |
| "epoch": 21.74297539932377, |
| "grad_norm": 0.37883898615837097, |
| "learning_rate": 0.0003392860892388451, |
| "loss": 3.2394, |
| "step": 74600 |
| }, |
| { |
| "epoch": 21.757549259647895, |
| "grad_norm": 0.4050464332103729, |
| "learning_rate": 0.0003391111111111111, |
| "loss": 3.2349, |
| "step": 74650 |
| }, |
| { |
| "epoch": 21.77212311997202, |
| "grad_norm": 0.466884970664978, |
| "learning_rate": 0.00033893613298337705, |
| "loss": 3.2426, |
| "step": 74700 |
| }, |
| { |
| "epoch": 21.786696980296142, |
| "grad_norm": 0.39415720105171204, |
| "learning_rate": 0.000338761154855643, |
| "loss": 3.227, |
| "step": 74750 |
| }, |
| { |
| "epoch": 21.801270840620262, |
| "grad_norm": 0.3947415351867676, |
| "learning_rate": 0.000338586176727909, |
| "loss": 3.2409, |
| "step": 74800 |
| }, |
| { |
| "epoch": 21.815844700944385, |
| "grad_norm": 0.37887927889823914, |
| "learning_rate": 0.00033841119860017493, |
| "loss": 3.2535, |
| "step": 74850 |
| }, |
| { |
| "epoch": 21.83041856126851, |
| "grad_norm": 0.3835974931716919, |
| "learning_rate": 0.0003382362204724409, |
| "loss": 3.2596, |
| "step": 74900 |
| }, |
| { |
| "epoch": 21.844992421592632, |
| "grad_norm": 0.4018533527851105, |
| "learning_rate": 0.00033806124234470687, |
| "loss": 3.2334, |
| "step": 74950 |
| }, |
| { |
| "epoch": 21.859566281916756, |
| "grad_norm": 0.3988489508628845, |
| "learning_rate": 0.00033788626421697287, |
| "loss": 3.2434, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.859566281916756, |
| "eval_accuracy": 0.3730929710558487, |
| "eval_loss": 3.540520191192627, |
| "eval_runtime": 53.6254, |
| "eval_samples_per_second": 310.058, |
| "eval_steps_per_second": 19.394, |
| "step": 75000 |
| }, |
| { |
| "epoch": 21.874140142240876, |
| "grad_norm": 0.388423353433609, |
| "learning_rate": 0.0003377112860892388, |
| "loss": 3.2444, |
| "step": 75050 |
| }, |
| { |
| "epoch": 21.888714002565, |
| "grad_norm": 0.4200911223888397, |
| "learning_rate": 0.00033753630796150476, |
| "loss": 3.2457, |
| "step": 75100 |
| }, |
| { |
| "epoch": 21.903287862889123, |
| "grad_norm": 0.41083720326423645, |
| "learning_rate": 0.00033736132983377075, |
| "loss": 3.253, |
| "step": 75150 |
| }, |
| { |
| "epoch": 21.917861723213246, |
| "grad_norm": 0.3731345534324646, |
| "learning_rate": 0.00033718635170603675, |
| "loss": 3.2407, |
| "step": 75200 |
| }, |
| { |
| "epoch": 21.932435583537366, |
| "grad_norm": 0.391739159822464, |
| "learning_rate": 0.00033701137357830264, |
| "loss": 3.2517, |
| "step": 75250 |
| }, |
| { |
| "epoch": 21.94700944386149, |
| "grad_norm": 0.4036067724227905, |
| "learning_rate": 0.00033683639545056864, |
| "loss": 3.244, |
| "step": 75300 |
| }, |
| { |
| "epoch": 21.961583304185613, |
| "grad_norm": 0.40137407183647156, |
| "learning_rate": 0.00033666141732283464, |
| "loss": 3.2464, |
| "step": 75350 |
| }, |
| { |
| "epoch": 21.976157164509736, |
| "grad_norm": 0.40169379115104675, |
| "learning_rate": 0.00033648643919510063, |
| "loss": 3.2589, |
| "step": 75400 |
| }, |
| { |
| "epoch": 21.99073102483386, |
| "grad_norm": 0.40501073002815247, |
| "learning_rate": 0.0003363114610673665, |
| "loss": 3.2552, |
| "step": 75450 |
| }, |
| { |
| "epoch": 22.005246589716684, |
| "grad_norm": 0.3766656816005707, |
| "learning_rate": 0.0003361364829396325, |
| "loss": 3.2075, |
| "step": 75500 |
| }, |
| { |
| "epoch": 22.019820450040807, |
| "grad_norm": 0.42506319284439087, |
| "learning_rate": 0.0003359615048118985, |
| "loss": 3.1334, |
| "step": 75550 |
| }, |
| { |
| "epoch": 22.03439431036493, |
| "grad_norm": 0.36939767003059387, |
| "learning_rate": 0.00033578652668416446, |
| "loss": 3.1431, |
| "step": 75600 |
| }, |
| { |
| "epoch": 22.04896817068905, |
| "grad_norm": 0.3918786644935608, |
| "learning_rate": 0.0003356115485564304, |
| "loss": 3.1536, |
| "step": 75650 |
| }, |
| { |
| "epoch": 22.063542031013174, |
| "grad_norm": 0.41587457060813904, |
| "learning_rate": 0.0003354365704286964, |
| "loss": 3.1569, |
| "step": 75700 |
| }, |
| { |
| "epoch": 22.078115891337298, |
| "grad_norm": 0.41171425580978394, |
| "learning_rate": 0.00033526159230096235, |
| "loss": 3.1802, |
| "step": 75750 |
| }, |
| { |
| "epoch": 22.09268975166142, |
| "grad_norm": 0.40625959634780884, |
| "learning_rate": 0.0003350866141732283, |
| "loss": 3.1744, |
| "step": 75800 |
| }, |
| { |
| "epoch": 22.10726361198554, |
| "grad_norm": 0.4250766336917877, |
| "learning_rate": 0.0003349116360454943, |
| "loss": 3.1737, |
| "step": 75850 |
| }, |
| { |
| "epoch": 22.121837472309664, |
| "grad_norm": 0.38940393924713135, |
| "learning_rate": 0.00033473665791776023, |
| "loss": 3.1688, |
| "step": 75900 |
| }, |
| { |
| "epoch": 22.136411332633788, |
| "grad_norm": 0.4095974564552307, |
| "learning_rate": 0.00033456167979002623, |
| "loss": 3.1845, |
| "step": 75950 |
| }, |
| { |
| "epoch": 22.15098519295791, |
| "grad_norm": 0.4068727493286133, |
| "learning_rate": 0.00033438670166229217, |
| "loss": 3.1707, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.15098519295791, |
| "eval_accuracy": 0.3726765592521159, |
| "eval_loss": 3.5525548458099365, |
| "eval_runtime": 53.4722, |
| "eval_samples_per_second": 310.946, |
| "eval_steps_per_second": 19.449, |
| "step": 76000 |
| }, |
| { |
| "epoch": 22.165559053282035, |
| "grad_norm": 0.4066833555698395, |
| "learning_rate": 0.0003342117235345581, |
| "loss": 3.1765, |
| "step": 76050 |
| }, |
| { |
| "epoch": 22.180132913606155, |
| "grad_norm": 0.4130534529685974, |
| "learning_rate": 0.0003340367454068241, |
| "loss": 3.1864, |
| "step": 76100 |
| }, |
| { |
| "epoch": 22.194706773930278, |
| "grad_norm": 0.422269731760025, |
| "learning_rate": 0.0003338617672790901, |
| "loss": 3.1754, |
| "step": 76150 |
| }, |
| { |
| "epoch": 22.2092806342544, |
| "grad_norm": 0.4179966151714325, |
| "learning_rate": 0.000333686789151356, |
| "loss": 3.1797, |
| "step": 76200 |
| }, |
| { |
| "epoch": 22.223854494578525, |
| "grad_norm": 0.4105667471885681, |
| "learning_rate": 0.000333511811023622, |
| "loss": 3.1762, |
| "step": 76250 |
| }, |
| { |
| "epoch": 22.238428354902645, |
| "grad_norm": 0.4112195074558258, |
| "learning_rate": 0.000333336832895888, |
| "loss": 3.1755, |
| "step": 76300 |
| }, |
| { |
| "epoch": 22.25300221522677, |
| "grad_norm": 0.4119960069656372, |
| "learning_rate": 0.000333161854768154, |
| "loss": 3.1816, |
| "step": 76350 |
| }, |
| { |
| "epoch": 22.267576075550892, |
| "grad_norm": 0.3936940133571625, |
| "learning_rate": 0.0003329868766404199, |
| "loss": 3.1952, |
| "step": 76400 |
| }, |
| { |
| "epoch": 22.282149935875015, |
| "grad_norm": 0.4321756958961487, |
| "learning_rate": 0.0003328118985126859, |
| "loss": 3.1955, |
| "step": 76450 |
| }, |
| { |
| "epoch": 22.29672379619914, |
| "grad_norm": 0.3987957239151001, |
| "learning_rate": 0.0003326369203849519, |
| "loss": 3.1963, |
| "step": 76500 |
| }, |
| { |
| "epoch": 22.31129765652326, |
| "grad_norm": 0.4003720283508301, |
| "learning_rate": 0.0003324619422572179, |
| "loss": 3.1841, |
| "step": 76550 |
| }, |
| { |
| "epoch": 22.325871516847382, |
| "grad_norm": 0.41864824295043945, |
| "learning_rate": 0.00033228696412948377, |
| "loss": 3.2056, |
| "step": 76600 |
| }, |
| { |
| "epoch": 22.340445377171505, |
| "grad_norm": 0.40899229049682617, |
| "learning_rate": 0.00033211198600174976, |
| "loss": 3.1989, |
| "step": 76650 |
| }, |
| { |
| "epoch": 22.35501923749563, |
| "grad_norm": 0.37372127175331116, |
| "learning_rate": 0.00033193700787401576, |
| "loss": 3.208, |
| "step": 76700 |
| }, |
| { |
| "epoch": 22.36959309781975, |
| "grad_norm": 0.4053705036640167, |
| "learning_rate": 0.00033176202974628165, |
| "loss": 3.2028, |
| "step": 76750 |
| }, |
| { |
| "epoch": 22.384166958143872, |
| "grad_norm": 0.38806775212287903, |
| "learning_rate": 0.00033158705161854765, |
| "loss": 3.2103, |
| "step": 76800 |
| }, |
| { |
| "epoch": 22.398740818467996, |
| "grad_norm": 0.397602379322052, |
| "learning_rate": 0.00033141207349081365, |
| "loss": 3.2053, |
| "step": 76850 |
| }, |
| { |
| "epoch": 22.41331467879212, |
| "grad_norm": 0.3891013264656067, |
| "learning_rate": 0.0003312370953630796, |
| "loss": 3.2098, |
| "step": 76900 |
| }, |
| { |
| "epoch": 22.427888539116243, |
| "grad_norm": 0.3974474370479584, |
| "learning_rate": 0.00033106211723534553, |
| "loss": 3.1977, |
| "step": 76950 |
| }, |
| { |
| "epoch": 22.442462399440362, |
| "grad_norm": 0.40681177377700806, |
| "learning_rate": 0.00033088713910761153, |
| "loss": 3.1933, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.442462399440362, |
| "eval_accuracy": 0.37280108262361095, |
| "eval_loss": 3.5484120845794678, |
| "eval_runtime": 53.5687, |
| "eval_samples_per_second": 310.386, |
| "eval_steps_per_second": 19.414, |
| "step": 77000 |
| }, |
| { |
| "epoch": 22.457036259764486, |
| "grad_norm": 0.376244455575943, |
| "learning_rate": 0.0003307121609798775, |
| "loss": 3.2125, |
| "step": 77050 |
| }, |
| { |
| "epoch": 22.47161012008861, |
| "grad_norm": 0.4017881155014038, |
| "learning_rate": 0.00033053718285214347, |
| "loss": 3.2109, |
| "step": 77100 |
| }, |
| { |
| "epoch": 22.486183980412733, |
| "grad_norm": 0.4230104088783264, |
| "learning_rate": 0.0003303622047244094, |
| "loss": 3.2, |
| "step": 77150 |
| }, |
| { |
| "epoch": 22.500757840736853, |
| "grad_norm": 0.41939836740493774, |
| "learning_rate": 0.00033018722659667536, |
| "loss": 3.2107, |
| "step": 77200 |
| }, |
| { |
| "epoch": 22.515331701060976, |
| "grad_norm": 0.41486603021621704, |
| "learning_rate": 0.00033001224846894136, |
| "loss": 3.2037, |
| "step": 77250 |
| }, |
| { |
| "epoch": 22.5299055613851, |
| "grad_norm": 0.41459715366363525, |
| "learning_rate": 0.00032983727034120735, |
| "loss": 3.2118, |
| "step": 77300 |
| }, |
| { |
| "epoch": 22.544479421709223, |
| "grad_norm": 0.4097362458705902, |
| "learning_rate": 0.00032966229221347324, |
| "loss": 3.2086, |
| "step": 77350 |
| }, |
| { |
| "epoch": 22.559053282033346, |
| "grad_norm": 0.4361289441585541, |
| "learning_rate": 0.00032948731408573924, |
| "loss": 3.2146, |
| "step": 77400 |
| }, |
| { |
| "epoch": 22.573627142357466, |
| "grad_norm": 0.39922842383384705, |
| "learning_rate": 0.00032931233595800524, |
| "loss": 3.2218, |
| "step": 77450 |
| }, |
| { |
| "epoch": 22.58820100268159, |
| "grad_norm": 0.3960963487625122, |
| "learning_rate": 0.00032913735783027124, |
| "loss": 3.2208, |
| "step": 77500 |
| }, |
| { |
| "epoch": 22.602774863005713, |
| "grad_norm": 0.38441357016563416, |
| "learning_rate": 0.0003289623797025371, |
| "loss": 3.2235, |
| "step": 77550 |
| }, |
| { |
| "epoch": 22.617348723329837, |
| "grad_norm": 0.41173240542411804, |
| "learning_rate": 0.0003287874015748031, |
| "loss": 3.2227, |
| "step": 77600 |
| }, |
| { |
| "epoch": 22.631922583653957, |
| "grad_norm": 0.39351511001586914, |
| "learning_rate": 0.0003286124234470691, |
| "loss": 3.2251, |
| "step": 77650 |
| }, |
| { |
| "epoch": 22.64649644397808, |
| "grad_norm": 0.4175865948200226, |
| "learning_rate": 0.000328437445319335, |
| "loss": 3.221, |
| "step": 77700 |
| }, |
| { |
| "epoch": 22.661070304302203, |
| "grad_norm": 0.39702001214027405, |
| "learning_rate": 0.000328262467191601, |
| "loss": 3.2248, |
| "step": 77750 |
| }, |
| { |
| "epoch": 22.675644164626327, |
| "grad_norm": 0.3889565169811249, |
| "learning_rate": 0.000328087489063867, |
| "loss": 3.2267, |
| "step": 77800 |
| }, |
| { |
| "epoch": 22.69021802495045, |
| "grad_norm": 0.43343624472618103, |
| "learning_rate": 0.000327912510936133, |
| "loss": 3.227, |
| "step": 77850 |
| }, |
| { |
| "epoch": 22.70479188527457, |
| "grad_norm": 0.41087186336517334, |
| "learning_rate": 0.0003277375328083989, |
| "loss": 3.2261, |
| "step": 77900 |
| }, |
| { |
| "epoch": 22.719365745598694, |
| "grad_norm": 0.41997984051704407, |
| "learning_rate": 0.0003275625546806649, |
| "loss": 3.2397, |
| "step": 77950 |
| }, |
| { |
| "epoch": 22.733939605922817, |
| "grad_norm": 0.40573835372924805, |
| "learning_rate": 0.0003273875765529309, |
| "loss": 3.2346, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.733939605922817, |
| "eval_accuracy": 0.37317335807166263, |
| "eval_loss": 3.5404129028320312, |
| "eval_runtime": 53.555, |
| "eval_samples_per_second": 310.466, |
| "eval_steps_per_second": 19.419, |
| "step": 78000 |
| }, |
| { |
| "epoch": 22.74851346624694, |
| "grad_norm": 0.3728252947330475, |
| "learning_rate": 0.00032721259842519683, |
| "loss": 3.2272, |
| "step": 78050 |
| }, |
| { |
| "epoch": 22.76308732657106, |
| "grad_norm": 0.4036720395088196, |
| "learning_rate": 0.0003270376202974628, |
| "loss": 3.2292, |
| "step": 78100 |
| }, |
| { |
| "epoch": 22.777661186895184, |
| "grad_norm": 0.3707255423069, |
| "learning_rate": 0.00032686264216972877, |
| "loss": 3.2311, |
| "step": 78150 |
| }, |
| { |
| "epoch": 22.792235047219307, |
| "grad_norm": 0.4215398132801056, |
| "learning_rate": 0.0003266876640419947, |
| "loss": 3.2266, |
| "step": 78200 |
| }, |
| { |
| "epoch": 22.80680890754343, |
| "grad_norm": 0.41581565141677856, |
| "learning_rate": 0.0003265126859142607, |
| "loss": 3.2274, |
| "step": 78250 |
| }, |
| { |
| "epoch": 22.821382767867554, |
| "grad_norm": 0.3970228135585785, |
| "learning_rate": 0.00032633770778652666, |
| "loss": 3.239, |
| "step": 78300 |
| }, |
| { |
| "epoch": 22.835956628191674, |
| "grad_norm": 0.3960101008415222, |
| "learning_rate": 0.0003261627296587926, |
| "loss": 3.2293, |
| "step": 78350 |
| }, |
| { |
| "epoch": 22.850530488515798, |
| "grad_norm": 0.383298397064209, |
| "learning_rate": 0.0003259877515310586, |
| "loss": 3.2367, |
| "step": 78400 |
| }, |
| { |
| "epoch": 22.86510434883992, |
| "grad_norm": 0.38575494289398193, |
| "learning_rate": 0.00032581277340332454, |
| "loss": 3.2232, |
| "step": 78450 |
| }, |
| { |
| "epoch": 22.879678209164044, |
| "grad_norm": 0.3949655592441559, |
| "learning_rate": 0.0003256377952755905, |
| "loss": 3.2318, |
| "step": 78500 |
| }, |
| { |
| "epoch": 22.894252069488164, |
| "grad_norm": 0.40856286883354187, |
| "learning_rate": 0.0003254628171478565, |
| "loss": 3.2409, |
| "step": 78550 |
| }, |
| { |
| "epoch": 22.908825929812288, |
| "grad_norm": 0.4039371907711029, |
| "learning_rate": 0.0003252878390201225, |
| "loss": 3.2345, |
| "step": 78600 |
| }, |
| { |
| "epoch": 22.92339979013641, |
| "grad_norm": 0.39217332005500793, |
| "learning_rate": 0.00032511286089238837, |
| "loss": 3.2388, |
| "step": 78650 |
| }, |
| { |
| "epoch": 22.937973650460535, |
| "grad_norm": 0.4055219292640686, |
| "learning_rate": 0.00032493788276465437, |
| "loss": 3.247, |
| "step": 78700 |
| }, |
| { |
| "epoch": 22.952547510784658, |
| "grad_norm": 0.4267149865627289, |
| "learning_rate": 0.00032476290463692036, |
| "loss": 3.2502, |
| "step": 78750 |
| }, |
| { |
| "epoch": 22.967121371108778, |
| "grad_norm": 0.3840542733669281, |
| "learning_rate": 0.00032458792650918636, |
| "loss": 3.2476, |
| "step": 78800 |
| }, |
| { |
| "epoch": 22.9816952314329, |
| "grad_norm": 0.38321739435195923, |
| "learning_rate": 0.00032441294838145225, |
| "loss": 3.2461, |
| "step": 78850 |
| }, |
| { |
| "epoch": 22.996269091757025, |
| "grad_norm": 0.39017432928085327, |
| "learning_rate": 0.00032423797025371825, |
| "loss": 3.236, |
| "step": 78900 |
| }, |
| { |
| "epoch": 23.01078465663985, |
| "grad_norm": 0.39079105854034424, |
| "learning_rate": 0.00032406299212598425, |
| "loss": 3.1573, |
| "step": 78950 |
| }, |
| { |
| "epoch": 23.025358516963973, |
| "grad_norm": 0.44053953886032104, |
| "learning_rate": 0.0003238880139982502, |
| "loss": 3.1206, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.025358516963973, |
| "eval_accuracy": 0.37291701411786665, |
| "eval_loss": 3.550906181335449, |
| "eval_runtime": 53.5191, |
| "eval_samples_per_second": 310.674, |
| "eval_steps_per_second": 19.432, |
| "step": 79000 |
| }, |
| { |
| "epoch": 23.039932377288096, |
| "grad_norm": 0.38476139307022095, |
| "learning_rate": 0.00032371303587051613, |
| "loss": 3.1507, |
| "step": 79050 |
| }, |
| { |
| "epoch": 23.05450623761222, |
| "grad_norm": 0.44088542461395264, |
| "learning_rate": 0.00032353805774278213, |
| "loss": 3.1624, |
| "step": 79100 |
| }, |
| { |
| "epoch": 23.069080097936343, |
| "grad_norm": 0.42090490460395813, |
| "learning_rate": 0.00032336307961504813, |
| "loss": 3.1443, |
| "step": 79150 |
| }, |
| { |
| "epoch": 23.083653958260463, |
| "grad_norm": 0.4365140199661255, |
| "learning_rate": 0.0003231881014873141, |
| "loss": 3.1595, |
| "step": 79200 |
| }, |
| { |
| "epoch": 23.098227818584586, |
| "grad_norm": 0.3900358974933624, |
| "learning_rate": 0.00032301312335958, |
| "loss": 3.1461, |
| "step": 79250 |
| }, |
| { |
| "epoch": 23.11280167890871, |
| "grad_norm": 0.4244283139705658, |
| "learning_rate": 0.000322838145231846, |
| "loss": 3.162, |
| "step": 79300 |
| }, |
| { |
| "epoch": 23.127375539232833, |
| "grad_norm": 0.4338662326335907, |
| "learning_rate": 0.00032266316710411196, |
| "loss": 3.1672, |
| "step": 79350 |
| }, |
| { |
| "epoch": 23.141949399556953, |
| "grad_norm": 0.42256635427474976, |
| "learning_rate": 0.0003224881889763779, |
| "loss": 3.1625, |
| "step": 79400 |
| }, |
| { |
| "epoch": 23.156523259881077, |
| "grad_norm": 0.4065759479999542, |
| "learning_rate": 0.0003223132108486439, |
| "loss": 3.1774, |
| "step": 79450 |
| }, |
| { |
| "epoch": 23.1710971202052, |
| "grad_norm": 0.42197123169898987, |
| "learning_rate": 0.00032213823272090984, |
| "loss": 3.166, |
| "step": 79500 |
| }, |
| { |
| "epoch": 23.185670980529324, |
| "grad_norm": 0.4154304563999176, |
| "learning_rate": 0.00032196325459317584, |
| "loss": 3.1655, |
| "step": 79550 |
| }, |
| { |
| "epoch": 23.200244840853447, |
| "grad_norm": 0.40782201290130615, |
| "learning_rate": 0.0003217882764654418, |
| "loss": 3.178, |
| "step": 79600 |
| }, |
| { |
| "epoch": 23.214818701177567, |
| "grad_norm": 0.4080769419670105, |
| "learning_rate": 0.0003216132983377077, |
| "loss": 3.1834, |
| "step": 79650 |
| }, |
| { |
| "epoch": 23.22939256150169, |
| "grad_norm": 0.4333416521549225, |
| "learning_rate": 0.0003214383202099737, |
| "loss": 3.1809, |
| "step": 79700 |
| }, |
| { |
| "epoch": 23.243966421825814, |
| "grad_norm": 0.4143783748149872, |
| "learning_rate": 0.0003212633420822397, |
| "loss": 3.1766, |
| "step": 79750 |
| }, |
| { |
| "epoch": 23.258540282149937, |
| "grad_norm": 0.43470337986946106, |
| "learning_rate": 0.0003210883639545056, |
| "loss": 3.1743, |
| "step": 79800 |
| }, |
| { |
| "epoch": 23.273114142474057, |
| "grad_norm": 0.4042425751686096, |
| "learning_rate": 0.0003209133858267716, |
| "loss": 3.1916, |
| "step": 79850 |
| }, |
| { |
| "epoch": 23.28768800279818, |
| "grad_norm": 0.39887237548828125, |
| "learning_rate": 0.0003207384076990376, |
| "loss": 3.1919, |
| "step": 79900 |
| }, |
| { |
| "epoch": 23.302261863122304, |
| "grad_norm": 0.4006640315055847, |
| "learning_rate": 0.0003205634295713036, |
| "loss": 3.1837, |
| "step": 79950 |
| }, |
| { |
| "epoch": 23.316835723446427, |
| "grad_norm": 0.394792765378952, |
| "learning_rate": 0.0003203884514435695, |
| "loss": 3.1772, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.316835723446427, |
| "eval_accuracy": 0.3725921705400536, |
| "eval_loss": 3.5538315773010254, |
| "eval_runtime": 53.5731, |
| "eval_samples_per_second": 310.361, |
| "eval_steps_per_second": 19.413, |
| "step": 80000 |
| }, |
| { |
| "epoch": 23.33140958377055, |
| "grad_norm": 0.4694288372993469, |
| "learning_rate": 0.0003202134733158355, |
| "loss": 3.1761, |
| "step": 80050 |
| }, |
| { |
| "epoch": 23.34598344409467, |
| "grad_norm": 0.4258243441581726, |
| "learning_rate": 0.0003200384951881015, |
| "loss": 3.1831, |
| "step": 80100 |
| }, |
| { |
| "epoch": 23.360557304418794, |
| "grad_norm": 0.3972517251968384, |
| "learning_rate": 0.00031986351706036743, |
| "loss": 3.1842, |
| "step": 80150 |
| }, |
| { |
| "epoch": 23.375131164742918, |
| "grad_norm": 0.40481069684028625, |
| "learning_rate": 0.0003196885389326334, |
| "loss": 3.194, |
| "step": 80200 |
| }, |
| { |
| "epoch": 23.38970502506704, |
| "grad_norm": 0.46562784910202026, |
| "learning_rate": 0.0003195135608048994, |
| "loss": 3.2019, |
| "step": 80250 |
| }, |
| { |
| "epoch": 23.40427888539116, |
| "grad_norm": 0.40546947717666626, |
| "learning_rate": 0.0003193385826771653, |
| "loss": 3.1886, |
| "step": 80300 |
| }, |
| { |
| "epoch": 23.418852745715284, |
| "grad_norm": 0.4361385405063629, |
| "learning_rate": 0.00031916360454943126, |
| "loss": 3.1878, |
| "step": 80350 |
| }, |
| { |
| "epoch": 23.433426606039408, |
| "grad_norm": 0.4106120467185974, |
| "learning_rate": 0.00031898862642169726, |
| "loss": 3.1985, |
| "step": 80400 |
| }, |
| { |
| "epoch": 23.44800046636353, |
| "grad_norm": 0.44011247158050537, |
| "learning_rate": 0.00031881364829396326, |
| "loss": 3.199, |
| "step": 80450 |
| }, |
| { |
| "epoch": 23.462574326687655, |
| "grad_norm": 0.40028128027915955, |
| "learning_rate": 0.0003186386701662292, |
| "loss": 3.1998, |
| "step": 80500 |
| }, |
| { |
| "epoch": 23.477148187011775, |
| "grad_norm": 0.42574769258499146, |
| "learning_rate": 0.00031846369203849514, |
| "loss": 3.1929, |
| "step": 80550 |
| }, |
| { |
| "epoch": 23.491722047335898, |
| "grad_norm": 0.417959600687027, |
| "learning_rate": 0.00031828871391076114, |
| "loss": 3.1983, |
| "step": 80600 |
| }, |
| { |
| "epoch": 23.50629590766002, |
| "grad_norm": 0.41737136244773865, |
| "learning_rate": 0.0003181137357830271, |
| "loss": 3.2019, |
| "step": 80650 |
| }, |
| { |
| "epoch": 23.520869767984145, |
| "grad_norm": 0.386445015668869, |
| "learning_rate": 0.0003179387576552931, |
| "loss": 3.2049, |
| "step": 80700 |
| }, |
| { |
| "epoch": 23.535443628308265, |
| "grad_norm": 0.44584667682647705, |
| "learning_rate": 0.000317763779527559, |
| "loss": 3.2015, |
| "step": 80750 |
| }, |
| { |
| "epoch": 23.55001748863239, |
| "grad_norm": 0.40984952449798584, |
| "learning_rate": 0.00031758880139982497, |
| "loss": 3.2132, |
| "step": 80800 |
| }, |
| { |
| "epoch": 23.56459134895651, |
| "grad_norm": 0.4484936594963074, |
| "learning_rate": 0.00031741382327209097, |
| "loss": 3.2085, |
| "step": 80850 |
| }, |
| { |
| "epoch": 23.579165209280635, |
| "grad_norm": 0.44321057200431824, |
| "learning_rate": 0.00031723884514435696, |
| "loss": 3.2025, |
| "step": 80900 |
| }, |
| { |
| "epoch": 23.59373906960476, |
| "grad_norm": 0.4272693693637848, |
| "learning_rate": 0.00031706386701662285, |
| "loss": 3.2099, |
| "step": 80950 |
| }, |
| { |
| "epoch": 23.60831292992888, |
| "grad_norm": 0.40105003118515015, |
| "learning_rate": 0.00031688888888888885, |
| "loss": 3.207, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.60831292992888, |
| "eval_accuracy": 0.37350832358704517, |
| "eval_loss": 3.5404655933380127, |
| "eval_runtime": 53.5759, |
| "eval_samples_per_second": 310.345, |
| "eval_steps_per_second": 19.412, |
| "step": 81000 |
| }, |
| { |
| "epoch": 23.622886790253002, |
| "grad_norm": 0.4260452687740326, |
| "learning_rate": 0.00031671391076115485, |
| "loss": 3.218, |
| "step": 81050 |
| }, |
| { |
| "epoch": 23.637460650577125, |
| "grad_norm": 0.41286465525627136, |
| "learning_rate": 0.00031653893263342085, |
| "loss": 3.2062, |
| "step": 81100 |
| }, |
| { |
| "epoch": 23.65203451090125, |
| "grad_norm": 0.4247594475746155, |
| "learning_rate": 0.00031636395450568674, |
| "loss": 3.2249, |
| "step": 81150 |
| }, |
| { |
| "epoch": 23.66660837122537, |
| "grad_norm": 0.4216274619102478, |
| "learning_rate": 0.00031618897637795273, |
| "loss": 3.2147, |
| "step": 81200 |
| }, |
| { |
| "epoch": 23.681182231549492, |
| "grad_norm": 0.4162251055240631, |
| "learning_rate": 0.00031601399825021873, |
| "loss": 3.2276, |
| "step": 81250 |
| }, |
| { |
| "epoch": 23.695756091873616, |
| "grad_norm": 0.3944651484489441, |
| "learning_rate": 0.0003158390201224846, |
| "loss": 3.2187, |
| "step": 81300 |
| }, |
| { |
| "epoch": 23.71032995219774, |
| "grad_norm": 0.4357243776321411, |
| "learning_rate": 0.0003156640419947506, |
| "loss": 3.2171, |
| "step": 81350 |
| }, |
| { |
| "epoch": 23.72490381252186, |
| "grad_norm": 0.4096743166446686, |
| "learning_rate": 0.0003154890638670166, |
| "loss": 3.2173, |
| "step": 81400 |
| }, |
| { |
| "epoch": 23.739477672845982, |
| "grad_norm": 0.42880937457084656, |
| "learning_rate": 0.00031531408573928256, |
| "loss": 3.217, |
| "step": 81450 |
| }, |
| { |
| "epoch": 23.754051533170106, |
| "grad_norm": 0.41319262981414795, |
| "learning_rate": 0.0003151391076115485, |
| "loss": 3.2243, |
| "step": 81500 |
| }, |
| { |
| "epoch": 23.76862539349423, |
| "grad_norm": 0.4365960359573364, |
| "learning_rate": 0.0003149641294838145, |
| "loss": 3.2121, |
| "step": 81550 |
| }, |
| { |
| "epoch": 23.783199253818353, |
| "grad_norm": 0.419045090675354, |
| "learning_rate": 0.00031478915135608044, |
| "loss": 3.2324, |
| "step": 81600 |
| }, |
| { |
| "epoch": 23.797773114142473, |
| "grad_norm": 0.43372637033462524, |
| "learning_rate": 0.00031461417322834644, |
| "loss": 3.2372, |
| "step": 81650 |
| }, |
| { |
| "epoch": 23.812346974466596, |
| "grad_norm": 0.4185149371623993, |
| "learning_rate": 0.0003144391951006124, |
| "loss": 3.2211, |
| "step": 81700 |
| }, |
| { |
| "epoch": 23.82692083479072, |
| "grad_norm": 0.38297638297080994, |
| "learning_rate": 0.0003142642169728784, |
| "loss": 3.2162, |
| "step": 81750 |
| }, |
| { |
| "epoch": 23.841494695114843, |
| "grad_norm": 0.39599165320396423, |
| "learning_rate": 0.0003140892388451443, |
| "loss": 3.2305, |
| "step": 81800 |
| }, |
| { |
| "epoch": 23.856068555438966, |
| "grad_norm": 0.38620686531066895, |
| "learning_rate": 0.0003139142607174103, |
| "loss": 3.2271, |
| "step": 81850 |
| }, |
| { |
| "epoch": 23.870642415763086, |
| "grad_norm": 0.41307684779167175, |
| "learning_rate": 0.00031373928258967627, |
| "loss": 3.2306, |
| "step": 81900 |
| }, |
| { |
| "epoch": 23.88521627608721, |
| "grad_norm": 0.42049381136894226, |
| "learning_rate": 0.0003135643044619422, |
| "loss": 3.2334, |
| "step": 81950 |
| }, |
| { |
| "epoch": 23.899790136411333, |
| "grad_norm": 0.3921961784362793, |
| "learning_rate": 0.0003133893263342082, |
| "loss": 3.216, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.899790136411333, |
| "eval_accuracy": 0.373993705802589, |
| "eval_loss": 3.5379347801208496, |
| "eval_runtime": 53.5832, |
| "eval_samples_per_second": 310.302, |
| "eval_steps_per_second": 19.409, |
| "step": 82000 |
| }, |
| { |
| "epoch": 23.914363996735457, |
| "grad_norm": 0.3855375051498413, |
| "learning_rate": 0.00031321434820647415, |
| "loss": 3.2272, |
| "step": 82050 |
| }, |
| { |
| "epoch": 23.928937857059577, |
| "grad_norm": 0.40274760127067566, |
| "learning_rate": 0.0003130393700787401, |
| "loss": 3.2277, |
| "step": 82100 |
| }, |
| { |
| "epoch": 23.9435117173837, |
| "grad_norm": 0.40383246541023254, |
| "learning_rate": 0.0003128643919510061, |
| "loss": 3.2323, |
| "step": 82150 |
| }, |
| { |
| "epoch": 23.958085577707823, |
| "grad_norm": 0.42298346757888794, |
| "learning_rate": 0.0003126894138232721, |
| "loss": 3.2334, |
| "step": 82200 |
| }, |
| { |
| "epoch": 23.972659438031947, |
| "grad_norm": 0.37038496136665344, |
| "learning_rate": 0.000312514435695538, |
| "loss": 3.2225, |
| "step": 82250 |
| }, |
| { |
| "epoch": 23.987233298356067, |
| "grad_norm": 0.44437092542648315, |
| "learning_rate": 0.000312339457567804, |
| "loss": 3.2296, |
| "step": 82300 |
| }, |
| { |
| "epoch": 24.001748863238895, |
| "grad_norm": 0.42472222447395325, |
| "learning_rate": 0.00031216447944007, |
| "loss": 3.2188, |
| "step": 82350 |
| }, |
| { |
| "epoch": 24.016322723563018, |
| "grad_norm": 0.43172892928123474, |
| "learning_rate": 0.000311989501312336, |
| "loss": 3.1294, |
| "step": 82400 |
| }, |
| { |
| "epoch": 24.03089658388714, |
| "grad_norm": 0.4458048343658447, |
| "learning_rate": 0.00031181452318460186, |
| "loss": 3.1414, |
| "step": 82450 |
| }, |
| { |
| "epoch": 24.04547044421126, |
| "grad_norm": 0.40629690885543823, |
| "learning_rate": 0.00031163954505686786, |
| "loss": 3.1308, |
| "step": 82500 |
| }, |
| { |
| "epoch": 24.060044304535385, |
| "grad_norm": 0.39976805448532104, |
| "learning_rate": 0.00031146456692913386, |
| "loss": 3.1328, |
| "step": 82550 |
| }, |
| { |
| "epoch": 24.07461816485951, |
| "grad_norm": 0.4165184795856476, |
| "learning_rate": 0.0003112895888013998, |
| "loss": 3.1381, |
| "step": 82600 |
| }, |
| { |
| "epoch": 24.089192025183632, |
| "grad_norm": 0.4216763377189636, |
| "learning_rate": 0.00031111461067366575, |
| "loss": 3.1385, |
| "step": 82650 |
| }, |
| { |
| "epoch": 24.10376588550775, |
| "grad_norm": 0.411807119846344, |
| "learning_rate": 0.00031093963254593174, |
| "loss": 3.1709, |
| "step": 82700 |
| }, |
| { |
| "epoch": 24.118339745831875, |
| "grad_norm": 0.42998647689819336, |
| "learning_rate": 0.0003107646544181977, |
| "loss": 3.1431, |
| "step": 82750 |
| }, |
| { |
| "epoch": 24.132913606156, |
| "grad_norm": 0.41308829188346863, |
| "learning_rate": 0.0003105896762904637, |
| "loss": 3.1546, |
| "step": 82800 |
| }, |
| { |
| "epoch": 24.147487466480122, |
| "grad_norm": 0.40349751710891724, |
| "learning_rate": 0.00031041469816272963, |
| "loss": 3.1499, |
| "step": 82850 |
| }, |
| { |
| "epoch": 24.162061326804245, |
| "grad_norm": 0.4231622517108917, |
| "learning_rate": 0.00031023972003499557, |
| "loss": 3.1638, |
| "step": 82900 |
| }, |
| { |
| "epoch": 24.176635187128365, |
| "grad_norm": 0.4381542503833771, |
| "learning_rate": 0.00031006474190726157, |
| "loss": 3.1434, |
| "step": 82950 |
| }, |
| { |
| "epoch": 24.19120904745249, |
| "grad_norm": 0.4462905824184418, |
| "learning_rate": 0.0003098897637795275, |
| "loss": 3.1572, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.19120904745249, |
| "eval_accuracy": 0.37264101477367406, |
| "eval_loss": 3.553147077560425, |
| "eval_runtime": 53.4273, |
| "eval_samples_per_second": 311.208, |
| "eval_steps_per_second": 19.466, |
| "step": 83000 |
| }, |
| { |
| "epoch": 24.205782907776612, |
| "grad_norm": 0.44927412271499634, |
| "learning_rate": 0.0003097147856517935, |
| "loss": 3.167, |
| "step": 83050 |
| }, |
| { |
| "epoch": 24.220356768100736, |
| "grad_norm": 0.4245893657207489, |
| "learning_rate": 0.00030953980752405945, |
| "loss": 3.1591, |
| "step": 83100 |
| }, |
| { |
| "epoch": 24.234930628424856, |
| "grad_norm": 0.41934916377067566, |
| "learning_rate": 0.00030936482939632545, |
| "loss": 3.1713, |
| "step": 83150 |
| }, |
| { |
| "epoch": 24.24950448874898, |
| "grad_norm": 0.42093417048454285, |
| "learning_rate": 0.0003091898512685914, |
| "loss": 3.1749, |
| "step": 83200 |
| }, |
| { |
| "epoch": 24.264078349073102, |
| "grad_norm": 0.39428791403770447, |
| "learning_rate": 0.00030901487314085734, |
| "loss": 3.1648, |
| "step": 83250 |
| }, |
| { |
| "epoch": 24.278652209397226, |
| "grad_norm": 0.3996634781360626, |
| "learning_rate": 0.00030883989501312334, |
| "loss": 3.1892, |
| "step": 83300 |
| }, |
| { |
| "epoch": 24.29322606972135, |
| "grad_norm": 0.41399145126342773, |
| "learning_rate": 0.00030866491688538933, |
| "loss": 3.1812, |
| "step": 83350 |
| }, |
| { |
| "epoch": 24.30779993004547, |
| "grad_norm": 0.40609946846961975, |
| "learning_rate": 0.0003084899387576552, |
| "loss": 3.1632, |
| "step": 83400 |
| }, |
| { |
| "epoch": 24.322373790369593, |
| "grad_norm": 0.3995642066001892, |
| "learning_rate": 0.0003083149606299212, |
| "loss": 3.1841, |
| "step": 83450 |
| }, |
| { |
| "epoch": 24.336947650693716, |
| "grad_norm": 0.4245721995830536, |
| "learning_rate": 0.0003081399825021872, |
| "loss": 3.1762, |
| "step": 83500 |
| }, |
| { |
| "epoch": 24.35152151101784, |
| "grad_norm": 0.3996485769748688, |
| "learning_rate": 0.0003079650043744532, |
| "loss": 3.1753, |
| "step": 83550 |
| }, |
| { |
| "epoch": 24.36609537134196, |
| "grad_norm": 0.40523093938827515, |
| "learning_rate": 0.0003077900262467191, |
| "loss": 3.1897, |
| "step": 83600 |
| }, |
| { |
| "epoch": 24.380669231666083, |
| "grad_norm": 0.46414250135421753, |
| "learning_rate": 0.0003076150481189851, |
| "loss": 3.1919, |
| "step": 83650 |
| }, |
| { |
| "epoch": 24.395243091990206, |
| "grad_norm": 0.4243074357509613, |
| "learning_rate": 0.0003074400699912511, |
| "loss": 3.1876, |
| "step": 83700 |
| }, |
| { |
| "epoch": 24.40981695231433, |
| "grad_norm": 0.4179231822490692, |
| "learning_rate": 0.00030726509186351704, |
| "loss": 3.1885, |
| "step": 83750 |
| }, |
| { |
| "epoch": 24.424390812638453, |
| "grad_norm": 0.41542547941207886, |
| "learning_rate": 0.000307090113735783, |
| "loss": 3.1899, |
| "step": 83800 |
| }, |
| { |
| "epoch": 24.438964672962573, |
| "grad_norm": 0.41838836669921875, |
| "learning_rate": 0.000306915135608049, |
| "loss": 3.1906, |
| "step": 83850 |
| }, |
| { |
| "epoch": 24.453538533286697, |
| "grad_norm": 0.4413517415523529, |
| "learning_rate": 0.00030674015748031493, |
| "loss": 3.1847, |
| "step": 83900 |
| }, |
| { |
| "epoch": 24.46811239361082, |
| "grad_norm": 0.4166496992111206, |
| "learning_rate": 0.00030656517935258087, |
| "loss": 3.19, |
| "step": 83950 |
| }, |
| { |
| "epoch": 24.482686253934943, |
| "grad_norm": 0.4076080620288849, |
| "learning_rate": 0.00030639020122484687, |
| "loss": 3.1916, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.482686253934943, |
| "eval_accuracy": 0.3732628077525097, |
| "eval_loss": 3.5452823638916016, |
| "eval_runtime": 53.5419, |
| "eval_samples_per_second": 310.542, |
| "eval_steps_per_second": 19.424, |
| "step": 84000 |
| }, |
| { |
| "epoch": 24.497260114259063, |
| "grad_norm": 0.3986864686012268, |
| "learning_rate": 0.0003062152230971128, |
| "loss": 3.2017, |
| "step": 84050 |
| }, |
| { |
| "epoch": 24.511833974583187, |
| "grad_norm": 0.39412063360214233, |
| "learning_rate": 0.0003060402449693788, |
| "loss": 3.1878, |
| "step": 84100 |
| }, |
| { |
| "epoch": 24.52640783490731, |
| "grad_norm": 0.39426127076148987, |
| "learning_rate": 0.00030586526684164475, |
| "loss": 3.1996, |
| "step": 84150 |
| }, |
| { |
| "epoch": 24.540981695231434, |
| "grad_norm": 0.38532009720802307, |
| "learning_rate": 0.0003056902887139107, |
| "loss": 3.2022, |
| "step": 84200 |
| }, |
| { |
| "epoch": 24.555555555555557, |
| "grad_norm": 0.41849541664123535, |
| "learning_rate": 0.0003055153105861767, |
| "loss": 3.1908, |
| "step": 84250 |
| }, |
| { |
| "epoch": 24.570129415879677, |
| "grad_norm": 0.4017096161842346, |
| "learning_rate": 0.0003053403324584427, |
| "loss": 3.2057, |
| "step": 84300 |
| }, |
| { |
| "epoch": 24.5847032762038, |
| "grad_norm": 0.42205843329429626, |
| "learning_rate": 0.00030516535433070864, |
| "loss": 3.1873, |
| "step": 84350 |
| }, |
| { |
| "epoch": 24.599277136527924, |
| "grad_norm": 0.43874719738960266, |
| "learning_rate": 0.0003049903762029746, |
| "loss": 3.2067, |
| "step": 84400 |
| }, |
| { |
| "epoch": 24.613850996852047, |
| "grad_norm": 0.3923831582069397, |
| "learning_rate": 0.0003048153980752406, |
| "loss": 3.1939, |
| "step": 84450 |
| }, |
| { |
| "epoch": 24.628424857176167, |
| "grad_norm": 0.3787786662578583, |
| "learning_rate": 0.0003046404199475066, |
| "loss": 3.2034, |
| "step": 84500 |
| }, |
| { |
| "epoch": 24.64299871750029, |
| "grad_norm": 0.42038998007774353, |
| "learning_rate": 0.00030446544181977247, |
| "loss": 3.2088, |
| "step": 84550 |
| }, |
| { |
| "epoch": 24.657572577824414, |
| "grad_norm": 0.43957141041755676, |
| "learning_rate": 0.00030429046369203846, |
| "loss": 3.211, |
| "step": 84600 |
| }, |
| { |
| "epoch": 24.672146438148538, |
| "grad_norm": 0.4145025610923767, |
| "learning_rate": 0.00030411548556430446, |
| "loss": 3.2039, |
| "step": 84650 |
| }, |
| { |
| "epoch": 24.68672029847266, |
| "grad_norm": 0.4103289246559143, |
| "learning_rate": 0.00030394050743657046, |
| "loss": 3.1947, |
| "step": 84700 |
| }, |
| { |
| "epoch": 24.70129415879678, |
| "grad_norm": 0.40595585107803345, |
| "learning_rate": 0.00030376552930883635, |
| "loss": 3.2063, |
| "step": 84750 |
| }, |
| { |
| "epoch": 24.715868019120904, |
| "grad_norm": 0.41054317355155945, |
| "learning_rate": 0.00030359055118110235, |
| "loss": 3.1998, |
| "step": 84800 |
| }, |
| { |
| "epoch": 24.730441879445028, |
| "grad_norm": 0.41700872778892517, |
| "learning_rate": 0.00030341557305336834, |
| "loss": 3.2124, |
| "step": 84850 |
| }, |
| { |
| "epoch": 24.74501573976915, |
| "grad_norm": 0.3898296058177948, |
| "learning_rate": 0.00030324059492563423, |
| "loss": 3.2013, |
| "step": 84900 |
| }, |
| { |
| "epoch": 24.75958960009327, |
| "grad_norm": 0.4132135510444641, |
| "learning_rate": 0.00030306561679790023, |
| "loss": 3.2098, |
| "step": 84950 |
| }, |
| { |
| "epoch": 24.774163460417395, |
| "grad_norm": 0.4450601637363434, |
| "learning_rate": 0.00030289063867016623, |
| "loss": 3.2191, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.774163460417395, |
| "eval_accuracy": 0.3736838097372333, |
| "eval_loss": 3.5401432514190674, |
| "eval_runtime": 53.6239, |
| "eval_samples_per_second": 310.067, |
| "eval_steps_per_second": 19.394, |
| "step": 85000 |
| }, |
| { |
| "epoch": 24.788737320741518, |
| "grad_norm": 0.43197497725486755, |
| "learning_rate": 0.00030271566054243217, |
| "loss": 3.2044, |
| "step": 85050 |
| }, |
| { |
| "epoch": 24.80331118106564, |
| "grad_norm": 0.42331868410110474, |
| "learning_rate": 0.0003025406824146981, |
| "loss": 3.2154, |
| "step": 85100 |
| }, |
| { |
| "epoch": 24.817885041389765, |
| "grad_norm": 0.453522652387619, |
| "learning_rate": 0.0003023657042869641, |
| "loss": 3.2237, |
| "step": 85150 |
| }, |
| { |
| "epoch": 24.832458901713885, |
| "grad_norm": 0.4122588634490967, |
| "learning_rate": 0.00030219072615923006, |
| "loss": 3.2135, |
| "step": 85200 |
| }, |
| { |
| "epoch": 24.84703276203801, |
| "grad_norm": 0.3891066312789917, |
| "learning_rate": 0.00030201574803149605, |
| "loss": 3.2201, |
| "step": 85250 |
| }, |
| { |
| "epoch": 24.86160662236213, |
| "grad_norm": 0.46326911449432373, |
| "learning_rate": 0.000301840769903762, |
| "loss": 3.2194, |
| "step": 85300 |
| }, |
| { |
| "epoch": 24.876180482686255, |
| "grad_norm": 0.4535176753997803, |
| "learning_rate": 0.00030166579177602794, |
| "loss": 3.2222, |
| "step": 85350 |
| }, |
| { |
| "epoch": 24.890754343010375, |
| "grad_norm": 0.4166485369205475, |
| "learning_rate": 0.00030149081364829394, |
| "loss": 3.222, |
| "step": 85400 |
| }, |
| { |
| "epoch": 24.9053282033345, |
| "grad_norm": 0.42368149757385254, |
| "learning_rate": 0.00030131583552055994, |
| "loss": 3.2258, |
| "step": 85450 |
| }, |
| { |
| "epoch": 24.919902063658622, |
| "grad_norm": 0.4298972189426422, |
| "learning_rate": 0.0003011408573928258, |
| "loss": 3.2162, |
| "step": 85500 |
| }, |
| { |
| "epoch": 24.934475923982745, |
| "grad_norm": 0.42533475160598755, |
| "learning_rate": 0.0003009658792650918, |
| "loss": 3.2261, |
| "step": 85550 |
| }, |
| { |
| "epoch": 24.94904978430687, |
| "grad_norm": 0.3993895947933197, |
| "learning_rate": 0.0003007909011373578, |
| "loss": 3.2181, |
| "step": 85600 |
| }, |
| { |
| "epoch": 24.96362364463099, |
| "grad_norm": 0.4366506040096283, |
| "learning_rate": 0.00030061592300962376, |
| "loss": 3.215, |
| "step": 85650 |
| }, |
| { |
| "epoch": 24.978197504955112, |
| "grad_norm": 0.4059385359287262, |
| "learning_rate": 0.0003004409448818897, |
| "loss": 3.2233, |
| "step": 85700 |
| }, |
| { |
| "epoch": 24.992771365279236, |
| "grad_norm": 0.42242342233657837, |
| "learning_rate": 0.0003002659667541557, |
| "loss": 3.2164, |
| "step": 85750 |
| }, |
| { |
| "epoch": 25.00728693016206, |
| "grad_norm": 0.4567851722240448, |
| "learning_rate": 0.0003000909886264217, |
| "loss": 3.1653, |
| "step": 85800 |
| }, |
| { |
| "epoch": 25.021860790486183, |
| "grad_norm": 0.41543054580688477, |
| "learning_rate": 0.00029991601049868765, |
| "loss": 3.1233, |
| "step": 85850 |
| }, |
| { |
| "epoch": 25.036434650810307, |
| "grad_norm": 0.45275938510894775, |
| "learning_rate": 0.0002997410323709536, |
| "loss": 3.1201, |
| "step": 85900 |
| }, |
| { |
| "epoch": 25.05100851113443, |
| "grad_norm": 0.413595050573349, |
| "learning_rate": 0.0002995660542432196, |
| "loss": 3.1308, |
| "step": 85950 |
| }, |
| { |
| "epoch": 25.065582371458554, |
| "grad_norm": 0.44148409366607666, |
| "learning_rate": 0.00029939107611548553, |
| "loss": 3.1353, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.065582371458554, |
| "eval_accuracy": 0.37286663982391594, |
| "eval_loss": 3.5544240474700928, |
| "eval_runtime": 53.6379, |
| "eval_samples_per_second": 309.986, |
| "eval_steps_per_second": 19.389, |
| "step": 86000 |
| }, |
| { |
| "epoch": 25.080156231782674, |
| "grad_norm": 0.4196157455444336, |
| "learning_rate": 0.00029921609798775153, |
| "loss": 3.1413, |
| "step": 86050 |
| }, |
| { |
| "epoch": 25.094730092106797, |
| "grad_norm": 0.44834548234939575, |
| "learning_rate": 0.00029904111986001747, |
| "loss": 3.1296, |
| "step": 86100 |
| }, |
| { |
| "epoch": 25.10930395243092, |
| "grad_norm": 0.420354425907135, |
| "learning_rate": 0.00029886614173228347, |
| "loss": 3.1417, |
| "step": 86150 |
| }, |
| { |
| "epoch": 25.123877812755044, |
| "grad_norm": 0.39772215485572815, |
| "learning_rate": 0.0002986911636045494, |
| "loss": 3.1562, |
| "step": 86200 |
| }, |
| { |
| "epoch": 25.138451673079164, |
| "grad_norm": 0.44562727212905884, |
| "learning_rate": 0.0002985161854768154, |
| "loss": 3.1419, |
| "step": 86250 |
| }, |
| { |
| "epoch": 25.153025533403287, |
| "grad_norm": 0.401484876871109, |
| "learning_rate": 0.00029834120734908135, |
| "loss": 3.1417, |
| "step": 86300 |
| }, |
| { |
| "epoch": 25.16759939372741, |
| "grad_norm": 0.41795679926872253, |
| "learning_rate": 0.0002981662292213473, |
| "loss": 3.1548, |
| "step": 86350 |
| }, |
| { |
| "epoch": 25.182173254051534, |
| "grad_norm": 0.40250077843666077, |
| "learning_rate": 0.0002979912510936133, |
| "loss": 3.1477, |
| "step": 86400 |
| }, |
| { |
| "epoch": 25.196747114375654, |
| "grad_norm": 0.40981054306030273, |
| "learning_rate": 0.00029781627296587924, |
| "loss": 3.1572, |
| "step": 86450 |
| }, |
| { |
| "epoch": 25.211320974699778, |
| "grad_norm": 0.4290998876094818, |
| "learning_rate": 0.0002976412948381452, |
| "loss": 3.1559, |
| "step": 86500 |
| }, |
| { |
| "epoch": 25.2258948350239, |
| "grad_norm": 0.41594749689102173, |
| "learning_rate": 0.0002974663167104112, |
| "loss": 3.1643, |
| "step": 86550 |
| }, |
| { |
| "epoch": 25.240468695348024, |
| "grad_norm": 0.4130031168460846, |
| "learning_rate": 0.0002972913385826771, |
| "loss": 3.1558, |
| "step": 86600 |
| }, |
| { |
| "epoch": 25.255042555672148, |
| "grad_norm": 0.3956284821033478, |
| "learning_rate": 0.00029711636045494307, |
| "loss": 3.1635, |
| "step": 86650 |
| }, |
| { |
| "epoch": 25.269616415996268, |
| "grad_norm": 0.43169260025024414, |
| "learning_rate": 0.00029694138232720906, |
| "loss": 3.1629, |
| "step": 86700 |
| }, |
| { |
| "epoch": 25.28419027632039, |
| "grad_norm": 0.406389057636261, |
| "learning_rate": 0.000296766404199475, |
| "loss": 3.1669, |
| "step": 86750 |
| }, |
| { |
| "epoch": 25.298764136644515, |
| "grad_norm": 0.42388877272605896, |
| "learning_rate": 0.000296591426071741, |
| "loss": 3.1621, |
| "step": 86800 |
| }, |
| { |
| "epoch": 25.313337996968638, |
| "grad_norm": 0.41170287132263184, |
| "learning_rate": 0.00029641644794400695, |
| "loss": 3.1758, |
| "step": 86850 |
| }, |
| { |
| "epoch": 25.327911857292758, |
| "grad_norm": 0.4237252175807953, |
| "learning_rate": 0.00029624146981627295, |
| "loss": 3.1599, |
| "step": 86900 |
| }, |
| { |
| "epoch": 25.34248571761688, |
| "grad_norm": 0.40915387868881226, |
| "learning_rate": 0.0002960664916885389, |
| "loss": 3.1648, |
| "step": 86950 |
| }, |
| { |
| "epoch": 25.357059577941005, |
| "grad_norm": 0.42069658637046814, |
| "learning_rate": 0.0002958915135608049, |
| "loss": 3.1683, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.357059577941005, |
| "eval_accuracy": 0.37334684337372653, |
| "eval_loss": 3.551579713821411, |
| "eval_runtime": 53.3841, |
| "eval_samples_per_second": 311.46, |
| "eval_steps_per_second": 19.481, |
| "step": 87000 |
| }, |
| { |
| "epoch": 25.37163343826513, |
| "grad_norm": 0.43754515051841736, |
| "learning_rate": 0.00029571653543307083, |
| "loss": 3.1719, |
| "step": 87050 |
| }, |
| { |
| "epoch": 25.38620729858925, |
| "grad_norm": 0.40791741013526917, |
| "learning_rate": 0.00029554155730533683, |
| "loss": 3.1721, |
| "step": 87100 |
| }, |
| { |
| "epoch": 25.40078115891337, |
| "grad_norm": 0.4316633939743042, |
| "learning_rate": 0.0002953665791776028, |
| "loss": 3.1827, |
| "step": 87150 |
| }, |
| { |
| "epoch": 25.415355019237495, |
| "grad_norm": 0.40897324681282043, |
| "learning_rate": 0.00029519160104986877, |
| "loss": 3.1769, |
| "step": 87200 |
| }, |
| { |
| "epoch": 25.42992887956162, |
| "grad_norm": 0.4291824400424957, |
| "learning_rate": 0.0002950166229221347, |
| "loss": 3.1802, |
| "step": 87250 |
| }, |
| { |
| "epoch": 25.444502739885742, |
| "grad_norm": 0.43355199694633484, |
| "learning_rate": 0.0002948416447944007, |
| "loss": 3.1747, |
| "step": 87300 |
| }, |
| { |
| "epoch": 25.459076600209862, |
| "grad_norm": 0.40116190910339355, |
| "learning_rate": 0.00029466666666666666, |
| "loss": 3.1834, |
| "step": 87350 |
| }, |
| { |
| "epoch": 25.473650460533985, |
| "grad_norm": 0.43621543049812317, |
| "learning_rate": 0.0002944916885389326, |
| "loss": 3.1795, |
| "step": 87400 |
| }, |
| { |
| "epoch": 25.48822432085811, |
| "grad_norm": 0.40916112065315247, |
| "learning_rate": 0.0002943167104111986, |
| "loss": 3.1814, |
| "step": 87450 |
| }, |
| { |
| "epoch": 25.502798181182232, |
| "grad_norm": 0.42583590745925903, |
| "learning_rate": 0.00029414173228346454, |
| "loss": 3.1872, |
| "step": 87500 |
| }, |
| { |
| "epoch": 25.517372041506356, |
| "grad_norm": 0.43467631936073303, |
| "learning_rate": 0.00029396675415573054, |
| "loss": 3.1909, |
| "step": 87550 |
| }, |
| { |
| "epoch": 25.531945901830476, |
| "grad_norm": 0.4192965030670166, |
| "learning_rate": 0.0002937917760279965, |
| "loss": 3.1862, |
| "step": 87600 |
| }, |
| { |
| "epoch": 25.5465197621546, |
| "grad_norm": 0.4481930732727051, |
| "learning_rate": 0.0002936167979002624, |
| "loss": 3.1951, |
| "step": 87650 |
| }, |
| { |
| "epoch": 25.561093622478722, |
| "grad_norm": 0.40336722135543823, |
| "learning_rate": 0.0002934418197725284, |
| "loss": 3.1867, |
| "step": 87700 |
| }, |
| { |
| "epoch": 25.575667482802846, |
| "grad_norm": 0.39473244547843933, |
| "learning_rate": 0.00029326684164479437, |
| "loss": 3.1867, |
| "step": 87750 |
| }, |
| { |
| "epoch": 25.59024134312697, |
| "grad_norm": 0.4513109624385834, |
| "learning_rate": 0.0002930918635170603, |
| "loss": 3.1936, |
| "step": 87800 |
| }, |
| { |
| "epoch": 25.60481520345109, |
| "grad_norm": 0.41294845938682556, |
| "learning_rate": 0.0002929168853893263, |
| "loss": 3.1944, |
| "step": 87850 |
| }, |
| { |
| "epoch": 25.619389063775213, |
| "grad_norm": 0.43818268179893494, |
| "learning_rate": 0.00029274190726159225, |
| "loss": 3.201, |
| "step": 87900 |
| }, |
| { |
| "epoch": 25.633962924099336, |
| "grad_norm": 0.4113070070743561, |
| "learning_rate": 0.00029256692913385825, |
| "loss": 3.2102, |
| "step": 87950 |
| }, |
| { |
| "epoch": 25.64853678442346, |
| "grad_norm": 0.42678162455558777, |
| "learning_rate": 0.0002923919510061242, |
| "loss": 3.2059, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.64853678442346, |
| "eval_accuracy": 0.37366544901327, |
| "eval_loss": 3.542567729949951, |
| "eval_runtime": 53.514, |
| "eval_samples_per_second": 310.704, |
| "eval_steps_per_second": 19.434, |
| "step": 88000 |
| }, |
| { |
| "epoch": 25.66311064474758, |
| "grad_norm": 0.42839187383651733, |
| "learning_rate": 0.0002922169728783902, |
| "loss": 3.1819, |
| "step": 88050 |
| }, |
| { |
| "epoch": 25.677684505071703, |
| "grad_norm": 0.42220258712768555, |
| "learning_rate": 0.00029204199475065613, |
| "loss": 3.1957, |
| "step": 88100 |
| }, |
| { |
| "epoch": 25.692258365395826, |
| "grad_norm": 0.4478667676448822, |
| "learning_rate": 0.00029186701662292213, |
| "loss": 3.1959, |
| "step": 88150 |
| }, |
| { |
| "epoch": 25.70683222571995, |
| "grad_norm": 0.4184144139289856, |
| "learning_rate": 0.0002916920384951881, |
| "loss": 3.1992, |
| "step": 88200 |
| }, |
| { |
| "epoch": 25.72140608604407, |
| "grad_norm": 0.41506487131118774, |
| "learning_rate": 0.00029151706036745407, |
| "loss": 3.2053, |
| "step": 88250 |
| }, |
| { |
| "epoch": 25.735979946368193, |
| "grad_norm": 0.41918498277664185, |
| "learning_rate": 0.00029134208223972, |
| "loss": 3.1906, |
| "step": 88300 |
| }, |
| { |
| "epoch": 25.750553806692317, |
| "grad_norm": 0.40549084544181824, |
| "learning_rate": 0.00029116710411198596, |
| "loss": 3.2129, |
| "step": 88350 |
| }, |
| { |
| "epoch": 25.76512766701644, |
| "grad_norm": 0.41350501775741577, |
| "learning_rate": 0.00029099212598425196, |
| "loss": 3.2143, |
| "step": 88400 |
| }, |
| { |
| "epoch": 25.779701527340563, |
| "grad_norm": 0.44957444071769714, |
| "learning_rate": 0.0002908171478565179, |
| "loss": 3.1998, |
| "step": 88450 |
| }, |
| { |
| "epoch": 25.794275387664683, |
| "grad_norm": 0.4271428883075714, |
| "learning_rate": 0.0002906421697287839, |
| "loss": 3.2005, |
| "step": 88500 |
| }, |
| { |
| "epoch": 25.808849247988807, |
| "grad_norm": 0.4413732886314392, |
| "learning_rate": 0.00029046719160104984, |
| "loss": 3.2163, |
| "step": 88550 |
| }, |
| { |
| "epoch": 25.82342310831293, |
| "grad_norm": 0.4298846125602722, |
| "learning_rate": 0.00029029221347331584, |
| "loss": 3.1998, |
| "step": 88600 |
| }, |
| { |
| "epoch": 25.837996968637054, |
| "grad_norm": 0.4098442792892456, |
| "learning_rate": 0.0002901172353455818, |
| "loss": 3.2162, |
| "step": 88650 |
| }, |
| { |
| "epoch": 25.852570828961174, |
| "grad_norm": 0.4474171996116638, |
| "learning_rate": 0.0002899422572178477, |
| "loss": 3.2006, |
| "step": 88700 |
| }, |
| { |
| "epoch": 25.867144689285297, |
| "grad_norm": 0.4159180223941803, |
| "learning_rate": 0.0002897672790901137, |
| "loss": 3.2028, |
| "step": 88750 |
| }, |
| { |
| "epoch": 25.88171854960942, |
| "grad_norm": 0.40655815601348877, |
| "learning_rate": 0.00028959230096237967, |
| "loss": 3.2056, |
| "step": 88800 |
| }, |
| { |
| "epoch": 25.896292409933544, |
| "grad_norm": 0.41476768255233765, |
| "learning_rate": 0.00028941732283464566, |
| "loss": 3.2041, |
| "step": 88850 |
| }, |
| { |
| "epoch": 25.910866270257667, |
| "grad_norm": 0.41896602511405945, |
| "learning_rate": 0.0002892423447069116, |
| "loss": 3.2096, |
| "step": 88900 |
| }, |
| { |
| "epoch": 25.925440130581787, |
| "grad_norm": 0.4241361916065216, |
| "learning_rate": 0.00028906736657917755, |
| "loss": 3.2099, |
| "step": 88950 |
| }, |
| { |
| "epoch": 25.94001399090591, |
| "grad_norm": 0.44193774461746216, |
| "learning_rate": 0.00028889238845144355, |
| "loss": 3.2038, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.94001399090591, |
| "eval_accuracy": 0.3741535382586289, |
| "eval_loss": 3.5345892906188965, |
| "eval_runtime": 53.6848, |
| "eval_samples_per_second": 309.715, |
| "eval_steps_per_second": 19.372, |
| "step": 89000 |
| }, |
| { |
| "epoch": 25.954587851230034, |
| "grad_norm": 0.4183708429336548, |
| "learning_rate": 0.0002887174103237095, |
| "loss": 3.2093, |
| "step": 89050 |
| }, |
| { |
| "epoch": 25.969161711554158, |
| "grad_norm": 0.44829314947128296, |
| "learning_rate": 0.0002885424321959755, |
| "loss": 3.2096, |
| "step": 89100 |
| }, |
| { |
| "epoch": 25.983735571878277, |
| "grad_norm": 0.4053567349910736, |
| "learning_rate": 0.00028836745406824143, |
| "loss": 3.1987, |
| "step": 89150 |
| }, |
| { |
| "epoch": 25.9983094322024, |
| "grad_norm": 0.47260692715644836, |
| "learning_rate": 0.00028819247594050743, |
| "loss": 3.2249, |
| "step": 89200 |
| }, |
| { |
| "epoch": 26.01282499708523, |
| "grad_norm": 0.4070647060871124, |
| "learning_rate": 0.0002880174978127734, |
| "loss": 3.1164, |
| "step": 89250 |
| }, |
| { |
| "epoch": 26.027398857409352, |
| "grad_norm": 0.4365572929382324, |
| "learning_rate": 0.0002878425196850393, |
| "loss": 3.1104, |
| "step": 89300 |
| }, |
| { |
| "epoch": 26.041972717733472, |
| "grad_norm": 0.42117395997047424, |
| "learning_rate": 0.0002876675415573053, |
| "loss": 3.1094, |
| "step": 89350 |
| }, |
| { |
| "epoch": 26.056546578057596, |
| "grad_norm": 0.4205396771430969, |
| "learning_rate": 0.00028749256342957126, |
| "loss": 3.1059, |
| "step": 89400 |
| }, |
| { |
| "epoch": 26.07112043838172, |
| "grad_norm": 0.40787479281425476, |
| "learning_rate": 0.00028731758530183726, |
| "loss": 3.1266, |
| "step": 89450 |
| }, |
| { |
| "epoch": 26.085694298705842, |
| "grad_norm": 0.4280753433704376, |
| "learning_rate": 0.0002871426071741032, |
| "loss": 3.1168, |
| "step": 89500 |
| }, |
| { |
| "epoch": 26.100268159029962, |
| "grad_norm": 0.4404989182949066, |
| "learning_rate": 0.0002869676290463692, |
| "loss": 3.1399, |
| "step": 89550 |
| }, |
| { |
| "epoch": 26.114842019354086, |
| "grad_norm": 0.42946237325668335, |
| "learning_rate": 0.00028679265091863514, |
| "loss": 3.1219, |
| "step": 89600 |
| }, |
| { |
| "epoch": 26.12941587967821, |
| "grad_norm": 0.43695181608200073, |
| "learning_rate": 0.00028661767279090114, |
| "loss": 3.1428, |
| "step": 89650 |
| }, |
| { |
| "epoch": 26.143989740002333, |
| "grad_norm": 0.45373693108558655, |
| "learning_rate": 0.0002864426946631671, |
| "loss": 3.1457, |
| "step": 89700 |
| }, |
| { |
| "epoch": 26.158563600326456, |
| "grad_norm": 0.44140884280204773, |
| "learning_rate": 0.0002862677165354331, |
| "loss": 3.1503, |
| "step": 89750 |
| }, |
| { |
| "epoch": 26.173137460650576, |
| "grad_norm": 0.8031109571456909, |
| "learning_rate": 0.000286092738407699, |
| "loss": 3.144, |
| "step": 89800 |
| }, |
| { |
| "epoch": 26.1877113209747, |
| "grad_norm": 0.41350263357162476, |
| "learning_rate": 0.00028591776027996497, |
| "loss": 3.1521, |
| "step": 89850 |
| }, |
| { |
| "epoch": 26.202285181298823, |
| "grad_norm": 0.4209875166416168, |
| "learning_rate": 0.00028574278215223097, |
| "loss": 3.1555, |
| "step": 89900 |
| }, |
| { |
| "epoch": 26.216859041622946, |
| "grad_norm": 0.4206160604953766, |
| "learning_rate": 0.0002855678040244969, |
| "loss": 3.1333, |
| "step": 89950 |
| }, |
| { |
| "epoch": 26.231432901947066, |
| "grad_norm": 0.41024914383888245, |
| "learning_rate": 0.00028539282589676285, |
| "loss": 3.1591, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.231432901947066, |
| "eval_accuracy": 0.37301187785834394, |
| "eval_loss": 3.5533785820007324, |
| "eval_runtime": 53.4418, |
| "eval_samples_per_second": 311.124, |
| "eval_steps_per_second": 19.46, |
| "step": 90000 |
| }, |
| { |
| "epoch": 26.24600676227119, |
| "grad_norm": 0.4198101758956909, |
| "learning_rate": 0.00028521784776902885, |
| "loss": 3.1546, |
| "step": 90050 |
| }, |
| { |
| "epoch": 26.260580622595313, |
| "grad_norm": 0.42669153213500977, |
| "learning_rate": 0.0002850428696412948, |
| "loss": 3.1546, |
| "step": 90100 |
| }, |
| { |
| "epoch": 26.275154482919437, |
| "grad_norm": 0.43775099515914917, |
| "learning_rate": 0.00028486789151356074, |
| "loss": 3.1431, |
| "step": 90150 |
| }, |
| { |
| "epoch": 26.28972834324356, |
| "grad_norm": 0.4364624619483948, |
| "learning_rate": 0.00028469291338582673, |
| "loss": 3.1721, |
| "step": 90200 |
| }, |
| { |
| "epoch": 26.30430220356768, |
| "grad_norm": 0.41487571597099304, |
| "learning_rate": 0.0002845179352580927, |
| "loss": 3.1624, |
| "step": 90250 |
| }, |
| { |
| "epoch": 26.318876063891803, |
| "grad_norm": 0.4365042746067047, |
| "learning_rate": 0.0002843429571303587, |
| "loss": 3.165, |
| "step": 90300 |
| }, |
| { |
| "epoch": 26.333449924215927, |
| "grad_norm": 0.4010760486125946, |
| "learning_rate": 0.0002841679790026246, |
| "loss": 3.1573, |
| "step": 90350 |
| }, |
| { |
| "epoch": 26.34802378454005, |
| "grad_norm": 0.45000582933425903, |
| "learning_rate": 0.0002839930008748906, |
| "loss": 3.1701, |
| "step": 90400 |
| }, |
| { |
| "epoch": 26.36259764486417, |
| "grad_norm": 0.41207069158554077, |
| "learning_rate": 0.00028381802274715656, |
| "loss": 3.1702, |
| "step": 90450 |
| }, |
| { |
| "epoch": 26.377171505188294, |
| "grad_norm": 0.4389991760253906, |
| "learning_rate": 0.00028364304461942256, |
| "loss": 3.1671, |
| "step": 90500 |
| }, |
| { |
| "epoch": 26.391745365512417, |
| "grad_norm": 0.4085967242717743, |
| "learning_rate": 0.0002834680664916885, |
| "loss": 3.1785, |
| "step": 90550 |
| }, |
| { |
| "epoch": 26.40631922583654, |
| "grad_norm": 0.43534621596336365, |
| "learning_rate": 0.0002832930883639545, |
| "loss": 3.1674, |
| "step": 90600 |
| }, |
| { |
| "epoch": 26.420893086160664, |
| "grad_norm": 0.42330414056777954, |
| "learning_rate": 0.00028311811023622044, |
| "loss": 3.1838, |
| "step": 90650 |
| }, |
| { |
| "epoch": 26.435466946484784, |
| "grad_norm": 0.4248245358467102, |
| "learning_rate": 0.00028294313210848644, |
| "loss": 3.1794, |
| "step": 90700 |
| }, |
| { |
| "epoch": 26.450040806808907, |
| "grad_norm": 0.45084935426712036, |
| "learning_rate": 0.0002827681539807524, |
| "loss": 3.1836, |
| "step": 90750 |
| }, |
| { |
| "epoch": 26.46461466713303, |
| "grad_norm": 0.4310995936393738, |
| "learning_rate": 0.0002825931758530184, |
| "loss": 3.1701, |
| "step": 90800 |
| }, |
| { |
| "epoch": 26.479188527457154, |
| "grad_norm": 0.3991900086402893, |
| "learning_rate": 0.0002824181977252843, |
| "loss": 3.1744, |
| "step": 90850 |
| }, |
| { |
| "epoch": 26.493762387781274, |
| "grad_norm": 0.4258917570114136, |
| "learning_rate": 0.0002822432195975503, |
| "loss": 3.1852, |
| "step": 90900 |
| }, |
| { |
| "epoch": 26.508336248105397, |
| "grad_norm": 0.4781590700149536, |
| "learning_rate": 0.00028206824146981627, |
| "loss": 3.1764, |
| "step": 90950 |
| }, |
| { |
| "epoch": 26.52291010842952, |
| "grad_norm": 0.4917435348033905, |
| "learning_rate": 0.0002818932633420822, |
| "loss": 3.1674, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.52291010842952, |
| "eval_accuracy": 0.37358623896694093, |
| "eval_loss": 3.545701742172241, |
| "eval_runtime": 53.437, |
| "eval_samples_per_second": 311.151, |
| "eval_steps_per_second": 19.462, |
| "step": 91000 |
| }, |
| { |
| "epoch": 26.537483968753644, |
| "grad_norm": 0.4191431701183319, |
| "learning_rate": 0.0002817182852143482, |
| "loss": 3.1829, |
| "step": 91050 |
| }, |
| { |
| "epoch": 26.552057829077768, |
| "grad_norm": 0.4149972200393677, |
| "learning_rate": 0.00028154330708661415, |
| "loss": 3.173, |
| "step": 91100 |
| }, |
| { |
| "epoch": 26.566631689401888, |
| "grad_norm": 0.4207434356212616, |
| "learning_rate": 0.0002813683289588801, |
| "loss": 3.1768, |
| "step": 91150 |
| }, |
| { |
| "epoch": 26.58120554972601, |
| "grad_norm": 0.46031084656715393, |
| "learning_rate": 0.0002811933508311461, |
| "loss": 3.1889, |
| "step": 91200 |
| }, |
| { |
| "epoch": 26.595779410050135, |
| "grad_norm": 0.4277399778366089, |
| "learning_rate": 0.00028101837270341204, |
| "loss": 3.188, |
| "step": 91250 |
| }, |
| { |
| "epoch": 26.610353270374258, |
| "grad_norm": 0.44055789709091187, |
| "learning_rate": 0.000280843394575678, |
| "loss": 3.1835, |
| "step": 91300 |
| }, |
| { |
| "epoch": 26.624927130698378, |
| "grad_norm": 0.4088766276836395, |
| "learning_rate": 0.000280668416447944, |
| "loss": 3.1839, |
| "step": 91350 |
| }, |
| { |
| "epoch": 26.6395009910225, |
| "grad_norm": 0.4153987169265747, |
| "learning_rate": 0.0002804934383202099, |
| "loss": 3.1741, |
| "step": 91400 |
| }, |
| { |
| "epoch": 26.654074851346625, |
| "grad_norm": 0.4546470642089844, |
| "learning_rate": 0.0002803184601924759, |
| "loss": 3.1887, |
| "step": 91450 |
| }, |
| { |
| "epoch": 26.66864871167075, |
| "grad_norm": 0.4202099144458771, |
| "learning_rate": 0.00028014348206474186, |
| "loss": 3.184, |
| "step": 91500 |
| }, |
| { |
| "epoch": 26.68322257199487, |
| "grad_norm": 0.3985988199710846, |
| "learning_rate": 0.00027996850393700786, |
| "loss": 3.193, |
| "step": 91550 |
| }, |
| { |
| "epoch": 26.69779643231899, |
| "grad_norm": 0.4250155985355377, |
| "learning_rate": 0.0002797935258092738, |
| "loss": 3.1982, |
| "step": 91600 |
| }, |
| { |
| "epoch": 26.712370292643115, |
| "grad_norm": 0.4258333146572113, |
| "learning_rate": 0.0002796185476815398, |
| "loss": 3.1863, |
| "step": 91650 |
| }, |
| { |
| "epoch": 26.72694415296724, |
| "grad_norm": 0.4628980755805969, |
| "learning_rate": 0.00027944356955380574, |
| "loss": 3.184, |
| "step": 91700 |
| }, |
| { |
| "epoch": 26.741518013291362, |
| "grad_norm": 0.40205731987953186, |
| "learning_rate": 0.00027926859142607174, |
| "loss": 3.1891, |
| "step": 91750 |
| }, |
| { |
| "epoch": 26.756091873615482, |
| "grad_norm": 0.39519068598747253, |
| "learning_rate": 0.0002790936132983377, |
| "loss": 3.1967, |
| "step": 91800 |
| }, |
| { |
| "epoch": 26.770665733939605, |
| "grad_norm": 0.42985662817955017, |
| "learning_rate": 0.0002789186351706037, |
| "loss": 3.1987, |
| "step": 91850 |
| }, |
| { |
| "epoch": 26.78523959426373, |
| "grad_norm": 0.414476603269577, |
| "learning_rate": 0.0002787436570428696, |
| "loss": 3.2048, |
| "step": 91900 |
| }, |
| { |
| "epoch": 26.799813454587852, |
| "grad_norm": 0.4109380543231964, |
| "learning_rate": 0.00027856867891513557, |
| "loss": 3.1974, |
| "step": 91950 |
| }, |
| { |
| "epoch": 26.814387314911976, |
| "grad_norm": 0.4429774284362793, |
| "learning_rate": 0.00027839370078740157, |
| "loss": 3.2005, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.814387314911976, |
| "eval_accuracy": 0.3741004569348631, |
| "eval_loss": 3.5386786460876465, |
| "eval_runtime": 53.7358, |
| "eval_samples_per_second": 309.421, |
| "eval_steps_per_second": 19.354, |
| "step": 92000 |
| }, |
| { |
| "epoch": 26.814387314911976, |
| "step": 92000, |
| "total_flos": 1.922898754142208e+18, |
| "train_loss": 3.4168865719670833, |
| "train_runtime": 40958.8697, |
| "train_samples_per_second": 335.03, |
| "train_steps_per_second": 4.188 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 171550, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 50, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 20, |
| "early_stopping_threshold": 0.0 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 18 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.922898754142208e+18, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|