| { |
| "best_metric": 3.3018171787261963, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_495/checkpoint-90000", |
| "epoch": 10.0, |
| "eval_steps": 1000, |
| "global_step": 92910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005381552039608223, |
| "grad_norm": 2.1159660816192627, |
| "learning_rate": 0.00028199999999999997, |
| "loss": 8.9316, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.010763104079216447, |
| "grad_norm": 1.9895623922348022, |
| "learning_rate": 0.0005819999999999999, |
| "loss": 6.9096, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01614465611882467, |
| "grad_norm": 1.8703136444091797, |
| "learning_rate": 0.0005996961534317422, |
| "loss": 6.4463, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.021526208158432893, |
| "grad_norm": 1.2748037576675415, |
| "learning_rate": 0.0005993729124016807, |
| "loss": 6.2017, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.026907760198041114, |
| "grad_norm": 1.3479403257369995, |
| "learning_rate": 0.0005990496713716194, |
| "loss": 6.1031, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03228931223764934, |
| "grad_norm": 1.0388602018356323, |
| "learning_rate": 0.000598726430341558, |
| "loss": 5.9726, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03767086427725756, |
| "grad_norm": 1.1332666873931885, |
| "learning_rate": 0.0005984031893114966, |
| "loss": 5.886, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04305241631686579, |
| "grad_norm": 1.398230791091919, |
| "learning_rate": 0.0005980799482814351, |
| "loss": 5.8019, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.048433968356474004, |
| "grad_norm": 1.5357556343078613, |
| "learning_rate": 0.0005977567072513737, |
| "loss": 5.7437, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05381552039608223, |
| "grad_norm": 1.175200343132019, |
| "learning_rate": 0.0005974334662213123, |
| "loss": 5.6616, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05919707243569045, |
| "grad_norm": 1.386881709098816, |
| "learning_rate": 0.0005971102251912509, |
| "loss": 5.5895, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06457862447529868, |
| "grad_norm": 1.3578399419784546, |
| "learning_rate": 0.0005967869841611895, |
| "loss": 5.4912, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0699601765149069, |
| "grad_norm": 1.2151751518249512, |
| "learning_rate": 0.000596463743131128, |
| "loss": 5.4275, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07534172855451512, |
| "grad_norm": 1.3390365839004517, |
| "learning_rate": 0.0005961405021010667, |
| "loss": 5.3731, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08072328059412334, |
| "grad_norm": 1.103965401649475, |
| "learning_rate": 0.0005958172610710052, |
| "loss": 5.3186, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.08610483263373157, |
| "grad_norm": 1.1866347789764404, |
| "learning_rate": 0.0005954940200409439, |
| "loss": 5.2687, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09148638467333979, |
| "grad_norm": 1.0097291469573975, |
| "learning_rate": 0.0005951707790108824, |
| "loss": 5.2072, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.09686793671294801, |
| "grad_norm": 1.3843412399291992, |
| "learning_rate": 0.0005948475379808209, |
| "loss": 5.1549, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10224948875255624, |
| "grad_norm": 1.1859126091003418, |
| "learning_rate": 0.0005945242969507596, |
| "loss": 5.0977, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "grad_norm": 1.1825942993164062, |
| "learning_rate": 0.0005942010559206981, |
| "loss": 5.0644, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "eval_accuracy": 0.22798678911267642, |
| "eval_loss": 5.015804290771484, |
| "eval_runtime": 186.987, |
| "eval_samples_per_second": 96.322, |
| "eval_steps_per_second": 6.022, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11301259283177269, |
| "grad_norm": 1.036710262298584, |
| "learning_rate": 0.0005938778148906367, |
| "loss": 5.045, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1183941448713809, |
| "grad_norm": 1.1089870929718018, |
| "learning_rate": 0.0005935545738605753, |
| "loss": 5.0081, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12377569691098914, |
| "grad_norm": 0.9135867357254028, |
| "learning_rate": 0.000593231332830514, |
| "loss": 4.9925, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.12915724895059735, |
| "grad_norm": 0.8327316641807556, |
| "learning_rate": 0.0005929080918004525, |
| "loss": 4.9437, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.13453880099020557, |
| "grad_norm": 1.0137102603912354, |
| "learning_rate": 0.000592584850770391, |
| "loss": 4.9288, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1399203530298138, |
| "grad_norm": 1.620171070098877, |
| "learning_rate": 0.0005922616097403296, |
| "loss": 4.8813, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14530190506942203, |
| "grad_norm": 1.1357035636901855, |
| "learning_rate": 0.0005919383687102682, |
| "loss": 4.8713, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.15068345710903025, |
| "grad_norm": 0.7975968718528748, |
| "learning_rate": 0.0005916151276802069, |
| "loss": 4.835, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15606500914863847, |
| "grad_norm": 0.9254979491233826, |
| "learning_rate": 0.0005912918866501454, |
| "loss": 4.806, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.16144656118824668, |
| "grad_norm": 0.8283946514129639, |
| "learning_rate": 0.000590968645620084, |
| "loss": 4.8075, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1668281132278549, |
| "grad_norm": 1.1234453916549683, |
| "learning_rate": 0.0005906454045900226, |
| "loss": 4.7791, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.17220966526746315, |
| "grad_norm": 0.9318310618400574, |
| "learning_rate": 0.0005903221635599612, |
| "loss": 4.7354, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.17759121730707136, |
| "grad_norm": 0.8706691861152649, |
| "learning_rate": 0.0005899989225298998, |
| "loss": 4.7278, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18297276934667958, |
| "grad_norm": 1.0983690023422241, |
| "learning_rate": 0.0005896756814998383, |
| "loss": 4.7116, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1883543213862878, |
| "grad_norm": 1.1487178802490234, |
| "learning_rate": 0.0005893524404697769, |
| "loss": 4.7026, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.19373587342589602, |
| "grad_norm": 0.8954223394393921, |
| "learning_rate": 0.0005890291994397155, |
| "loss": 4.6491, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.19911742546550426, |
| "grad_norm": 1.1864057779312134, |
| "learning_rate": 0.0005887059584096541, |
| "loss": 4.6526, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20449897750511248, |
| "grad_norm": 0.945884108543396, |
| "learning_rate": 0.0005883827173795926, |
| "loss": 4.6132, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2098805295447207, |
| "grad_norm": 1.2601373195648193, |
| "learning_rate": 0.0005880594763495313, |
| "loss": 4.6044, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "grad_norm": 0.9447725415229797, |
| "learning_rate": 0.0005877362353194698, |
| "loss": 4.5877, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "eval_accuracy": 0.27040933128385014, |
| "eval_loss": 4.508606910705566, |
| "eval_runtime": 184.8398, |
| "eval_samples_per_second": 97.441, |
| "eval_steps_per_second": 6.092, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.22064363362393713, |
| "grad_norm": 0.7851864695549011, |
| "learning_rate": 0.0005874129942894084, |
| "loss": 4.5643, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22602518566354537, |
| "grad_norm": 0.862130880355835, |
| "learning_rate": 0.000587089753259347, |
| "loss": 4.5481, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2314067377031536, |
| "grad_norm": 0.8470994830131531, |
| "learning_rate": 0.0005867665122292855, |
| "loss": 4.5219, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2367882897427618, |
| "grad_norm": 0.8713746070861816, |
| "learning_rate": 0.0005864432711992242, |
| "loss": 4.511, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24216984178237003, |
| "grad_norm": 0.8879591226577759, |
| "learning_rate": 0.0005861200301691628, |
| "loss": 4.5126, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.24755139382197827, |
| "grad_norm": 0.9361649751663208, |
| "learning_rate": 0.0005857967891391014, |
| "loss": 4.4803, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2529329458615865, |
| "grad_norm": 0.8480615019798279, |
| "learning_rate": 0.0005854735481090399, |
| "loss": 4.4446, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2583144979011947, |
| "grad_norm": 1.0228937864303589, |
| "learning_rate": 0.0005851503070789784, |
| "loss": 4.4508, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2636960499408029, |
| "grad_norm": 0.9693055152893066, |
| "learning_rate": 0.0005848270660489171, |
| "loss": 4.4283, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.26907760198041114, |
| "grad_norm": 0.7247486114501953, |
| "learning_rate": 0.0005845038250188556, |
| "loss": 4.4015, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.27445915402001936, |
| "grad_norm": 0.7183290123939514, |
| "learning_rate": 0.0005841805839887943, |
| "loss": 4.3918, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2798407060596276, |
| "grad_norm": 0.900417149066925, |
| "learning_rate": 0.0005838573429587328, |
| "loss": 4.3701, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2852222580992358, |
| "grad_norm": 0.8292120099067688, |
| "learning_rate": 0.0005835341019286715, |
| "loss": 4.3891, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29060381013884407, |
| "grad_norm": 0.798037052154541, |
| "learning_rate": 0.00058321086089861, |
| "loss": 4.36, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.2959853621784523, |
| "grad_norm": 0.8176091313362122, |
| "learning_rate": 0.0005828876198685486, |
| "loss": 4.3491, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3013669142180605, |
| "grad_norm": 0.8014965057373047, |
| "learning_rate": 0.0005825643788384872, |
| "loss": 4.3642, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3067484662576687, |
| "grad_norm": 1.2503150701522827, |
| "learning_rate": 0.0005822411378084257, |
| "loss": 4.3251, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31213001829727693, |
| "grad_norm": 0.7830126285552979, |
| "learning_rate": 0.0005819178967783644, |
| "loss": 4.3251, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.31751157033688515, |
| "grad_norm": 0.9035398364067078, |
| "learning_rate": 0.0005815946557483029, |
| "loss": 4.3313, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "grad_norm": 0.6857808828353882, |
| "learning_rate": 0.0005812714147182415, |
| "loss": 4.2989, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "eval_accuracy": 0.2990024252411089, |
| "eval_loss": 4.2293620109558105, |
| "eval_runtime": 185.031, |
| "eval_samples_per_second": 97.34, |
| "eval_steps_per_second": 6.085, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3282746744161016, |
| "grad_norm": 0.7547754049301147, |
| "learning_rate": 0.0005809481736881801, |
| "loss": 4.296, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.3336562264557098, |
| "grad_norm": 0.7557756304740906, |
| "learning_rate": 0.0005806249326581187, |
| "loss": 4.2866, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3390377784953181, |
| "grad_norm": 0.7948616147041321, |
| "learning_rate": 0.0005803016916280573, |
| "loss": 4.2736, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3444193305349263, |
| "grad_norm": 0.864937424659729, |
| "learning_rate": 0.0005799784505979959, |
| "loss": 4.2902, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3498008825745345, |
| "grad_norm": 0.8035250902175903, |
| "learning_rate": 0.0005796552095679344, |
| "loss": 4.2524, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.35518243461414273, |
| "grad_norm": 0.7827973365783691, |
| "learning_rate": 0.000579331968537873, |
| "loss": 4.2573, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.36056398665375095, |
| "grad_norm": 0.7145817875862122, |
| "learning_rate": 0.0005790087275078116, |
| "loss": 4.233, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.36594553869335916, |
| "grad_norm": 0.7260717749595642, |
| "learning_rate": 0.0005786854864777502, |
| "loss": 4.2351, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3713270907329674, |
| "grad_norm": 0.846215009689331, |
| "learning_rate": 0.0005783622454476888, |
| "loss": 4.236, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.3767086427725756, |
| "grad_norm": 0.6381611227989197, |
| "learning_rate": 0.0005780390044176273, |
| "loss": 4.2186, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3820901948121838, |
| "grad_norm": 0.7176727652549744, |
| "learning_rate": 0.000577715763387566, |
| "loss": 4.2159, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.38747174685179203, |
| "grad_norm": 0.7199215888977051, |
| "learning_rate": 0.0005773925223575045, |
| "loss": 4.2187, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3928532988914003, |
| "grad_norm": 0.7120049595832825, |
| "learning_rate": 0.0005770692813274432, |
| "loss": 4.1858, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.3982348509310085, |
| "grad_norm": 0.682905912399292, |
| "learning_rate": 0.0005767460402973817, |
| "loss": 4.2067, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.40361640297061674, |
| "grad_norm": 0.8106050491333008, |
| "learning_rate": 0.0005764227992673203, |
| "loss": 4.1901, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.40899795501022496, |
| "grad_norm": 0.6139677166938782, |
| "learning_rate": 0.0005760995582372589, |
| "loss": 4.1922, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4143795070498332, |
| "grad_norm": 0.6996195912361145, |
| "learning_rate": 0.0005757763172071974, |
| "loss": 4.1884, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.4197610590894414, |
| "grad_norm": 0.6972913146018982, |
| "learning_rate": 0.000575453076177136, |
| "loss": 4.1469, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.4251426111290496, |
| "grad_norm": 0.7805142998695374, |
| "learning_rate": 0.0005751298351470746, |
| "loss": 4.1681, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "grad_norm": 0.6750819683074951, |
| "learning_rate": 0.0005748065941170133, |
| "loss": 4.1677, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "eval_accuracy": 0.3131229545414788, |
| "eval_loss": 4.085065841674805, |
| "eval_runtime": 184.7939, |
| "eval_samples_per_second": 97.465, |
| "eval_steps_per_second": 6.093, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.43590571520826604, |
| "grad_norm": 0.743445098400116, |
| "learning_rate": 0.0005744833530869518, |
| "loss": 4.1605, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.44128726724787426, |
| "grad_norm": 0.6816270351409912, |
| "learning_rate": 0.0005741601120568903, |
| "loss": 4.1494, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.44666881928748253, |
| "grad_norm": 0.6031078696250916, |
| "learning_rate": 0.0005738368710268289, |
| "loss": 4.1373, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.45205037132709075, |
| "grad_norm": 0.869440495967865, |
| "learning_rate": 0.0005735136299967675, |
| "loss": 4.1274, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.45743192336669897, |
| "grad_norm": 0.6965010762214661, |
| "learning_rate": 0.0005731903889667062, |
| "loss": 4.1363, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4628134754063072, |
| "grad_norm": 0.6586082577705383, |
| "learning_rate": 0.0005728671479366447, |
| "loss": 4.1073, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4681950274459154, |
| "grad_norm": 0.6008425951004028, |
| "learning_rate": 0.0005725439069065833, |
| "loss": 4.1131, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.4735765794855236, |
| "grad_norm": 0.6233063340187073, |
| "learning_rate": 0.0005722206658765219, |
| "loss": 4.1266, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.47895813152513184, |
| "grad_norm": 0.7806559205055237, |
| "learning_rate": 0.0005718974248464605, |
| "loss": 4.1052, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.48433968356474005, |
| "grad_norm": 0.7604005336761475, |
| "learning_rate": 0.000571574183816399, |
| "loss": 4.0944, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.48972123560434827, |
| "grad_norm": 0.8495274782180786, |
| "learning_rate": 0.0005712509427863376, |
| "loss": 4.1176, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.49510278764395654, |
| "grad_norm": 0.6198326349258423, |
| "learning_rate": 0.0005709277017562762, |
| "loss": 4.1016, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5004843396835648, |
| "grad_norm": 0.6641437411308289, |
| "learning_rate": 0.0005706044607262148, |
| "loss": 4.0835, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.505865891723173, |
| "grad_norm": 0.7691634297370911, |
| "learning_rate": 0.0005702812196961534, |
| "loss": 4.0815, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5112474437627812, |
| "grad_norm": 0.7156651020050049, |
| "learning_rate": 0.0005699579786660919, |
| "loss": 4.0818, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5166289958023894, |
| "grad_norm": 0.6472979784011841, |
| "learning_rate": 0.0005696347376360306, |
| "loss": 4.067, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5220105478419976, |
| "grad_norm": 0.724949300289154, |
| "learning_rate": 0.0005693114966059691, |
| "loss": 4.0615, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5273920998816058, |
| "grad_norm": 0.7559602856636047, |
| "learning_rate": 0.0005689882555759077, |
| "loss": 4.0676, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5327736519212141, |
| "grad_norm": 0.6394193768501282, |
| "learning_rate": 0.0005686650145458463, |
| "loss": 4.0461, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "grad_norm": 0.7001257538795471, |
| "learning_rate": 0.0005683417735157848, |
| "loss": 4.0729, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "eval_accuracy": 0.320991922635667, |
| "eval_loss": 3.990946054458618, |
| "eval_runtime": 184.8283, |
| "eval_samples_per_second": 97.447, |
| "eval_steps_per_second": 6.092, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5435367560004305, |
| "grad_norm": 0.7765357494354248, |
| "learning_rate": 0.0005680185324857235, |
| "loss": 4.0425, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.5489183080400387, |
| "grad_norm": 0.6510383486747742, |
| "learning_rate": 0.0005676952914556621, |
| "loss": 4.0239, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.5542998600796469, |
| "grad_norm": 0.6882658004760742, |
| "learning_rate": 0.0005673720504256007, |
| "loss": 4.0242, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.5596814121192552, |
| "grad_norm": 0.7188264727592468, |
| "learning_rate": 0.0005670488093955392, |
| "loss": 4.0442, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5650629641588634, |
| "grad_norm": 0.6599631905555725, |
| "learning_rate": 0.0005667255683654777, |
| "loss": 4.0497, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.5704445161984716, |
| "grad_norm": 0.5674473643302917, |
| "learning_rate": 0.0005664023273354164, |
| "loss": 4.0271, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5758260682380799, |
| "grad_norm": 0.6670402884483337, |
| "learning_rate": 0.0005660790863053549, |
| "loss": 4.0284, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.5812076202776881, |
| "grad_norm": 0.7389664649963379, |
| "learning_rate": 0.0005657558452752936, |
| "loss": 4.0449, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5865891723172963, |
| "grad_norm": 0.6841723322868347, |
| "learning_rate": 0.0005654326042452321, |
| "loss": 4.051, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.5919707243569046, |
| "grad_norm": 0.6119431853294373, |
| "learning_rate": 0.0005651093632151708, |
| "loss": 4.0158, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5973522763965128, |
| "grad_norm": 0.8850950598716736, |
| "learning_rate": 0.0005647861221851093, |
| "loss": 4.0183, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.602733828436121, |
| "grad_norm": 0.6376111507415771, |
| "learning_rate": 0.0005644628811550479, |
| "loss": 4.0076, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6081153804757292, |
| "grad_norm": 0.5115376114845276, |
| "learning_rate": 0.0005641396401249865, |
| "loss": 4.0006, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6134969325153374, |
| "grad_norm": 0.6021561026573181, |
| "learning_rate": 0.000563816399094925, |
| "loss": 4.0056, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6188784845549457, |
| "grad_norm": 0.5795146226882935, |
| "learning_rate": 0.0005634931580648637, |
| "loss": 4.0159, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6242600365945539, |
| "grad_norm": 0.6918138861656189, |
| "learning_rate": 0.0005631699170348022, |
| "loss": 3.991, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6296415886341621, |
| "grad_norm": 0.6135892271995544, |
| "learning_rate": 0.0005628466760047408, |
| "loss": 4.0127, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.6350231406737703, |
| "grad_norm": 0.6184191703796387, |
| "learning_rate": 0.0005625234349746794, |
| "loss": 3.9862, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6404046927133785, |
| "grad_norm": 0.6259024739265442, |
| "learning_rate": 0.000562200193944618, |
| "loss": 3.9922, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "grad_norm": 0.5602654814720154, |
| "learning_rate": 0.0005618769529145566, |
| "loss": 3.995, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "eval_accuracy": 0.32913100180896193, |
| "eval_loss": 3.9143569469451904, |
| "eval_runtime": 184.5106, |
| "eval_samples_per_second": 97.615, |
| "eval_steps_per_second": 6.103, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.651167796792595, |
| "grad_norm": 0.7381386160850525, |
| "learning_rate": 0.0005615537118844952, |
| "loss": 3.9749, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.6565493488322032, |
| "grad_norm": 0.6130694150924683, |
| "learning_rate": 0.0005612304708544337, |
| "loss": 3.9791, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.6619309008718114, |
| "grad_norm": 0.6282258629798889, |
| "learning_rate": 0.0005609072298243723, |
| "loss": 3.9766, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.6673124529114196, |
| "grad_norm": 0.7320833802223206, |
| "learning_rate": 0.0005605839887943109, |
| "loss": 3.9744, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6726940049510278, |
| "grad_norm": 0.7067676782608032, |
| "learning_rate": 0.0005602607477642495, |
| "loss": 3.9755, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.6780755569906362, |
| "grad_norm": 0.6241488456726074, |
| "learning_rate": 0.0005599375067341881, |
| "loss": 3.9639, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6834571090302444, |
| "grad_norm": 0.6031643152236938, |
| "learning_rate": 0.0005596142657041266, |
| "loss": 3.9535, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.6888386610698526, |
| "grad_norm": 0.5485600829124451, |
| "learning_rate": 0.0005592910246740653, |
| "loss": 3.9616, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6942202131094608, |
| "grad_norm": 0.6337101459503174, |
| "learning_rate": 0.0005589677836440038, |
| "loss": 3.9509, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.699601765149069, |
| "grad_norm": 0.546737790107727, |
| "learning_rate": 0.0005586445426139425, |
| "loss": 3.9416, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7049833171886772, |
| "grad_norm": 0.6616541147232056, |
| "learning_rate": 0.000558321301583881, |
| "loss": 3.9618, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7103648692282855, |
| "grad_norm": 0.5739938020706177, |
| "learning_rate": 0.0005579980605538196, |
| "loss": 3.949, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7157464212678937, |
| "grad_norm": 0.5481446981430054, |
| "learning_rate": 0.0005576748195237582, |
| "loss": 3.961, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7211279733075019, |
| "grad_norm": 0.6824391484260559, |
| "learning_rate": 0.0005573515784936967, |
| "loss": 3.9376, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.7265095253471101, |
| "grad_norm": 0.6663408279418945, |
| "learning_rate": 0.0005570283374636353, |
| "loss": 3.9525, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.7318910773867183, |
| "grad_norm": 0.5435226559638977, |
| "learning_rate": 0.0005567050964335739, |
| "loss": 3.9249, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.7372726294263265, |
| "grad_norm": 0.6284464001655579, |
| "learning_rate": 0.0005563818554035126, |
| "loss": 3.9307, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.7426541814659348, |
| "grad_norm": 0.5826367139816284, |
| "learning_rate": 0.0005560586143734511, |
| "loss": 3.9251, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.748035733505543, |
| "grad_norm": 0.5755831003189087, |
| "learning_rate": 0.0005557353733433896, |
| "loss": 3.9271, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "grad_norm": 0.6272289156913757, |
| "learning_rate": 0.0005554121323133283, |
| "loss": 3.9517, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "eval_accuracy": 0.33305206722441094, |
| "eval_loss": 3.8637194633483887, |
| "eval_runtime": 184.9315, |
| "eval_samples_per_second": 97.393, |
| "eval_steps_per_second": 6.089, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7587988375847594, |
| "grad_norm": 0.6227930188179016, |
| "learning_rate": 0.0005550888912832668, |
| "loss": 3.916, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.7641803896243676, |
| "grad_norm": 0.6726216077804565, |
| "learning_rate": 0.0005547656502532055, |
| "loss": 3.9217, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.7695619416639758, |
| "grad_norm": 0.5420123338699341, |
| "learning_rate": 0.000554442409223144, |
| "loss": 3.9238, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.7749434937035841, |
| "grad_norm": 0.5879136919975281, |
| "learning_rate": 0.0005541191681930826, |
| "loss": 3.9191, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7803250457431924, |
| "grad_norm": 0.5138105750083923, |
| "learning_rate": 0.0005537959271630212, |
| "loss": 3.9145, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.7857065977828006, |
| "grad_norm": 0.5751107931137085, |
| "learning_rate": 0.0005534726861329598, |
| "loss": 3.9067, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7910881498224088, |
| "grad_norm": 0.6342102289199829, |
| "learning_rate": 0.0005531494451028983, |
| "loss": 3.9038, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.796469701862017, |
| "grad_norm": 0.5363113284111023, |
| "learning_rate": 0.0005528262040728369, |
| "loss": 3.9121, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8018512539016253, |
| "grad_norm": 0.6412177681922913, |
| "learning_rate": 0.0005525029630427755, |
| "loss": 3.894, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.8072328059412335, |
| "grad_norm": 0.6429057121276855, |
| "learning_rate": 0.0005521797220127141, |
| "loss": 3.904, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8126143579808417, |
| "grad_norm": 0.6362873315811157, |
| "learning_rate": 0.0005518564809826527, |
| "loss": 3.9328, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.8179959100204499, |
| "grad_norm": 0.6349565386772156, |
| "learning_rate": 0.0005515332399525912, |
| "loss": 3.8807, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.8233774620600581, |
| "grad_norm": 0.6769669651985168, |
| "learning_rate": 0.0005512099989225299, |
| "loss": 3.8932, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.8287590140996663, |
| "grad_norm": 0.601761519908905, |
| "learning_rate": 0.0005508867578924685, |
| "loss": 3.897, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.8341405661392746, |
| "grad_norm": 0.6315547823905945, |
| "learning_rate": 0.000550563516862407, |
| "loss": 3.8814, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.8395221181788828, |
| "grad_norm": 0.5754820108413696, |
| "learning_rate": 0.0005502402758323456, |
| "loss": 3.9101, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.844903670218491, |
| "grad_norm": 0.627307653427124, |
| "learning_rate": 0.0005499170348022841, |
| "loss": 3.9035, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.8502852222580992, |
| "grad_norm": 0.6498885154724121, |
| "learning_rate": 0.0005495937937722228, |
| "loss": 3.9012, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.8556667742977074, |
| "grad_norm": 0.5503144264221191, |
| "learning_rate": 0.0005492705527421614, |
| "loss": 3.8942, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "grad_norm": 0.6200529336929321, |
| "learning_rate": 0.0005489473117121, |
| "loss": 3.8717, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "eval_accuracy": 0.3380203291726159, |
| "eval_loss": 3.8143210411071777, |
| "eval_runtime": 184.8318, |
| "eval_samples_per_second": 97.445, |
| "eval_steps_per_second": 6.092, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8664298783769239, |
| "grad_norm": 0.7157406210899353, |
| "learning_rate": 0.0005486240706820385, |
| "loss": 3.8746, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.8718114304165321, |
| "grad_norm": 0.643976628780365, |
| "learning_rate": 0.0005483008296519772, |
| "loss": 3.8859, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 0.5374308824539185, |
| "learning_rate": 0.0005479775886219157, |
| "loss": 3.8586, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.8825745344957485, |
| "grad_norm": 0.6169512867927551, |
| "learning_rate": 0.0005476543475918542, |
| "loss": 3.8734, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.8879560865353568, |
| "grad_norm": 0.6148831248283386, |
| "learning_rate": 0.0005473311065617929, |
| "loss": 3.8731, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.8933376385749651, |
| "grad_norm": 0.5680239796638489, |
| "learning_rate": 0.0005470078655317314, |
| "loss": 3.8674, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.8987191906145733, |
| "grad_norm": 0.5679119825363159, |
| "learning_rate": 0.0005466846245016701, |
| "loss": 3.8556, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.9041007426541815, |
| "grad_norm": 0.5395457744598389, |
| "learning_rate": 0.0005463613834716086, |
| "loss": 3.8536, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.9094822946937897, |
| "grad_norm": 0.5603843331336975, |
| "learning_rate": 0.0005460381424415472, |
| "loss": 3.8579, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.9148638467333979, |
| "grad_norm": 0.5550517439842224, |
| "learning_rate": 0.0005457149014114858, |
| "loss": 3.858, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.9202453987730062, |
| "grad_norm": 0.6112827062606812, |
| "learning_rate": 0.0005453916603814243, |
| "loss": 3.8563, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.9256269508126144, |
| "grad_norm": 0.6317242383956909, |
| "learning_rate": 0.000545068419351363, |
| "loss": 3.854, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.9310085028522226, |
| "grad_norm": 0.574309766292572, |
| "learning_rate": 0.0005447451783213015, |
| "loss": 3.8455, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.9363900548918308, |
| "grad_norm": 0.5467745065689087, |
| "learning_rate": 0.0005444219372912401, |
| "loss": 3.858, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.941771606931439, |
| "grad_norm": 0.5530813932418823, |
| "learning_rate": 0.0005440986962611787, |
| "loss": 3.8518, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.9471531589710472, |
| "grad_norm": 0.7071423530578613, |
| "learning_rate": 0.0005437754552311173, |
| "loss": 3.8473, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.9525347110106555, |
| "grad_norm": 0.5127449631690979, |
| "learning_rate": 0.0005434522142010559, |
| "loss": 3.8471, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.9579162630502637, |
| "grad_norm": 0.5855455994606018, |
| "learning_rate": 0.0005431289731709945, |
| "loss": 3.8589, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.9632978150898719, |
| "grad_norm": 0.6145156025886536, |
| "learning_rate": 0.000542805732140933, |
| "loss": 3.8295, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "grad_norm": 0.5550306439399719, |
| "learning_rate": 0.0005424824911108716, |
| "loss": 3.8378, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "eval_accuracy": 0.3415591537287335, |
| "eval_loss": 3.7806262969970703, |
| "eval_runtime": 184.435, |
| "eval_samples_per_second": 97.655, |
| "eval_steps_per_second": 6.105, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9740609191690883, |
| "grad_norm": 0.5801270008087158, |
| "learning_rate": 0.0005421592500808102, |
| "loss": 3.8502, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.9794424712086965, |
| "grad_norm": 0.5607963800430298, |
| "learning_rate": 0.0005418360090507488, |
| "loss": 3.825, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.9848240232483048, |
| "grad_norm": 0.5534636974334717, |
| "learning_rate": 0.0005415127680206874, |
| "loss": 3.8361, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.9902055752879131, |
| "grad_norm": 0.692423939704895, |
| "learning_rate": 0.0005411895269906259, |
| "loss": 3.8318, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.9955871273275213, |
| "grad_norm": 0.5722038745880127, |
| "learning_rate": 0.0005408662859605646, |
| "loss": 3.8376, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.0009686793671295, |
| "grad_norm": 0.6226180195808411, |
| "learning_rate": 0.0005405430449305031, |
| "loss": 3.8294, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.0063502314067376, |
| "grad_norm": 0.6096346974372864, |
| "learning_rate": 0.0005402198039004416, |
| "loss": 3.7878, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.011731783446346, |
| "grad_norm": 0.5371617078781128, |
| "learning_rate": 0.0005398965628703803, |
| "loss": 3.7627, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.017113335485954, |
| "grad_norm": 0.6130861639976501, |
| "learning_rate": 0.0005395733218403189, |
| "loss": 3.769, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.0224948875255624, |
| "grad_norm": 0.5652937889099121, |
| "learning_rate": 0.0005392500808102575, |
| "loss": 3.7714, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0278764395651705, |
| "grad_norm": 0.5810158252716064, |
| "learning_rate": 0.000538926839780196, |
| "loss": 3.7566, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.0332579916047788, |
| "grad_norm": 0.6366006731987, |
| "learning_rate": 0.0005386035987501346, |
| "loss": 3.7647, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.0386395436443872, |
| "grad_norm": 0.5298298597335815, |
| "learning_rate": 0.0005382803577200732, |
| "loss": 3.7488, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.0440210956839953, |
| "grad_norm": 1.2535181045532227, |
| "learning_rate": 0.0005379571166900119, |
| "loss": 3.7539, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.0494026477236036, |
| "grad_norm": 0.601189374923706, |
| "learning_rate": 0.0005376338756599504, |
| "loss": 3.7654, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.0547841997632117, |
| "grad_norm": 0.6602137684822083, |
| "learning_rate": 0.0005373106346298889, |
| "loss": 3.7686, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.06016575180282, |
| "grad_norm": 0.6176168918609619, |
| "learning_rate": 0.0005369873935998276, |
| "loss": 3.7589, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.0655473038424281, |
| "grad_norm": 0.6260287165641785, |
| "learning_rate": 0.0005366641525697661, |
| "loss": 3.7655, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.0709288558820365, |
| "grad_norm": 0.5720372796058655, |
| "learning_rate": 0.0005363409115397048, |
| "loss": 3.7559, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "grad_norm": 0.5716636180877686, |
| "learning_rate": 0.0005360176705096433, |
| "loss": 3.7573, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "eval_accuracy": 0.3451164492757796, |
| "eval_loss": 3.7458271980285645, |
| "eval_runtime": 185.1344, |
| "eval_samples_per_second": 97.286, |
| "eval_steps_per_second": 6.082, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.081691959961253, |
| "grad_norm": 0.6392249464988708, |
| "learning_rate": 0.0005356944294795819, |
| "loss": 3.746, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.087073512000861, |
| "grad_norm": 0.5700231194496155, |
| "learning_rate": 0.0005353711884495205, |
| "loss": 3.7913, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.0924550640404693, |
| "grad_norm": 0.5578026175498962, |
| "learning_rate": 0.000535047947419459, |
| "loss": 3.7517, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.0978366160800774, |
| "grad_norm": 0.5823580026626587, |
| "learning_rate": 0.0005347247063893976, |
| "loss": 3.7495, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.1032181681196858, |
| "grad_norm": 0.6097853779792786, |
| "learning_rate": 0.0005344014653593362, |
| "loss": 3.748, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.1085997201592939, |
| "grad_norm": 0.6023664474487305, |
| "learning_rate": 0.0005340782243292748, |
| "loss": 3.7464, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.1139812721989022, |
| "grad_norm": 0.6338013410568237, |
| "learning_rate": 0.0005337549832992134, |
| "loss": 3.7546, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.1193628242385103, |
| "grad_norm": 0.5424166917800903, |
| "learning_rate": 0.000533431742269152, |
| "loss": 3.7676, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.1247443762781186, |
| "grad_norm": 0.5553328990936279, |
| "learning_rate": 0.0005331085012390905, |
| "loss": 3.7516, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.1301259283177267, |
| "grad_norm": 0.5472305417060852, |
| "learning_rate": 0.0005327852602090292, |
| "loss": 3.7578, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.135507480357335, |
| "grad_norm": 0.6304797530174255, |
| "learning_rate": 0.0005324620191789678, |
| "loss": 3.7302, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.1408890323969434, |
| "grad_norm": 0.5144755244255066, |
| "learning_rate": 0.0005321387781489063, |
| "loss": 3.7487, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.1462705844365515, |
| "grad_norm": 0.5785931348800659, |
| "learning_rate": 0.0005318155371188449, |
| "loss": 3.7557, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.1516521364761596, |
| "grad_norm": 0.5502986907958984, |
| "learning_rate": 0.0005314922960887834, |
| "loss": 3.7452, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.157033688515768, |
| "grad_norm": 0.6079390048980713, |
| "learning_rate": 0.0005311690550587221, |
| "loss": 3.7587, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.1624152405553763, |
| "grad_norm": 0.5040614008903503, |
| "learning_rate": 0.0005308458140286607, |
| "loss": 3.7611, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.1677967925949844, |
| "grad_norm": 0.5929045677185059, |
| "learning_rate": 0.0005305225729985993, |
| "loss": 3.7669, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.1731783446345927, |
| "grad_norm": 0.535692572593689, |
| "learning_rate": 0.0005301993319685378, |
| "loss": 3.764, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.1785598966742008, |
| "grad_norm": 0.5377613306045532, |
| "learning_rate": 0.0005298760909384765, |
| "loss": 3.7536, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "grad_norm": 0.6084091663360596, |
| "learning_rate": 0.000529552849908415, |
| "loss": 3.7359, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "eval_accuracy": 0.3476050350182825, |
| "eval_loss": 3.722259044647217, |
| "eval_runtime": 184.9277, |
| "eval_samples_per_second": 97.395, |
| "eval_steps_per_second": 6.089, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1893230007534172, |
| "grad_norm": 0.5570613741874695, |
| "learning_rate": 0.0005292296088783535, |
| "loss": 3.7553, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.1947045527930256, |
| "grad_norm": 0.5923260450363159, |
| "learning_rate": 0.0005289063678482922, |
| "loss": 3.7489, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.2000861048326337, |
| "grad_norm": 0.6002750396728516, |
| "learning_rate": 0.0005285831268182307, |
| "loss": 3.742, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.205467656872242, |
| "grad_norm": 0.6400654911994934, |
| "learning_rate": 0.0005282598857881694, |
| "loss": 3.7462, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.21084920891185, |
| "grad_norm": 0.5764461755752563, |
| "learning_rate": 0.0005279366447581079, |
| "loss": 3.7248, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.2162307609514584, |
| "grad_norm": 0.5291248559951782, |
| "learning_rate": 0.0005276134037280465, |
| "loss": 3.7395, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.2216123129910665, |
| "grad_norm": 0.5363642573356628, |
| "learning_rate": 0.0005272966275185863, |
| "loss": 3.7486, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.2269938650306749, |
| "grad_norm": 0.5614747405052185, |
| "learning_rate": 0.0005269733864885249, |
| "loss": 3.7352, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.232375417070283, |
| "grad_norm": 0.568588137626648, |
| "learning_rate": 0.0005266501454584636, |
| "loss": 3.7354, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.2377569691098913, |
| "grad_norm": 0.5658771991729736, |
| "learning_rate": 0.0005263269044284021, |
| "loss": 3.7457, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.2431385211494996, |
| "grad_norm": 0.5699712038040161, |
| "learning_rate": 0.0005260036633983406, |
| "loss": 3.7583, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.2485200731891077, |
| "grad_norm": 0.6179584264755249, |
| "learning_rate": 0.0005256804223682792, |
| "loss": 3.7221, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.2539016252287158, |
| "grad_norm": 0.5554487109184265, |
| "learning_rate": 0.0005253571813382178, |
| "loss": 3.7318, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.2592831772683242, |
| "grad_norm": 0.5801360011100769, |
| "learning_rate": 0.0005250339403081564, |
| "loss": 3.7272, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.2646647293079325, |
| "grad_norm": 0.58757084608078, |
| "learning_rate": 0.000524710699278095, |
| "loss": 3.7316, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.2700462813475406, |
| "grad_norm": 0.6320570111274719, |
| "learning_rate": 0.0005243874582480336, |
| "loss": 3.7323, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.275427833387149, |
| "grad_norm": 0.5550466179847717, |
| "learning_rate": 0.0005240642172179722, |
| "loss": 3.7152, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.280809385426757, |
| "grad_norm": 0.7199887633323669, |
| "learning_rate": 0.0005237409761879107, |
| "loss": 3.724, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.2861909374663654, |
| "grad_norm": 0.5186243653297424, |
| "learning_rate": 0.0005234177351578493, |
| "loss": 3.7295, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "grad_norm": 0.5249941945075989, |
| "learning_rate": 0.0005230944941277878, |
| "loss": 3.7189, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "eval_accuracy": 0.34949287894405906, |
| "eval_loss": 3.6969494819641113, |
| "eval_runtime": 184.7681, |
| "eval_samples_per_second": 97.479, |
| "eval_steps_per_second": 6.094, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2969540415455818, |
| "grad_norm": 0.5758280158042908, |
| "learning_rate": 0.0005227712530977265, |
| "loss": 3.7271, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.30233559358519, |
| "grad_norm": 0.5738679766654968, |
| "learning_rate": 0.0005224480120676651, |
| "loss": 3.7388, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.3077171456247982, |
| "grad_norm": 0.6011477708816528, |
| "learning_rate": 0.0005221247710376037, |
| "loss": 3.7381, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.3130986976644063, |
| "grad_norm": 0.5592882037162781, |
| "learning_rate": 0.0005218015300075422, |
| "loss": 3.7326, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.3184802497040147, |
| "grad_norm": 0.5867257714271545, |
| "learning_rate": 0.0005214782889774809, |
| "loss": 3.7186, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.3238618017436228, |
| "grad_norm": 0.5920559167861938, |
| "learning_rate": 0.0005211550479474194, |
| "loss": 3.7018, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.329243353783231, |
| "grad_norm": 0.5726664066314697, |
| "learning_rate": 0.000520831806917358, |
| "loss": 3.7209, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.3346249058228392, |
| "grad_norm": 0.5191407799720764, |
| "learning_rate": 0.0005205085658872966, |
| "loss": 3.7221, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.3400064578624475, |
| "grad_norm": 0.5495452284812927, |
| "learning_rate": 0.0005201853248572351, |
| "loss": 3.7186, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.3453880099020559, |
| "grad_norm": 0.536133348941803, |
| "learning_rate": 0.0005198620838271738, |
| "loss": 3.727, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.350769561941664, |
| "grad_norm": 0.6578084826469421, |
| "learning_rate": 0.0005195388427971123, |
| "loss": 3.7175, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.356151113981272, |
| "grad_norm": 0.697626531124115, |
| "learning_rate": 0.000519215601767051, |
| "loss": 3.7129, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.3615326660208804, |
| "grad_norm": 0.5493432879447937, |
| "learning_rate": 0.0005188923607369895, |
| "loss": 3.7113, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.3669142180604887, |
| "grad_norm": 0.5725867748260498, |
| "learning_rate": 0.000518569119706928, |
| "loss": 3.7209, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.3722957701000968, |
| "grad_norm": 0.5644043684005737, |
| "learning_rate": 0.0005182458786768667, |
| "loss": 3.7148, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.3776773221397052, |
| "grad_norm": 0.516650378704071, |
| "learning_rate": 0.0005179226376468052, |
| "loss": 3.7039, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.3830588741793133, |
| "grad_norm": 0.5859173536300659, |
| "learning_rate": 0.0005175993966167438, |
| "loss": 3.7156, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.3884404262189216, |
| "grad_norm": 0.4950462579727173, |
| "learning_rate": 0.0005172761555866824, |
| "loss": 3.7052, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.3938219782585297, |
| "grad_norm": 0.6020005941390991, |
| "learning_rate": 0.0005169529145566211, |
| "loss": 3.7147, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "grad_norm": 0.5504841208457947, |
| "learning_rate": 0.0005166296735265596, |
| "loss": 3.7161, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "eval_accuracy": 0.3519344179861383, |
| "eval_loss": 3.6728596687316895, |
| "eval_runtime": 184.9723, |
| "eval_samples_per_second": 97.371, |
| "eval_steps_per_second": 6.087, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4045850823377461, |
| "grad_norm": 0.5823192000389099, |
| "learning_rate": 0.0005163064324964982, |
| "loss": 3.7071, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.4099666343773545, |
| "grad_norm": 0.5856388807296753, |
| "learning_rate": 0.0005159831914664367, |
| "loss": 3.6995, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.4153481864169626, |
| "grad_norm": 0.6325433850288391, |
| "learning_rate": 0.0005156599504363753, |
| "loss": 3.717, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.420729738456571, |
| "grad_norm": 0.6164390444755554, |
| "learning_rate": 0.000515336709406314, |
| "loss": 3.7094, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.426111290496179, |
| "grad_norm": 0.5653883218765259, |
| "learning_rate": 0.0005150134683762525, |
| "loss": 3.6838, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.4314928425357873, |
| "grad_norm": 0.6306009888648987, |
| "learning_rate": 0.0005146902273461911, |
| "loss": 3.7257, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.4368743945753955, |
| "grad_norm": 0.5829269886016846, |
| "learning_rate": 0.0005143669863161297, |
| "loss": 3.7158, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.4422559466150038, |
| "grad_norm": 0.5895189642906189, |
| "learning_rate": 0.0005140437452860683, |
| "loss": 3.7031, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.447637498654612, |
| "grad_norm": 0.5675577521324158, |
| "learning_rate": 0.0005137205042560069, |
| "loss": 3.7066, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.4530190506942202, |
| "grad_norm": 0.5363256335258484, |
| "learning_rate": 0.0005133972632259455, |
| "loss": 3.6979, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.4584006027338283, |
| "grad_norm": 0.6230180859565735, |
| "learning_rate": 0.000513074022195884, |
| "loss": 3.7036, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.4637821547734367, |
| "grad_norm": 0.566116988658905, |
| "learning_rate": 0.0005127507811658226, |
| "loss": 3.7168, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.469163706813045, |
| "grad_norm": 0.5180698037147522, |
| "learning_rate": 0.0005124275401357612, |
| "loss": 3.7075, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.474545258852653, |
| "grad_norm": 0.743535578250885, |
| "learning_rate": 0.0005121042991056997, |
| "loss": 3.7021, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.4799268108922612, |
| "grad_norm": 0.5717049241065979, |
| "learning_rate": 0.0005117810580756384, |
| "loss": 3.7071, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.4853083629318695, |
| "grad_norm": 0.5536872148513794, |
| "learning_rate": 0.0005114578170455769, |
| "loss": 3.7012, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.4906899149714778, |
| "grad_norm": 0.5160924196243286, |
| "learning_rate": 0.0005111345760155156, |
| "loss": 3.7051, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.496071467011086, |
| "grad_norm": 0.5639503598213196, |
| "learning_rate": 0.0005108113349854541, |
| "loss": 3.6988, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.501453019050694, |
| "grad_norm": 0.5460876822471619, |
| "learning_rate": 0.0005104880939553926, |
| "loss": 3.6936, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "grad_norm": 0.5820977091789246, |
| "learning_rate": 0.0005101648529253313, |
| "loss": 3.7014, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "eval_accuracy": 0.3535329192716649, |
| "eval_loss": 3.655021905899048, |
| "eval_runtime": 184.7328, |
| "eval_samples_per_second": 97.498, |
| "eval_steps_per_second": 6.095, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5122161231299107, |
| "grad_norm": 0.5606687664985657, |
| "learning_rate": 0.0005098416118952699, |
| "loss": 3.6949, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.5175976751695188, |
| "grad_norm": 0.5651271343231201, |
| "learning_rate": 0.0005095183708652085, |
| "loss": 3.6798, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.5229792272091272, |
| "grad_norm": 0.5150020122528076, |
| "learning_rate": 0.000509195129835147, |
| "loss": 3.68, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.5283607792487355, |
| "grad_norm": 0.5977632999420166, |
| "learning_rate": 0.0005088718888050856, |
| "loss": 3.6851, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.5337423312883436, |
| "grad_norm": 0.6188676953315735, |
| "learning_rate": 0.0005085486477750242, |
| "loss": 3.6703, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.5391238833279517, |
| "grad_norm": 0.5410431027412415, |
| "learning_rate": 0.0005082254067449629, |
| "loss": 3.6897, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.54450543536756, |
| "grad_norm": 0.5880251526832581, |
| "learning_rate": 0.0005079021657149014, |
| "loss": 3.7039, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.5498869874071683, |
| "grad_norm": 0.5555596947669983, |
| "learning_rate": 0.0005075789246848399, |
| "loss": 3.6537, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.5552685394467765, |
| "grad_norm": 0.5395658612251282, |
| "learning_rate": 0.0005072556836547785, |
| "loss": 3.6913, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.5606500914863846, |
| "grad_norm": 0.5492722392082214, |
| "learning_rate": 0.0005069324426247171, |
| "loss": 3.6815, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.566031643525993, |
| "grad_norm": 0.5226956605911255, |
| "learning_rate": 0.0005066092015946557, |
| "loss": 3.6833, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.5714131955656012, |
| "grad_norm": 0.5579638481140137, |
| "learning_rate": 0.0005062859605645943, |
| "loss": 3.6818, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.5767947476052093, |
| "grad_norm": 0.5651870965957642, |
| "learning_rate": 0.0005059627195345329, |
| "loss": 3.6777, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.5821762996448174, |
| "grad_norm": 0.5880829095840454, |
| "learning_rate": 0.0005056394785044715, |
| "loss": 3.6853, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.5875578516844258, |
| "grad_norm": 0.6156719326972961, |
| "learning_rate": 0.00050531623747441, |
| "loss": 3.6936, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.592939403724034, |
| "grad_norm": 0.5112903118133545, |
| "learning_rate": 0.0005049929964443486, |
| "loss": 3.6819, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.5983209557636422, |
| "grad_norm": 0.6731923818588257, |
| "learning_rate": 0.0005046697554142871, |
| "loss": 3.6723, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.6037025078032503, |
| "grad_norm": 0.6107453107833862, |
| "learning_rate": 0.0005043465143842258, |
| "loss": 3.7043, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.6090840598428586, |
| "grad_norm": 0.6314976811408997, |
| "learning_rate": 0.0005040232733541644, |
| "loss": 3.6723, |
| "step": 14950 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "grad_norm": 0.5651878714561462, |
| "learning_rate": 0.000503700032324103, |
| "loss": 3.684, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "eval_accuracy": 0.35533384088718994, |
| "eval_loss": 3.6347434520721436, |
| "eval_runtime": 184.7829, |
| "eval_samples_per_second": 97.471, |
| "eval_steps_per_second": 6.094, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.619847163922075, |
| "grad_norm": 0.5857282280921936, |
| "learning_rate": 0.0005033767912940415, |
| "loss": 3.6826, |
| "step": 15050 |
| }, |
| { |
| "epoch": 1.6252287159616834, |
| "grad_norm": 0.5991663336753845, |
| "learning_rate": 0.0005030535502639802, |
| "loss": 3.6961, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.6306102680012917, |
| "grad_norm": 0.5337246060371399, |
| "learning_rate": 0.0005027303092339187, |
| "loss": 3.6949, |
| "step": 15150 |
| }, |
| { |
| "epoch": 1.6359918200408998, |
| "grad_norm": 0.6633757948875427, |
| "learning_rate": 0.0005024070682038573, |
| "loss": 3.6859, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.641373372080508, |
| "grad_norm": 0.5476294755935669, |
| "learning_rate": 0.0005020838271737959, |
| "loss": 3.6768, |
| "step": 15250 |
| }, |
| { |
| "epoch": 1.6467549241201163, |
| "grad_norm": 0.5833688974380493, |
| "learning_rate": 0.0005017605861437344, |
| "loss": 3.6723, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.6521364761597246, |
| "grad_norm": 0.5485004782676697, |
| "learning_rate": 0.0005014373451136731, |
| "loss": 3.6814, |
| "step": 15350 |
| }, |
| { |
| "epoch": 1.6575180281993327, |
| "grad_norm": 0.5738468170166016, |
| "learning_rate": 0.0005011205689042129, |
| "loss": 3.6761, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.6628995802389408, |
| "grad_norm": 0.5266857147216797, |
| "learning_rate": 0.0005007973278741514, |
| "loss": 3.683, |
| "step": 15450 |
| }, |
| { |
| "epoch": 1.6682811322785491, |
| "grad_norm": 0.5718429088592529, |
| "learning_rate": 0.00050047408684409, |
| "loss": 3.6676, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.6736626843181575, |
| "grad_norm": 0.5431683659553528, |
| "learning_rate": 0.0005001508458140286, |
| "loss": 3.6648, |
| "step": 15550 |
| }, |
| { |
| "epoch": 1.6790442363577656, |
| "grad_norm": 0.6572660803794861, |
| "learning_rate": 0.0004998276047839673, |
| "loss": 3.6898, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.6844257883973737, |
| "grad_norm": 0.5896181464195251, |
| "learning_rate": 0.0004995043637539058, |
| "loss": 3.6605, |
| "step": 15650 |
| }, |
| { |
| "epoch": 1.689807340436982, |
| "grad_norm": 0.549990713596344, |
| "learning_rate": 0.0004991811227238443, |
| "loss": 3.657, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.6951888924765903, |
| "grad_norm": 0.5397534966468811, |
| "learning_rate": 0.0004988578816937829, |
| "loss": 3.6706, |
| "step": 15750 |
| }, |
| { |
| "epoch": 1.7005704445161984, |
| "grad_norm": 0.5501967668533325, |
| "learning_rate": 0.0004985346406637215, |
| "loss": 3.6697, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.7059519965558065, |
| "grad_norm": 0.6153730154037476, |
| "learning_rate": 0.0004982113996336602, |
| "loss": 3.6671, |
| "step": 15850 |
| }, |
| { |
| "epoch": 1.7113335485954149, |
| "grad_norm": 0.619310200214386, |
| "learning_rate": 0.0004978881586035987, |
| "loss": 3.6858, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.7167151006350232, |
| "grad_norm": 0.5859758853912354, |
| "learning_rate": 0.0004975649175735373, |
| "loss": 3.6639, |
| "step": 15950 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "grad_norm": 0.5947201251983643, |
| "learning_rate": 0.0004972416765434759, |
| "loss": 3.6546, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "eval_accuracy": 0.3573106715280866, |
| "eval_loss": 3.6192057132720947, |
| "eval_runtime": 184.9464, |
| "eval_samples_per_second": 97.385, |
| "eval_steps_per_second": 6.088, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7274782047142396, |
| "grad_norm": 0.6014895439147949, |
| "learning_rate": 0.0004969184355134145, |
| "loss": 3.6555, |
| "step": 16050 |
| }, |
| { |
| "epoch": 1.732859756753848, |
| "grad_norm": 0.6242139935493469, |
| "learning_rate": 0.0004965951944833531, |
| "loss": 3.6725, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.738241308793456, |
| "grad_norm": 0.6019642949104309, |
| "learning_rate": 0.0004962719534532916, |
| "loss": 3.6383, |
| "step": 16150 |
| }, |
| { |
| "epoch": 1.7436228608330642, |
| "grad_norm": 0.5532971620559692, |
| "learning_rate": 0.0004959487124232302, |
| "loss": 3.6586, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.7490044128726725, |
| "grad_norm": 0.684308648109436, |
| "learning_rate": 0.0004956254713931688, |
| "loss": 3.6486, |
| "step": 16250 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.5315883755683899, |
| "learning_rate": 0.0004953022303631074, |
| "loss": 3.6488, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.759767516951889, |
| "grad_norm": 0.5260655879974365, |
| "learning_rate": 0.0004949789893330459, |
| "loss": 3.6647, |
| "step": 16350 |
| }, |
| { |
| "epoch": 1.765149068991497, |
| "grad_norm": 0.5171688795089722, |
| "learning_rate": 0.0004946557483029846, |
| "loss": 3.6572, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.7705306210311054, |
| "grad_norm": 0.5283517241477966, |
| "learning_rate": 0.0004943325072729231, |
| "loss": 3.6654, |
| "step": 16450 |
| }, |
| { |
| "epoch": 1.7759121730707137, |
| "grad_norm": 0.5800235867500305, |
| "learning_rate": 0.0004940092662428617, |
| "loss": 3.673, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.7812937251103218, |
| "grad_norm": 0.6345435380935669, |
| "learning_rate": 0.0004936860252128003, |
| "loss": 3.671, |
| "step": 16550 |
| }, |
| { |
| "epoch": 1.78667527714993, |
| "grad_norm": 0.5385580658912659, |
| "learning_rate": 0.0004933627841827388, |
| "loss": 3.6765, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.7920568291895382, |
| "grad_norm": 0.521317720413208, |
| "learning_rate": 0.0004930395431526775, |
| "loss": 3.6577, |
| "step": 16650 |
| }, |
| { |
| "epoch": 1.7974383812291466, |
| "grad_norm": 0.596302330493927, |
| "learning_rate": 0.0004927163021226161, |
| "loss": 3.6477, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.8028199332687547, |
| "grad_norm": 0.5540564656257629, |
| "learning_rate": 0.0004923930610925547, |
| "loss": 3.6626, |
| "step": 16750 |
| }, |
| { |
| "epoch": 1.8082014853083628, |
| "grad_norm": 0.5950270295143127, |
| "learning_rate": 0.0004920698200624932, |
| "loss": 3.6484, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.813583037347971, |
| "grad_norm": 0.5792022943496704, |
| "learning_rate": 0.0004917465790324317, |
| "loss": 3.6412, |
| "step": 16850 |
| }, |
| { |
| "epoch": 1.8189645893875794, |
| "grad_norm": 0.5268118381500244, |
| "learning_rate": 0.0004914233380023704, |
| "loss": 3.6561, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.8243461414271875, |
| "grad_norm": 0.5259914994239807, |
| "learning_rate": 0.0004911000969723089, |
| "loss": 3.6361, |
| "step": 16950 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "grad_norm": 0.5924632549285889, |
| "learning_rate": 0.0004907768559422476, |
| "loss": 3.6475, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "eval_accuracy": 0.3591788492811688, |
| "eval_loss": 3.601672410964966, |
| "eval_runtime": 184.7368, |
| "eval_samples_per_second": 97.495, |
| "eval_steps_per_second": 6.095, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8351092455064042, |
| "grad_norm": 0.5890408754348755, |
| "learning_rate": 0.0004904536149121861, |
| "loss": 3.6432, |
| "step": 17050 |
| }, |
| { |
| "epoch": 1.8404907975460123, |
| "grad_norm": 0.5994479656219482, |
| "learning_rate": 0.0004901303738821248, |
| "loss": 3.645, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.8458723495856204, |
| "grad_norm": 0.5794470310211182, |
| "learning_rate": 0.0004898071328520633, |
| "loss": 3.6375, |
| "step": 17150 |
| }, |
| { |
| "epoch": 1.8512539016252287, |
| "grad_norm": 0.5518653392791748, |
| "learning_rate": 0.0004894838918220019, |
| "loss": 3.6343, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.856635453664837, |
| "grad_norm": 0.5658548474311829, |
| "learning_rate": 0.0004891606507919405, |
| "loss": 3.6607, |
| "step": 17250 |
| }, |
| { |
| "epoch": 1.8620170057044452, |
| "grad_norm": 0.5588201880455017, |
| "learning_rate": 0.000488837409761879, |
| "loss": 3.6551, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.8673985577440533, |
| "grad_norm": 0.5978080034255981, |
| "learning_rate": 0.0004885141687318177, |
| "loss": 3.6435, |
| "step": 17350 |
| }, |
| { |
| "epoch": 1.8727801097836616, |
| "grad_norm": 0.5438842177391052, |
| "learning_rate": 0.00048819092770175623, |
| "loss": 3.6491, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.87816166182327, |
| "grad_norm": 0.5227386951446533, |
| "learning_rate": 0.00048787415149229604, |
| "loss": 3.6431, |
| "step": 17450 |
| }, |
| { |
| "epoch": 1.883543213862878, |
| "grad_norm": 0.5770353078842163, |
| "learning_rate": 0.00048755091046223464, |
| "loss": 3.652, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.8889247659024861, |
| "grad_norm": 0.6320988535881042, |
| "learning_rate": 0.0004872276694321732, |
| "loss": 3.6296, |
| "step": 17550 |
| }, |
| { |
| "epoch": 1.8943063179420945, |
| "grad_norm": 0.5457765460014343, |
| "learning_rate": 0.00048690442840211177, |
| "loss": 3.6411, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.8996878699817028, |
| "grad_norm": 0.5683311820030212, |
| "learning_rate": 0.0004865811873720504, |
| "loss": 3.6369, |
| "step": 17650 |
| }, |
| { |
| "epoch": 1.905069422021311, |
| "grad_norm": 0.6170719861984253, |
| "learning_rate": 0.00048625794634198896, |
| "loss": 3.6392, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.910450974060919, |
| "grad_norm": 0.5767971277236938, |
| "learning_rate": 0.00048593470531192756, |
| "loss": 3.6482, |
| "step": 17750 |
| }, |
| { |
| "epoch": 1.9158325261005273, |
| "grad_norm": 0.5953500270843506, |
| "learning_rate": 0.00048561146428186615, |
| "loss": 3.6474, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.9212140781401357, |
| "grad_norm": 0.5317740440368652, |
| "learning_rate": 0.0004852882232518047, |
| "loss": 3.6312, |
| "step": 17850 |
| }, |
| { |
| "epoch": 1.9265956301797438, |
| "grad_norm": 0.5526648163795471, |
| "learning_rate": 0.00048496498222174334, |
| "loss": 3.6434, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.931977182219352, |
| "grad_norm": 0.5383221507072449, |
| "learning_rate": 0.00048464174119168193, |
| "loss": 3.6373, |
| "step": 17950 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "grad_norm": 0.567500114440918, |
| "learning_rate": 0.0004843185001616205, |
| "loss": 3.6584, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "eval_accuracy": 0.3605316863873469, |
| "eval_loss": 3.5917465686798096, |
| "eval_runtime": 184.8222, |
| "eval_samples_per_second": 97.45, |
| "eval_steps_per_second": 6.092, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9427402862985685, |
| "grad_norm": 0.5618447065353394, |
| "learning_rate": 0.00048399525913155907, |
| "loss": 3.6331, |
| "step": 18050 |
| }, |
| { |
| "epoch": 1.9481218383381766, |
| "grad_norm": 0.5482614636421204, |
| "learning_rate": 0.0004836720181014976, |
| "loss": 3.6441, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.953503390377785, |
| "grad_norm": 0.5669938325881958, |
| "learning_rate": 0.0004833487770714362, |
| "loss": 3.6514, |
| "step": 18150 |
| }, |
| { |
| "epoch": 1.9588849424173933, |
| "grad_norm": 0.5920496582984924, |
| "learning_rate": 0.00048302553604137485, |
| "loss": 3.6434, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.9642664944570014, |
| "grad_norm": 0.5437381267547607, |
| "learning_rate": 0.0004827022950113134, |
| "loss": 3.664, |
| "step": 18250 |
| }, |
| { |
| "epoch": 1.9696480464966095, |
| "grad_norm": 0.5581496357917786, |
| "learning_rate": 0.000482379053981252, |
| "loss": 3.6224, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.9750295985362178, |
| "grad_norm": 0.5657931566238403, |
| "learning_rate": 0.0004820558129511906, |
| "loss": 3.6262, |
| "step": 18350 |
| }, |
| { |
| "epoch": 1.9804111505758262, |
| "grad_norm": 0.5733588933944702, |
| "learning_rate": 0.0004817325719211291, |
| "loss": 3.6271, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.9857927026154343, |
| "grad_norm": 0.5454577803611755, |
| "learning_rate": 0.0004814093308910677, |
| "loss": 3.6385, |
| "step": 18450 |
| }, |
| { |
| "epoch": 1.9911742546550424, |
| "grad_norm": 0.5400912165641785, |
| "learning_rate": 0.00048108608986100637, |
| "loss": 3.6269, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.9965558066946507, |
| "grad_norm": 0.5676894187927246, |
| "learning_rate": 0.0004807628488309449, |
| "loss": 3.6297, |
| "step": 18550 |
| }, |
| { |
| "epoch": 2.001937358734259, |
| "grad_norm": 0.5944864153862, |
| "learning_rate": 0.0004804396078008835, |
| "loss": 3.578, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.007318910773867, |
| "grad_norm": 0.5926184058189392, |
| "learning_rate": 0.00048011636677082204, |
| "loss": 3.5433, |
| "step": 18650 |
| }, |
| { |
| "epoch": 2.0127004628134753, |
| "grad_norm": 0.507103443145752, |
| "learning_rate": 0.00047979312574076064, |
| "loss": 3.5372, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.018082014853084, |
| "grad_norm": 0.5739589929580688, |
| "learning_rate": 0.0004794698847106992, |
| "loss": 3.5428, |
| "step": 18750 |
| }, |
| { |
| "epoch": 2.023463566892692, |
| "grad_norm": 0.5899766087532043, |
| "learning_rate": 0.0004791466436806378, |
| "loss": 3.5556, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.0288451189323, |
| "grad_norm": 0.603664755821228, |
| "learning_rate": 0.0004788234026505764, |
| "loss": 3.5476, |
| "step": 18850 |
| }, |
| { |
| "epoch": 2.034226670971908, |
| "grad_norm": 0.6328676342964172, |
| "learning_rate": 0.00047850016162051496, |
| "loss": 3.5374, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.0396082230115167, |
| "grad_norm": 0.5715523362159729, |
| "learning_rate": 0.00047817692059045356, |
| "loss": 3.5504, |
| "step": 18950 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "grad_norm": 0.554076075553894, |
| "learning_rate": 0.00047785367956039215, |
| "loss": 3.5579, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "eval_accuracy": 0.3616910127003274, |
| "eval_loss": 3.57794189453125, |
| "eval_runtime": 184.5487, |
| "eval_samples_per_second": 97.595, |
| "eval_steps_per_second": 6.101, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.050371327090733, |
| "grad_norm": 0.5813206434249878, |
| "learning_rate": 0.00047753043853033075, |
| "loss": 3.5526, |
| "step": 19050 |
| }, |
| { |
| "epoch": 2.055752879130341, |
| "grad_norm": 0.5949262976646423, |
| "learning_rate": 0.00047720719750026934, |
| "loss": 3.5472, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.0611344311699495, |
| "grad_norm": 0.5201438069343567, |
| "learning_rate": 0.00047688395647020793, |
| "loss": 3.5551, |
| "step": 19150 |
| }, |
| { |
| "epoch": 2.0665159832095576, |
| "grad_norm": 0.5755208730697632, |
| "learning_rate": 0.0004765607154401465, |
| "loss": 3.5279, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.0718975352491658, |
| "grad_norm": 0.5886051654815674, |
| "learning_rate": 0.00047623747441008507, |
| "loss": 3.5456, |
| "step": 19250 |
| }, |
| { |
| "epoch": 2.0772790872887743, |
| "grad_norm": 0.581571102142334, |
| "learning_rate": 0.0004759142333800236, |
| "loss": 3.5523, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.0826606393283824, |
| "grad_norm": 0.5398459434509277, |
| "learning_rate": 0.00047559099234996226, |
| "loss": 3.5488, |
| "step": 19350 |
| }, |
| { |
| "epoch": 2.0880421913679905, |
| "grad_norm": 0.5748805403709412, |
| "learning_rate": 0.00047526775131990085, |
| "loss": 3.5538, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.0934237434075986, |
| "grad_norm": 0.5900923013687134, |
| "learning_rate": 0.0004749445102898394, |
| "loss": 3.5743, |
| "step": 19450 |
| }, |
| { |
| "epoch": 2.098805295447207, |
| "grad_norm": 0.5444471836090088, |
| "learning_rate": 0.000474621269259778, |
| "loss": 3.5705, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.1041868474868153, |
| "grad_norm": 0.5641753673553467, |
| "learning_rate": 0.0004742980282297166, |
| "loss": 3.5555, |
| "step": 19550 |
| }, |
| { |
| "epoch": 2.1095683995264234, |
| "grad_norm": 0.5689592957496643, |
| "learning_rate": 0.0004739812520202564, |
| "loss": 3.5609, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.1149499515660315, |
| "grad_norm": 0.6442001461982727, |
| "learning_rate": 0.000473658010990195, |
| "loss": 3.5649, |
| "step": 19650 |
| }, |
| { |
| "epoch": 2.12033150360564, |
| "grad_norm": 0.5538535118103027, |
| "learning_rate": 0.0004733347699601336, |
| "loss": 3.5624, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.125713055645248, |
| "grad_norm": 0.5702688097953796, |
| "learning_rate": 0.0004730115289300722, |
| "loss": 3.5425, |
| "step": 19750 |
| }, |
| { |
| "epoch": 2.1310946076848563, |
| "grad_norm": 0.5623447895050049, |
| "learning_rate": 0.000472694752720612, |
| "loss": 3.5462, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.1364761597244644, |
| "grad_norm": 0.6005106568336487, |
| "learning_rate": 0.0004723715116905506, |
| "loss": 3.5623, |
| "step": 19850 |
| }, |
| { |
| "epoch": 2.141857711764073, |
| "grad_norm": 0.530021607875824, |
| "learning_rate": 0.0004720482706604891, |
| "loss": 3.552, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.147239263803681, |
| "grad_norm": 0.6486440896987915, |
| "learning_rate": 0.0004717250296304277, |
| "loss": 3.5587, |
| "step": 19950 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "grad_norm": 0.5921491980552673, |
| "learning_rate": 0.0004714017886003663, |
| "loss": 3.5664, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "eval_accuracy": 0.36275624561246056, |
| "eval_loss": 3.568859815597534, |
| "eval_runtime": 184.9316, |
| "eval_samples_per_second": 97.393, |
| "eval_steps_per_second": 6.089, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.1580023678828972, |
| "grad_norm": 0.5587154030799866, |
| "learning_rate": 0.00047107854757030485, |
| "loss": 3.5344, |
| "step": 20050 |
| }, |
| { |
| "epoch": 2.163383919922506, |
| "grad_norm": 0.6131019592285156, |
| "learning_rate": 0.0004707553065402435, |
| "loss": 3.563, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.168765471962114, |
| "grad_norm": 0.602177083492279, |
| "learning_rate": 0.0004704320655101821, |
| "loss": 3.5386, |
| "step": 20150 |
| }, |
| { |
| "epoch": 2.174147024001722, |
| "grad_norm": 0.5748916864395142, |
| "learning_rate": 0.00047010882448012063, |
| "loss": 3.5412, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.1795285760413305, |
| "grad_norm": 0.5983401536941528, |
| "learning_rate": 0.0004697855834500592, |
| "loss": 3.5439, |
| "step": 20250 |
| }, |
| { |
| "epoch": 2.1849101280809387, |
| "grad_norm": 0.5558173656463623, |
| "learning_rate": 0.00046946234241999776, |
| "loss": 3.5476, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.1902916801205468, |
| "grad_norm": 0.5931748151779175, |
| "learning_rate": 0.00046913910138993636, |
| "loss": 3.5686, |
| "step": 20350 |
| }, |
| { |
| "epoch": 2.195673232160155, |
| "grad_norm": 0.5662775635719299, |
| "learning_rate": 0.000468815860359875, |
| "loss": 3.5691, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.2010547841997634, |
| "grad_norm": 0.6415780186653137, |
| "learning_rate": 0.00046849261932981355, |
| "loss": 3.54, |
| "step": 20450 |
| }, |
| { |
| "epoch": 2.2064363362393715, |
| "grad_norm": 0.5790057182312012, |
| "learning_rate": 0.00046816937829975214, |
| "loss": 3.5471, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.2118178882789796, |
| "grad_norm": 0.559398353099823, |
| "learning_rate": 0.00046784613726969074, |
| "loss": 3.5688, |
| "step": 20550 |
| }, |
| { |
| "epoch": 2.2171994403185877, |
| "grad_norm": 0.5323479175567627, |
| "learning_rate": 0.0004675228962396293, |
| "loss": 3.5536, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.2225809923581963, |
| "grad_norm": 0.5640736818313599, |
| "learning_rate": 0.0004671996552095679, |
| "loss": 3.5444, |
| "step": 20650 |
| }, |
| { |
| "epoch": 2.2279625443978044, |
| "grad_norm": 0.6212958097457886, |
| "learning_rate": 0.0004668764141795065, |
| "loss": 3.5705, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.2333440964374125, |
| "grad_norm": 0.5601683855056763, |
| "learning_rate": 0.00046655317314944506, |
| "loss": 3.5538, |
| "step": 20750 |
| }, |
| { |
| "epoch": 2.2387256484770206, |
| "grad_norm": 0.5923741459846497, |
| "learning_rate": 0.00046622993211938366, |
| "loss": 3.5587, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.244107200516629, |
| "grad_norm": 0.5725792646408081, |
| "learning_rate": 0.0004659066910893222, |
| "loss": 3.5558, |
| "step": 20850 |
| }, |
| { |
| "epoch": 2.2494887525562373, |
| "grad_norm": 0.577241063117981, |
| "learning_rate": 0.0004655834500592608, |
| "loss": 3.5422, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.2548703045958454, |
| "grad_norm": 0.5885173082351685, |
| "learning_rate": 0.00046526020902919944, |
| "loss": 3.5613, |
| "step": 20950 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "grad_norm": 0.6301099061965942, |
| "learning_rate": 0.000464936967999138, |
| "loss": 3.5668, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "eval_accuracy": 0.3639710935511143, |
| "eval_loss": 3.557680606842041, |
| "eval_runtime": 184.4972, |
| "eval_samples_per_second": 97.622, |
| "eval_steps_per_second": 6.103, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.265633408675062, |
| "grad_norm": 0.6086539030075073, |
| "learning_rate": 0.0004646137269690766, |
| "loss": 3.5513, |
| "step": 21050 |
| }, |
| { |
| "epoch": 2.27101496071467, |
| "grad_norm": 0.5513246059417725, |
| "learning_rate": 0.00046429048593901517, |
| "loss": 3.5549, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.2763965127542782, |
| "grad_norm": 0.5785967111587524, |
| "learning_rate": 0.0004639672449089537, |
| "loss": 3.5596, |
| "step": 21150 |
| }, |
| { |
| "epoch": 2.281778064793887, |
| "grad_norm": 0.5956328511238098, |
| "learning_rate": 0.0004636440038788923, |
| "loss": 3.5427, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.287159616833495, |
| "grad_norm": 0.6081483364105225, |
| "learning_rate": 0.00046332076284883095, |
| "loss": 3.5325, |
| "step": 21250 |
| }, |
| { |
| "epoch": 2.292541168873103, |
| "grad_norm": 0.5518943071365356, |
| "learning_rate": 0.0004629975218187695, |
| "loss": 3.5368, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.297922720912711, |
| "grad_norm": 0.5766976475715637, |
| "learning_rate": 0.0004626742807887081, |
| "loss": 3.5642, |
| "step": 21350 |
| }, |
| { |
| "epoch": 2.303304272952319, |
| "grad_norm": 0.582249104976654, |
| "learning_rate": 0.00046235103975864663, |
| "loss": 3.5492, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.3086858249919278, |
| "grad_norm": 0.6385564208030701, |
| "learning_rate": 0.0004620277987285852, |
| "loss": 3.5574, |
| "step": 21450 |
| }, |
| { |
| "epoch": 2.314067377031536, |
| "grad_norm": 0.5533836483955383, |
| "learning_rate": 0.0004617045576985239, |
| "loss": 3.5461, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.319448929071144, |
| "grad_norm": 0.6168516278266907, |
| "learning_rate": 0.0004613813166684624, |
| "loss": 3.536, |
| "step": 21550 |
| }, |
| { |
| "epoch": 2.3248304811107525, |
| "grad_norm": 0.5422675013542175, |
| "learning_rate": 0.000461058075638401, |
| "loss": 3.5478, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.3302120331503606, |
| "grad_norm": 0.673985481262207, |
| "learning_rate": 0.00046073483460833955, |
| "loss": 3.5648, |
| "step": 21650 |
| }, |
| { |
| "epoch": 2.3355935851899687, |
| "grad_norm": 0.6507932543754578, |
| "learning_rate": 0.00046041159357827814, |
| "loss": 3.5514, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.340975137229577, |
| "grad_norm": 0.5993643403053284, |
| "learning_rate": 0.00046008835254821674, |
| "loss": 3.5723, |
| "step": 21750 |
| }, |
| { |
| "epoch": 2.3463566892691854, |
| "grad_norm": 0.629294753074646, |
| "learning_rate": 0.0004597651115181554, |
| "loss": 3.5385, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.3517382413087935, |
| "grad_norm": 0.5397011041641235, |
| "learning_rate": 0.00045944187048809393, |
| "loss": 3.5378, |
| "step": 21850 |
| }, |
| { |
| "epoch": 2.3571197933484016, |
| "grad_norm": 0.5139660835266113, |
| "learning_rate": 0.0004591186294580325, |
| "loss": 3.5584, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.3625013453880097, |
| "grad_norm": 0.6737929582595825, |
| "learning_rate": 0.00045879538842797106, |
| "loss": 3.5637, |
| "step": 21950 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "grad_norm": 0.5924611687660217, |
| "learning_rate": 0.00045847214739790966, |
| "loss": 3.5642, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "eval_accuracy": 0.36480739482862234, |
| "eval_loss": 3.550454616546631, |
| "eval_runtime": 184.9625, |
| "eval_samples_per_second": 97.376, |
| "eval_steps_per_second": 6.088, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3732644494672264, |
| "grad_norm": 0.5367729067802429, |
| "learning_rate": 0.0004581489063678482, |
| "loss": 3.5564, |
| "step": 22050 |
| }, |
| { |
| "epoch": 2.3786460015068345, |
| "grad_norm": 0.6676405072212219, |
| "learning_rate": 0.00045782566533778685, |
| "loss": 3.5521, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.384027553546443, |
| "grad_norm": 0.5662420988082886, |
| "learning_rate": 0.00045750242430772544, |
| "loss": 3.5702, |
| "step": 22150 |
| }, |
| { |
| "epoch": 2.389409105586051, |
| "grad_norm": 0.5759710073471069, |
| "learning_rate": 0.000457179183277664, |
| "loss": 3.5667, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.3947906576256592, |
| "grad_norm": 0.5874856114387512, |
| "learning_rate": 0.0004568559422476026, |
| "loss": 3.5669, |
| "step": 22250 |
| }, |
| { |
| "epoch": 2.4001722096652673, |
| "grad_norm": 0.6054186820983887, |
| "learning_rate": 0.00045653270121754117, |
| "loss": 3.5455, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.4055537617048754, |
| "grad_norm": 0.5471639037132263, |
| "learning_rate": 0.0004562094601874797, |
| "loss": 3.5433, |
| "step": 22350 |
| }, |
| { |
| "epoch": 2.410935313744484, |
| "grad_norm": 0.5708964467048645, |
| "learning_rate": 0.00045588621915741836, |
| "loss": 3.5434, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.416316865784092, |
| "grad_norm": 0.5709865093231201, |
| "learning_rate": 0.00045556297812735696, |
| "loss": 3.5641, |
| "step": 22450 |
| }, |
| { |
| "epoch": 2.4216984178237, |
| "grad_norm": 0.608943521976471, |
| "learning_rate": 0.0004552397370972955, |
| "loss": 3.5373, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.4270799698633088, |
| "grad_norm": 0.5688168406486511, |
| "learning_rate": 0.0004549164960672341, |
| "loss": 3.5529, |
| "step": 22550 |
| }, |
| { |
| "epoch": 2.432461521902917, |
| "grad_norm": 0.5552303791046143, |
| "learning_rate": 0.00045459325503717263, |
| "loss": 3.5403, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.437843073942525, |
| "grad_norm": 0.5680282711982727, |
| "learning_rate": 0.0004542700140071113, |
| "loss": 3.5457, |
| "step": 22650 |
| }, |
| { |
| "epoch": 2.443224625982133, |
| "grad_norm": 0.584951639175415, |
| "learning_rate": 0.0004539467729770499, |
| "loss": 3.5575, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.4486061780217416, |
| "grad_norm": 0.5572960376739502, |
| "learning_rate": 0.0004536235319469884, |
| "loss": 3.5445, |
| "step": 22750 |
| }, |
| { |
| "epoch": 2.4539877300613497, |
| "grad_norm": 0.5390388369560242, |
| "learning_rate": 0.000453300290916927, |
| "loss": 3.5575, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.459369282100958, |
| "grad_norm": 0.6123374700546265, |
| "learning_rate": 0.0004529770498868656, |
| "loss": 3.5767, |
| "step": 22850 |
| }, |
| { |
| "epoch": 2.464750834140566, |
| "grad_norm": 0.5894783139228821, |
| "learning_rate": 0.00045265380885680414, |
| "loss": 3.5542, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.4701323861801745, |
| "grad_norm": 0.5763325095176697, |
| "learning_rate": 0.0004523305678267428, |
| "loss": 3.5624, |
| "step": 22950 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "grad_norm": 0.5496013760566711, |
| "learning_rate": 0.0004520073267966814, |
| "loss": 3.5787, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "eval_accuracy": 0.3661909807020519, |
| "eval_loss": 3.5384585857391357, |
| "eval_runtime": 184.7612, |
| "eval_samples_per_second": 97.483, |
| "eval_steps_per_second": 6.094, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4808954902593907, |
| "grad_norm": 0.6314433217048645, |
| "learning_rate": 0.00045168408576661993, |
| "loss": 3.538, |
| "step": 23050 |
| }, |
| { |
| "epoch": 2.4862770422989993, |
| "grad_norm": 0.5926424860954285, |
| "learning_rate": 0.0004513608447365585, |
| "loss": 3.5478, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.4916585943386074, |
| "grad_norm": 0.5751678943634033, |
| "learning_rate": 0.00045103760370649706, |
| "loss": 3.5523, |
| "step": 23150 |
| }, |
| { |
| "epoch": 2.4970401463782155, |
| "grad_norm": 0.5919656753540039, |
| "learning_rate": 0.00045071436267643566, |
| "loss": 3.5424, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.5024216984178236, |
| "grad_norm": 0.5607296228408813, |
| "learning_rate": 0.0004503911216463743, |
| "loss": 3.5337, |
| "step": 23250 |
| }, |
| { |
| "epoch": 2.5078032504574317, |
| "grad_norm": 0.5334439873695374, |
| "learning_rate": 0.00045006788061631285, |
| "loss": 3.5375, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.5131848024970402, |
| "grad_norm": 0.5940549969673157, |
| "learning_rate": 0.00044974463958625144, |
| "loss": 3.5429, |
| "step": 23350 |
| }, |
| { |
| "epoch": 2.5185663545366483, |
| "grad_norm": 0.5540521740913391, |
| "learning_rate": 0.00044942139855619004, |
| "loss": 3.5407, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.5239479065762565, |
| "grad_norm": 0.6090623140335083, |
| "learning_rate": 0.0004490981575261286, |
| "loss": 3.5343, |
| "step": 23450 |
| }, |
| { |
| "epoch": 2.529329458615865, |
| "grad_norm": 0.5577390789985657, |
| "learning_rate": 0.0004487749164960672, |
| "loss": 3.5486, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.534711010655473, |
| "grad_norm": 0.5548477172851562, |
| "learning_rate": 0.0004484516754660058, |
| "loss": 3.5276, |
| "step": 23550 |
| }, |
| { |
| "epoch": 2.540092562695081, |
| "grad_norm": 0.6063026189804077, |
| "learning_rate": 0.00044812843443594436, |
| "loss": 3.5385, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.5454741147346893, |
| "grad_norm": 0.5368787050247192, |
| "learning_rate": 0.00044780519340588296, |
| "loss": 3.5378, |
| "step": 23650 |
| }, |
| { |
| "epoch": 2.550855666774298, |
| "grad_norm": 0.6063023805618286, |
| "learning_rate": 0.0004474819523758215, |
| "loss": 3.5421, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.556237218813906, |
| "grad_norm": 0.5546424984931946, |
| "learning_rate": 0.0004471587113457601, |
| "loss": 3.5394, |
| "step": 23750 |
| }, |
| { |
| "epoch": 2.561618770853514, |
| "grad_norm": 0.5690875053405762, |
| "learning_rate": 0.00044683547031569874, |
| "loss": 3.5454, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.567000322893122, |
| "grad_norm": 0.5386408567428589, |
| "learning_rate": 0.0004465186941062385, |
| "loss": 3.5472, |
| "step": 23850 |
| }, |
| { |
| "epoch": 2.5723818749327307, |
| "grad_norm": 0.5763099193572998, |
| "learning_rate": 0.00044619545307617714, |
| "loss": 3.5556, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.577763426972339, |
| "grad_norm": 0.5599791407585144, |
| "learning_rate": 0.0004458722120461157, |
| "loss": 3.5531, |
| "step": 23950 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "grad_norm": 0.5776103138923645, |
| "learning_rate": 0.0004455489710160543, |
| "loss": 3.5471, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "eval_accuracy": 0.3669588306602369, |
| "eval_loss": 3.5304083824157715, |
| "eval_runtime": 185.0981, |
| "eval_samples_per_second": 97.305, |
| "eval_steps_per_second": 6.083, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.5885265310515555, |
| "grad_norm": 0.5959745049476624, |
| "learning_rate": 0.0004452257299859928, |
| "loss": 3.5302, |
| "step": 24050 |
| }, |
| { |
| "epoch": 2.5939080830911636, |
| "grad_norm": 0.5823758840560913, |
| "learning_rate": 0.0004449024889559314, |
| "loss": 3.5519, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.5992896351307717, |
| "grad_norm": 0.5374528765678406, |
| "learning_rate": 0.00044457924792587, |
| "loss": 3.541, |
| "step": 24150 |
| }, |
| { |
| "epoch": 2.60467118717038, |
| "grad_norm": 0.5732862949371338, |
| "learning_rate": 0.0004442560068958086, |
| "loss": 3.5239, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.610052739209988, |
| "grad_norm": 0.5892502069473267, |
| "learning_rate": 0.0004439327658657472, |
| "loss": 3.5252, |
| "step": 24250 |
| }, |
| { |
| "epoch": 2.6154342912495965, |
| "grad_norm": 0.572582483291626, |
| "learning_rate": 0.0004436095248356858, |
| "loss": 3.5351, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.6208158432892046, |
| "grad_norm": 0.5686654448509216, |
| "learning_rate": 0.00044328628380562433, |
| "loss": 3.5611, |
| "step": 24350 |
| }, |
| { |
| "epoch": 2.6261973953288127, |
| "grad_norm": 0.6537500619888306, |
| "learning_rate": 0.0004429630427755629, |
| "loss": 3.5398, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.5685673356056213, |
| "learning_rate": 0.0004426398017455016, |
| "loss": 3.5416, |
| "step": 24450 |
| }, |
| { |
| "epoch": 2.6369604994080293, |
| "grad_norm": 0.5964141488075256, |
| "learning_rate": 0.0004423165607154401, |
| "loss": 3.5345, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.6423420514476375, |
| "grad_norm": 0.5742844343185425, |
| "learning_rate": 0.0004419933196853787, |
| "loss": 3.5365, |
| "step": 24550 |
| }, |
| { |
| "epoch": 2.6477236034872456, |
| "grad_norm": 0.6200198531150818, |
| "learning_rate": 0.00044167007865531725, |
| "loss": 3.5483, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.653105155526854, |
| "grad_norm": 0.5747848153114319, |
| "learning_rate": 0.00044134683762525584, |
| "loss": 3.5481, |
| "step": 24650 |
| }, |
| { |
| "epoch": 2.658486707566462, |
| "grad_norm": 0.5651057958602905, |
| "learning_rate": 0.00044102359659519444, |
| "loss": 3.5375, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.6638682596060703, |
| "grad_norm": 0.6583673357963562, |
| "learning_rate": 0.00044070035556513303, |
| "loss": 3.5528, |
| "step": 24750 |
| }, |
| { |
| "epoch": 2.6692498116456784, |
| "grad_norm": 0.6136596202850342, |
| "learning_rate": 0.00044037711453507163, |
| "loss": 3.5511, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.674631363685287, |
| "grad_norm": 0.534138560295105, |
| "learning_rate": 0.0004400538735050102, |
| "loss": 3.5538, |
| "step": 24850 |
| }, |
| { |
| "epoch": 2.680012915724895, |
| "grad_norm": 0.5683159828186035, |
| "learning_rate": 0.00043973063247494876, |
| "loss": 3.5404, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.685394467764503, |
| "grad_norm": 0.5965563058853149, |
| "learning_rate": 0.00043940739144488736, |
| "loss": 3.5361, |
| "step": 24950 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "grad_norm": 0.6091514825820923, |
| "learning_rate": 0.0004390841504148259, |
| "loss": 3.5407, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "eval_accuracy": 0.368015914605784, |
| "eval_loss": 3.5202255249023438, |
| "eval_runtime": 184.4748, |
| "eval_samples_per_second": 97.634, |
| "eval_steps_per_second": 6.104, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.69615757184372, |
| "grad_norm": 0.5996729135513306, |
| "learning_rate": 0.00043876090938476455, |
| "loss": 3.5246, |
| "step": 25050 |
| }, |
| { |
| "epoch": 2.701539123883328, |
| "grad_norm": 0.5679774284362793, |
| "learning_rate": 0.00043843766835470314, |
| "loss": 3.5314, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.706920675922936, |
| "grad_norm": 0.6458651423454285, |
| "learning_rate": 0.0004381144273246417, |
| "loss": 3.5488, |
| "step": 25150 |
| }, |
| { |
| "epoch": 2.712302227962544, |
| "grad_norm": 0.5411641597747803, |
| "learning_rate": 0.0004377911862945803, |
| "loss": 3.5364, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.7176837800021527, |
| "grad_norm": 0.5480161309242249, |
| "learning_rate": 0.00043746794526451887, |
| "loss": 3.5359, |
| "step": 25250 |
| }, |
| { |
| "epoch": 2.723065332041761, |
| "grad_norm": 0.5879181623458862, |
| "learning_rate": 0.00043714470423445747, |
| "loss": 3.5329, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.728446884081369, |
| "grad_norm": 0.5946080088615417, |
| "learning_rate": 0.00043682146320439606, |
| "loss": 3.54, |
| "step": 25350 |
| }, |
| { |
| "epoch": 2.7338284361209775, |
| "grad_norm": 0.5855832099914551, |
| "learning_rate": 0.00043649822217433466, |
| "loss": 3.5489, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.7392099881605856, |
| "grad_norm": 0.5638436675071716, |
| "learning_rate": 0.0004361749811442732, |
| "loss": 3.5306, |
| "step": 25450 |
| }, |
| { |
| "epoch": 2.7445915402001937, |
| "grad_norm": 0.5559232831001282, |
| "learning_rate": 0.0004358517401142118, |
| "loss": 3.5255, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.749973092239802, |
| "grad_norm": 0.5973818302154541, |
| "learning_rate": 0.00043552849908415033, |
| "loss": 3.5437, |
| "step": 25550 |
| }, |
| { |
| "epoch": 2.7553546442794103, |
| "grad_norm": 0.5868799090385437, |
| "learning_rate": 0.000435205258054089, |
| "loss": 3.5466, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.7607361963190185, |
| "grad_norm": 0.6294220089912415, |
| "learning_rate": 0.0004348820170240276, |
| "loss": 3.543, |
| "step": 25650 |
| }, |
| { |
| "epoch": 2.7661177483586266, |
| "grad_norm": 0.5231223702430725, |
| "learning_rate": 0.0004345587759939661, |
| "loss": 3.5294, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.7714993003982347, |
| "grad_norm": 0.634873628616333, |
| "learning_rate": 0.0004342355349639047, |
| "loss": 3.537, |
| "step": 25750 |
| }, |
| { |
| "epoch": 2.776880852437843, |
| "grad_norm": 0.5652748346328735, |
| "learning_rate": 0.0004339122939338433, |
| "loss": 3.5232, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.7822624044774513, |
| "grad_norm": 0.6224278807640076, |
| "learning_rate": 0.00043358905290378184, |
| "loss": 3.5462, |
| "step": 25850 |
| }, |
| { |
| "epoch": 2.7876439565170594, |
| "grad_norm": 0.6616294980049133, |
| "learning_rate": 0.0004332658118737205, |
| "loss": 3.5231, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.793025508556668, |
| "grad_norm": 0.566481351852417, |
| "learning_rate": 0.0004329425708436591, |
| "loss": 3.5384, |
| "step": 25950 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "grad_norm": 0.5798117518424988, |
| "learning_rate": 0.00043261932981359763, |
| "loss": 3.5461, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "eval_accuracy": 0.3685674366643303, |
| "eval_loss": 3.5130412578582764, |
| "eval_runtime": 184.8813, |
| "eval_samples_per_second": 97.419, |
| "eval_steps_per_second": 6.09, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.803788612635884, |
| "grad_norm": 0.5411988496780396, |
| "learning_rate": 0.0004322960887835362, |
| "loss": 3.5413, |
| "step": 26050 |
| }, |
| { |
| "epoch": 2.8091701646754923, |
| "grad_norm": 0.554651141166687, |
| "learning_rate": 0.00043197284775347476, |
| "loss": 3.5336, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.8145517167151004, |
| "grad_norm": 0.5864933133125305, |
| "learning_rate": 0.00043164960672341336, |
| "loss": 3.5489, |
| "step": 26150 |
| }, |
| { |
| "epoch": 2.819933268754709, |
| "grad_norm": 0.549628734588623, |
| "learning_rate": 0.000431326365693352, |
| "loss": 3.534, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.825314820794317, |
| "grad_norm": 0.5752499103546143, |
| "learning_rate": 0.00043100312466329055, |
| "loss": 3.5373, |
| "step": 26250 |
| }, |
| { |
| "epoch": 2.830696372833925, |
| "grad_norm": 0.6056687831878662, |
| "learning_rate": 0.00043067988363322914, |
| "loss": 3.5467, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.8360779248735337, |
| "grad_norm": 0.5557623505592346, |
| "learning_rate": 0.00043035664260316774, |
| "loss": 3.5347, |
| "step": 26350 |
| }, |
| { |
| "epoch": 2.841459476913142, |
| "grad_norm": 0.5566017031669617, |
| "learning_rate": 0.00043003986639370754, |
| "loss": 3.5568, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.84684102895275, |
| "grad_norm": 0.6013063192367554, |
| "learning_rate": 0.0004297166253636461, |
| "loss": 3.5329, |
| "step": 26450 |
| }, |
| { |
| "epoch": 2.852222580992358, |
| "grad_norm": 0.560697078704834, |
| "learning_rate": 0.0004293933843335847, |
| "loss": 3.5274, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.857604133031966, |
| "grad_norm": 0.5921282768249512, |
| "learning_rate": 0.00042907014330352333, |
| "loss": 3.538, |
| "step": 26550 |
| }, |
| { |
| "epoch": 2.8629856850715747, |
| "grad_norm": 0.5770512223243713, |
| "learning_rate": 0.00042874690227346187, |
| "loss": 3.5341, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.868367237111183, |
| "grad_norm": 0.5724422931671143, |
| "learning_rate": 0.00042842366124340046, |
| "loss": 3.5228, |
| "step": 26650 |
| }, |
| { |
| "epoch": 2.873748789150791, |
| "grad_norm": 0.593283474445343, |
| "learning_rate": 0.00042810042021333906, |
| "loss": 3.535, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.8791303411903995, |
| "grad_norm": 0.5896148681640625, |
| "learning_rate": 0.0004277771791832776, |
| "loss": 3.5122, |
| "step": 26750 |
| }, |
| { |
| "epoch": 2.8845118932300076, |
| "grad_norm": 0.5596243143081665, |
| "learning_rate": 0.0004274539381532162, |
| "loss": 3.5354, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.8898934452696157, |
| "grad_norm": 0.592008650302887, |
| "learning_rate": 0.00042713069712315484, |
| "loss": 3.5354, |
| "step": 26850 |
| }, |
| { |
| "epoch": 2.895274997309224, |
| "grad_norm": 0.5777035355567932, |
| "learning_rate": 0.0004268074560930934, |
| "loss": 3.5294, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.9006565493488323, |
| "grad_norm": 0.6243908405303955, |
| "learning_rate": 0.000426484215063032, |
| "loss": 3.5134, |
| "step": 26950 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "grad_norm": 0.5806100368499756, |
| "learning_rate": 0.0004261609740329705, |
| "loss": 3.5291, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "eval_accuracy": 0.36983183031982736, |
| "eval_loss": 3.504441738128662, |
| "eval_runtime": 185.054, |
| "eval_samples_per_second": 97.328, |
| "eval_steps_per_second": 6.085, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9114196534280485, |
| "grad_norm": 0.6097036600112915, |
| "learning_rate": 0.0004258377330029091, |
| "loss": 3.5165, |
| "step": 27050 |
| }, |
| { |
| "epoch": 2.9168012054676566, |
| "grad_norm": 0.6032792925834656, |
| "learning_rate": 0.00042551449197284776, |
| "loss": 3.543, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.922182757507265, |
| "grad_norm": 0.5959144830703735, |
| "learning_rate": 0.0004251912509427863, |
| "loss": 3.5216, |
| "step": 27150 |
| }, |
| { |
| "epoch": 2.9275643095468733, |
| "grad_norm": 0.6524012684822083, |
| "learning_rate": 0.0004248680099127249, |
| "loss": 3.5105, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.9329458615864814, |
| "grad_norm": 0.5885307788848877, |
| "learning_rate": 0.0004245447688826635, |
| "loss": 3.5274, |
| "step": 27250 |
| }, |
| { |
| "epoch": 2.93832741362609, |
| "grad_norm": 0.6017599105834961, |
| "learning_rate": 0.00042422152785260203, |
| "loss": 3.5332, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.943708965665698, |
| "grad_norm": 0.6001046895980835, |
| "learning_rate": 0.0004238982868225406, |
| "loss": 3.5223, |
| "step": 27350 |
| }, |
| { |
| "epoch": 2.949090517705306, |
| "grad_norm": 0.5998550057411194, |
| "learning_rate": 0.0004235750457924793, |
| "loss": 3.5066, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.9544720697449143, |
| "grad_norm": 0.6128471493721008, |
| "learning_rate": 0.0004232518047624178, |
| "loss": 3.5448, |
| "step": 27450 |
| }, |
| { |
| "epoch": 2.9598536217845224, |
| "grad_norm": 0.5720791816711426, |
| "learning_rate": 0.0004229285637323564, |
| "loss": 3.5259, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.965235173824131, |
| "grad_norm": 0.5852633118629456, |
| "learning_rate": 0.00042260532270229495, |
| "loss": 3.5241, |
| "step": 27550 |
| }, |
| { |
| "epoch": 2.970616725863739, |
| "grad_norm": 0.6088577508926392, |
| "learning_rate": 0.00042228208167223354, |
| "loss": 3.518, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.975998277903347, |
| "grad_norm": 0.5983795523643494, |
| "learning_rate": 0.00042195884064217214, |
| "loss": 3.5264, |
| "step": 27650 |
| }, |
| { |
| "epoch": 2.9813798299429557, |
| "grad_norm": 0.5886030197143555, |
| "learning_rate": 0.00042163559961211073, |
| "loss": 3.5261, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.986761381982564, |
| "grad_norm": 0.612271249294281, |
| "learning_rate": 0.00042131235858204933, |
| "loss": 3.5311, |
| "step": 27750 |
| }, |
| { |
| "epoch": 2.992142934022172, |
| "grad_norm": 0.6254613399505615, |
| "learning_rate": 0.0004209891175519879, |
| "loss": 3.5097, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.9975244860617805, |
| "grad_norm": 0.6479254364967346, |
| "learning_rate": 0.00042066587652192646, |
| "loss": 3.5238, |
| "step": 27850 |
| }, |
| { |
| "epoch": 3.0029060381013886, |
| "grad_norm": 0.6095746755599976, |
| "learning_rate": 0.00042034263549186506, |
| "loss": 3.4804, |
| "step": 27900 |
| }, |
| { |
| "epoch": 3.0082875901409967, |
| "grad_norm": 0.6377231478691101, |
| "learning_rate": 0.0004200193944618036, |
| "loss": 3.4438, |
| "step": 27950 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "grad_norm": 0.5965224504470825, |
| "learning_rate": 0.00041969615343174225, |
| "loss": 3.4265, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "eval_accuracy": 0.3701457285127234, |
| "eval_loss": 3.5000498294830322, |
| "eval_runtime": 184.7752, |
| "eval_samples_per_second": 97.475, |
| "eval_steps_per_second": 6.094, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0190506942202133, |
| "grad_norm": 0.5932812690734863, |
| "learning_rate": 0.00041937291240168084, |
| "loss": 3.4383, |
| "step": 28050 |
| }, |
| { |
| "epoch": 3.0244322462598214, |
| "grad_norm": 0.6348116993904114, |
| "learning_rate": 0.0004190496713716194, |
| "loss": 3.4435, |
| "step": 28100 |
| }, |
| { |
| "epoch": 3.0298137982994295, |
| "grad_norm": 0.623078465461731, |
| "learning_rate": 0.000418726430341558, |
| "loss": 3.4472, |
| "step": 28150 |
| }, |
| { |
| "epoch": 3.0351953503390376, |
| "grad_norm": 0.5842487215995789, |
| "learning_rate": 0.00041840318931149657, |
| "loss": 3.4284, |
| "step": 28200 |
| }, |
| { |
| "epoch": 3.040576902378646, |
| "grad_norm": 0.6037656664848328, |
| "learning_rate": 0.00041807994828143517, |
| "loss": 3.4273, |
| "step": 28250 |
| }, |
| { |
| "epoch": 3.0459584544182543, |
| "grad_norm": 0.5624446868896484, |
| "learning_rate": 0.00041775670725137376, |
| "loss": 3.4341, |
| "step": 28300 |
| }, |
| { |
| "epoch": 3.0513400064578624, |
| "grad_norm": 0.6236454248428345, |
| "learning_rate": 0.00041743346622131236, |
| "loss": 3.4384, |
| "step": 28350 |
| }, |
| { |
| "epoch": 3.0567215584974705, |
| "grad_norm": 0.5729033350944519, |
| "learning_rate": 0.0004171102251912509, |
| "loss": 3.4509, |
| "step": 28400 |
| }, |
| { |
| "epoch": 3.062103110537079, |
| "grad_norm": 0.570470929145813, |
| "learning_rate": 0.0004167869841611895, |
| "loss": 3.4454, |
| "step": 28450 |
| }, |
| { |
| "epoch": 3.067484662576687, |
| "grad_norm": 0.5834895372390747, |
| "learning_rate": 0.00041646374313112803, |
| "loss": 3.4471, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.0728662146162953, |
| "grad_norm": 0.6381838917732239, |
| "learning_rate": 0.0004161405021010667, |
| "loss": 3.4475, |
| "step": 28550 |
| }, |
| { |
| "epoch": 3.0782477666559034, |
| "grad_norm": 0.611271321773529, |
| "learning_rate": 0.00041582372589160643, |
| "loss": 3.452, |
| "step": 28600 |
| }, |
| { |
| "epoch": 3.083629318695512, |
| "grad_norm": 0.6530306339263916, |
| "learning_rate": 0.0004155004848615451, |
| "loss": 3.4317, |
| "step": 28650 |
| }, |
| { |
| "epoch": 3.08901087073512, |
| "grad_norm": 0.6146801710128784, |
| "learning_rate": 0.0004151772438314837, |
| "loss": 3.4379, |
| "step": 28700 |
| }, |
| { |
| "epoch": 3.094392422774728, |
| "grad_norm": 0.5884636640548706, |
| "learning_rate": 0.0004148540028014222, |
| "loss": 3.4381, |
| "step": 28750 |
| }, |
| { |
| "epoch": 3.0997739748143363, |
| "grad_norm": 0.6561384797096252, |
| "learning_rate": 0.0004145307617713608, |
| "loss": 3.4375, |
| "step": 28800 |
| }, |
| { |
| "epoch": 3.105155526853945, |
| "grad_norm": 0.5567984580993652, |
| "learning_rate": 0.00041420752074129935, |
| "loss": 3.4381, |
| "step": 28850 |
| }, |
| { |
| "epoch": 3.110537078893553, |
| "grad_norm": 0.583527684211731, |
| "learning_rate": 0.000413884279711238, |
| "loss": 3.4351, |
| "step": 28900 |
| }, |
| { |
| "epoch": 3.115918630933161, |
| "grad_norm": 0.580026388168335, |
| "learning_rate": 0.0004135610386811766, |
| "loss": 3.443, |
| "step": 28950 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "grad_norm": 0.6263661980628967, |
| "learning_rate": 0.00041323779765111514, |
| "loss": 3.4472, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "eval_accuracy": 0.37119868364853353, |
| "eval_loss": 3.4976308345794678, |
| "eval_runtime": 184.8903, |
| "eval_samples_per_second": 97.415, |
| "eval_steps_per_second": 6.09, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.1266817350123777, |
| "grad_norm": 0.5766123533248901, |
| "learning_rate": 0.00041291455662105373, |
| "loss": 3.4438, |
| "step": 29050 |
| }, |
| { |
| "epoch": 3.132063287051986, |
| "grad_norm": 0.5877207517623901, |
| "learning_rate": 0.0004125913155909923, |
| "loss": 3.4321, |
| "step": 29100 |
| }, |
| { |
| "epoch": 3.137444839091594, |
| "grad_norm": 0.6023469567298889, |
| "learning_rate": 0.00041226807456093087, |
| "loss": 3.4396, |
| "step": 29150 |
| }, |
| { |
| "epoch": 3.1428263911312024, |
| "grad_norm": 0.5752248167991638, |
| "learning_rate": 0.0004119448335308695, |
| "loss": 3.4291, |
| "step": 29200 |
| }, |
| { |
| "epoch": 3.1482079431708105, |
| "grad_norm": 0.5830261707305908, |
| "learning_rate": 0.0004116215925008081, |
| "loss": 3.4434, |
| "step": 29250 |
| }, |
| { |
| "epoch": 3.1535894952104186, |
| "grad_norm": 0.6252485513687134, |
| "learning_rate": 0.00041129835147074665, |
| "loss": 3.458, |
| "step": 29300 |
| }, |
| { |
| "epoch": 3.1589710472500268, |
| "grad_norm": 0.6096091866493225, |
| "learning_rate": 0.00041097511044068524, |
| "loss": 3.4718, |
| "step": 29350 |
| }, |
| { |
| "epoch": 3.1643525992896353, |
| "grad_norm": 0.6204565167427063, |
| "learning_rate": 0.0004106518694106238, |
| "loss": 3.4728, |
| "step": 29400 |
| }, |
| { |
| "epoch": 3.1697341513292434, |
| "grad_norm": 0.6110973954200745, |
| "learning_rate": 0.0004103286283805624, |
| "loss": 3.4484, |
| "step": 29450 |
| }, |
| { |
| "epoch": 3.1751157033688515, |
| "grad_norm": 0.6513490080833435, |
| "learning_rate": 0.00041000538735050103, |
| "loss": 3.4548, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.1804972554084596, |
| "grad_norm": 0.6093899607658386, |
| "learning_rate": 0.00040968214632043957, |
| "loss": 3.4613, |
| "step": 29550 |
| }, |
| { |
| "epoch": 3.185878807448068, |
| "grad_norm": 0.6108792424201965, |
| "learning_rate": 0.00040935890529037816, |
| "loss": 3.4307, |
| "step": 29600 |
| }, |
| { |
| "epoch": 3.1912603594876763, |
| "grad_norm": 0.6220118999481201, |
| "learning_rate": 0.00040903566426031676, |
| "loss": 3.4599, |
| "step": 29650 |
| }, |
| { |
| "epoch": 3.1966419115272844, |
| "grad_norm": 0.5830521583557129, |
| "learning_rate": 0.0004087124232302553, |
| "loss": 3.4543, |
| "step": 29700 |
| }, |
| { |
| "epoch": 3.2020234635668925, |
| "grad_norm": 0.6141335368156433, |
| "learning_rate": 0.0004083891822001939, |
| "loss": 3.4441, |
| "step": 29750 |
| }, |
| { |
| "epoch": 3.207405015606501, |
| "grad_norm": 0.6098508238792419, |
| "learning_rate": 0.00040806594117013254, |
| "loss": 3.4538, |
| "step": 29800 |
| }, |
| { |
| "epoch": 3.212786567646109, |
| "grad_norm": 0.5632219314575195, |
| "learning_rate": 0.0004077427001400711, |
| "loss": 3.4533, |
| "step": 29850 |
| }, |
| { |
| "epoch": 3.2181681196857173, |
| "grad_norm": 0.6396539807319641, |
| "learning_rate": 0.0004074194591100097, |
| "loss": 3.4618, |
| "step": 29900 |
| }, |
| { |
| "epoch": 3.2235496717253254, |
| "grad_norm": 0.6783267855644226, |
| "learning_rate": 0.0004070962180799482, |
| "loss": 3.4458, |
| "step": 29950 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "grad_norm": 0.6168944239616394, |
| "learning_rate": 0.0004067729770498868, |
| "loss": 3.4667, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "eval_accuracy": 0.37168392744551304, |
| "eval_loss": 3.4887137413024902, |
| "eval_runtime": 184.9069, |
| "eval_samples_per_second": 97.406, |
| "eval_steps_per_second": 6.09, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.234312775804542, |
| "grad_norm": 0.6674462556838989, |
| "learning_rate": 0.00040644973601982546, |
| "loss": 3.4648, |
| "step": 30050 |
| }, |
| { |
| "epoch": 3.23969432784415, |
| "grad_norm": 0.5786466598510742, |
| "learning_rate": 0.000406126494989764, |
| "loss": 3.4439, |
| "step": 30100 |
| }, |
| { |
| "epoch": 3.2450758798837587, |
| "grad_norm": 0.6295211315155029, |
| "learning_rate": 0.0004058032539597026, |
| "loss": 3.4525, |
| "step": 30150 |
| }, |
| { |
| "epoch": 3.250457431923367, |
| "grad_norm": 0.659146249294281, |
| "learning_rate": 0.00040548001292964114, |
| "loss": 3.4615, |
| "step": 30200 |
| }, |
| { |
| "epoch": 3.255838983962975, |
| "grad_norm": 0.5946276187896729, |
| "learning_rate": 0.00040515677189957973, |
| "loss": 3.4612, |
| "step": 30250 |
| }, |
| { |
| "epoch": 3.261220536002583, |
| "grad_norm": 0.6724011898040771, |
| "learning_rate": 0.0004048335308695183, |
| "loss": 3.4394, |
| "step": 30300 |
| }, |
| { |
| "epoch": 3.2666020880421915, |
| "grad_norm": 0.6301489472389221, |
| "learning_rate": 0.0004045102898394569, |
| "loss": 3.4501, |
| "step": 30350 |
| }, |
| { |
| "epoch": 3.2719836400817996, |
| "grad_norm": 0.666102945804596, |
| "learning_rate": 0.0004041870488093955, |
| "loss": 3.4573, |
| "step": 30400 |
| }, |
| { |
| "epoch": 3.2773651921214078, |
| "grad_norm": 0.6429094672203064, |
| "learning_rate": 0.0004038638077793341, |
| "loss": 3.4675, |
| "step": 30450 |
| }, |
| { |
| "epoch": 3.282746744161016, |
| "grad_norm": 0.6529164910316467, |
| "learning_rate": 0.00040354056674927265, |
| "loss": 3.4523, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.2881282962006244, |
| "grad_norm": 0.593303918838501, |
| "learning_rate": 0.00040321732571921124, |
| "loss": 3.4455, |
| "step": 30550 |
| }, |
| { |
| "epoch": 3.2935098482402325, |
| "grad_norm": 0.587743878364563, |
| "learning_rate": 0.0004029070143303523, |
| "loss": 3.4432, |
| "step": 30600 |
| }, |
| { |
| "epoch": 3.2988914002798406, |
| "grad_norm": 0.6212865114212036, |
| "learning_rate": 0.00040258377330029086, |
| "loss": 3.4705, |
| "step": 30650 |
| }, |
| { |
| "epoch": 3.304272952319449, |
| "grad_norm": 0.5814943313598633, |
| "learning_rate": 0.00040226053227022945, |
| "loss": 3.4567, |
| "step": 30700 |
| }, |
| { |
| "epoch": 3.3096545043590573, |
| "grad_norm": 0.5858567953109741, |
| "learning_rate": 0.00040193729124016805, |
| "loss": 3.453, |
| "step": 30750 |
| }, |
| { |
| "epoch": 3.3150360563986654, |
| "grad_norm": 0.6212705373764038, |
| "learning_rate": 0.00040161405021010664, |
| "loss": 3.4501, |
| "step": 30800 |
| }, |
| { |
| "epoch": 3.3204176084382735, |
| "grad_norm": 0.6279528141021729, |
| "learning_rate": 0.00040129080918004524, |
| "loss": 3.4556, |
| "step": 30850 |
| }, |
| { |
| "epoch": 3.3257991604778816, |
| "grad_norm": 0.6483830213546753, |
| "learning_rate": 0.00040096756814998383, |
| "loss": 3.4473, |
| "step": 30900 |
| }, |
| { |
| "epoch": 3.33118071251749, |
| "grad_norm": 0.6192697286605835, |
| "learning_rate": 0.00040064432711992237, |
| "loss": 3.47, |
| "step": 30950 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "grad_norm": 0.6230787038803101, |
| "learning_rate": 0.00040032108608986097, |
| "loss": 3.4857, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "eval_accuracy": 0.3728218491395941, |
| "eval_loss": 3.4819231033325195, |
| "eval_runtime": 185.0123, |
| "eval_samples_per_second": 97.35, |
| "eval_steps_per_second": 6.086, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3419438165967064, |
| "grad_norm": 0.6070190072059631, |
| "learning_rate": 0.0003999978450597995, |
| "loss": 3.4448, |
| "step": 31050 |
| }, |
| { |
| "epoch": 3.347325368636315, |
| "grad_norm": 0.638231098651886, |
| "learning_rate": 0.00039967460402973816, |
| "loss": 3.4718, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.352706920675923, |
| "grad_norm": 0.7029089331626892, |
| "learning_rate": 0.00039935136299967675, |
| "loss": 3.4692, |
| "step": 31150 |
| }, |
| { |
| "epoch": 3.358088472715531, |
| "grad_norm": 0.565448522567749, |
| "learning_rate": 0.0003990281219696153, |
| "loss": 3.4601, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.3634700247551392, |
| "grad_norm": 0.6335359811782837, |
| "learning_rate": 0.0003987048809395539, |
| "loss": 3.4573, |
| "step": 31250 |
| }, |
| { |
| "epoch": 3.368851576794748, |
| "grad_norm": 0.604525625705719, |
| "learning_rate": 0.0003983816399094925, |
| "loss": 3.4508, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.374233128834356, |
| "grad_norm": 0.634780764579773, |
| "learning_rate": 0.000398058398879431, |
| "loss": 3.4762, |
| "step": 31350 |
| }, |
| { |
| "epoch": 3.379614680873964, |
| "grad_norm": 0.636223316192627, |
| "learning_rate": 0.00039773515784936967, |
| "loss": 3.4521, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.384996232913572, |
| "grad_norm": 0.6109070181846619, |
| "learning_rate": 0.00039741191681930826, |
| "loss": 3.4659, |
| "step": 31450 |
| }, |
| { |
| "epoch": 3.3903777849531807, |
| "grad_norm": 0.6315115094184875, |
| "learning_rate": 0.0003970886757892468, |
| "loss": 3.4598, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.3957593369927888, |
| "grad_norm": 0.5966545343399048, |
| "learning_rate": 0.0003967654347591854, |
| "loss": 3.458, |
| "step": 31550 |
| }, |
| { |
| "epoch": 3.401140889032397, |
| "grad_norm": 0.6154278516769409, |
| "learning_rate": 0.00039644219372912394, |
| "loss": 3.4428, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.4065224410720054, |
| "grad_norm": 0.6580645442008972, |
| "learning_rate": 0.0003961189526990626, |
| "loss": 3.4463, |
| "step": 31650 |
| }, |
| { |
| "epoch": 3.4119039931116135, |
| "grad_norm": 0.6818550825119019, |
| "learning_rate": 0.0003957957116690012, |
| "loss": 3.4425, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.4172855451512216, |
| "grad_norm": 0.6167941689491272, |
| "learning_rate": 0.0003954724706389397, |
| "loss": 3.4583, |
| "step": 31750 |
| }, |
| { |
| "epoch": 3.4226670971908297, |
| "grad_norm": 0.6051561236381531, |
| "learning_rate": 0.0003951492296088783, |
| "loss": 3.4626, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.428048649230438, |
| "grad_norm": 0.604656457901001, |
| "learning_rate": 0.0003948259885788169, |
| "loss": 3.456, |
| "step": 31850 |
| }, |
| { |
| "epoch": 3.4334302012700464, |
| "grad_norm": 0.6075143814086914, |
| "learning_rate": 0.00039450274754875545, |
| "loss": 3.4499, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.4388117533096545, |
| "grad_norm": 0.6349363327026367, |
| "learning_rate": 0.0003941795065186941, |
| "loss": 3.4608, |
| "step": 31950 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "grad_norm": 0.6251042485237122, |
| "learning_rate": 0.0003938562654886327, |
| "loss": 3.4601, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "eval_accuracy": 0.372848251791333, |
| "eval_loss": 3.476959466934204, |
| "eval_runtime": 185.0651, |
| "eval_samples_per_second": 97.323, |
| "eval_steps_per_second": 6.084, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.449574857388871, |
| "grad_norm": 0.6375144720077515, |
| "learning_rate": 0.00039353302445857124, |
| "loss": 3.4643, |
| "step": 32050 |
| }, |
| { |
| "epoch": 3.4549564094284793, |
| "grad_norm": 0.6365067958831787, |
| "learning_rate": 0.00039320978342850983, |
| "loss": 3.4534, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.4603379614680874, |
| "grad_norm": 0.6374215483665466, |
| "learning_rate": 0.00039288654239844837, |
| "loss": 3.4464, |
| "step": 32150 |
| }, |
| { |
| "epoch": 3.4657195135076955, |
| "grad_norm": 0.6080241203308105, |
| "learning_rate": 0.00039256330136838697, |
| "loss": 3.4582, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.471101065547304, |
| "grad_norm": 0.5984114408493042, |
| "learning_rate": 0.0003922400603383256, |
| "loss": 3.456, |
| "step": 32250 |
| }, |
| { |
| "epoch": 3.476482617586912, |
| "grad_norm": 0.6118507981300354, |
| "learning_rate": 0.00039191681930826416, |
| "loss": 3.4786, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.4818641696265202, |
| "grad_norm": 0.6618357300758362, |
| "learning_rate": 0.00039159357827820275, |
| "loss": 3.4559, |
| "step": 32350 |
| }, |
| { |
| "epoch": 3.4872457216661283, |
| "grad_norm": 0.6183602809906006, |
| "learning_rate": 0.00039127033724814135, |
| "loss": 3.462, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.492627273705737, |
| "grad_norm": 0.6030476689338684, |
| "learning_rate": 0.0003909470962180799, |
| "loss": 3.4661, |
| "step": 32450 |
| }, |
| { |
| "epoch": 3.498008825745345, |
| "grad_norm": 0.6316002011299133, |
| "learning_rate": 0.00039062385518801854, |
| "loss": 3.4361, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.503390377784953, |
| "grad_norm": 0.592577338218689, |
| "learning_rate": 0.00039030061415795713, |
| "loss": 3.438, |
| "step": 32550 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 0.6018152236938477, |
| "learning_rate": 0.00038997737312789567, |
| "loss": 3.4465, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.5141534818641698, |
| "grad_norm": 0.657094419002533, |
| "learning_rate": 0.00038965413209783426, |
| "loss": 3.4449, |
| "step": 32650 |
| }, |
| { |
| "epoch": 3.519535033903778, |
| "grad_norm": 0.6502700448036194, |
| "learning_rate": 0.0003893308910677728, |
| "loss": 3.4711, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.524916585943386, |
| "grad_norm": 0.6325830817222595, |
| "learning_rate": 0.0003890076500377114, |
| "loss": 3.4545, |
| "step": 32750 |
| }, |
| { |
| "epoch": 3.530298137982994, |
| "grad_norm": 0.6116040349006653, |
| "learning_rate": 0.00038868440900765005, |
| "loss": 3.4498, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.5356796900226026, |
| "grad_norm": 0.6353474855422974, |
| "learning_rate": 0.0003883611679775886, |
| "loss": 3.4359, |
| "step": 32850 |
| }, |
| { |
| "epoch": 3.5410612420622107, |
| "grad_norm": 0.6394508481025696, |
| "learning_rate": 0.0003880379269475272, |
| "loss": 3.4538, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.546442794101819, |
| "grad_norm": 0.5920386910438538, |
| "learning_rate": 0.0003877146859174657, |
| "loss": 3.4638, |
| "step": 32950 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "grad_norm": 0.6352172493934631, |
| "learning_rate": 0.0003873914448874043, |
| "loss": 3.4542, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "eval_accuracy": 0.3739409738840832, |
| "eval_loss": 3.467642068862915, |
| "eval_runtime": 184.7948, |
| "eval_samples_per_second": 97.465, |
| "eval_steps_per_second": 6.093, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5572058981810355, |
| "grad_norm": 0.5835293531417847, |
| "learning_rate": 0.0003870682038573429, |
| "loss": 3.4507, |
| "step": 33050 |
| }, |
| { |
| "epoch": 3.5625874502206436, |
| "grad_norm": 0.6255052089691162, |
| "learning_rate": 0.0003867449628272815, |
| "loss": 3.4522, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.5679690022602517, |
| "grad_norm": 0.6663792133331299, |
| "learning_rate": 0.0003864217217972201, |
| "loss": 3.4373, |
| "step": 33150 |
| }, |
| { |
| "epoch": 3.57335055429986, |
| "grad_norm": 0.5987926125526428, |
| "learning_rate": 0.0003860984807671587, |
| "loss": 3.4538, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.5787321063394684, |
| "grad_norm": 0.6144534945487976, |
| "learning_rate": 0.00038577523973709724, |
| "loss": 3.466, |
| "step": 33250 |
| }, |
| { |
| "epoch": 3.5841136583790765, |
| "grad_norm": 0.6252511739730835, |
| "learning_rate": 0.00038545199870703583, |
| "loss": 3.4454, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.5894952104186846, |
| "grad_norm": 0.6555866599082947, |
| "learning_rate": 0.0003851287576769744, |
| "loss": 3.455, |
| "step": 33350 |
| }, |
| { |
| "epoch": 3.594876762458293, |
| "grad_norm": 0.6036868691444397, |
| "learning_rate": 0.000384805516646913, |
| "loss": 3.4569, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.6002583144979012, |
| "grad_norm": 0.6310072541236877, |
| "learning_rate": 0.0003844822756168516, |
| "loss": 3.4375, |
| "step": 33450 |
| }, |
| { |
| "epoch": 3.6056398665375093, |
| "grad_norm": 0.6304979920387268, |
| "learning_rate": 0.00038415903458679016, |
| "loss": 3.4567, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.611021418577118, |
| "grad_norm": 0.6037408113479614, |
| "learning_rate": 0.00038383579355672875, |
| "loss": 3.4643, |
| "step": 33550 |
| }, |
| { |
| "epoch": 3.616402970616726, |
| "grad_norm": 0.6693500280380249, |
| "learning_rate": 0.00038351255252666735, |
| "loss": 3.4298, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.621784522656334, |
| "grad_norm": 0.6045053005218506, |
| "learning_rate": 0.00038318931149660594, |
| "loss": 3.4583, |
| "step": 33650 |
| }, |
| { |
| "epoch": 3.627166074695942, |
| "grad_norm": 0.5890701413154602, |
| "learning_rate": 0.00038286607046654454, |
| "loss": 3.4497, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.6325476267355503, |
| "grad_norm": 0.6815541386604309, |
| "learning_rate": 0.00038254282943648313, |
| "loss": 3.4522, |
| "step": 33750 |
| }, |
| { |
| "epoch": 3.637929178775159, |
| "grad_norm": 0.6552605032920837, |
| "learning_rate": 0.00038221958840642167, |
| "loss": 3.466, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.643310730814767, |
| "grad_norm": 0.6101818084716797, |
| "learning_rate": 0.00038189634737636027, |
| "loss": 3.4601, |
| "step": 33850 |
| }, |
| { |
| "epoch": 3.648692282854375, |
| "grad_norm": 0.636333703994751, |
| "learning_rate": 0.0003815731063462988, |
| "loss": 3.4451, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.6540738348939836, |
| "grad_norm": 0.6014490723609924, |
| "learning_rate": 0.00038124986531623745, |
| "loss": 3.4576, |
| "step": 33950 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "grad_norm": 0.6211981773376465, |
| "learning_rate": 0.00038092662428617605, |
| "loss": 3.4527, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "eval_accuracy": 0.374348748172051, |
| "eval_loss": 3.4629900455474854, |
| "eval_runtime": 184.9732, |
| "eval_samples_per_second": 97.371, |
| "eval_steps_per_second": 6.087, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6648369389732, |
| "grad_norm": 0.65215665102005, |
| "learning_rate": 0.0003806033832561146, |
| "loss": 3.4392, |
| "step": 34050 |
| }, |
| { |
| "epoch": 3.670218491012808, |
| "grad_norm": 0.6055262684822083, |
| "learning_rate": 0.0003802801422260532, |
| "loss": 3.4235, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.675600043052416, |
| "grad_norm": 0.6137178540229797, |
| "learning_rate": 0.0003799569011959918, |
| "loss": 3.4482, |
| "step": 34150 |
| }, |
| { |
| "epoch": 3.6809815950920246, |
| "grad_norm": 0.6887802481651306, |
| "learning_rate": 0.0003796336601659303, |
| "loss": 3.4466, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.6863631471316327, |
| "grad_norm": 0.634081244468689, |
| "learning_rate": 0.00037931041913586897, |
| "loss": 3.4466, |
| "step": 34250 |
| }, |
| { |
| "epoch": 3.691744699171241, |
| "grad_norm": 0.622760534286499, |
| "learning_rate": 0.00037898717810580756, |
| "loss": 3.4431, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.6971262512108494, |
| "grad_norm": 0.616561770439148, |
| "learning_rate": 0.0003786639370757461, |
| "loss": 3.454, |
| "step": 34350 |
| }, |
| { |
| "epoch": 3.7025078032504575, |
| "grad_norm": 0.6342828273773193, |
| "learning_rate": 0.0003783406960456847, |
| "loss": 3.4736, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.7078893552900656, |
| "grad_norm": 0.7093660235404968, |
| "learning_rate": 0.00037801745501562324, |
| "loss": 3.4731, |
| "step": 34450 |
| }, |
| { |
| "epoch": 3.713270907329674, |
| "grad_norm": 0.6421014666557312, |
| "learning_rate": 0.0003776942139855619, |
| "loss": 3.4593, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.7186524593692822, |
| "grad_norm": 0.6160980463027954, |
| "learning_rate": 0.0003773709729555005, |
| "loss": 3.4535, |
| "step": 34550 |
| }, |
| { |
| "epoch": 3.7240340114088903, |
| "grad_norm": 0.6290106177330017, |
| "learning_rate": 0.000377047731925439, |
| "loss": 3.461, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.7294155634484984, |
| "grad_norm": 0.6211118698120117, |
| "learning_rate": 0.0003767244908953776, |
| "loss": 3.4576, |
| "step": 34650 |
| }, |
| { |
| "epoch": 3.7347971154881066, |
| "grad_norm": 0.6442049145698547, |
| "learning_rate": 0.0003764077146859174, |
| "loss": 3.4573, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.740178667527715, |
| "grad_norm": 0.6050986647605896, |
| "learning_rate": 0.000376084473655856, |
| "loss": 3.4425, |
| "step": 34750 |
| }, |
| { |
| "epoch": 3.745560219567323, |
| "grad_norm": 0.6353182792663574, |
| "learning_rate": 0.00037576123262579456, |
| "loss": 3.437, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.7509417716069313, |
| "grad_norm": 0.6667701005935669, |
| "learning_rate": 0.00037543799159573315, |
| "loss": 3.4571, |
| "step": 34850 |
| }, |
| { |
| "epoch": 3.75632332364654, |
| "grad_norm": 0.6093037128448486, |
| "learning_rate": 0.0003751147505656718, |
| "loss": 3.4453, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.761704875686148, |
| "grad_norm": 0.5967413783073425, |
| "learning_rate": 0.00037479150953561034, |
| "loss": 3.4659, |
| "step": 34950 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "grad_norm": 0.6182402968406677, |
| "learning_rate": 0.00037446826850554894, |
| "loss": 3.4603, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "eval_accuracy": 0.37532618955083, |
| "eval_loss": 3.454987049102783, |
| "eval_runtime": 184.7797, |
| "eval_samples_per_second": 97.473, |
| "eval_steps_per_second": 6.094, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.772467979765364, |
| "grad_norm": 0.6796460747718811, |
| "learning_rate": 0.00037414502747548753, |
| "loss": 3.4427, |
| "step": 35050 |
| }, |
| { |
| "epoch": 3.7778495318049723, |
| "grad_norm": 0.6759485006332397, |
| "learning_rate": 0.00037382178644542607, |
| "loss": 3.4735, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.783231083844581, |
| "grad_norm": 0.6431259512901306, |
| "learning_rate": 0.00037349854541536467, |
| "loss": 3.4669, |
| "step": 35150 |
| }, |
| { |
| "epoch": 3.788612635884189, |
| "grad_norm": 0.6208698153495789, |
| "learning_rate": 0.0003731753043853033, |
| "loss": 3.4517, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.793994187923797, |
| "grad_norm": 0.6531832218170166, |
| "learning_rate": 0.00037285206335524186, |
| "loss": 3.459, |
| "step": 35250 |
| }, |
| { |
| "epoch": 3.7993757399634056, |
| "grad_norm": 0.6968291997909546, |
| "learning_rate": 0.00037252882232518045, |
| "loss": 3.45, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.8047572920030137, |
| "grad_norm": 0.6449758410453796, |
| "learning_rate": 0.000372205581295119, |
| "loss": 3.4481, |
| "step": 35350 |
| }, |
| { |
| "epoch": 3.810138844042622, |
| "grad_norm": 0.6235898733139038, |
| "learning_rate": 0.0003718823402650576, |
| "loss": 3.4659, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.8155203960822304, |
| "grad_norm": 0.7655190825462341, |
| "learning_rate": 0.00037155909923499624, |
| "loss": 3.4616, |
| "step": 35450 |
| }, |
| { |
| "epoch": 3.8209019481218385, |
| "grad_norm": 0.6297181248664856, |
| "learning_rate": 0.0003712358582049348, |
| "loss": 3.4774, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.8262835001614466, |
| "grad_norm": 0.6621859073638916, |
| "learning_rate": 0.00037091261717487337, |
| "loss": 3.4521, |
| "step": 35550 |
| }, |
| { |
| "epoch": 3.8316650522010547, |
| "grad_norm": 0.5900010466575623, |
| "learning_rate": 0.00037058937614481197, |
| "loss": 3.447, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.837046604240663, |
| "grad_norm": 0.6074267625808716, |
| "learning_rate": 0.0003702661351147505, |
| "loss": 3.4478, |
| "step": 35650 |
| }, |
| { |
| "epoch": 3.8424281562802713, |
| "grad_norm": 0.616184413433075, |
| "learning_rate": 0.0003699428940846891, |
| "loss": 3.431, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.8478097083198795, |
| "grad_norm": 0.6139175891876221, |
| "learning_rate": 0.00036961965305462775, |
| "loss": 3.4688, |
| "step": 35750 |
| }, |
| { |
| "epoch": 3.8531912603594876, |
| "grad_norm": 0.6691179871559143, |
| "learning_rate": 0.0003692964120245663, |
| "loss": 3.4461, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.858572812399096, |
| "grad_norm": 0.6166695356369019, |
| "learning_rate": 0.0003689731709945049, |
| "loss": 3.4642, |
| "step": 35850 |
| }, |
| { |
| "epoch": 3.863954364438704, |
| "grad_norm": 0.6620895266532898, |
| "learning_rate": 0.0003686499299644434, |
| "loss": 3.4599, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.8693359164783123, |
| "grad_norm": 0.58867347240448, |
| "learning_rate": 0.000368326688934382, |
| "loss": 3.4494, |
| "step": 35950 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "grad_norm": 0.6114178895950317, |
| "learning_rate": 0.0003680034479043206, |
| "loss": 3.4545, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "eval_accuracy": 0.37561944369504136, |
| "eval_loss": 3.451740026473999, |
| "eval_runtime": 184.9248, |
| "eval_samples_per_second": 97.396, |
| "eval_steps_per_second": 6.089, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8800990205575285, |
| "grad_norm": 0.6434826850891113, |
| "learning_rate": 0.0003676802068742592, |
| "loss": 3.4546, |
| "step": 36050 |
| }, |
| { |
| "epoch": 3.885480572597137, |
| "grad_norm": 0.6056340336799622, |
| "learning_rate": 0.0003673569658441978, |
| "loss": 3.4587, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.890862124636745, |
| "grad_norm": 0.6036874055862427, |
| "learning_rate": 0.0003670337248141364, |
| "loss": 3.4457, |
| "step": 36150 |
| }, |
| { |
| "epoch": 3.8962436766763533, |
| "grad_norm": 0.6100910305976868, |
| "learning_rate": 0.00036671048378407494, |
| "loss": 3.4508, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.901625228715962, |
| "grad_norm": 0.6239060759544373, |
| "learning_rate": 0.00036638724275401353, |
| "loss": 3.4465, |
| "step": 36250 |
| }, |
| { |
| "epoch": 3.90700678075557, |
| "grad_norm": 0.6256414651870728, |
| "learning_rate": 0.00036607046654455334, |
| "loss": 3.4362, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.912388332795178, |
| "grad_norm": 0.5970678329467773, |
| "learning_rate": 0.00036574722551449193, |
| "loss": 3.4634, |
| "step": 36350 |
| }, |
| { |
| "epoch": 3.9177698848347866, |
| "grad_norm": 0.6506580114364624, |
| "learning_rate": 0.00036542398448443053, |
| "loss": 3.4448, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.9231514368743947, |
| "grad_norm": 0.64057856798172, |
| "learning_rate": 0.0003651007434543691, |
| "loss": 3.445, |
| "step": 36450 |
| }, |
| { |
| "epoch": 3.928532988914003, |
| "grad_norm": 0.6665002703666687, |
| "learning_rate": 0.0003647775024243077, |
| "loss": 3.4366, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.933914540953611, |
| "grad_norm": 0.6165843605995178, |
| "learning_rate": 0.00036445426139424626, |
| "loss": 3.4316, |
| "step": 36550 |
| }, |
| { |
| "epoch": 3.939296092993219, |
| "grad_norm": 0.6074864268302917, |
| "learning_rate": 0.00036413102036418485, |
| "loss": 3.4501, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.9446776450328276, |
| "grad_norm": 0.7129690647125244, |
| "learning_rate": 0.0003638077793341234, |
| "loss": 3.4409, |
| "step": 36650 |
| }, |
| { |
| "epoch": 3.9500591970724357, |
| "grad_norm": 0.6536222100257874, |
| "learning_rate": 0.00036348453830406204, |
| "loss": 3.4472, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.955440749112044, |
| "grad_norm": 0.5931937098503113, |
| "learning_rate": 0.00036316129727400064, |
| "loss": 3.4439, |
| "step": 36750 |
| }, |
| { |
| "epoch": 3.9608223011516523, |
| "grad_norm": 0.6528921127319336, |
| "learning_rate": 0.0003628380562439392, |
| "loss": 3.4427, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.9662038531912605, |
| "grad_norm": 0.5929076075553894, |
| "learning_rate": 0.00036251481521387777, |
| "loss": 3.4536, |
| "step": 36850 |
| }, |
| { |
| "epoch": 3.9715854052308686, |
| "grad_norm": 0.6636005640029907, |
| "learning_rate": 0.00036219157418381637, |
| "loss": 3.4588, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.9769669572704767, |
| "grad_norm": 0.5989009737968445, |
| "learning_rate": 0.0003618683331537549, |
| "loss": 3.4483, |
| "step": 36950 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "grad_norm": 0.6094232201576233, |
| "learning_rate": 0.00036154509212369356, |
| "loss": 3.456, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "eval_accuracy": 0.37639761567756863, |
| "eval_loss": 3.444646120071411, |
| "eval_runtime": 184.7393, |
| "eval_samples_per_second": 97.494, |
| "eval_steps_per_second": 6.095, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9877300613496933, |
| "grad_norm": 0.6560502648353577, |
| "learning_rate": 0.00036122185109363215, |
| "loss": 3.4508, |
| "step": 37050 |
| }, |
| { |
| "epoch": 3.9931116133893014, |
| "grad_norm": 0.5955866575241089, |
| "learning_rate": 0.0003608986100635707, |
| "loss": 3.4503, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.9984931654289095, |
| "grad_norm": 0.6154381036758423, |
| "learning_rate": 0.0003605753690335093, |
| "loss": 3.4548, |
| "step": 37150 |
| }, |
| { |
| "epoch": 4.003874717468518, |
| "grad_norm": 0.6376234292984009, |
| "learning_rate": 0.0003602521280034478, |
| "loss": 3.3711, |
| "step": 37200 |
| }, |
| { |
| "epoch": 4.009256269508126, |
| "grad_norm": 0.6253707408905029, |
| "learning_rate": 0.0003599288869733865, |
| "loss": 3.3483, |
| "step": 37250 |
| }, |
| { |
| "epoch": 4.014637821547734, |
| "grad_norm": 0.6334782838821411, |
| "learning_rate": 0.00035960564594332507, |
| "loss": 3.3408, |
| "step": 37300 |
| }, |
| { |
| "epoch": 4.020019373587343, |
| "grad_norm": 0.6270228624343872, |
| "learning_rate": 0.0003592824049132636, |
| "loss": 3.3435, |
| "step": 37350 |
| }, |
| { |
| "epoch": 4.0254009256269505, |
| "grad_norm": 0.6254666447639465, |
| "learning_rate": 0.0003589591638832022, |
| "loss": 3.3457, |
| "step": 37400 |
| }, |
| { |
| "epoch": 4.030782477666559, |
| "grad_norm": 0.7249552607536316, |
| "learning_rate": 0.0003586359228531408, |
| "loss": 3.3514, |
| "step": 37450 |
| }, |
| { |
| "epoch": 4.036164029706168, |
| "grad_norm": 0.6332011222839355, |
| "learning_rate": 0.00035831268182307934, |
| "loss": 3.3702, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.041545581745775, |
| "grad_norm": 0.6144351959228516, |
| "learning_rate": 0.000357989440793018, |
| "loss": 3.3694, |
| "step": 37550 |
| }, |
| { |
| "epoch": 4.046927133785384, |
| "grad_norm": 0.6586161851882935, |
| "learning_rate": 0.0003576661997629566, |
| "loss": 3.3739, |
| "step": 37600 |
| }, |
| { |
| "epoch": 4.0523086858249915, |
| "grad_norm": 0.6490854024887085, |
| "learning_rate": 0.0003573429587328951, |
| "loss": 3.3556, |
| "step": 37650 |
| }, |
| { |
| "epoch": 4.0576902378646, |
| "grad_norm": 0.6474526524543762, |
| "learning_rate": 0.0003570197177028337, |
| "loss": 3.3655, |
| "step": 37700 |
| }, |
| { |
| "epoch": 4.063071789904209, |
| "grad_norm": 0.6664512157440186, |
| "learning_rate": 0.00035669647667277226, |
| "loss": 3.362, |
| "step": 37750 |
| }, |
| { |
| "epoch": 4.068453341943816, |
| "grad_norm": 0.6286465525627136, |
| "learning_rate": 0.00035637323564271085, |
| "loss": 3.385, |
| "step": 37800 |
| }, |
| { |
| "epoch": 4.073834893983425, |
| "grad_norm": 0.6096540093421936, |
| "learning_rate": 0.0003560499946126495, |
| "loss": 3.3604, |
| "step": 37850 |
| }, |
| { |
| "epoch": 4.079216446023033, |
| "grad_norm": 0.6467686295509338, |
| "learning_rate": 0.00035572675358258804, |
| "loss": 3.3638, |
| "step": 37900 |
| }, |
| { |
| "epoch": 4.084597998062641, |
| "grad_norm": 0.6298966407775879, |
| "learning_rate": 0.00035540351255252664, |
| "loss": 3.3726, |
| "step": 37950 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "grad_norm": 0.7064566612243652, |
| "learning_rate": 0.00035508027152246523, |
| "loss": 3.376, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "eval_accuracy": 0.37656059500929034, |
| "eval_loss": 3.446718692779541, |
| "eval_runtime": 184.5393, |
| "eval_samples_per_second": 97.6, |
| "eval_steps_per_second": 6.102, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.095361102141858, |
| "grad_norm": 0.6466246843338013, |
| "learning_rate": 0.0003547570304924038, |
| "loss": 3.3837, |
| "step": 38050 |
| }, |
| { |
| "epoch": 4.100742654181466, |
| "grad_norm": 0.6541309952735901, |
| "learning_rate": 0.0003544337894623424, |
| "loss": 3.3768, |
| "step": 38100 |
| }, |
| { |
| "epoch": 4.106124206221074, |
| "grad_norm": 0.6219347715377808, |
| "learning_rate": 0.000354110548432281, |
| "loss": 3.361, |
| "step": 38150 |
| }, |
| { |
| "epoch": 4.111505758260682, |
| "grad_norm": 0.6315196752548218, |
| "learning_rate": 0.00035378730740221956, |
| "loss": 3.3712, |
| "step": 38200 |
| }, |
| { |
| "epoch": 4.1168873103002905, |
| "grad_norm": 0.6126587390899658, |
| "learning_rate": 0.00035346406637215815, |
| "loss": 3.3761, |
| "step": 38250 |
| }, |
| { |
| "epoch": 4.122268862339899, |
| "grad_norm": 0.7233909368515015, |
| "learning_rate": 0.0003531408253420967, |
| "loss": 3.344, |
| "step": 38300 |
| }, |
| { |
| "epoch": 4.127650414379507, |
| "grad_norm": 0.6340131163597107, |
| "learning_rate": 0.0003528175843120353, |
| "loss": 3.3787, |
| "step": 38350 |
| }, |
| { |
| "epoch": 4.133031966419115, |
| "grad_norm": 0.6910709738731384, |
| "learning_rate": 0.00035249434328197394, |
| "loss": 3.3778, |
| "step": 38400 |
| }, |
| { |
| "epoch": 4.138413518458724, |
| "grad_norm": 0.6195011138916016, |
| "learning_rate": 0.0003521711022519125, |
| "loss": 3.3878, |
| "step": 38450 |
| }, |
| { |
| "epoch": 4.1437950704983315, |
| "grad_norm": 0.6645599007606506, |
| "learning_rate": 0.00035184786122185107, |
| "loss": 3.3773, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.14917662253794, |
| "grad_norm": 0.6299995183944702, |
| "learning_rate": 0.0003515310850123909, |
| "loss": 3.3798, |
| "step": 38550 |
| }, |
| { |
| "epoch": 4.154558174577549, |
| "grad_norm": 0.6655153632164001, |
| "learning_rate": 0.00035120784398232947, |
| "loss": 3.3805, |
| "step": 38600 |
| }, |
| { |
| "epoch": 4.159939726617156, |
| "grad_norm": 0.626811683177948, |
| "learning_rate": 0.000350884602952268, |
| "loss": 3.3592, |
| "step": 38650 |
| }, |
| { |
| "epoch": 4.165321278656765, |
| "grad_norm": 0.647689938545227, |
| "learning_rate": 0.0003505613619222066, |
| "loss": 3.383, |
| "step": 38700 |
| }, |
| { |
| "epoch": 4.1707028306963725, |
| "grad_norm": 0.6576713919639587, |
| "learning_rate": 0.0003502381208921452, |
| "loss": 3.3792, |
| "step": 38750 |
| }, |
| { |
| "epoch": 4.176084382735981, |
| "grad_norm": 0.6431198716163635, |
| "learning_rate": 0.0003499148798620838, |
| "loss": 3.3732, |
| "step": 38800 |
| }, |
| { |
| "epoch": 4.18146593477559, |
| "grad_norm": 0.691774845123291, |
| "learning_rate": 0.0003495916388320224, |
| "loss": 3.3809, |
| "step": 38850 |
| }, |
| { |
| "epoch": 4.186847486815197, |
| "grad_norm": 0.6469290852546692, |
| "learning_rate": 0.000349268397801961, |
| "loss": 3.3734, |
| "step": 38900 |
| }, |
| { |
| "epoch": 4.192229038854806, |
| "grad_norm": 0.7078692317008972, |
| "learning_rate": 0.0003489451567718995, |
| "loss": 3.3864, |
| "step": 38950 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "grad_norm": 0.6617368459701538, |
| "learning_rate": 0.0003486219157418381, |
| "loss": 3.3961, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "eval_accuracy": 0.3772209872614268, |
| "eval_loss": 3.4436137676239014, |
| "eval_runtime": 184.7722, |
| "eval_samples_per_second": 97.477, |
| "eval_steps_per_second": 6.094, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.202992142934022, |
| "grad_norm": 0.6935890316963196, |
| "learning_rate": 0.00034829867471177677, |
| "loss": 3.3808, |
| "step": 39050 |
| }, |
| { |
| "epoch": 4.208373694973631, |
| "grad_norm": 0.6806594133377075, |
| "learning_rate": 0.0003479754336817153, |
| "loss": 3.3771, |
| "step": 39100 |
| }, |
| { |
| "epoch": 4.213755247013238, |
| "grad_norm": 0.6456026434898376, |
| "learning_rate": 0.0003476521926516539, |
| "loss": 3.4124, |
| "step": 39150 |
| }, |
| { |
| "epoch": 4.219136799052847, |
| "grad_norm": 0.5707885026931763, |
| "learning_rate": 0.00034732895162159245, |
| "loss": 3.3916, |
| "step": 39200 |
| }, |
| { |
| "epoch": 4.224518351092455, |
| "grad_norm": 0.6426082849502563, |
| "learning_rate": 0.00034700571059153104, |
| "loss": 3.376, |
| "step": 39250 |
| }, |
| { |
| "epoch": 4.229899903132063, |
| "grad_norm": 0.6489585638046265, |
| "learning_rate": 0.00034668246956146963, |
| "loss": 3.3809, |
| "step": 39300 |
| }, |
| { |
| "epoch": 4.2352814551716715, |
| "grad_norm": 0.6257960796356201, |
| "learning_rate": 0.00034635922853140823, |
| "loss": 3.3873, |
| "step": 39350 |
| }, |
| { |
| "epoch": 4.24066300721128, |
| "grad_norm": 0.698000967502594, |
| "learning_rate": 0.0003460359875013468, |
| "loss": 3.3771, |
| "step": 39400 |
| }, |
| { |
| "epoch": 4.246044559250888, |
| "grad_norm": 0.6346425414085388, |
| "learning_rate": 0.0003457127464712854, |
| "loss": 3.3877, |
| "step": 39450 |
| }, |
| { |
| "epoch": 4.251426111290496, |
| "grad_norm": 0.6701180338859558, |
| "learning_rate": 0.00034538950544122396, |
| "loss": 3.3697, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.256807663330104, |
| "grad_norm": 0.6371496319770813, |
| "learning_rate": 0.00034506626441116255, |
| "loss": 3.3843, |
| "step": 39550 |
| }, |
| { |
| "epoch": 4.2621892153697125, |
| "grad_norm": 0.6401668190956116, |
| "learning_rate": 0.0003447430233811011, |
| "loss": 3.3913, |
| "step": 39600 |
| }, |
| { |
| "epoch": 4.267570767409321, |
| "grad_norm": 0.6551806926727295, |
| "learning_rate": 0.00034441978235103974, |
| "loss": 3.3918, |
| "step": 39650 |
| }, |
| { |
| "epoch": 4.272952319448929, |
| "grad_norm": 0.6413207054138184, |
| "learning_rate": 0.00034409654132097834, |
| "loss": 3.3873, |
| "step": 39700 |
| }, |
| { |
| "epoch": 4.278333871488537, |
| "grad_norm": 0.6472829580307007, |
| "learning_rate": 0.0003437733002909169, |
| "loss": 3.3974, |
| "step": 39750 |
| }, |
| { |
| "epoch": 4.283715423528146, |
| "grad_norm": 0.6183416247367859, |
| "learning_rate": 0.00034345005926085547, |
| "loss": 3.3703, |
| "step": 39800 |
| }, |
| { |
| "epoch": 4.2890969755677535, |
| "grad_norm": 0.715303361415863, |
| "learning_rate": 0.00034312681823079407, |
| "loss": 3.3877, |
| "step": 39850 |
| }, |
| { |
| "epoch": 4.294478527607362, |
| "grad_norm": 0.6685398817062378, |
| "learning_rate": 0.00034280357720073266, |
| "loss": 3.3878, |
| "step": 39900 |
| }, |
| { |
| "epoch": 4.299860079646971, |
| "grad_norm": 0.6901618242263794, |
| "learning_rate": 0.00034248033617067126, |
| "loss": 3.3789, |
| "step": 39950 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "grad_norm": 0.6296851634979248, |
| "learning_rate": 0.00034215709514060985, |
| "loss": 3.3729, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "eval_accuracy": 0.3777952177735263, |
| "eval_loss": 3.437654972076416, |
| "eval_runtime": 184.7949, |
| "eval_samples_per_second": 97.465, |
| "eval_steps_per_second": 6.093, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.310623183726187, |
| "grad_norm": 0.7259793877601624, |
| "learning_rate": 0.0003418338541105484, |
| "loss": 3.3616, |
| "step": 40050 |
| }, |
| { |
| "epoch": 4.3160047357657945, |
| "grad_norm": 0.6109891533851624, |
| "learning_rate": 0.000341510613080487, |
| "loss": 3.3857, |
| "step": 40100 |
| }, |
| { |
| "epoch": 4.321386287805403, |
| "grad_norm": 0.6450995802879333, |
| "learning_rate": 0.0003411873720504255, |
| "loss": 3.389, |
| "step": 40150 |
| }, |
| { |
| "epoch": 4.326767839845012, |
| "grad_norm": 0.6640621423721313, |
| "learning_rate": 0.0003408641310203642, |
| "loss": 3.3955, |
| "step": 40200 |
| }, |
| { |
| "epoch": 4.332149391884619, |
| "grad_norm": 0.6404122710227966, |
| "learning_rate": 0.00034054088999030277, |
| "loss": 3.3787, |
| "step": 40250 |
| }, |
| { |
| "epoch": 4.337530943924228, |
| "grad_norm": 0.6834564208984375, |
| "learning_rate": 0.0003402176489602413, |
| "loss": 3.3903, |
| "step": 40300 |
| }, |
| { |
| "epoch": 4.342912495963836, |
| "grad_norm": 0.7080681324005127, |
| "learning_rate": 0.0003398944079301799, |
| "loss": 3.3965, |
| "step": 40350 |
| }, |
| { |
| "epoch": 4.348294048003444, |
| "grad_norm": 0.6617289185523987, |
| "learning_rate": 0.0003395711669001185, |
| "loss": 3.4053, |
| "step": 40400 |
| }, |
| { |
| "epoch": 4.3536756000430525, |
| "grad_norm": 0.6993653178215027, |
| "learning_rate": 0.00033924792587005704, |
| "loss": 3.3868, |
| "step": 40450 |
| }, |
| { |
| "epoch": 4.359057152082661, |
| "grad_norm": 0.6167160868644714, |
| "learning_rate": 0.0003389246848399957, |
| "loss": 3.3719, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.364438704122269, |
| "grad_norm": 0.669032096862793, |
| "learning_rate": 0.0003386014438099343, |
| "loss": 3.3802, |
| "step": 40550 |
| }, |
| { |
| "epoch": 4.369820256161877, |
| "grad_norm": 0.619784414768219, |
| "learning_rate": 0.0003382782027798728, |
| "loss": 3.3882, |
| "step": 40600 |
| }, |
| { |
| "epoch": 4.375201808201485, |
| "grad_norm": 0.6738349199295044, |
| "learning_rate": 0.0003379549617498114, |
| "loss": 3.3755, |
| "step": 40650 |
| }, |
| { |
| "epoch": 4.3805833602410935, |
| "grad_norm": 0.6499971151351929, |
| "learning_rate": 0.00033763172071974996, |
| "loss": 3.3892, |
| "step": 40700 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 0.6084420084953308, |
| "learning_rate": 0.00033730847968968855, |
| "loss": 3.3835, |
| "step": 40750 |
| }, |
| { |
| "epoch": 4.39134646432031, |
| "grad_norm": 0.6882473230361938, |
| "learning_rate": 0.0003369852386596272, |
| "loss": 3.3852, |
| "step": 40800 |
| }, |
| { |
| "epoch": 4.396728016359918, |
| "grad_norm": 0.6895729303359985, |
| "learning_rate": 0.00033666199762956574, |
| "loss": 3.3923, |
| "step": 40850 |
| }, |
| { |
| "epoch": 4.402109568399527, |
| "grad_norm": 0.6523504853248596, |
| "learning_rate": 0.00033633875659950434, |
| "loss": 3.3847, |
| "step": 40900 |
| }, |
| { |
| "epoch": 4.4074911204391345, |
| "grad_norm": 0.619976282119751, |
| "learning_rate": 0.0003360155155694429, |
| "loss": 3.3709, |
| "step": 40950 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "grad_norm": 0.6657698750495911, |
| "learning_rate": 0.0003356922745393815, |
| "loss": 3.3866, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "eval_accuracy": 0.3780848863724397, |
| "eval_loss": 3.4327142238616943, |
| "eval_runtime": 184.5831, |
| "eval_samples_per_second": 97.577, |
| "eval_steps_per_second": 6.1, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.418254224518351, |
| "grad_norm": 0.6433376669883728, |
| "learning_rate": 0.0003353690335093201, |
| "loss": 3.3842, |
| "step": 41050 |
| }, |
| { |
| "epoch": 4.423635776557959, |
| "grad_norm": 0.676450252532959, |
| "learning_rate": 0.0003350457924792587, |
| "loss": 3.4097, |
| "step": 41100 |
| }, |
| { |
| "epoch": 4.429017328597568, |
| "grad_norm": 0.6363053321838379, |
| "learning_rate": 0.00033472255144919726, |
| "loss": 3.389, |
| "step": 41150 |
| }, |
| { |
| "epoch": 4.4343988806371755, |
| "grad_norm": 0.6782971620559692, |
| "learning_rate": 0.00033439931041913585, |
| "loss": 3.4028, |
| "step": 41200 |
| }, |
| { |
| "epoch": 4.439780432676784, |
| "grad_norm": 0.6248264908790588, |
| "learning_rate": 0.00033408253420967566, |
| "loss": 3.3992, |
| "step": 41250 |
| }, |
| { |
| "epoch": 4.445161984716393, |
| "grad_norm": 0.617743194103241, |
| "learning_rate": 0.00033375929317961425, |
| "loss": 3.3682, |
| "step": 41300 |
| }, |
| { |
| "epoch": 4.450543536756, |
| "grad_norm": 0.7119070887565613, |
| "learning_rate": 0.0003334360521495528, |
| "loss": 3.3864, |
| "step": 41350 |
| }, |
| { |
| "epoch": 4.455925088795609, |
| "grad_norm": 0.6565623879432678, |
| "learning_rate": 0.0003331128111194914, |
| "loss": 3.3814, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.461306640835216, |
| "grad_norm": 0.7476305365562439, |
| "learning_rate": 0.00033278957008943004, |
| "loss": 3.384, |
| "step": 41450 |
| }, |
| { |
| "epoch": 4.466688192874825, |
| "grad_norm": 0.6427007913589478, |
| "learning_rate": 0.0003324663290593686, |
| "loss": 3.3901, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.4720697449144335, |
| "grad_norm": 0.6926642656326294, |
| "learning_rate": 0.00033214308802930717, |
| "loss": 3.3919, |
| "step": 41550 |
| }, |
| { |
| "epoch": 4.477451296954041, |
| "grad_norm": 0.687822163105011, |
| "learning_rate": 0.0003318198469992457, |
| "loss": 3.3926, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.48283284899365, |
| "grad_norm": 0.695188045501709, |
| "learning_rate": 0.0003314966059691843, |
| "loss": 3.3966, |
| "step": 41650 |
| }, |
| { |
| "epoch": 4.488214401033258, |
| "grad_norm": 0.6738727688789368, |
| "learning_rate": 0.00033117336493912296, |
| "loss": 3.4002, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.493595953072866, |
| "grad_norm": 0.6818880438804626, |
| "learning_rate": 0.0003308501239090615, |
| "loss": 3.3993, |
| "step": 41750 |
| }, |
| { |
| "epoch": 4.4989775051124745, |
| "grad_norm": 0.6498254537582397, |
| "learning_rate": 0.0003305268828790001, |
| "loss": 3.4005, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.504359057152083, |
| "grad_norm": 0.6302522420883179, |
| "learning_rate": 0.0003302036418489387, |
| "loss": 3.3798, |
| "step": 41850 |
| }, |
| { |
| "epoch": 4.509740609191691, |
| "grad_norm": 0.6108139157295227, |
| "learning_rate": 0.0003298804008188772, |
| "loss": 3.36, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.515122161231299, |
| "grad_norm": 0.6787891983985901, |
| "learning_rate": 0.0003295571597888158, |
| "loss": 3.3822, |
| "step": 41950 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "grad_norm": 0.6845345497131348, |
| "learning_rate": 0.00032923391875875447, |
| "loss": 3.3773, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "eval_accuracy": 0.3784473524061888, |
| "eval_loss": 3.427448272705078, |
| "eval_runtime": 184.8487, |
| "eval_samples_per_second": 97.436, |
| "eval_steps_per_second": 6.091, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.5258852653105155, |
| "grad_norm": 0.6910591125488281, |
| "learning_rate": 0.000328910677728693, |
| "loss": 3.3944, |
| "step": 42050 |
| }, |
| { |
| "epoch": 4.531266817350124, |
| "grad_norm": 0.6579834222793579, |
| "learning_rate": 0.0003285874366986316, |
| "loss": 3.3767, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.536648369389732, |
| "grad_norm": 0.6646836996078491, |
| "learning_rate": 0.00032826419566857015, |
| "loss": 3.3925, |
| "step": 42150 |
| }, |
| { |
| "epoch": 4.54202992142934, |
| "grad_norm": 0.6295957565307617, |
| "learning_rate": 0.00032794095463850874, |
| "loss": 3.3939, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.547411473468949, |
| "grad_norm": 0.6931584477424622, |
| "learning_rate": 0.0003276177136084473, |
| "loss": 3.3813, |
| "step": 42250 |
| }, |
| { |
| "epoch": 4.5527930255085565, |
| "grad_norm": 0.728244423866272, |
| "learning_rate": 0.00032729447257838593, |
| "loss": 3.3823, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.558174577548165, |
| "grad_norm": 0.6908766627311707, |
| "learning_rate": 0.0003269712315483245, |
| "loss": 3.3996, |
| "step": 42350 |
| }, |
| { |
| "epoch": 4.563556129587774, |
| "grad_norm": 0.6593059301376343, |
| "learning_rate": 0.00032664799051826306, |
| "loss": 3.3678, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.568937681627381, |
| "grad_norm": 0.7266299724578857, |
| "learning_rate": 0.00032632474948820166, |
| "loss": 3.3908, |
| "step": 42450 |
| }, |
| { |
| "epoch": 4.57431923366699, |
| "grad_norm": 0.6872548460960388, |
| "learning_rate": 0.00032600150845814025, |
| "loss": 3.3937, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.579700785706597, |
| "grad_norm": 0.7212083339691162, |
| "learning_rate": 0.0003256782674280788, |
| "loss": 3.3884, |
| "step": 42550 |
| }, |
| { |
| "epoch": 4.585082337746206, |
| "grad_norm": 0.6755133867263794, |
| "learning_rate": 0.00032535502639801744, |
| "loss": 3.3797, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.5904638897858145, |
| "grad_norm": 0.634061336517334, |
| "learning_rate": 0.00032503178536795604, |
| "loss": 3.3579, |
| "step": 42650 |
| }, |
| { |
| "epoch": 4.595845441825422, |
| "grad_norm": 0.6569266319274902, |
| "learning_rate": 0.0003247085443378946, |
| "loss": 3.3904, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.601226993865031, |
| "grad_norm": 0.6839199066162109, |
| "learning_rate": 0.0003243853033078332, |
| "loss": 3.3873, |
| "step": 42750 |
| }, |
| { |
| "epoch": 4.606608545904638, |
| "grad_norm": 0.6381998062133789, |
| "learning_rate": 0.0003240620622777717, |
| "loss": 3.3958, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.611990097944247, |
| "grad_norm": 0.6814762949943542, |
| "learning_rate": 0.00032373882124771036, |
| "loss": 3.3978, |
| "step": 42850 |
| }, |
| { |
| "epoch": 4.6173716499838555, |
| "grad_norm": 0.6227290630340576, |
| "learning_rate": 0.00032341558021764896, |
| "loss": 3.3834, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.622753202023463, |
| "grad_norm": 0.644250750541687, |
| "learning_rate": 0.0003230923391875875, |
| "loss": 3.3759, |
| "step": 42950 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "grad_norm": 0.6935346722602844, |
| "learning_rate": 0.0003227690981575261, |
| "loss": 3.4146, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "eval_accuracy": 0.37911926186443357, |
| "eval_loss": 3.4232747554779053, |
| "eval_runtime": 184.608, |
| "eval_samples_per_second": 97.563, |
| "eval_steps_per_second": 6.099, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.63351630610268, |
| "grad_norm": 0.6674992442131042, |
| "learning_rate": 0.0003224458571274647, |
| "loss": 3.3878, |
| "step": 43050 |
| }, |
| { |
| "epoch": 4.638897858142288, |
| "grad_norm": 0.6759820580482483, |
| "learning_rate": 0.00032212261609740323, |
| "loss": 3.3895, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.6442794101818965, |
| "grad_norm": 0.687537670135498, |
| "learning_rate": 0.0003217993750673419, |
| "loss": 3.3907, |
| "step": 43150 |
| }, |
| { |
| "epoch": 4.649660962221505, |
| "grad_norm": 0.6385945081710815, |
| "learning_rate": 0.00032147613403728047, |
| "loss": 3.3825, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.655042514261113, |
| "grad_norm": 0.7066798210144043, |
| "learning_rate": 0.000321152893007219, |
| "loss": 3.3945, |
| "step": 43250 |
| }, |
| { |
| "epoch": 4.660424066300721, |
| "grad_norm": 0.7198359966278076, |
| "learning_rate": 0.0003208296519771576, |
| "loss": 3.4057, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.665805618340329, |
| "grad_norm": 0.6627112627029419, |
| "learning_rate": 0.00032050641094709615, |
| "loss": 3.3895, |
| "step": 43350 |
| }, |
| { |
| "epoch": 4.6711871703799375, |
| "grad_norm": 0.6638057827949524, |
| "learning_rate": 0.00032018316991703474, |
| "loss": 3.3827, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.676568722419546, |
| "grad_norm": 0.641120970249176, |
| "learning_rate": 0.0003198599288869734, |
| "loss": 3.3974, |
| "step": 43450 |
| }, |
| { |
| "epoch": 4.681950274459154, |
| "grad_norm": 0.6419323682785034, |
| "learning_rate": 0.00031953668785691193, |
| "loss": 3.3961, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.687331826498762, |
| "grad_norm": 0.7281374335289001, |
| "learning_rate": 0.0003192134468268505, |
| "loss": 3.3956, |
| "step": 43550 |
| }, |
| { |
| "epoch": 4.692713378538371, |
| "grad_norm": 0.6420050859451294, |
| "learning_rate": 0.0003188902057967891, |
| "loss": 3.3683, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.6980949305779784, |
| "grad_norm": 0.6763567328453064, |
| "learning_rate": 0.00031856696476672766, |
| "loss": 3.3961, |
| "step": 43650 |
| }, |
| { |
| "epoch": 4.703476482617587, |
| "grad_norm": 0.7230156064033508, |
| "learning_rate": 0.0003182437237366663, |
| "loss": 3.4111, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.7088580346571955, |
| "grad_norm": 0.7006603479385376, |
| "learning_rate": 0.0003179204827066049, |
| "loss": 3.4003, |
| "step": 43750 |
| }, |
| { |
| "epoch": 4.714239586696803, |
| "grad_norm": 0.6924633383750916, |
| "learning_rate": 0.00031759724167654344, |
| "loss": 3.3832, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.719621138736412, |
| "grad_norm": 0.7065801024436951, |
| "learning_rate": 0.00031727400064648204, |
| "loss": 3.3894, |
| "step": 43850 |
| }, |
| { |
| "epoch": 4.725002690776019, |
| "grad_norm": 0.6304015517234802, |
| "learning_rate": 0.0003169507596164206, |
| "loss": 3.3659, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.730384242815628, |
| "grad_norm": 0.6763742566108704, |
| "learning_rate": 0.0003166275185863592, |
| "loss": 3.3802, |
| "step": 43950 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "grad_norm": 0.6544877886772156, |
| "learning_rate": 0.0003163042775562978, |
| "loss": 3.3892, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "eval_accuracy": 0.3791140465258185, |
| "eval_loss": 3.4197874069213867, |
| "eval_runtime": 184.7556, |
| "eval_samples_per_second": 97.486, |
| "eval_steps_per_second": 6.095, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.741147346894844, |
| "grad_norm": 0.6808891296386719, |
| "learning_rate": 0.00031598103652623636, |
| "loss": 3.3747, |
| "step": 44050 |
| }, |
| { |
| "epoch": 4.746528898934453, |
| "grad_norm": 0.6510489583015442, |
| "learning_rate": 0.00031565779549617496, |
| "loss": 3.3851, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.751910450974061, |
| "grad_norm": 0.659172773361206, |
| "learning_rate": 0.00031534101928671476, |
| "loss": 3.3789, |
| "step": 44150 |
| }, |
| { |
| "epoch": 4.757292003013669, |
| "grad_norm": 0.7054234743118286, |
| "learning_rate": 0.00031501777825665336, |
| "loss": 3.3755, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.7626735550532775, |
| "grad_norm": 0.7176251411437988, |
| "learning_rate": 0.0003146945372265919, |
| "loss": 3.3863, |
| "step": 44250 |
| }, |
| { |
| "epoch": 4.768055107092886, |
| "grad_norm": 0.751799464225769, |
| "learning_rate": 0.0003143712961965305, |
| "loss": 3.3894, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.773436659132494, |
| "grad_norm": 0.6770383715629578, |
| "learning_rate": 0.0003140480551664691, |
| "loss": 3.3808, |
| "step": 44350 |
| }, |
| { |
| "epoch": 4.778818211172102, |
| "grad_norm": 0.6924847960472107, |
| "learning_rate": 0.0003137248141364077, |
| "loss": 3.3874, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.78419976321171, |
| "grad_norm": 0.633284330368042, |
| "learning_rate": 0.0003134015731063463, |
| "loss": 3.3971, |
| "step": 44450 |
| }, |
| { |
| "epoch": 4.7895813152513185, |
| "grad_norm": 0.6829878091812134, |
| "learning_rate": 0.0003130783320762849, |
| "loss": 3.3811, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.794962867290927, |
| "grad_norm": 0.6898048520088196, |
| "learning_rate": 0.0003127550910462234, |
| "loss": 3.3897, |
| "step": 44550 |
| }, |
| { |
| "epoch": 4.800344419330535, |
| "grad_norm": 0.6750717163085938, |
| "learning_rate": 0.000312431850016162, |
| "loss": 3.3971, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.805725971370143, |
| "grad_norm": 0.6479890942573547, |
| "learning_rate": 0.00031210860898610066, |
| "loss": 3.406, |
| "step": 44650 |
| }, |
| { |
| "epoch": 4.811107523409751, |
| "grad_norm": 0.7591122984886169, |
| "learning_rate": 0.0003117853679560392, |
| "loss": 3.3913, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.8164890754493594, |
| "grad_norm": 0.6572939157485962, |
| "learning_rate": 0.0003114621269259778, |
| "loss": 3.38, |
| "step": 44750 |
| }, |
| { |
| "epoch": 4.821870627488968, |
| "grad_norm": 0.6863782405853271, |
| "learning_rate": 0.00031113888589591633, |
| "loss": 3.3788, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.827252179528576, |
| "grad_norm": 0.6886435747146606, |
| "learning_rate": 0.00031081564486585493, |
| "loss": 3.3875, |
| "step": 44850 |
| }, |
| { |
| "epoch": 4.832633731568184, |
| "grad_norm": 0.7485862970352173, |
| "learning_rate": 0.0003104924038357935, |
| "loss": 3.3789, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.838015283607793, |
| "grad_norm": 0.666218638420105, |
| "learning_rate": 0.0003101691628057321, |
| "loss": 3.3945, |
| "step": 44950 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "grad_norm": 0.6517705917358398, |
| "learning_rate": 0.0003098459217756707, |
| "loss": 3.401, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "eval_accuracy": 0.38019872830487045, |
| "eval_loss": 3.4125325679779053, |
| "eval_runtime": 184.9758, |
| "eval_samples_per_second": 97.369, |
| "eval_steps_per_second": 6.087, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.848778387687009, |
| "grad_norm": 0.6660953760147095, |
| "learning_rate": 0.0003095226807456093, |
| "loss": 3.3751, |
| "step": 45050 |
| }, |
| { |
| "epoch": 4.8541599397266175, |
| "grad_norm": 0.6609524488449097, |
| "learning_rate": 0.00030919943971554785, |
| "loss": 3.3946, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.859541491766225, |
| "grad_norm": 0.6775374412536621, |
| "learning_rate": 0.00030887619868548644, |
| "loss": 3.3883, |
| "step": 45150 |
| }, |
| { |
| "epoch": 4.864923043805834, |
| "grad_norm": 0.664930522441864, |
| "learning_rate": 0.000308552957655425, |
| "loss": 3.3975, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.870304595845441, |
| "grad_norm": 0.6616421341896057, |
| "learning_rate": 0.00030822971662536363, |
| "loss": 3.3899, |
| "step": 45250 |
| }, |
| { |
| "epoch": 4.87568614788505, |
| "grad_norm": 0.7150900959968567, |
| "learning_rate": 0.0003079064755953022, |
| "loss": 3.3785, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.8810676999246585, |
| "grad_norm": 0.6499496698379517, |
| "learning_rate": 0.00030758323456524077, |
| "loss": 3.3818, |
| "step": 45350 |
| }, |
| { |
| "epoch": 4.886449251964266, |
| "grad_norm": 0.658064603805542, |
| "learning_rate": 0.00030725999353517936, |
| "loss": 3.3796, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.891830804003875, |
| "grad_norm": 0.6768323183059692, |
| "learning_rate": 0.00030693675250511795, |
| "loss": 3.3858, |
| "step": 45450 |
| }, |
| { |
| "epoch": 4.897212356043483, |
| "grad_norm": 0.679030179977417, |
| "learning_rate": 0.00030661351147505655, |
| "loss": 3.3737, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.902593908083091, |
| "grad_norm": 0.6560843586921692, |
| "learning_rate": 0.00030629027044499514, |
| "loss": 3.3828, |
| "step": 45550 |
| }, |
| { |
| "epoch": 4.9079754601226995, |
| "grad_norm": 0.6728476881980896, |
| "learning_rate": 0.00030596702941493374, |
| "loss": 3.3804, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.913357012162308, |
| "grad_norm": 0.736443817615509, |
| "learning_rate": 0.0003056437883848723, |
| "loss": 3.3771, |
| "step": 45650 |
| }, |
| { |
| "epoch": 4.918738564201916, |
| "grad_norm": 0.6769527792930603, |
| "learning_rate": 0.0003053205473548109, |
| "loss": 3.4014, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.924120116241524, |
| "grad_norm": 0.6532332301139832, |
| "learning_rate": 0.0003049973063247494, |
| "loss": 3.3797, |
| "step": 45750 |
| }, |
| { |
| "epoch": 4.929501668281132, |
| "grad_norm": 0.6099948287010193, |
| "learning_rate": 0.00030467406529468806, |
| "loss": 3.3837, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.9348832203207404, |
| "grad_norm": 0.6482587456703186, |
| "learning_rate": 0.00030435082426462666, |
| "loss": 3.3776, |
| "step": 45850 |
| }, |
| { |
| "epoch": 4.940264772360349, |
| "grad_norm": 0.6524862051010132, |
| "learning_rate": 0.0003040275832345652, |
| "loss": 3.3893, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.945646324399957, |
| "grad_norm": 0.6760078072547913, |
| "learning_rate": 0.0003037043422045038, |
| "loss": 3.386, |
| "step": 45950 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "grad_norm": 0.7402188777923584, |
| "learning_rate": 0.0003033811011744424, |
| "loss": 3.3919, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "eval_accuracy": 0.3805924863703101, |
| "eval_loss": 3.4080684185028076, |
| "eval_runtime": 184.4763, |
| "eval_samples_per_second": 97.633, |
| "eval_steps_per_second": 6.104, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.956409428479174, |
| "grad_norm": 0.7083448767662048, |
| "learning_rate": 0.00030305786014438093, |
| "loss": 3.3806, |
| "step": 46050 |
| }, |
| { |
| "epoch": 4.961790980518781, |
| "grad_norm": 0.7174310684204102, |
| "learning_rate": 0.0003027346191143196, |
| "loss": 3.37, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.96717253255839, |
| "grad_norm": 0.6411959528923035, |
| "learning_rate": 0.00030241137808425817, |
| "loss": 3.3813, |
| "step": 46150 |
| }, |
| { |
| "epoch": 4.9725540845979985, |
| "grad_norm": 0.6771125197410583, |
| "learning_rate": 0.0003020881370541967, |
| "loss": 3.3759, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.977935636637606, |
| "grad_norm": 0.6481576561927795, |
| "learning_rate": 0.0003017648960241353, |
| "loss": 3.3786, |
| "step": 46250 |
| }, |
| { |
| "epoch": 4.983317188677215, |
| "grad_norm": 0.6613046526908875, |
| "learning_rate": 0.00030144165499407385, |
| "loss": 3.4187, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.988698740716822, |
| "grad_norm": 0.6821366548538208, |
| "learning_rate": 0.00030111841396401244, |
| "loss": 3.3862, |
| "step": 46350 |
| }, |
| { |
| "epoch": 4.994080292756431, |
| "grad_norm": 0.7078256607055664, |
| "learning_rate": 0.0003007951729339511, |
| "loss": 3.3917, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.9994618447960395, |
| "grad_norm": 0.6903404593467712, |
| "learning_rate": 0.00030047193190388963, |
| "loss": 3.3968, |
| "step": 46450 |
| }, |
| { |
| "epoch": 5.004843396835647, |
| "grad_norm": 0.6922981142997742, |
| "learning_rate": 0.0003001486908738282, |
| "loss": 3.3085, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.010224948875256, |
| "grad_norm": 0.75609290599823, |
| "learning_rate": 0.0002998254498437668, |
| "loss": 3.2916, |
| "step": 46550 |
| }, |
| { |
| "epoch": 5.015606500914864, |
| "grad_norm": 0.7146428823471069, |
| "learning_rate": 0.0002995022088137054, |
| "loss": 3.3115, |
| "step": 46600 |
| }, |
| { |
| "epoch": 5.020988052954472, |
| "grad_norm": 0.7553977966308594, |
| "learning_rate": 0.00029917896778364396, |
| "loss": 3.3028, |
| "step": 46650 |
| }, |
| { |
| "epoch": 5.0263696049940805, |
| "grad_norm": 0.6598968505859375, |
| "learning_rate": 0.00029885572675358255, |
| "loss": 3.2995, |
| "step": 46700 |
| }, |
| { |
| "epoch": 5.031751157033688, |
| "grad_norm": 0.6931416988372803, |
| "learning_rate": 0.00029853248572352114, |
| "loss": 3.2986, |
| "step": 46750 |
| }, |
| { |
| "epoch": 5.037132709073297, |
| "grad_norm": 0.6862365007400513, |
| "learning_rate": 0.00029820924469345974, |
| "loss": 3.3076, |
| "step": 46800 |
| }, |
| { |
| "epoch": 5.042514261112905, |
| "grad_norm": 0.674670398235321, |
| "learning_rate": 0.0002978860036633983, |
| "loss": 3.282, |
| "step": 46850 |
| }, |
| { |
| "epoch": 5.047895813152513, |
| "grad_norm": 0.6725071668624878, |
| "learning_rate": 0.00029756276263333693, |
| "loss": 3.3175, |
| "step": 46900 |
| }, |
| { |
| "epoch": 5.0532773651921215, |
| "grad_norm": 0.715003252029419, |
| "learning_rate": 0.00029723952160327547, |
| "loss": 3.3009, |
| "step": 46950 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "grad_norm": 0.6437234878540039, |
| "learning_rate": 0.00029691628057321406, |
| "loss": 3.2957, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "eval_accuracy": 0.3807873009981615, |
| "eval_loss": 3.412716865539551, |
| "eval_runtime": 184.8748, |
| "eval_samples_per_second": 97.423, |
| "eval_steps_per_second": 6.091, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.064040469271338, |
| "grad_norm": 0.6628379225730896, |
| "learning_rate": 0.00029659303954315266, |
| "loss": 3.3054, |
| "step": 47050 |
| }, |
| { |
| "epoch": 5.069422021310946, |
| "grad_norm": 0.6656396985054016, |
| "learning_rate": 0.00029626979851309125, |
| "loss": 3.3118, |
| "step": 47100 |
| }, |
| { |
| "epoch": 5.074803573350554, |
| "grad_norm": 0.6756969094276428, |
| "learning_rate": 0.00029594655748302985, |
| "loss": 3.2969, |
| "step": 47150 |
| }, |
| { |
| "epoch": 5.080185125390162, |
| "grad_norm": 0.6976021528244019, |
| "learning_rate": 0.0002956233164529684, |
| "loss": 3.2881, |
| "step": 47200 |
| }, |
| { |
| "epoch": 5.085566677429771, |
| "grad_norm": 0.7710605263710022, |
| "learning_rate": 0.000295300075422907, |
| "loss": 3.3236, |
| "step": 47250 |
| }, |
| { |
| "epoch": 5.090948229469379, |
| "grad_norm": 0.6808165907859802, |
| "learning_rate": 0.0002949768343928456, |
| "loss": 3.3053, |
| "step": 47300 |
| }, |
| { |
| "epoch": 5.096329781508987, |
| "grad_norm": 0.6778217554092407, |
| "learning_rate": 0.00029465359336278417, |
| "loss": 3.3169, |
| "step": 47350 |
| }, |
| { |
| "epoch": 5.101711333548596, |
| "grad_norm": 0.7051342725753784, |
| "learning_rate": 0.0002943303523327227, |
| "loss": 3.3148, |
| "step": 47400 |
| }, |
| { |
| "epoch": 5.107092885588203, |
| "grad_norm": 0.6564488410949707, |
| "learning_rate": 0.00029400711130266136, |
| "loss": 3.2898, |
| "step": 47450 |
| }, |
| { |
| "epoch": 5.112474437627812, |
| "grad_norm": 0.6671553254127502, |
| "learning_rate": 0.0002936838702725999, |
| "loss": 3.3092, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.1178559896674205, |
| "grad_norm": 0.7125695943832397, |
| "learning_rate": 0.0002933606292425385, |
| "loss": 3.3329, |
| "step": 47550 |
| }, |
| { |
| "epoch": 5.123237541707028, |
| "grad_norm": 0.7285773754119873, |
| "learning_rate": 0.0002930373882124771, |
| "loss": 3.3005, |
| "step": 47600 |
| }, |
| { |
| "epoch": 5.128619093746637, |
| "grad_norm": 0.7182151675224304, |
| "learning_rate": 0.0002927141471824157, |
| "loss": 3.3026, |
| "step": 47650 |
| }, |
| { |
| "epoch": 5.134000645786244, |
| "grad_norm": 0.6936147212982178, |
| "learning_rate": 0.0002923909061523542, |
| "loss": 3.3223, |
| "step": 47700 |
| }, |
| { |
| "epoch": 5.139382197825853, |
| "grad_norm": 0.7076971530914307, |
| "learning_rate": 0.0002920676651222928, |
| "loss": 3.3132, |
| "step": 47750 |
| }, |
| { |
| "epoch": 5.1447637498654615, |
| "grad_norm": 0.7523118257522583, |
| "learning_rate": 0.0002917444240922314, |
| "loss": 3.331, |
| "step": 47800 |
| }, |
| { |
| "epoch": 5.150145301905069, |
| "grad_norm": 0.6461667418479919, |
| "learning_rate": 0.00029142118306216996, |
| "loss": 3.3323, |
| "step": 47850 |
| }, |
| { |
| "epoch": 5.155526853944678, |
| "grad_norm": 0.6895806193351746, |
| "learning_rate": 0.0002910979420321086, |
| "loss": 3.3141, |
| "step": 47900 |
| }, |
| { |
| "epoch": 5.160908405984286, |
| "grad_norm": 0.715032696723938, |
| "learning_rate": 0.00029077470100204715, |
| "loss": 3.3178, |
| "step": 47950 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "grad_norm": 0.7430562376976013, |
| "learning_rate": 0.00029045145997198574, |
| "loss": 3.3146, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "eval_accuracy": 0.38082771987242847, |
| "eval_loss": 3.4105722904205322, |
| "eval_runtime": 184.8541, |
| "eval_samples_per_second": 97.434, |
| "eval_steps_per_second": 6.091, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.1716715100635025, |
| "grad_norm": 0.6621975302696228, |
| "learning_rate": 0.00029012821894192433, |
| "loss": 3.3294, |
| "step": 48050 |
| }, |
| { |
| "epoch": 5.17705306210311, |
| "grad_norm": 0.6793010830879211, |
| "learning_rate": 0.00028980497791186293, |
| "loss": 3.3091, |
| "step": 48100 |
| }, |
| { |
| "epoch": 5.182434614142719, |
| "grad_norm": 0.7010188698768616, |
| "learning_rate": 0.00028948820170240274, |
| "loss": 3.3182, |
| "step": 48150 |
| }, |
| { |
| "epoch": 5.187816166182327, |
| "grad_norm": 0.6465266942977905, |
| "learning_rate": 0.00028916496067234133, |
| "loss": 3.3094, |
| "step": 48200 |
| }, |
| { |
| "epoch": 5.193197718221935, |
| "grad_norm": 0.7279083132743835, |
| "learning_rate": 0.0002888417196422799, |
| "loss": 3.3092, |
| "step": 48250 |
| }, |
| { |
| "epoch": 5.198579270261543, |
| "grad_norm": 0.6986241936683655, |
| "learning_rate": 0.00028851847861221847, |
| "loss": 3.2993, |
| "step": 48300 |
| }, |
| { |
| "epoch": 5.203960822301152, |
| "grad_norm": 0.6948270797729492, |
| "learning_rate": 0.00028819523758215706, |
| "loss": 3.3012, |
| "step": 48350 |
| }, |
| { |
| "epoch": 5.20934237434076, |
| "grad_norm": 0.6764682531356812, |
| "learning_rate": 0.00028787199655209566, |
| "loss": 3.3296, |
| "step": 48400 |
| }, |
| { |
| "epoch": 5.214723926380368, |
| "grad_norm": 0.7321643233299255, |
| "learning_rate": 0.00028754875552203425, |
| "loss": 3.3373, |
| "step": 48450 |
| }, |
| { |
| "epoch": 5.220105478419977, |
| "grad_norm": 0.666074275970459, |
| "learning_rate": 0.0002872255144919728, |
| "loss": 3.315, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.225487030459584, |
| "grad_norm": 0.7783872485160828, |
| "learning_rate": 0.00028690227346191144, |
| "loss": 3.3212, |
| "step": 48550 |
| }, |
| { |
| "epoch": 5.230868582499193, |
| "grad_norm": 0.726060152053833, |
| "learning_rate": 0.00028657903243185, |
| "loss": 3.3096, |
| "step": 48600 |
| }, |
| { |
| "epoch": 5.236250134538801, |
| "grad_norm": 0.6934331655502319, |
| "learning_rate": 0.0002862557914017886, |
| "loss": 3.3258, |
| "step": 48650 |
| }, |
| { |
| "epoch": 5.241631686578409, |
| "grad_norm": 0.684074878692627, |
| "learning_rate": 0.00028593255037172717, |
| "loss": 3.3216, |
| "step": 48700 |
| }, |
| { |
| "epoch": 5.247013238618018, |
| "grad_norm": 0.6767908334732056, |
| "learning_rate": 0.00028560930934166576, |
| "loss": 3.3207, |
| "step": 48750 |
| }, |
| { |
| "epoch": 5.252394790657625, |
| "grad_norm": 0.691230058670044, |
| "learning_rate": 0.00028528606831160436, |
| "loss": 3.3434, |
| "step": 48800 |
| }, |
| { |
| "epoch": 5.257776342697234, |
| "grad_norm": 0.7426613569259644, |
| "learning_rate": 0.0002849628272815429, |
| "loss": 3.3263, |
| "step": 48850 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 0.6871113777160645, |
| "learning_rate": 0.0002846395862514815, |
| "loss": 3.3332, |
| "step": 48900 |
| }, |
| { |
| "epoch": 5.26853944677645, |
| "grad_norm": 0.6987447142601013, |
| "learning_rate": 0.0002843228100420213, |
| "loss": 3.3349, |
| "step": 48950 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "grad_norm": 0.6851251125335693, |
| "learning_rate": 0.0002839995690119599, |
| "loss": 3.3387, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "eval_accuracy": 0.3813237203053016, |
| "eval_loss": 3.4056828022003174, |
| "eval_runtime": 184.6477, |
| "eval_samples_per_second": 97.543, |
| "eval_steps_per_second": 6.098, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.279302550855666, |
| "grad_norm": 0.6929810047149658, |
| "learning_rate": 0.0002836763279818985, |
| "loss": 3.3249, |
| "step": 49050 |
| }, |
| { |
| "epoch": 5.284684102895275, |
| "grad_norm": 0.7261629104614258, |
| "learning_rate": 0.0002833530869518371, |
| "loss": 3.3331, |
| "step": 49100 |
| }, |
| { |
| "epoch": 5.2900656549348835, |
| "grad_norm": 0.6819291114807129, |
| "learning_rate": 0.0002830298459217756, |
| "loss": 3.326, |
| "step": 49150 |
| }, |
| { |
| "epoch": 5.295447206974491, |
| "grad_norm": 0.7193529009819031, |
| "learning_rate": 0.0002827066048917142, |
| "loss": 3.3126, |
| "step": 49200 |
| }, |
| { |
| "epoch": 5.3008287590141, |
| "grad_norm": 0.7201562523841858, |
| "learning_rate": 0.0002823833638616528, |
| "loss": 3.3113, |
| "step": 49250 |
| }, |
| { |
| "epoch": 5.306210311053708, |
| "grad_norm": 0.6952700614929199, |
| "learning_rate": 0.0002820601228315914, |
| "loss": 3.3216, |
| "step": 49300 |
| }, |
| { |
| "epoch": 5.311591863093316, |
| "grad_norm": 0.6874309778213501, |
| "learning_rate": 0.00028173688180153, |
| "loss": 3.3205, |
| "step": 49350 |
| }, |
| { |
| "epoch": 5.316973415132924, |
| "grad_norm": 0.6886963844299316, |
| "learning_rate": 0.00028141364077146854, |
| "loss": 3.3213, |
| "step": 49400 |
| }, |
| { |
| "epoch": 5.322354967172533, |
| "grad_norm": 0.7013046741485596, |
| "learning_rate": 0.00028109039974140714, |
| "loss": 3.344, |
| "step": 49450 |
| }, |
| { |
| "epoch": 5.327736519212141, |
| "grad_norm": 0.73094242811203, |
| "learning_rate": 0.00028076715871134573, |
| "loss": 3.3239, |
| "step": 49500 |
| }, |
| { |
| "epoch": 5.333118071251749, |
| "grad_norm": 0.7266910076141357, |
| "learning_rate": 0.00028044391768128433, |
| "loss": 3.3593, |
| "step": 49550 |
| }, |
| { |
| "epoch": 5.338499623291357, |
| "grad_norm": 0.6710968017578125, |
| "learning_rate": 0.0002801206766512229, |
| "loss": 3.3211, |
| "step": 49600 |
| }, |
| { |
| "epoch": 5.343881175330965, |
| "grad_norm": 0.6896623373031616, |
| "learning_rate": 0.0002797974356211615, |
| "loss": 3.3291, |
| "step": 49650 |
| }, |
| { |
| "epoch": 5.349262727370574, |
| "grad_norm": 0.7244205474853516, |
| "learning_rate": 0.00027947419459110006, |
| "loss": 3.3124, |
| "step": 49700 |
| }, |
| { |
| "epoch": 5.354644279410182, |
| "grad_norm": 0.7222291827201843, |
| "learning_rate": 0.00027915095356103865, |
| "loss": 3.3257, |
| "step": 49750 |
| }, |
| { |
| "epoch": 5.36002583144979, |
| "grad_norm": 0.6412253379821777, |
| "learning_rate": 0.00027882771253097725, |
| "loss": 3.3272, |
| "step": 49800 |
| }, |
| { |
| "epoch": 5.365407383489399, |
| "grad_norm": 0.699478268623352, |
| "learning_rate": 0.00027850447150091584, |
| "loss": 3.335, |
| "step": 49850 |
| }, |
| { |
| "epoch": 5.370788935529006, |
| "grad_norm": 0.6737022399902344, |
| "learning_rate": 0.00027818123047085444, |
| "loss": 3.3194, |
| "step": 49900 |
| }, |
| { |
| "epoch": 5.376170487568615, |
| "grad_norm": 0.6805015802383423, |
| "learning_rate": 0.000277857989440793, |
| "loss": 3.3292, |
| "step": 49950 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "grad_norm": 0.7129510641098022, |
| "learning_rate": 0.00027753474841073157, |
| "loss": 3.3388, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "eval_accuracy": 0.3819017536684746, |
| "eval_loss": 3.400902032852173, |
| "eval_runtime": 184.6882, |
| "eval_samples_per_second": 97.521, |
| "eval_steps_per_second": 6.097, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.386933591647831, |
| "grad_norm": 0.7065452933311462, |
| "learning_rate": 0.00027721150738067017, |
| "loss": 3.3545, |
| "step": 50050 |
| }, |
| { |
| "epoch": 5.39231514368744, |
| "grad_norm": 0.714790403842926, |
| "learning_rate": 0.00027688826635060876, |
| "loss": 3.3329, |
| "step": 50100 |
| }, |
| { |
| "epoch": 5.397696695727047, |
| "grad_norm": 0.7263872027397156, |
| "learning_rate": 0.0002765650253205473, |
| "loss": 3.3431, |
| "step": 50150 |
| }, |
| { |
| "epoch": 5.403078247766656, |
| "grad_norm": 0.6966455578804016, |
| "learning_rate": 0.00027624178429048595, |
| "loss": 3.3286, |
| "step": 50200 |
| }, |
| { |
| "epoch": 5.4084597998062645, |
| "grad_norm": 0.716153621673584, |
| "learning_rate": 0.0002759185432604245, |
| "loss": 3.3326, |
| "step": 50250 |
| }, |
| { |
| "epoch": 5.413841351845872, |
| "grad_norm": 0.7463889718055725, |
| "learning_rate": 0.0002755953022303631, |
| "loss": 3.3212, |
| "step": 50300 |
| }, |
| { |
| "epoch": 5.419222903885481, |
| "grad_norm": 0.6929644346237183, |
| "learning_rate": 0.0002752720612003017, |
| "loss": 3.3268, |
| "step": 50350 |
| }, |
| { |
| "epoch": 5.424604455925088, |
| "grad_norm": 0.7609820365905762, |
| "learning_rate": 0.0002749488201702403, |
| "loss": 3.3446, |
| "step": 50400 |
| }, |
| { |
| "epoch": 5.429986007964697, |
| "grad_norm": 0.721838116645813, |
| "learning_rate": 0.0002746255791401788, |
| "loss": 3.3271, |
| "step": 50450 |
| }, |
| { |
| "epoch": 5.435367560004305, |
| "grad_norm": 0.7337953448295593, |
| "learning_rate": 0.0002743023381101174, |
| "loss": 3.3351, |
| "step": 50500 |
| }, |
| { |
| "epoch": 5.440749112043913, |
| "grad_norm": 0.7078254818916321, |
| "learning_rate": 0.000273979097080056, |
| "loss": 3.326, |
| "step": 50550 |
| }, |
| { |
| "epoch": 5.446130664083522, |
| "grad_norm": 0.6884511709213257, |
| "learning_rate": 0.0002736558560499946, |
| "loss": 3.3364, |
| "step": 50600 |
| }, |
| { |
| "epoch": 5.45151221612313, |
| "grad_norm": 0.6717587113380432, |
| "learning_rate": 0.0002733326150199332, |
| "loss": 3.3209, |
| "step": 50650 |
| }, |
| { |
| "epoch": 5.456893768162738, |
| "grad_norm": 0.7340549826622009, |
| "learning_rate": 0.00027300937398987173, |
| "loss": 3.3344, |
| "step": 50700 |
| }, |
| { |
| "epoch": 5.462275320202346, |
| "grad_norm": 0.7046282291412354, |
| "learning_rate": 0.0002726861329598104, |
| "loss": 3.3286, |
| "step": 50750 |
| }, |
| { |
| "epoch": 5.467656872241955, |
| "grad_norm": 0.7176598906517029, |
| "learning_rate": 0.0002723628919297489, |
| "loss": 3.335, |
| "step": 50800 |
| }, |
| { |
| "epoch": 5.473038424281563, |
| "grad_norm": 0.7335805296897888, |
| "learning_rate": 0.0002720396508996875, |
| "loss": 3.3395, |
| "step": 50850 |
| }, |
| { |
| "epoch": 5.478419976321171, |
| "grad_norm": 0.7153794169425964, |
| "learning_rate": 0.0002717164098696261, |
| "loss": 3.3335, |
| "step": 50900 |
| }, |
| { |
| "epoch": 5.483801528360779, |
| "grad_norm": 0.7129613757133484, |
| "learning_rate": 0.00027139316883956465, |
| "loss": 3.34, |
| "step": 50950 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "grad_norm": 0.701945424079895, |
| "learning_rate": 0.00027106992780950325, |
| "loss": 3.3306, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "eval_accuracy": 0.3824516459337037, |
| "eval_loss": 3.3981103897094727, |
| "eval_runtime": 184.7536, |
| "eval_samples_per_second": 97.487, |
| "eval_steps_per_second": 6.095, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.494564632439996, |
| "grad_norm": 0.6667730212211609, |
| "learning_rate": 0.00027074668677944184, |
| "loss": 3.3239, |
| "step": 51050 |
| }, |
| { |
| "epoch": 5.499946184479604, |
| "grad_norm": 0.6833287477493286, |
| "learning_rate": 0.00027042344574938044, |
| "loss": 3.33, |
| "step": 51100 |
| }, |
| { |
| "epoch": 5.505327736519212, |
| "grad_norm": 0.7144169211387634, |
| "learning_rate": 0.000270100204719319, |
| "loss": 3.3382, |
| "step": 51150 |
| }, |
| { |
| "epoch": 5.510709288558821, |
| "grad_norm": 0.7473929524421692, |
| "learning_rate": 0.0002697769636892576, |
| "loss": 3.3129, |
| "step": 51200 |
| }, |
| { |
| "epoch": 5.516090840598428, |
| "grad_norm": 0.7221124172210693, |
| "learning_rate": 0.00026945372265919617, |
| "loss": 3.3577, |
| "step": 51250 |
| }, |
| { |
| "epoch": 5.521472392638037, |
| "grad_norm": 0.7008888721466064, |
| "learning_rate": 0.00026913048162913476, |
| "loss": 3.315, |
| "step": 51300 |
| }, |
| { |
| "epoch": 5.5268539446776455, |
| "grad_norm": 0.7081114649772644, |
| "learning_rate": 0.00026880724059907336, |
| "loss": 3.3402, |
| "step": 51350 |
| }, |
| { |
| "epoch": 5.532235496717253, |
| "grad_norm": 0.7221799492835999, |
| "learning_rate": 0.00026848399956901195, |
| "loss": 3.3182, |
| "step": 51400 |
| }, |
| { |
| "epoch": 5.537617048756862, |
| "grad_norm": 0.6946343779563904, |
| "learning_rate": 0.0002681607585389505, |
| "loss": 3.3361, |
| "step": 51450 |
| }, |
| { |
| "epoch": 5.542998600796469, |
| "grad_norm": 0.702681303024292, |
| "learning_rate": 0.0002678375175088891, |
| "loss": 3.3399, |
| "step": 51500 |
| }, |
| { |
| "epoch": 5.548380152836078, |
| "grad_norm": 0.7525026798248291, |
| "learning_rate": 0.0002675142764788277, |
| "loss": 3.3242, |
| "step": 51550 |
| }, |
| { |
| "epoch": 5.553761704875686, |
| "grad_norm": 0.772919774055481, |
| "learning_rate": 0.0002671910354487663, |
| "loss": 3.322, |
| "step": 51600 |
| }, |
| { |
| "epoch": 5.559143256915294, |
| "grad_norm": 0.6999807953834534, |
| "learning_rate": 0.00026686779441870487, |
| "loss": 3.3294, |
| "step": 51650 |
| }, |
| { |
| "epoch": 5.564524808954903, |
| "grad_norm": 0.7455286979675293, |
| "learning_rate": 0.0002665445533886434, |
| "loss": 3.3344, |
| "step": 51700 |
| }, |
| { |
| "epoch": 5.569906360994511, |
| "grad_norm": 0.7175178527832031, |
| "learning_rate": 0.00026622131235858206, |
| "loss": 3.3322, |
| "step": 51750 |
| }, |
| { |
| "epoch": 5.575287913034119, |
| "grad_norm": 0.7250377535820007, |
| "learning_rate": 0.0002658980713285206, |
| "loss": 3.3416, |
| "step": 51800 |
| }, |
| { |
| "epoch": 5.580669465073727, |
| "grad_norm": 0.7095009088516235, |
| "learning_rate": 0.0002655748302984592, |
| "loss": 3.3336, |
| "step": 51850 |
| }, |
| { |
| "epoch": 5.586051017113336, |
| "grad_norm": 0.757713258266449, |
| "learning_rate": 0.0002652515892683978, |
| "loss": 3.3384, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.591432569152944, |
| "grad_norm": 0.7264611124992371, |
| "learning_rate": 0.0002649283482383364, |
| "loss": 3.3279, |
| "step": 51950 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "grad_norm": 0.7121990323066711, |
| "learning_rate": 0.0002646051072082749, |
| "loss": 3.3177, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "eval_accuracy": 0.3827728238700833, |
| "eval_loss": 3.3929855823516846, |
| "eval_runtime": 184.7294, |
| "eval_samples_per_second": 97.499, |
| "eval_steps_per_second": 6.095, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.60219567323216, |
| "grad_norm": 0.7471087574958801, |
| "learning_rate": 0.0002642818661782135, |
| "loss": 3.3679, |
| "step": 52050 |
| }, |
| { |
| "epoch": 5.607577225271768, |
| "grad_norm": 0.7511977553367615, |
| "learning_rate": 0.0002639586251481521, |
| "loss": 3.3404, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.612958777311377, |
| "grad_norm": 0.6835429072380066, |
| "learning_rate": 0.0002636353841180907, |
| "loss": 3.3509, |
| "step": 52150 |
| }, |
| { |
| "epoch": 5.618340329350985, |
| "grad_norm": 0.7123720645904541, |
| "learning_rate": 0.0002633121430880293, |
| "loss": 3.3175, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.623721881390593, |
| "grad_norm": 0.7043327689170837, |
| "learning_rate": 0.00026298890205796784, |
| "loss": 3.3321, |
| "step": 52250 |
| }, |
| { |
| "epoch": 5.629103433430201, |
| "grad_norm": 0.7442822456359863, |
| "learning_rate": 0.00026266566102790644, |
| "loss": 3.3466, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.634484985469809, |
| "grad_norm": 0.6664136648178101, |
| "learning_rate": 0.00026234241999784503, |
| "loss": 3.3312, |
| "step": 52350 |
| }, |
| { |
| "epoch": 5.639866537509418, |
| "grad_norm": 0.7649520039558411, |
| "learning_rate": 0.0002620191789677836, |
| "loss": 3.3195, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.645248089549026, |
| "grad_norm": 0.717771053314209, |
| "learning_rate": 0.00026169593793772217, |
| "loss": 3.3188, |
| "step": 52450 |
| }, |
| { |
| "epoch": 5.650629641588634, |
| "grad_norm": 0.7109053134918213, |
| "learning_rate": 0.0002613726969076608, |
| "loss": 3.3444, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.656011193628243, |
| "grad_norm": 0.6929636001586914, |
| "learning_rate": 0.00026104945587759936, |
| "loss": 3.3419, |
| "step": 52550 |
| }, |
| { |
| "epoch": 5.66139274566785, |
| "grad_norm": 0.7861093282699585, |
| "learning_rate": 0.00026072621484753795, |
| "loss": 3.3373, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.666774297707459, |
| "grad_norm": 0.7142403721809387, |
| "learning_rate": 0.00026040297381747655, |
| "loss": 3.3326, |
| "step": 52650 |
| }, |
| { |
| "epoch": 5.672155849747067, |
| "grad_norm": 0.7610068321228027, |
| "learning_rate": 0.00026007973278741514, |
| "loss": 3.3319, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.677537401786675, |
| "grad_norm": 0.7404228448867798, |
| "learning_rate": 0.00025975649175735373, |
| "loss": 3.3378, |
| "step": 52750 |
| }, |
| { |
| "epoch": 5.682918953826284, |
| "grad_norm": 0.6821990013122559, |
| "learning_rate": 0.0002594332507272923, |
| "loss": 3.3357, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.688300505865891, |
| "grad_norm": 0.7214856743812561, |
| "learning_rate": 0.00025911000969723087, |
| "loss": 3.3244, |
| "step": 52850 |
| }, |
| { |
| "epoch": 5.6936820579055, |
| "grad_norm": 0.7556411623954773, |
| "learning_rate": 0.00025878676866716946, |
| "loss": 3.3249, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.699063609945108, |
| "grad_norm": 0.7074944376945496, |
| "learning_rate": 0.00025846352763710806, |
| "loss": 3.3231, |
| "step": 52950 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "grad_norm": 0.7696301341056824, |
| "learning_rate": 0.00025814675142764787, |
| "loss": 3.3337, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "eval_accuracy": 0.38316332234888856, |
| "eval_loss": 3.3885035514831543, |
| "eval_runtime": 184.6186, |
| "eval_samples_per_second": 97.558, |
| "eval_steps_per_second": 6.099, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.709826714024325, |
| "grad_norm": 0.7107107043266296, |
| "learning_rate": 0.00025782351039758646, |
| "loss": 3.3093, |
| "step": 53050 |
| }, |
| { |
| "epoch": 5.715208266063933, |
| "grad_norm": 0.7684431672096252, |
| "learning_rate": 0.000257500269367525, |
| "loss": 3.3285, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.720589818103541, |
| "grad_norm": 0.7184401154518127, |
| "learning_rate": 0.0002571770283374636, |
| "loss": 3.3186, |
| "step": 53150 |
| }, |
| { |
| "epoch": 5.725971370143149, |
| "grad_norm": 0.6955456733703613, |
| "learning_rate": 0.0002568537873074022, |
| "loss": 3.3399, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.731352922182758, |
| "grad_norm": 0.7292144298553467, |
| "learning_rate": 0.0002565305462773408, |
| "loss": 3.3194, |
| "step": 53250 |
| }, |
| { |
| "epoch": 5.736734474222366, |
| "grad_norm": 0.7374510169029236, |
| "learning_rate": 0.0002562073052472794, |
| "loss": 3.3199, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.742116026261974, |
| "grad_norm": 0.6997143030166626, |
| "learning_rate": 0.0002558840642172179, |
| "loss": 3.3359, |
| "step": 53350 |
| }, |
| { |
| "epoch": 5.747497578301582, |
| "grad_norm": 0.755856454372406, |
| "learning_rate": 0.00025556082318715657, |
| "loss": 3.3198, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.75287913034119, |
| "grad_norm": 0.7310744524002075, |
| "learning_rate": 0.0002552375821570951, |
| "loss": 3.3347, |
| "step": 53450 |
| }, |
| { |
| "epoch": 5.758260682380799, |
| "grad_norm": 0.767074704170227, |
| "learning_rate": 0.0002549143411270337, |
| "loss": 3.3416, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.763642234420407, |
| "grad_norm": 0.7051040530204773, |
| "learning_rate": 0.0002545911000969723, |
| "loss": 3.3338, |
| "step": 53550 |
| }, |
| { |
| "epoch": 5.769023786460015, |
| "grad_norm": 0.7256135940551758, |
| "learning_rate": 0.0002542678590669109, |
| "loss": 3.3486, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.774405338499624, |
| "grad_norm": 0.7255988121032715, |
| "learning_rate": 0.00025394461803684943, |
| "loss": 3.3406, |
| "step": 53650 |
| }, |
| { |
| "epoch": 5.779786890539231, |
| "grad_norm": 0.7210702300071716, |
| "learning_rate": 0.00025362137700678803, |
| "loss": 3.3282, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.78516844257884, |
| "grad_norm": 0.7039930820465088, |
| "learning_rate": 0.0002532981359767266, |
| "loss": 3.3353, |
| "step": 53750 |
| }, |
| { |
| "epoch": 5.790549994618448, |
| "grad_norm": 0.7262842059135437, |
| "learning_rate": 0.0002529748949466652, |
| "loss": 3.3549, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.795931546658056, |
| "grad_norm": 0.718593180179596, |
| "learning_rate": 0.0002526516539166038, |
| "loss": 3.3341, |
| "step": 53850 |
| }, |
| { |
| "epoch": 5.801313098697665, |
| "grad_norm": 0.7095365524291992, |
| "learning_rate": 0.00025232841288654235, |
| "loss": 3.3327, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.806694650737272, |
| "grad_norm": 0.7695627212524414, |
| "learning_rate": 0.00025200517185648095, |
| "loss": 3.3315, |
| "step": 53950 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "grad_norm": 0.699380099773407, |
| "learning_rate": 0.00025168193082641954, |
| "loss": 3.344, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "eval_accuracy": 0.38343777954350794, |
| "eval_loss": 3.3852038383483887, |
| "eval_runtime": 184.9031, |
| "eval_samples_per_second": 97.408, |
| "eval_steps_per_second": 6.09, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.817457754816489, |
| "grad_norm": 0.7718612551689148, |
| "learning_rate": 0.00025135868979635814, |
| "loss": 3.3475, |
| "step": 54050 |
| }, |
| { |
| "epoch": 5.822839306856097, |
| "grad_norm": 0.7421673536300659, |
| "learning_rate": 0.00025104191358689794, |
| "loss": 3.3234, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.828220858895706, |
| "grad_norm": 0.728031575679779, |
| "learning_rate": 0.00025071867255683654, |
| "loss": 3.3411, |
| "step": 54150 |
| }, |
| { |
| "epoch": 5.833602410935313, |
| "grad_norm": 0.7653467655181885, |
| "learning_rate": 0.00025039543152677513, |
| "loss": 3.3354, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.838983962974922, |
| "grad_norm": 0.6954116821289062, |
| "learning_rate": 0.0002500721904967137, |
| "loss": 3.3503, |
| "step": 54250 |
| }, |
| { |
| "epoch": 5.84436551501453, |
| "grad_norm": 0.7203173637390137, |
| "learning_rate": 0.00024974894946665227, |
| "loss": 3.3408, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.849747067054138, |
| "grad_norm": 0.7083039879798889, |
| "learning_rate": 0.00024942570843659086, |
| "loss": 3.3285, |
| "step": 54350 |
| }, |
| { |
| "epoch": 5.855128619093747, |
| "grad_norm": 0.8395054936408997, |
| "learning_rate": 0.00024910246740652946, |
| "loss": 3.3423, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.860510171133355, |
| "grad_norm": 0.7595872282981873, |
| "learning_rate": 0.000248779226376468, |
| "loss": 3.3395, |
| "step": 54450 |
| }, |
| { |
| "epoch": 5.865891723172963, |
| "grad_norm": 0.7334976196289062, |
| "learning_rate": 0.00024845598534640665, |
| "loss": 3.3299, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.871273275212571, |
| "grad_norm": 0.7502398490905762, |
| "learning_rate": 0.0002481327443163452, |
| "loss": 3.3213, |
| "step": 54550 |
| }, |
| { |
| "epoch": 5.87665482725218, |
| "grad_norm": 0.8335617184638977, |
| "learning_rate": 0.0002478095032862838, |
| "loss": 3.3317, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.882036379291788, |
| "grad_norm": 0.7220059037208557, |
| "learning_rate": 0.0002474862622562224, |
| "loss": 3.3367, |
| "step": 54650 |
| }, |
| { |
| "epoch": 5.887417931331396, |
| "grad_norm": 0.7250782251358032, |
| "learning_rate": 0.00024716302122616097, |
| "loss": 3.3122, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.892799483371004, |
| "grad_norm": 0.7305442690849304, |
| "learning_rate": 0.0002468397801960995, |
| "loss": 3.3405, |
| "step": 54750 |
| }, |
| { |
| "epoch": 5.898181035410612, |
| "grad_norm": 0.6805377006530762, |
| "learning_rate": 0.0002465165391660381, |
| "loss": 3.3371, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.903562587450221, |
| "grad_norm": 0.7381418347358704, |
| "learning_rate": 0.0002461932981359767, |
| "loss": 3.3307, |
| "step": 54850 |
| }, |
| { |
| "epoch": 5.9089441394898286, |
| "grad_norm": 0.7628136873245239, |
| "learning_rate": 0.0002458700571059153, |
| "loss": 3.3325, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.914325691529437, |
| "grad_norm": 0.6967856884002686, |
| "learning_rate": 0.0002455468160758539, |
| "loss": 3.3264, |
| "step": 54950 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "grad_norm": 0.7036657333374023, |
| "learning_rate": 0.00024522357504579243, |
| "loss": 3.3345, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "eval_accuracy": 0.3839816958999072, |
| "eval_loss": 3.3790087699890137, |
| "eval_runtime": 184.9878, |
| "eval_samples_per_second": 97.363, |
| "eval_steps_per_second": 6.087, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.925088795608653, |
| "grad_norm": 0.7337253093719482, |
| "learning_rate": 0.000244900334015731, |
| "loss": 3.333, |
| "step": 55050 |
| }, |
| { |
| "epoch": 5.930470347648262, |
| "grad_norm": 0.7193291783332825, |
| "learning_rate": 0.0002445770929856696, |
| "loss": 3.3383, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.93585189968787, |
| "grad_norm": 0.6923545598983765, |
| "learning_rate": 0.0002442538519556082, |
| "loss": 3.3076, |
| "step": 55150 |
| }, |
| { |
| "epoch": 5.941233451727478, |
| "grad_norm": 0.7571631073951721, |
| "learning_rate": 0.0002439306109255468, |
| "loss": 3.3281, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.946615003767087, |
| "grad_norm": 0.7402598857879639, |
| "learning_rate": 0.00024360736989548538, |
| "loss": 3.333, |
| "step": 55250 |
| }, |
| { |
| "epoch": 5.951996555806694, |
| "grad_norm": 0.7690083384513855, |
| "learning_rate": 0.00024328412886542394, |
| "loss": 3.3345, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.957378107846303, |
| "grad_norm": 0.6994602084159851, |
| "learning_rate": 0.00024296088783536257, |
| "loss": 3.3188, |
| "step": 55350 |
| }, |
| { |
| "epoch": 5.962759659885911, |
| "grad_norm": 0.7294467091560364, |
| "learning_rate": 0.00024263764680530113, |
| "loss": 3.3436, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.968141211925519, |
| "grad_norm": 0.7167765498161316, |
| "learning_rate": 0.0002423144057752397, |
| "loss": 3.3152, |
| "step": 55450 |
| }, |
| { |
| "epoch": 5.973522763965128, |
| "grad_norm": 0.733696699142456, |
| "learning_rate": 0.00024199116474517832, |
| "loss": 3.3195, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.978904316004736, |
| "grad_norm": 0.7189485430717468, |
| "learning_rate": 0.0002416679237151169, |
| "loss": 3.3247, |
| "step": 55550 |
| }, |
| { |
| "epoch": 5.984285868044344, |
| "grad_norm": 0.7302901148796082, |
| "learning_rate": 0.00024134468268505546, |
| "loss": 3.3269, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.989667420083952, |
| "grad_norm": 0.7239100337028503, |
| "learning_rate": 0.00024102144165499405, |
| "loss": 3.3427, |
| "step": 55650 |
| }, |
| { |
| "epoch": 5.995048972123561, |
| "grad_norm": 0.7289806604385376, |
| "learning_rate": 0.00024069820062493265, |
| "loss": 3.3162, |
| "step": 55700 |
| }, |
| { |
| "epoch": 6.000430524163169, |
| "grad_norm": 0.7002869248390198, |
| "learning_rate": 0.00024037495959487121, |
| "loss": 3.3247, |
| "step": 55750 |
| }, |
| { |
| "epoch": 6.005812076202777, |
| "grad_norm": 0.7519763708114624, |
| "learning_rate": 0.0002400517185648098, |
| "loss": 3.2449, |
| "step": 55800 |
| }, |
| { |
| "epoch": 6.011193628242385, |
| "grad_norm": 0.7565599679946899, |
| "learning_rate": 0.00023972847753474838, |
| "loss": 3.2444, |
| "step": 55850 |
| }, |
| { |
| "epoch": 6.016575180281993, |
| "grad_norm": 0.7228876352310181, |
| "learning_rate": 0.00023940523650468697, |
| "loss": 3.2454, |
| "step": 55900 |
| }, |
| { |
| "epoch": 6.021956732321602, |
| "grad_norm": 0.7430625557899475, |
| "learning_rate": 0.00023908199547462557, |
| "loss": 3.26, |
| "step": 55950 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "grad_norm": 0.7767698764801025, |
| "learning_rate": 0.00023875875444456413, |
| "loss": 3.2586, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "eval_accuracy": 0.3844594426476275, |
| "eval_loss": 3.383514642715454, |
| "eval_runtime": 184.7551, |
| "eval_samples_per_second": 97.486, |
| "eval_steps_per_second": 6.095, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.032719836400818, |
| "grad_norm": 0.7343913316726685, |
| "learning_rate": 0.0002384355134145027, |
| "loss": 3.2529, |
| "step": 56050 |
| }, |
| { |
| "epoch": 6.038101388440427, |
| "grad_norm": 0.745909571647644, |
| "learning_rate": 0.00023811227238444132, |
| "loss": 3.242, |
| "step": 56100 |
| }, |
| { |
| "epoch": 6.043482940480034, |
| "grad_norm": 0.7055720090866089, |
| "learning_rate": 0.0002377890313543799, |
| "loss": 3.2403, |
| "step": 56150 |
| }, |
| { |
| "epoch": 6.048864492519643, |
| "grad_norm": 0.7497826218605042, |
| "learning_rate": 0.00023746579032431849, |
| "loss": 3.243, |
| "step": 56200 |
| }, |
| { |
| "epoch": 6.0542460445592505, |
| "grad_norm": 0.7048562169075012, |
| "learning_rate": 0.00023714254929425708, |
| "loss": 3.242, |
| "step": 56250 |
| }, |
| { |
| "epoch": 6.059627596598859, |
| "grad_norm": 0.7660420536994934, |
| "learning_rate": 0.00023681930826419565, |
| "loss": 3.2648, |
| "step": 56300 |
| }, |
| { |
| "epoch": 6.065009148638468, |
| "grad_norm": 0.7854742407798767, |
| "learning_rate": 0.00023649606723413424, |
| "loss": 3.2551, |
| "step": 56350 |
| }, |
| { |
| "epoch": 6.070390700678075, |
| "grad_norm": 0.7162970900535583, |
| "learning_rate": 0.0002361728262040728, |
| "loss": 3.2563, |
| "step": 56400 |
| }, |
| { |
| "epoch": 6.075772252717684, |
| "grad_norm": 0.7376888990402222, |
| "learning_rate": 0.00023584958517401138, |
| "loss": 3.2507, |
| "step": 56450 |
| }, |
| { |
| "epoch": 6.081153804757292, |
| "grad_norm": 0.7727888226509094, |
| "learning_rate": 0.00023552634414395, |
| "loss": 3.2685, |
| "step": 56500 |
| }, |
| { |
| "epoch": 6.0865353567969, |
| "grad_norm": 0.7515780925750732, |
| "learning_rate": 0.00023520310311388857, |
| "loss": 3.2594, |
| "step": 56550 |
| }, |
| { |
| "epoch": 6.091916908836509, |
| "grad_norm": 0.721759557723999, |
| "learning_rate": 0.00023487986208382713, |
| "loss": 3.2608, |
| "step": 56600 |
| }, |
| { |
| "epoch": 6.097298460876116, |
| "grad_norm": 0.6812422871589661, |
| "learning_rate": 0.00023455662105376576, |
| "loss": 3.2503, |
| "step": 56650 |
| }, |
| { |
| "epoch": 6.102680012915725, |
| "grad_norm": 0.7772680521011353, |
| "learning_rate": 0.00023423338002370432, |
| "loss": 3.2606, |
| "step": 56700 |
| }, |
| { |
| "epoch": 6.108061564955333, |
| "grad_norm": 0.7657935619354248, |
| "learning_rate": 0.0002339101389936429, |
| "loss": 3.2543, |
| "step": 56750 |
| }, |
| { |
| "epoch": 6.113443116994941, |
| "grad_norm": 0.7089388370513916, |
| "learning_rate": 0.00023358689796358149, |
| "loss": 3.2707, |
| "step": 56800 |
| }, |
| { |
| "epoch": 6.11882466903455, |
| "grad_norm": 0.7366516590118408, |
| "learning_rate": 0.00023326365693352008, |
| "loss": 3.2698, |
| "step": 56850 |
| }, |
| { |
| "epoch": 6.124206221074158, |
| "grad_norm": 0.7500792145729065, |
| "learning_rate": 0.00023294041590345865, |
| "loss": 3.2517, |
| "step": 56900 |
| }, |
| { |
| "epoch": 6.129587773113766, |
| "grad_norm": 0.7379442453384399, |
| "learning_rate": 0.00023261717487339724, |
| "loss": 3.2377, |
| "step": 56950 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "grad_norm": 0.7361008524894714, |
| "learning_rate": 0.0002322939338433358, |
| "loss": 3.2657, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "eval_accuracy": 0.3848917724882413, |
| "eval_loss": 3.3805084228515625, |
| "eval_runtime": 184.8323, |
| "eval_samples_per_second": 97.445, |
| "eval_steps_per_second": 6.092, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 0.7512038350105286, |
| "learning_rate": 0.0002319706928132744, |
| "loss": 3.2544, |
| "step": 57050 |
| }, |
| { |
| "epoch": 6.1457324292325906, |
| "grad_norm": 0.7466521859169006, |
| "learning_rate": 0.000231647451783213, |
| "loss": 3.2559, |
| "step": 57100 |
| }, |
| { |
| "epoch": 6.151113981272199, |
| "grad_norm": 0.7655845880508423, |
| "learning_rate": 0.00023132421075315157, |
| "loss": 3.2716, |
| "step": 57150 |
| }, |
| { |
| "epoch": 6.156495533311807, |
| "grad_norm": 0.7715071439743042, |
| "learning_rate": 0.0002310009697230902, |
| "loss": 3.2597, |
| "step": 57200 |
| }, |
| { |
| "epoch": 6.161877085351415, |
| "grad_norm": 0.7462551593780518, |
| "learning_rate": 0.00023067772869302876, |
| "loss": 3.2619, |
| "step": 57250 |
| }, |
| { |
| "epoch": 6.167258637391024, |
| "grad_norm": 0.6912649273872375, |
| "learning_rate": 0.00023035448766296732, |
| "loss": 3.254, |
| "step": 57300 |
| }, |
| { |
| "epoch": 6.1726401894306315, |
| "grad_norm": 0.7506750822067261, |
| "learning_rate": 0.00023003124663290592, |
| "loss": 3.2706, |
| "step": 57350 |
| }, |
| { |
| "epoch": 6.17802174147024, |
| "grad_norm": 0.7650833129882812, |
| "learning_rate": 0.0002297080056028445, |
| "loss": 3.2581, |
| "step": 57400 |
| }, |
| { |
| "epoch": 6.183403293509849, |
| "grad_norm": 0.7462422251701355, |
| "learning_rate": 0.00022938476457278308, |
| "loss": 3.2885, |
| "step": 57450 |
| }, |
| { |
| "epoch": 6.188784845549456, |
| "grad_norm": 0.7575925588607788, |
| "learning_rate": 0.00022906152354272168, |
| "loss": 3.2697, |
| "step": 57500 |
| }, |
| { |
| "epoch": 6.194166397589065, |
| "grad_norm": 0.7644687294960022, |
| "learning_rate": 0.00022873828251266024, |
| "loss": 3.2552, |
| "step": 57550 |
| }, |
| { |
| "epoch": 6.1995479496286725, |
| "grad_norm": 0.7716279625892639, |
| "learning_rate": 0.00022841504148259884, |
| "loss": 3.2557, |
| "step": 57600 |
| }, |
| { |
| "epoch": 6.204929501668281, |
| "grad_norm": 0.7543048858642578, |
| "learning_rate": 0.00022809180045253743, |
| "loss": 3.2624, |
| "step": 57650 |
| }, |
| { |
| "epoch": 6.21031105370789, |
| "grad_norm": 0.7387022376060486, |
| "learning_rate": 0.000227768559422476, |
| "loss": 3.2646, |
| "step": 57700 |
| }, |
| { |
| "epoch": 6.215692605747497, |
| "grad_norm": 0.8519375324249268, |
| "learning_rate": 0.00022744531839241457, |
| "loss": 3.2812, |
| "step": 57750 |
| }, |
| { |
| "epoch": 6.221074157787106, |
| "grad_norm": 0.7521229982376099, |
| "learning_rate": 0.0002271220773623532, |
| "loss": 3.2558, |
| "step": 57800 |
| }, |
| { |
| "epoch": 6.226455709826714, |
| "grad_norm": 0.7615373134613037, |
| "learning_rate": 0.00022679883633229176, |
| "loss": 3.2676, |
| "step": 57850 |
| }, |
| { |
| "epoch": 6.231837261866322, |
| "grad_norm": 0.7787977457046509, |
| "learning_rate": 0.00022647559530223032, |
| "loss": 3.2631, |
| "step": 57900 |
| }, |
| { |
| "epoch": 6.237218813905931, |
| "grad_norm": 0.7316073179244995, |
| "learning_rate": 0.00022615235427216895, |
| "loss": 3.2677, |
| "step": 57950 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "grad_norm": 0.7252430319786072, |
| "learning_rate": 0.0002258291132421075, |
| "loss": 3.2792, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "eval_accuracy": 0.38463469975567227, |
| "eval_loss": 3.379333019256592, |
| "eval_runtime": 184.8261, |
| "eval_samples_per_second": 97.448, |
| "eval_steps_per_second": 6.092, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.247981917985147, |
| "grad_norm": 0.7599756121635437, |
| "learning_rate": 0.0002255058722120461, |
| "loss": 3.2763, |
| "step": 58050 |
| }, |
| { |
| "epoch": 6.253363470024755, |
| "grad_norm": 0.717863142490387, |
| "learning_rate": 0.0002251890960025859, |
| "loss": 3.2631, |
| "step": 58100 |
| }, |
| { |
| "epoch": 6.258745022064363, |
| "grad_norm": 0.7755172848701477, |
| "learning_rate": 0.0002248658549725245, |
| "loss": 3.2734, |
| "step": 58150 |
| }, |
| { |
| "epoch": 6.264126574103972, |
| "grad_norm": 0.737862229347229, |
| "learning_rate": 0.00022454261394246308, |
| "loss": 3.2671, |
| "step": 58200 |
| }, |
| { |
| "epoch": 6.26950812614358, |
| "grad_norm": 0.7289722561836243, |
| "learning_rate": 0.00022421937291240164, |
| "loss": 3.2787, |
| "step": 58250 |
| }, |
| { |
| "epoch": 6.274889678183188, |
| "grad_norm": 0.737337052822113, |
| "learning_rate": 0.00022389613188234027, |
| "loss": 3.2799, |
| "step": 58300 |
| }, |
| { |
| "epoch": 6.280271230222796, |
| "grad_norm": 0.7309353351593018, |
| "learning_rate": 0.00022357289085227883, |
| "loss": 3.2639, |
| "step": 58350 |
| }, |
| { |
| "epoch": 6.285652782262405, |
| "grad_norm": 0.7572712302207947, |
| "learning_rate": 0.0002232496498222174, |
| "loss": 3.2858, |
| "step": 58400 |
| }, |
| { |
| "epoch": 6.2910343343020125, |
| "grad_norm": 0.7290258407592773, |
| "learning_rate": 0.000222926408792156, |
| "loss": 3.2798, |
| "step": 58450 |
| }, |
| { |
| "epoch": 6.296415886341621, |
| "grad_norm": 0.7298064827919006, |
| "learning_rate": 0.0002226031677620946, |
| "loss": 3.2858, |
| "step": 58500 |
| }, |
| { |
| "epoch": 6.301797438381229, |
| "grad_norm": 0.7554293274879456, |
| "learning_rate": 0.00022227992673203316, |
| "loss": 3.2744, |
| "step": 58550 |
| }, |
| { |
| "epoch": 6.307178990420837, |
| "grad_norm": 0.7588570713996887, |
| "learning_rate": 0.00022195668570197175, |
| "loss": 3.2767, |
| "step": 58600 |
| }, |
| { |
| "epoch": 6.312560542460446, |
| "grad_norm": 0.7721754312515259, |
| "learning_rate": 0.00022163344467191032, |
| "loss": 3.26, |
| "step": 58650 |
| }, |
| { |
| "epoch": 6.3179420945000535, |
| "grad_norm": 0.7513089179992676, |
| "learning_rate": 0.00022131020364184891, |
| "loss": 3.2749, |
| "step": 58700 |
| }, |
| { |
| "epoch": 6.323323646539662, |
| "grad_norm": 0.8285422921180725, |
| "learning_rate": 0.0002209869626117875, |
| "loss": 3.281, |
| "step": 58750 |
| }, |
| { |
| "epoch": 6.328705198579271, |
| "grad_norm": 0.7624416947364807, |
| "learning_rate": 0.00022066372158172608, |
| "loss": 3.2768, |
| "step": 58800 |
| }, |
| { |
| "epoch": 6.334086750618878, |
| "grad_norm": 0.7431687712669373, |
| "learning_rate": 0.0002203469453722659, |
| "loss": 3.2844, |
| "step": 58850 |
| }, |
| { |
| "epoch": 6.339468302658487, |
| "grad_norm": 0.7476500272750854, |
| "learning_rate": 0.00022002370434220448, |
| "loss": 3.2766, |
| "step": 58900 |
| }, |
| { |
| "epoch": 6.344849854698095, |
| "grad_norm": 0.7640836238861084, |
| "learning_rate": 0.00021970046331214307, |
| "loss": 3.2819, |
| "step": 58950 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "grad_norm": 0.8359128832817078, |
| "learning_rate": 0.00021937722228208167, |
| "loss": 3.2714, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "eval_accuracy": 0.38533931373314917, |
| "eval_loss": 3.3757359981536865, |
| "eval_runtime": 184.9919, |
| "eval_samples_per_second": 97.361, |
| "eval_steps_per_second": 6.087, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.355612958777312, |
| "grad_norm": 0.7550880312919617, |
| "learning_rate": 0.00021905398125202024, |
| "loss": 3.2664, |
| "step": 59050 |
| }, |
| { |
| "epoch": 6.360994510816919, |
| "grad_norm": 0.7995261549949646, |
| "learning_rate": 0.00021873074022195883, |
| "loss": 3.2856, |
| "step": 59100 |
| }, |
| { |
| "epoch": 6.366376062856528, |
| "grad_norm": 0.7748396992683411, |
| "learning_rate": 0.0002184074991918974, |
| "loss": 3.2692, |
| "step": 59150 |
| }, |
| { |
| "epoch": 6.371757614896136, |
| "grad_norm": 0.7827137112617493, |
| "learning_rate": 0.00021808425816183597, |
| "loss": 3.298, |
| "step": 59200 |
| }, |
| { |
| "epoch": 6.377139166935744, |
| "grad_norm": 0.7864527702331543, |
| "learning_rate": 0.0002177610171317746, |
| "loss": 3.2718, |
| "step": 59250 |
| }, |
| { |
| "epoch": 6.382520718975353, |
| "grad_norm": 0.739658772945404, |
| "learning_rate": 0.00021743777610171315, |
| "loss": 3.2979, |
| "step": 59300 |
| }, |
| { |
| "epoch": 6.387902271014961, |
| "grad_norm": 0.7770798206329346, |
| "learning_rate": 0.00021711453507165172, |
| "loss": 3.2887, |
| "step": 59350 |
| }, |
| { |
| "epoch": 6.393283823054569, |
| "grad_norm": 0.7680968046188354, |
| "learning_rate": 0.00021679129404159034, |
| "loss": 3.2854, |
| "step": 59400 |
| }, |
| { |
| "epoch": 6.398665375094177, |
| "grad_norm": 0.7551103830337524, |
| "learning_rate": 0.0002164680530115289, |
| "loss": 3.2648, |
| "step": 59450 |
| }, |
| { |
| "epoch": 6.404046927133785, |
| "grad_norm": 0.7820691466331482, |
| "learning_rate": 0.00021614481198146748, |
| "loss": 3.2598, |
| "step": 59500 |
| }, |
| { |
| "epoch": 6.4094284791733935, |
| "grad_norm": 0.7943058013916016, |
| "learning_rate": 0.00021582157095140607, |
| "loss": 3.2879, |
| "step": 59550 |
| }, |
| { |
| "epoch": 6.414810031213002, |
| "grad_norm": 0.7810237407684326, |
| "learning_rate": 0.00021549832992134467, |
| "loss": 3.2768, |
| "step": 59600 |
| }, |
| { |
| "epoch": 6.42019158325261, |
| "grad_norm": 0.7318100333213806, |
| "learning_rate": 0.00021517508889128324, |
| "loss": 3.3002, |
| "step": 59650 |
| }, |
| { |
| "epoch": 6.425573135292218, |
| "grad_norm": 0.7224478721618652, |
| "learning_rate": 0.00021485184786122183, |
| "loss": 3.2621, |
| "step": 59700 |
| }, |
| { |
| "epoch": 6.430954687331827, |
| "grad_norm": 0.7693201303482056, |
| "learning_rate": 0.0002145286068311604, |
| "loss": 3.2572, |
| "step": 59750 |
| }, |
| { |
| "epoch": 6.4363362393714345, |
| "grad_norm": 0.7094773650169373, |
| "learning_rate": 0.00021420536580109902, |
| "loss": 3.2818, |
| "step": 59800 |
| }, |
| { |
| "epoch": 6.441717791411043, |
| "grad_norm": 0.761139988899231, |
| "learning_rate": 0.0002138821247710376, |
| "loss": 3.2627, |
| "step": 59850 |
| }, |
| { |
| "epoch": 6.447099343450651, |
| "grad_norm": 0.7221077084541321, |
| "learning_rate": 0.00021355888374097615, |
| "loss": 3.2619, |
| "step": 59900 |
| }, |
| { |
| "epoch": 6.452480895490259, |
| "grad_norm": 0.7425982356071472, |
| "learning_rate": 0.00021323564271091478, |
| "loss": 3.297, |
| "step": 59950 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "grad_norm": 0.8136957287788391, |
| "learning_rate": 0.00021291240168085334, |
| "loss": 3.2827, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "eval_accuracy": 0.3858433544797205, |
| "eval_loss": 3.369577169418335, |
| "eval_runtime": 184.5983, |
| "eval_samples_per_second": 97.569, |
| "eval_steps_per_second": 6.1, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.4632439995694755, |
| "grad_norm": 0.7648407220840454, |
| "learning_rate": 0.0002125891606507919, |
| "loss": 3.2689, |
| "step": 60050 |
| }, |
| { |
| "epoch": 6.468625551609084, |
| "grad_norm": 0.7560406923294067, |
| "learning_rate": 0.0002122659196207305, |
| "loss": 3.2828, |
| "step": 60100 |
| }, |
| { |
| "epoch": 6.474007103648693, |
| "grad_norm": 0.7538021206855774, |
| "learning_rate": 0.0002119426785906691, |
| "loss": 3.2751, |
| "step": 60150 |
| }, |
| { |
| "epoch": 6.4793886556883, |
| "grad_norm": 0.796912670135498, |
| "learning_rate": 0.00021161943756060767, |
| "loss": 3.2776, |
| "step": 60200 |
| }, |
| { |
| "epoch": 6.484770207727909, |
| "grad_norm": 0.7616314888000488, |
| "learning_rate": 0.00021129619653054626, |
| "loss": 3.2964, |
| "step": 60250 |
| }, |
| { |
| "epoch": 6.490151759767517, |
| "grad_norm": 0.7025566101074219, |
| "learning_rate": 0.00021097295550048483, |
| "loss": 3.2876, |
| "step": 60300 |
| }, |
| { |
| "epoch": 6.495533311807125, |
| "grad_norm": 0.775004506111145, |
| "learning_rate": 0.00021064971447042343, |
| "loss": 3.2618, |
| "step": 60350 |
| }, |
| { |
| "epoch": 6.500914863846734, |
| "grad_norm": 0.7972927689552307, |
| "learning_rate": 0.00021032647344036202, |
| "loss": 3.2837, |
| "step": 60400 |
| }, |
| { |
| "epoch": 6.506296415886341, |
| "grad_norm": 0.7656959295272827, |
| "learning_rate": 0.0002100032324103006, |
| "loss": 3.2853, |
| "step": 60450 |
| }, |
| { |
| "epoch": 6.51167796792595, |
| "grad_norm": 0.7854245901107788, |
| "learning_rate": 0.00020967999138023916, |
| "loss": 3.2795, |
| "step": 60500 |
| }, |
| { |
| "epoch": 6.517059519965558, |
| "grad_norm": 0.7886183857917786, |
| "learning_rate": 0.00020935675035017778, |
| "loss": 3.2849, |
| "step": 60550 |
| }, |
| { |
| "epoch": 6.522441072005166, |
| "grad_norm": 0.8237834572792053, |
| "learning_rate": 0.00020903350932011634, |
| "loss": 3.285, |
| "step": 60600 |
| }, |
| { |
| "epoch": 6.5278226240447745, |
| "grad_norm": 0.7524319291114807, |
| "learning_rate": 0.0002087102682900549, |
| "loss": 3.2721, |
| "step": 60650 |
| }, |
| { |
| "epoch": 6.533204176084383, |
| "grad_norm": 0.7918885946273804, |
| "learning_rate": 0.00020838702725999353, |
| "loss": 3.2733, |
| "step": 60700 |
| }, |
| { |
| "epoch": 6.538585728123991, |
| "grad_norm": 0.742572546005249, |
| "learning_rate": 0.0002080637862299321, |
| "loss": 3.2713, |
| "step": 60750 |
| }, |
| { |
| "epoch": 6.543967280163599, |
| "grad_norm": 0.7656800150871277, |
| "learning_rate": 0.0002077405451998707, |
| "loss": 3.2906, |
| "step": 60800 |
| }, |
| { |
| "epoch": 6.549348832203208, |
| "grad_norm": 0.7438012361526489, |
| "learning_rate": 0.00020741730416980926, |
| "loss": 3.2745, |
| "step": 60850 |
| }, |
| { |
| "epoch": 6.5547303842428155, |
| "grad_norm": 0.793135404586792, |
| "learning_rate": 0.00020709406313974786, |
| "loss": 3.2778, |
| "step": 60900 |
| }, |
| { |
| "epoch": 6.560111936282424, |
| "grad_norm": 0.7506826519966125, |
| "learning_rate": 0.00020677082210968645, |
| "loss": 3.2682, |
| "step": 60950 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "grad_norm": 0.7645418047904968, |
| "learning_rate": 0.00020644758107962502, |
| "loss": 3.2671, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "eval_accuracy": 0.386022305785951, |
| "eval_loss": 3.368492603302002, |
| "eval_runtime": 185.0491, |
| "eval_samples_per_second": 97.331, |
| "eval_steps_per_second": 6.085, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.57087504036164, |
| "grad_norm": 0.7325201630592346, |
| "learning_rate": 0.0002061243400495636, |
| "loss": 3.2796, |
| "step": 61050 |
| }, |
| { |
| "epoch": 6.576256592401249, |
| "grad_norm": 0.7918336987495422, |
| "learning_rate": 0.0002058010990195022, |
| "loss": 3.2781, |
| "step": 61100 |
| }, |
| { |
| "epoch": 6.5816381444408565, |
| "grad_norm": 0.7214813232421875, |
| "learning_rate": 0.00020547785798944078, |
| "loss": 3.2881, |
| "step": 61150 |
| }, |
| { |
| "epoch": 6.587019696480465, |
| "grad_norm": 0.7659515738487244, |
| "learning_rate": 0.00020515461695937934, |
| "loss": 3.2731, |
| "step": 61200 |
| }, |
| { |
| "epoch": 6.592401248520073, |
| "grad_norm": 0.7384321093559265, |
| "learning_rate": 0.00020483137592931797, |
| "loss": 3.2826, |
| "step": 61250 |
| }, |
| { |
| "epoch": 6.597782800559681, |
| "grad_norm": 0.7732715010643005, |
| "learning_rate": 0.00020450813489925653, |
| "loss": 3.2796, |
| "step": 61300 |
| }, |
| { |
| "epoch": 6.60316435259929, |
| "grad_norm": 0.7654476165771484, |
| "learning_rate": 0.0002041848938691951, |
| "loss": 3.2582, |
| "step": 61350 |
| }, |
| { |
| "epoch": 6.608545904638898, |
| "grad_norm": 0.7931333184242249, |
| "learning_rate": 0.0002038616528391337, |
| "loss": 3.2684, |
| "step": 61400 |
| }, |
| { |
| "epoch": 6.613927456678506, |
| "grad_norm": 0.7446388602256775, |
| "learning_rate": 0.00020353841180907226, |
| "loss": 3.2826, |
| "step": 61450 |
| }, |
| { |
| "epoch": 6.619309008718115, |
| "grad_norm": 0.7642561197280884, |
| "learning_rate": 0.00020321517077901086, |
| "loss": 3.2918, |
| "step": 61500 |
| }, |
| { |
| "epoch": 6.624690560757722, |
| "grad_norm": 0.8336501717567444, |
| "learning_rate": 0.00020289192974894945, |
| "loss": 3.2695, |
| "step": 61550 |
| }, |
| { |
| "epoch": 6.630072112797331, |
| "grad_norm": 0.7792569994926453, |
| "learning_rate": 0.00020256868871888802, |
| "loss": 3.285, |
| "step": 61600 |
| }, |
| { |
| "epoch": 6.635453664836939, |
| "grad_norm": 0.8366368412971497, |
| "learning_rate": 0.00020224544768882664, |
| "loss": 3.2801, |
| "step": 61650 |
| }, |
| { |
| "epoch": 6.640835216876547, |
| "grad_norm": 0.7799500226974487, |
| "learning_rate": 0.0002019222066587652, |
| "loss": 3.2867, |
| "step": 61700 |
| }, |
| { |
| "epoch": 6.6462167689161555, |
| "grad_norm": 0.8103165626525879, |
| "learning_rate": 0.00020159896562870378, |
| "loss": 3.2709, |
| "step": 61750 |
| }, |
| { |
| "epoch": 6.651598320955763, |
| "grad_norm": 0.7403290867805481, |
| "learning_rate": 0.00020127572459864237, |
| "loss": 3.2901, |
| "step": 61800 |
| }, |
| { |
| "epoch": 6.656979872995372, |
| "grad_norm": 0.7185531854629517, |
| "learning_rate": 0.00020095248356858097, |
| "loss": 3.2843, |
| "step": 61850 |
| }, |
| { |
| "epoch": 6.66236142503498, |
| "grad_norm": 0.7514901161193848, |
| "learning_rate": 0.00020062924253851953, |
| "loss": 3.2866, |
| "step": 61900 |
| }, |
| { |
| "epoch": 6.667742977074588, |
| "grad_norm": 0.7954921126365662, |
| "learning_rate": 0.00020030600150845813, |
| "loss": 3.2607, |
| "step": 61950 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "grad_norm": 0.7863759398460388, |
| "learning_rate": 0.0001999827604783967, |
| "loss": 3.3058, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "eval_accuracy": 0.3863287069295878, |
| "eval_loss": 3.3633809089660645, |
| "eval_runtime": 184.8165, |
| "eval_samples_per_second": 97.453, |
| "eval_steps_per_second": 6.093, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.678506081153805, |
| "grad_norm": 0.7923064827919006, |
| "learning_rate": 0.0001996595194483353, |
| "loss": 3.3203, |
| "step": 62050 |
| }, |
| { |
| "epoch": 6.683887633193413, |
| "grad_norm": 0.7636295557022095, |
| "learning_rate": 0.00019933627841827389, |
| "loss": 3.297, |
| "step": 62100 |
| }, |
| { |
| "epoch": 6.689269185233021, |
| "grad_norm": 0.740822970867157, |
| "learning_rate": 0.00019901303738821245, |
| "loss": 3.2925, |
| "step": 62150 |
| }, |
| { |
| "epoch": 6.69465073727263, |
| "grad_norm": 0.7824040651321411, |
| "learning_rate": 0.00019868979635815102, |
| "loss": 3.2845, |
| "step": 62200 |
| }, |
| { |
| "epoch": 6.7000322893122375, |
| "grad_norm": 0.7348474860191345, |
| "learning_rate": 0.00019836655532808964, |
| "loss": 3.2755, |
| "step": 62250 |
| }, |
| { |
| "epoch": 6.705413841351846, |
| "grad_norm": 0.8276709318161011, |
| "learning_rate": 0.0001980433142980282, |
| "loss": 3.302, |
| "step": 62300 |
| }, |
| { |
| "epoch": 6.710795393391454, |
| "grad_norm": 0.7462751865386963, |
| "learning_rate": 0.00019772007326796678, |
| "loss": 3.2734, |
| "step": 62350 |
| }, |
| { |
| "epoch": 6.716176945431062, |
| "grad_norm": 0.8004351258277893, |
| "learning_rate": 0.0001973968322379054, |
| "loss": 3.278, |
| "step": 62400 |
| }, |
| { |
| "epoch": 6.721558497470671, |
| "grad_norm": 0.7743335366249084, |
| "learning_rate": 0.00019707359120784397, |
| "loss": 3.2841, |
| "step": 62450 |
| }, |
| { |
| "epoch": 6.7269400495102785, |
| "grad_norm": 0.7511734366416931, |
| "learning_rate": 0.00019675035017778253, |
| "loss": 3.277, |
| "step": 62500 |
| }, |
| { |
| "epoch": 6.732321601549887, |
| "grad_norm": 0.7555059194564819, |
| "learning_rate": 0.00019642710914772113, |
| "loss": 3.2914, |
| "step": 62550 |
| }, |
| { |
| "epoch": 6.737703153589496, |
| "grad_norm": 0.7607969641685486, |
| "learning_rate": 0.00019610386811765972, |
| "loss": 3.2923, |
| "step": 62600 |
| }, |
| { |
| "epoch": 6.743084705629103, |
| "grad_norm": 0.7745279669761658, |
| "learning_rate": 0.00019578062708759832, |
| "loss": 3.2707, |
| "step": 62650 |
| }, |
| { |
| "epoch": 6.748466257668712, |
| "grad_norm": 0.7459644675254822, |
| "learning_rate": 0.00019545738605753689, |
| "loss": 3.2656, |
| "step": 62700 |
| }, |
| { |
| "epoch": 6.75384780970832, |
| "grad_norm": 0.7880685329437256, |
| "learning_rate": 0.00019513414502747545, |
| "loss": 3.271, |
| "step": 62750 |
| }, |
| { |
| "epoch": 6.759229361747928, |
| "grad_norm": 0.7434306740760803, |
| "learning_rate": 0.00019481090399741408, |
| "loss": 3.2812, |
| "step": 62800 |
| }, |
| { |
| "epoch": 6.7646109137875365, |
| "grad_norm": 0.8286969065666199, |
| "learning_rate": 0.0001945005926085551, |
| "loss": 3.2948, |
| "step": 62850 |
| }, |
| { |
| "epoch": 6.769992465827144, |
| "grad_norm": 0.7807492613792419, |
| "learning_rate": 0.0001941773515784937, |
| "loss": 3.296, |
| "step": 62900 |
| }, |
| { |
| "epoch": 6.775374017866753, |
| "grad_norm": 0.8009621500968933, |
| "learning_rate": 0.00019385411054843226, |
| "loss": 3.2822, |
| "step": 62950 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "grad_norm": 0.7488981485366821, |
| "learning_rate": 0.00019353086951837085, |
| "loss": 3.2775, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "eval_accuracy": 0.3868306832712907, |
| "eval_loss": 3.3595683574676514, |
| "eval_runtime": 184.9791, |
| "eval_samples_per_second": 97.368, |
| "eval_steps_per_second": 6.087, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.786137121945969, |
| "grad_norm": 0.7897948026657104, |
| "learning_rate": 0.00019320762848830942, |
| "loss": 3.277, |
| "step": 63050 |
| }, |
| { |
| "epoch": 6.7915186739855775, |
| "grad_norm": 0.7589372992515564, |
| "learning_rate": 0.000192884387458248, |
| "loss": 3.2875, |
| "step": 63100 |
| }, |
| { |
| "epoch": 6.796900226025185, |
| "grad_norm": 0.7196517586708069, |
| "learning_rate": 0.0001925611464281866, |
| "loss": 3.2844, |
| "step": 63150 |
| }, |
| { |
| "epoch": 6.802281778064794, |
| "grad_norm": 0.7708859443664551, |
| "learning_rate": 0.00019223790539812518, |
| "loss": 3.2796, |
| "step": 63200 |
| }, |
| { |
| "epoch": 6.807663330104402, |
| "grad_norm": 0.7655634880065918, |
| "learning_rate": 0.00019191466436806374, |
| "loss": 3.2928, |
| "step": 63250 |
| }, |
| { |
| "epoch": 6.813044882144011, |
| "grad_norm": 0.8020054697990417, |
| "learning_rate": 0.00019159142333800236, |
| "loss": 3.259, |
| "step": 63300 |
| }, |
| { |
| "epoch": 6.8184264341836185, |
| "grad_norm": 0.7741644382476807, |
| "learning_rate": 0.00019126818230794093, |
| "loss": 3.2918, |
| "step": 63350 |
| }, |
| { |
| "epoch": 6.823807986223227, |
| "grad_norm": 0.7251269221305847, |
| "learning_rate": 0.00019094494127787953, |
| "loss": 3.2822, |
| "step": 63400 |
| }, |
| { |
| "epoch": 6.829189538262835, |
| "grad_norm": 0.7983183860778809, |
| "learning_rate": 0.00019062170024781812, |
| "loss": 3.3032, |
| "step": 63450 |
| }, |
| { |
| "epoch": 6.834571090302443, |
| "grad_norm": 0.7949510216712952, |
| "learning_rate": 0.0001902984592177567, |
| "loss": 3.284, |
| "step": 63500 |
| }, |
| { |
| "epoch": 6.839952642342052, |
| "grad_norm": 0.788966715335846, |
| "learning_rate": 0.00018997521818769528, |
| "loss": 3.2741, |
| "step": 63550 |
| }, |
| { |
| "epoch": 6.8453341943816595, |
| "grad_norm": 0.7868644595146179, |
| "learning_rate": 0.00018965197715763385, |
| "loss": 3.2781, |
| "step": 63600 |
| }, |
| { |
| "epoch": 6.850715746421268, |
| "grad_norm": 0.7467278242111206, |
| "learning_rate": 0.00018932873612757245, |
| "loss": 3.2761, |
| "step": 63650 |
| }, |
| { |
| "epoch": 6.856097298460876, |
| "grad_norm": 0.7820574641227722, |
| "learning_rate": 0.00018900549509751104, |
| "loss": 3.2763, |
| "step": 63700 |
| }, |
| { |
| "epoch": 6.861478850500484, |
| "grad_norm": 0.7491395473480225, |
| "learning_rate": 0.0001886822540674496, |
| "loss": 3.2862, |
| "step": 63750 |
| }, |
| { |
| "epoch": 6.866860402540093, |
| "grad_norm": 0.8236872553825378, |
| "learning_rate": 0.00018835901303738818, |
| "loss": 3.2784, |
| "step": 63800 |
| }, |
| { |
| "epoch": 6.8722419545797, |
| "grad_norm": 0.7589198350906372, |
| "learning_rate": 0.0001880357720073268, |
| "loss": 3.2835, |
| "step": 63850 |
| }, |
| { |
| "epoch": 6.877623506619309, |
| "grad_norm": 0.7768234014511108, |
| "learning_rate": 0.00018771253097726537, |
| "loss": 3.2868, |
| "step": 63900 |
| }, |
| { |
| "epoch": 6.8830050586589175, |
| "grad_norm": 0.7730334997177124, |
| "learning_rate": 0.00018738928994720393, |
| "loss": 3.292, |
| "step": 63950 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "grad_norm": 0.7769656181335449, |
| "learning_rate": 0.00018706604891714255, |
| "loss": 3.2919, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "eval_accuracy": 0.3873528690501271, |
| "eval_loss": 3.354077100753784, |
| "eval_runtime": 185.2307, |
| "eval_samples_per_second": 97.236, |
| "eval_steps_per_second": 6.079, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.893768162738134, |
| "grad_norm": 0.8213059902191162, |
| "learning_rate": 0.00018674280788708112, |
| "loss": 3.2788, |
| "step": 64050 |
| }, |
| { |
| "epoch": 6.899149714777742, |
| "grad_norm": 0.7441644668579102, |
| "learning_rate": 0.0001864195668570197, |
| "loss": 3.2803, |
| "step": 64100 |
| }, |
| { |
| "epoch": 6.90453126681735, |
| "grad_norm": 0.7904473543167114, |
| "learning_rate": 0.00018609632582695828, |
| "loss": 3.2874, |
| "step": 64150 |
| }, |
| { |
| "epoch": 6.9099128188569585, |
| "grad_norm": 0.7749623656272888, |
| "learning_rate": 0.00018577308479689685, |
| "loss": 3.2675, |
| "step": 64200 |
| }, |
| { |
| "epoch": 6.915294370896566, |
| "grad_norm": 0.7651856541633606, |
| "learning_rate": 0.00018544984376683545, |
| "loss": 3.2799, |
| "step": 64250 |
| }, |
| { |
| "epoch": 6.920675922936175, |
| "grad_norm": 0.8032562136650085, |
| "learning_rate": 0.00018512660273677404, |
| "loss": 3.2993, |
| "step": 64300 |
| }, |
| { |
| "epoch": 6.926057474975783, |
| "grad_norm": 0.7830508351325989, |
| "learning_rate": 0.0001848033617067126, |
| "loss": 3.2769, |
| "step": 64350 |
| }, |
| { |
| "epoch": 6.931439027015391, |
| "grad_norm": 0.778607964515686, |
| "learning_rate": 0.00018448012067665123, |
| "loss": 3.2785, |
| "step": 64400 |
| }, |
| { |
| "epoch": 6.9368205790549995, |
| "grad_norm": 0.7424008846282959, |
| "learning_rate": 0.0001841568796465898, |
| "loss": 3.2746, |
| "step": 64450 |
| }, |
| { |
| "epoch": 6.942202131094608, |
| "grad_norm": 0.7957499027252197, |
| "learning_rate": 0.00018383363861652837, |
| "loss": 3.2933, |
| "step": 64500 |
| }, |
| { |
| "epoch": 6.947583683134216, |
| "grad_norm": 0.8147248029708862, |
| "learning_rate": 0.00018351039758646696, |
| "loss": 3.273, |
| "step": 64550 |
| }, |
| { |
| "epoch": 6.952965235173824, |
| "grad_norm": 0.7838407158851624, |
| "learning_rate": 0.00018318715655640555, |
| "loss": 3.2845, |
| "step": 64600 |
| }, |
| { |
| "epoch": 6.958346787213433, |
| "grad_norm": 0.7947826385498047, |
| "learning_rate": 0.00018286391552634412, |
| "loss": 3.299, |
| "step": 64650 |
| }, |
| { |
| "epoch": 6.9637283392530405, |
| "grad_norm": 0.8972591161727905, |
| "learning_rate": 0.00018254067449628272, |
| "loss": 3.3021, |
| "step": 64700 |
| }, |
| { |
| "epoch": 6.969109891292649, |
| "grad_norm": 0.7764172554016113, |
| "learning_rate": 0.00018221743346622128, |
| "loss": 3.2696, |
| "step": 64750 |
| }, |
| { |
| "epoch": 6.974491443332257, |
| "grad_norm": 0.8290959000587463, |
| "learning_rate": 0.00018189419243615988, |
| "loss": 3.2747, |
| "step": 64800 |
| }, |
| { |
| "epoch": 6.979872995371865, |
| "grad_norm": 0.751587450504303, |
| "learning_rate": 0.00018157095140609847, |
| "loss": 3.2917, |
| "step": 64850 |
| }, |
| { |
| "epoch": 6.985254547411474, |
| "grad_norm": 0.7895004153251648, |
| "learning_rate": 0.00018124771037603704, |
| "loss": 3.2852, |
| "step": 64900 |
| }, |
| { |
| "epoch": 6.990636099451081, |
| "grad_norm": 0.7497310638427734, |
| "learning_rate": 0.0001809244693459756, |
| "loss": 3.3, |
| "step": 64950 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "grad_norm": 0.763469398021698, |
| "learning_rate": 0.00018060122831591423, |
| "loss": 3.2603, |
| "step": 65000 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "eval_accuracy": 0.38770675150573886, |
| "eval_loss": 3.351874828338623, |
| "eval_runtime": 184.7786, |
| "eval_samples_per_second": 97.473, |
| "eval_steps_per_second": 6.094, |
| "step": 65000 |
| }, |
| { |
| "epoch": 7.0013992035302985, |
| "grad_norm": 0.7597163915634155, |
| "learning_rate": 0.0001802779872858528, |
| "loss": 3.2679, |
| "step": 65050 |
| }, |
| { |
| "epoch": 7.006780755569906, |
| "grad_norm": 0.7525449395179749, |
| "learning_rate": 0.00017995474625579137, |
| "loss": 3.2098, |
| "step": 65100 |
| }, |
| { |
| "epoch": 7.012162307609515, |
| "grad_norm": 0.8813150525093079, |
| "learning_rate": 0.00017963150522573, |
| "loss": 3.1881, |
| "step": 65150 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.787375271320343, |
| "learning_rate": 0.0001793147290162698, |
| "loss": 3.1957, |
| "step": 65200 |
| }, |
| { |
| "epoch": 7.022925411688731, |
| "grad_norm": 0.7959186434745789, |
| "learning_rate": 0.00017899148798620836, |
| "loss": 3.2153, |
| "step": 65250 |
| }, |
| { |
| "epoch": 7.0283069637283395, |
| "grad_norm": 0.7243605256080627, |
| "learning_rate": 0.00017866824695614696, |
| "loss": 3.1987, |
| "step": 65300 |
| }, |
| { |
| "epoch": 7.033688515767947, |
| "grad_norm": 0.7697280049324036, |
| "learning_rate": 0.00017834500592608555, |
| "loss": 3.2116, |
| "step": 65350 |
| }, |
| { |
| "epoch": 7.039070067807556, |
| "grad_norm": 0.781091034412384, |
| "learning_rate": 0.00017802176489602412, |
| "loss": 3.192, |
| "step": 65400 |
| }, |
| { |
| "epoch": 7.044451619847164, |
| "grad_norm": 0.8070541620254517, |
| "learning_rate": 0.00017769852386596269, |
| "loss": 3.2159, |
| "step": 65450 |
| }, |
| { |
| "epoch": 7.049833171886772, |
| "grad_norm": 0.7928311228752136, |
| "learning_rate": 0.0001773752828359013, |
| "loss": 3.2199, |
| "step": 65500 |
| }, |
| { |
| "epoch": 7.0552147239263805, |
| "grad_norm": 0.8367716670036316, |
| "learning_rate": 0.00017705204180583988, |
| "loss": 3.1962, |
| "step": 65550 |
| }, |
| { |
| "epoch": 7.060596275965988, |
| "grad_norm": 0.8304590582847595, |
| "learning_rate": 0.00017672880077577844, |
| "loss": 3.2124, |
| "step": 65600 |
| }, |
| { |
| "epoch": 7.065977828005597, |
| "grad_norm": 0.789740800857544, |
| "learning_rate": 0.00017640555974571704, |
| "loss": 3.1983, |
| "step": 65650 |
| }, |
| { |
| "epoch": 7.071359380045205, |
| "grad_norm": 0.7952477335929871, |
| "learning_rate": 0.00017608231871565563, |
| "loss": 3.2149, |
| "step": 65700 |
| }, |
| { |
| "epoch": 7.076740932084813, |
| "grad_norm": 0.8178505897521973, |
| "learning_rate": 0.0001757590776855942, |
| "loss": 3.2005, |
| "step": 65750 |
| }, |
| { |
| "epoch": 7.0821224841244215, |
| "grad_norm": 0.8267757892608643, |
| "learning_rate": 0.0001754358366555328, |
| "loss": 3.2145, |
| "step": 65800 |
| }, |
| { |
| "epoch": 7.08750403616403, |
| "grad_norm": 0.7804736495018005, |
| "learning_rate": 0.00017511259562547136, |
| "loss": 3.204, |
| "step": 65850 |
| }, |
| { |
| "epoch": 7.092885588203638, |
| "grad_norm": 0.8235890865325928, |
| "learning_rate": 0.00017478935459540996, |
| "loss": 3.2238, |
| "step": 65900 |
| }, |
| { |
| "epoch": 7.098267140243246, |
| "grad_norm": 0.8137307167053223, |
| "learning_rate": 0.00017446611356534855, |
| "loss": 3.2068, |
| "step": 65950 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "grad_norm": 0.7665497660636902, |
| "learning_rate": 0.00017414287253528712, |
| "loss": 3.1971, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "eval_accuracy": 0.3873983946101214, |
| "eval_loss": 3.3571224212646484, |
| "eval_runtime": 185.0933, |
| "eval_samples_per_second": 97.308, |
| "eval_steps_per_second": 6.083, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.109030244322462, |
| "grad_norm": 0.8523070812225342, |
| "learning_rate": 0.00017381963150522569, |
| "loss": 3.2132, |
| "step": 66050 |
| }, |
| { |
| "epoch": 7.114411796362071, |
| "grad_norm": 0.8501766920089722, |
| "learning_rate": 0.0001734963904751643, |
| "loss": 3.2196, |
| "step": 66100 |
| }, |
| { |
| "epoch": 7.119793348401679, |
| "grad_norm": 0.7479291558265686, |
| "learning_rate": 0.00017317314944510288, |
| "loss": 3.2021, |
| "step": 66150 |
| }, |
| { |
| "epoch": 7.125174900441287, |
| "grad_norm": 0.8581606149673462, |
| "learning_rate": 0.00017284990841504147, |
| "loss": 3.238, |
| "step": 66200 |
| }, |
| { |
| "epoch": 7.130556452480896, |
| "grad_norm": 0.8038374185562134, |
| "learning_rate": 0.00017252666738498007, |
| "loss": 3.2067, |
| "step": 66250 |
| }, |
| { |
| "epoch": 7.135938004520503, |
| "grad_norm": 0.7955670356750488, |
| "learning_rate": 0.00017220342635491863, |
| "loss": 3.2062, |
| "step": 66300 |
| }, |
| { |
| "epoch": 7.141319556560112, |
| "grad_norm": 0.8133190274238586, |
| "learning_rate": 0.00017188018532485723, |
| "loss": 3.2189, |
| "step": 66350 |
| }, |
| { |
| "epoch": 7.1467011085997205, |
| "grad_norm": 0.7808161377906799, |
| "learning_rate": 0.0001715569442947958, |
| "loss": 3.2181, |
| "step": 66400 |
| }, |
| { |
| "epoch": 7.152082660639328, |
| "grad_norm": 0.7812925577163696, |
| "learning_rate": 0.0001712337032647344, |
| "loss": 3.21, |
| "step": 66450 |
| }, |
| { |
| "epoch": 7.157464212678937, |
| "grad_norm": 0.7803530097007751, |
| "learning_rate": 0.00017091046223467298, |
| "loss": 3.221, |
| "step": 66500 |
| }, |
| { |
| "epoch": 7.162845764718545, |
| "grad_norm": 0.8198121786117554, |
| "learning_rate": 0.00017058722120461155, |
| "loss": 3.2165, |
| "step": 66550 |
| }, |
| { |
| "epoch": 7.168227316758153, |
| "grad_norm": 0.777133047580719, |
| "learning_rate": 0.00017026398017455012, |
| "loss": 3.2101, |
| "step": 66600 |
| }, |
| { |
| "epoch": 7.1736088687977615, |
| "grad_norm": 0.7378299236297607, |
| "learning_rate": 0.00016994073914448874, |
| "loss": 3.2223, |
| "step": 66650 |
| }, |
| { |
| "epoch": 7.178990420837369, |
| "grad_norm": 0.8564599752426147, |
| "learning_rate": 0.0001696174981144273, |
| "loss": 3.2103, |
| "step": 66700 |
| }, |
| { |
| "epoch": 7.184371972876978, |
| "grad_norm": 0.8352597951889038, |
| "learning_rate": 0.00016929425708436588, |
| "loss": 3.2395, |
| "step": 66750 |
| }, |
| { |
| "epoch": 7.189753524916586, |
| "grad_norm": 0.7732496857643127, |
| "learning_rate": 0.0001689710160543045, |
| "loss": 3.209, |
| "step": 66800 |
| }, |
| { |
| "epoch": 7.195135076956194, |
| "grad_norm": 0.7928017377853394, |
| "learning_rate": 0.00016864777502424307, |
| "loss": 3.208, |
| "step": 66850 |
| }, |
| { |
| "epoch": 7.2005166289958025, |
| "grad_norm": 0.8274809122085571, |
| "learning_rate": 0.00016832453399418163, |
| "loss": 3.2147, |
| "step": 66900 |
| }, |
| { |
| "epoch": 7.205898181035411, |
| "grad_norm": 0.812453031539917, |
| "learning_rate": 0.00016800129296412023, |
| "loss": 3.2238, |
| "step": 66950 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "grad_norm": 0.7714659571647644, |
| "learning_rate": 0.00016767805193405882, |
| "loss": 3.2284, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "eval_accuracy": 0.38780638620386476, |
| "eval_loss": 3.356039047241211, |
| "eval_runtime": 185.0026, |
| "eval_samples_per_second": 97.355, |
| "eval_steps_per_second": 6.086, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.216661285114627, |
| "grad_norm": 0.7873409390449524, |
| "learning_rate": 0.0001673548109039974, |
| "loss": 3.2216, |
| "step": 67050 |
| }, |
| { |
| "epoch": 7.222042837154235, |
| "grad_norm": 0.8289926052093506, |
| "learning_rate": 0.00016703156987393598, |
| "loss": 3.2315, |
| "step": 67100 |
| }, |
| { |
| "epoch": 7.2274243891938434, |
| "grad_norm": 0.7620259523391724, |
| "learning_rate": 0.00016670832884387455, |
| "loss": 3.2078, |
| "step": 67150 |
| }, |
| { |
| "epoch": 7.232805941233452, |
| "grad_norm": 0.8179631233215332, |
| "learning_rate": 0.00016638508781381317, |
| "loss": 3.2387, |
| "step": 67200 |
| }, |
| { |
| "epoch": 7.23818749327306, |
| "grad_norm": 0.8092691898345947, |
| "learning_rate": 0.00016606184678375174, |
| "loss": 3.2218, |
| "step": 67250 |
| }, |
| { |
| "epoch": 7.243569045312668, |
| "grad_norm": 0.8265263438224792, |
| "learning_rate": 0.0001657386057536903, |
| "loss": 3.2141, |
| "step": 67300 |
| }, |
| { |
| "epoch": 7.248950597352277, |
| "grad_norm": 0.7724214196205139, |
| "learning_rate": 0.00016541536472362893, |
| "loss": 3.2062, |
| "step": 67350 |
| }, |
| { |
| "epoch": 7.254332149391884, |
| "grad_norm": 0.8382948040962219, |
| "learning_rate": 0.0001650921236935675, |
| "loss": 3.2272, |
| "step": 67400 |
| }, |
| { |
| "epoch": 7.259713701431493, |
| "grad_norm": 0.7633907794952393, |
| "learning_rate": 0.00016476888266350607, |
| "loss": 3.2307, |
| "step": 67450 |
| }, |
| { |
| "epoch": 7.265095253471101, |
| "grad_norm": 0.8241746425628662, |
| "learning_rate": 0.00016444564163344466, |
| "loss": 3.2082, |
| "step": 67500 |
| }, |
| { |
| "epoch": 7.270476805510709, |
| "grad_norm": 0.8180881142616272, |
| "learning_rate": 0.00016412240060338326, |
| "loss": 3.2256, |
| "step": 67550 |
| }, |
| { |
| "epoch": 7.275858357550318, |
| "grad_norm": 0.8148139119148254, |
| "learning_rate": 0.00016379915957332182, |
| "loss": 3.2394, |
| "step": 67600 |
| }, |
| { |
| "epoch": 7.281239909589925, |
| "grad_norm": 0.8224004507064819, |
| "learning_rate": 0.00016348238336386163, |
| "loss": 3.2298, |
| "step": 67650 |
| }, |
| { |
| "epoch": 7.286621461629534, |
| "grad_norm": 0.7899991273880005, |
| "learning_rate": 0.0001631591423338002, |
| "loss": 3.2128, |
| "step": 67700 |
| }, |
| { |
| "epoch": 7.2920030136691425, |
| "grad_norm": 0.8135297298431396, |
| "learning_rate": 0.00016283590130373882, |
| "loss": 3.2192, |
| "step": 67750 |
| }, |
| { |
| "epoch": 7.29738456570875, |
| "grad_norm": 0.8687593340873718, |
| "learning_rate": 0.00016251266027367739, |
| "loss": 3.2169, |
| "step": 67800 |
| }, |
| { |
| "epoch": 7.302766117748359, |
| "grad_norm": 0.8100031018257141, |
| "learning_rate": 0.00016218941924361595, |
| "loss": 3.2052, |
| "step": 67850 |
| }, |
| { |
| "epoch": 7.308147669787967, |
| "grad_norm": 0.7895699739456177, |
| "learning_rate": 0.00016186617821355458, |
| "loss": 3.2334, |
| "step": 67900 |
| }, |
| { |
| "epoch": 7.313529221827575, |
| "grad_norm": 0.7895455956459045, |
| "learning_rate": 0.00016154293718349314, |
| "loss": 3.2231, |
| "step": 67950 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "grad_norm": 0.8295309543609619, |
| "learning_rate": 0.00016121969615343174, |
| "loss": 3.2006, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "eval_accuracy": 0.3882477342341672, |
| "eval_loss": 3.351390838623047, |
| "eval_runtime": 184.5883, |
| "eval_samples_per_second": 97.574, |
| "eval_steps_per_second": 6.1, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.324292325906791, |
| "grad_norm": 0.8855544328689575, |
| "learning_rate": 0.0001608964551233703, |
| "loss": 3.214, |
| "step": 68050 |
| }, |
| { |
| "epoch": 7.3296738779464, |
| "grad_norm": 0.8429924249649048, |
| "learning_rate": 0.0001605732140933089, |
| "loss": 3.2306, |
| "step": 68100 |
| }, |
| { |
| "epoch": 7.335055429986008, |
| "grad_norm": 0.8152256011962891, |
| "learning_rate": 0.0001602499730632475, |
| "loss": 3.2301, |
| "step": 68150 |
| }, |
| { |
| "epoch": 7.340436982025616, |
| "grad_norm": 0.8925390243530273, |
| "learning_rate": 0.00015992673203318606, |
| "loss": 3.2047, |
| "step": 68200 |
| }, |
| { |
| "epoch": 7.3458185340652244, |
| "grad_norm": 0.8430379033088684, |
| "learning_rate": 0.00015960349100312463, |
| "loss": 3.2294, |
| "step": 68250 |
| }, |
| { |
| "epoch": 7.351200086104833, |
| "grad_norm": 0.8633614778518677, |
| "learning_rate": 0.00015928024997306325, |
| "loss": 3.2202, |
| "step": 68300 |
| }, |
| { |
| "epoch": 7.356581638144441, |
| "grad_norm": 0.8025208711624146, |
| "learning_rate": 0.00015895700894300182, |
| "loss": 3.2208, |
| "step": 68350 |
| }, |
| { |
| "epoch": 7.361963190184049, |
| "grad_norm": 0.828989565372467, |
| "learning_rate": 0.00015863376791294039, |
| "loss": 3.2162, |
| "step": 68400 |
| }, |
| { |
| "epoch": 7.367344742223658, |
| "grad_norm": 0.8749890923500061, |
| "learning_rate": 0.000158310526882879, |
| "loss": 3.2363, |
| "step": 68450 |
| }, |
| { |
| "epoch": 7.372726294263265, |
| "grad_norm": 0.8234397768974304, |
| "learning_rate": 0.00015798728585281758, |
| "loss": 3.2358, |
| "step": 68500 |
| }, |
| { |
| "epoch": 7.378107846302874, |
| "grad_norm": 0.8228161931037903, |
| "learning_rate": 0.00015766404482275614, |
| "loss": 3.2272, |
| "step": 68550 |
| }, |
| { |
| "epoch": 7.383489398342482, |
| "grad_norm": 0.8303384780883789, |
| "learning_rate": 0.00015734080379269474, |
| "loss": 3.2314, |
| "step": 68600 |
| }, |
| { |
| "epoch": 7.38887095038209, |
| "grad_norm": 0.8005957007408142, |
| "learning_rate": 0.00015701756276263333, |
| "loss": 3.2291, |
| "step": 68650 |
| }, |
| { |
| "epoch": 7.394252502421699, |
| "grad_norm": 0.8635926842689514, |
| "learning_rate": 0.0001566943217325719, |
| "loss": 3.2199, |
| "step": 68700 |
| }, |
| { |
| "epoch": 7.399634054461306, |
| "grad_norm": 0.7820467352867126, |
| "learning_rate": 0.0001563710807025105, |
| "loss": 3.2154, |
| "step": 68750 |
| }, |
| { |
| "epoch": 7.405015606500915, |
| "grad_norm": 0.8372554779052734, |
| "learning_rate": 0.00015604783967244906, |
| "loss": 3.246, |
| "step": 68800 |
| }, |
| { |
| "epoch": 7.4103971585405235, |
| "grad_norm": 0.8614478707313538, |
| "learning_rate": 0.00015572459864238763, |
| "loss": 3.2373, |
| "step": 68850 |
| }, |
| { |
| "epoch": 7.415778710580131, |
| "grad_norm": 0.7786409854888916, |
| "learning_rate": 0.00015540135761232625, |
| "loss": 3.2196, |
| "step": 68900 |
| }, |
| { |
| "epoch": 7.42116026261974, |
| "grad_norm": 0.80152827501297, |
| "learning_rate": 0.00015507811658226482, |
| "loss": 3.2348, |
| "step": 68950 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "grad_norm": 0.8448081612586975, |
| "learning_rate": 0.00015475487555220344, |
| "loss": 3.2225, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "eval_accuracy": 0.38895625971560543, |
| "eval_loss": 3.3472721576690674, |
| "eval_runtime": 184.8693, |
| "eval_samples_per_second": 97.426, |
| "eval_steps_per_second": 6.091, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.431923366698956, |
| "grad_norm": 0.7952991127967834, |
| "learning_rate": 0.000154431634522142, |
| "loss": 3.2336, |
| "step": 69050 |
| }, |
| { |
| "epoch": 7.4373049187385645, |
| "grad_norm": 0.8257370591163635, |
| "learning_rate": 0.00015410839349208058, |
| "loss": 3.2352, |
| "step": 69100 |
| }, |
| { |
| "epoch": 7.442686470778172, |
| "grad_norm": 0.7818418741226196, |
| "learning_rate": 0.00015378515246201917, |
| "loss": 3.2127, |
| "step": 69150 |
| }, |
| { |
| "epoch": 7.448068022817781, |
| "grad_norm": 0.7481926083564758, |
| "learning_rate": 0.00015346191143195774, |
| "loss": 3.2302, |
| "step": 69200 |
| }, |
| { |
| "epoch": 7.453449574857389, |
| "grad_norm": 0.7871079444885254, |
| "learning_rate": 0.00015313867040189633, |
| "loss": 3.2131, |
| "step": 69250 |
| }, |
| { |
| "epoch": 7.458831126896997, |
| "grad_norm": 0.8238528370857239, |
| "learning_rate": 0.00015281542937183493, |
| "loss": 3.2312, |
| "step": 69300 |
| }, |
| { |
| "epoch": 7.4642126789366054, |
| "grad_norm": 0.8295047283172607, |
| "learning_rate": 0.0001524921883417735, |
| "loss": 3.2295, |
| "step": 69350 |
| }, |
| { |
| "epoch": 7.469594230976213, |
| "grad_norm": 0.8163410425186157, |
| "learning_rate": 0.00015216894731171206, |
| "loss": 3.2072, |
| "step": 69400 |
| }, |
| { |
| "epoch": 7.474975783015822, |
| "grad_norm": 0.8015903830528259, |
| "learning_rate": 0.00015184570628165068, |
| "loss": 3.2306, |
| "step": 69450 |
| }, |
| { |
| "epoch": 7.48035733505543, |
| "grad_norm": 0.8221493363380432, |
| "learning_rate": 0.00015152246525158925, |
| "loss": 3.2241, |
| "step": 69500 |
| }, |
| { |
| "epoch": 7.485738887095038, |
| "grad_norm": 0.8415564298629761, |
| "learning_rate": 0.00015119922422152782, |
| "loss": 3.2263, |
| "step": 69550 |
| }, |
| { |
| "epoch": 7.491120439134646, |
| "grad_norm": 0.8357605338096619, |
| "learning_rate": 0.00015087598319146644, |
| "loss": 3.2443, |
| "step": 69600 |
| }, |
| { |
| "epoch": 7.496501991174255, |
| "grad_norm": 0.8570162057876587, |
| "learning_rate": 0.000150552742161405, |
| "loss": 3.2303, |
| "step": 69650 |
| }, |
| { |
| "epoch": 7.501883543213863, |
| "grad_norm": 0.8457155227661133, |
| "learning_rate": 0.00015022950113134358, |
| "loss": 3.2263, |
| "step": 69700 |
| }, |
| { |
| "epoch": 7.507265095253471, |
| "grad_norm": 0.8195136785507202, |
| "learning_rate": 0.00014990626010128217, |
| "loss": 3.246, |
| "step": 69750 |
| }, |
| { |
| "epoch": 7.51264664729308, |
| "grad_norm": 0.9330834150314331, |
| "learning_rate": 0.00014958301907122077, |
| "loss": 3.2381, |
| "step": 69800 |
| }, |
| { |
| "epoch": 7.518028199332687, |
| "grad_norm": 0.836077094078064, |
| "learning_rate": 0.00014925977804115933, |
| "loss": 3.2522, |
| "step": 69850 |
| }, |
| { |
| "epoch": 7.523409751372296, |
| "grad_norm": 0.8031020760536194, |
| "learning_rate": 0.00014893653701109793, |
| "loss": 3.237, |
| "step": 69900 |
| }, |
| { |
| "epoch": 7.528791303411904, |
| "grad_norm": 0.8233877420425415, |
| "learning_rate": 0.0001486132959810365, |
| "loss": 3.229, |
| "step": 69950 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "grad_norm": 0.8217124342918396, |
| "learning_rate": 0.0001482900549509751, |
| "loss": 3.2258, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "eval_accuracy": 0.3891678068881802, |
| "eval_loss": 3.345587968826294, |
| "eval_runtime": 184.9082, |
| "eval_samples_per_second": 97.405, |
| "eval_steps_per_second": 6.09, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.539554407491121, |
| "grad_norm": 0.8223305344581604, |
| "learning_rate": 0.00014796681392091368, |
| "loss": 3.2386, |
| "step": 70050 |
| }, |
| { |
| "epoch": 7.544935959530728, |
| "grad_norm": 0.8422094583511353, |
| "learning_rate": 0.00014764357289085228, |
| "loss": 3.2385, |
| "step": 70100 |
| }, |
| { |
| "epoch": 7.550317511570337, |
| "grad_norm": 0.8534138798713684, |
| "learning_rate": 0.00014732033186079085, |
| "loss": 3.2214, |
| "step": 70150 |
| }, |
| { |
| "epoch": 7.5556990636099455, |
| "grad_norm": 0.8637493252754211, |
| "learning_rate": 0.00014699709083072944, |
| "loss": 3.2428, |
| "step": 70200 |
| }, |
| { |
| "epoch": 7.561080615649553, |
| "grad_norm": 0.831741452217102, |
| "learning_rate": 0.00014667384980066804, |
| "loss": 3.2154, |
| "step": 70250 |
| }, |
| { |
| "epoch": 7.566462167689162, |
| "grad_norm": 0.8678784370422363, |
| "learning_rate": 0.00014635707359120784, |
| "loss": 3.2371, |
| "step": 70300 |
| }, |
| { |
| "epoch": 7.57184371972877, |
| "grad_norm": 0.8292410373687744, |
| "learning_rate": 0.0001460338325611464, |
| "loss": 3.2373, |
| "step": 70350 |
| }, |
| { |
| "epoch": 7.577225271768378, |
| "grad_norm": 0.7928509712219238, |
| "learning_rate": 0.00014571059153108498, |
| "loss": 3.2306, |
| "step": 70400 |
| }, |
| { |
| "epoch": 7.5826068238079865, |
| "grad_norm": 0.8370128870010376, |
| "learning_rate": 0.00014538735050102357, |
| "loss": 3.2222, |
| "step": 70450 |
| }, |
| { |
| "epoch": 7.587988375847594, |
| "grad_norm": 0.801749587059021, |
| "learning_rate": 0.00014506410947096217, |
| "loss": 3.2312, |
| "step": 70500 |
| }, |
| { |
| "epoch": 7.593369927887203, |
| "grad_norm": 0.8322213292121887, |
| "learning_rate": 0.00014474086844090076, |
| "loss": 3.2293, |
| "step": 70550 |
| }, |
| { |
| "epoch": 7.598751479926811, |
| "grad_norm": 0.8021792769432068, |
| "learning_rate": 0.00014441762741083933, |
| "loss": 3.2364, |
| "step": 70600 |
| }, |
| { |
| "epoch": 7.604133031966419, |
| "grad_norm": 0.8352303504943848, |
| "learning_rate": 0.00014409438638077792, |
| "loss": 3.2124, |
| "step": 70650 |
| }, |
| { |
| "epoch": 7.609514584006027, |
| "grad_norm": 0.850394070148468, |
| "learning_rate": 0.00014377114535071652, |
| "loss": 3.2298, |
| "step": 70700 |
| }, |
| { |
| "epoch": 7.614896136045635, |
| "grad_norm": 0.8542826771736145, |
| "learning_rate": 0.0001434479043206551, |
| "loss": 3.2264, |
| "step": 70750 |
| }, |
| { |
| "epoch": 7.620277688085244, |
| "grad_norm": 0.8750932216644287, |
| "learning_rate": 0.00014312466329059368, |
| "loss": 3.2408, |
| "step": 70800 |
| }, |
| { |
| "epoch": 7.625659240124852, |
| "grad_norm": 0.8510986566543579, |
| "learning_rate": 0.00014280142226053225, |
| "loss": 3.21, |
| "step": 70850 |
| }, |
| { |
| "epoch": 7.63104079216446, |
| "grad_norm": 0.8305497765541077, |
| "learning_rate": 0.00014247818123047084, |
| "loss": 3.2402, |
| "step": 70900 |
| }, |
| { |
| "epoch": 7.636422344204068, |
| "grad_norm": 0.8636519908905029, |
| "learning_rate": 0.0001421549402004094, |
| "loss": 3.2141, |
| "step": 70950 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "grad_norm": 0.871688723564148, |
| "learning_rate": 0.000141831699170348, |
| "loss": 3.2407, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "eval_accuracy": 0.3893689233835248, |
| "eval_loss": 3.3400113582611084, |
| "eval_runtime": 185.1188, |
| "eval_samples_per_second": 97.294, |
| "eval_steps_per_second": 6.083, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.647185448283285, |
| "grad_norm": 0.8559573888778687, |
| "learning_rate": 0.0001415084581402866, |
| "loss": 3.2194, |
| "step": 71050 |
| }, |
| { |
| "epoch": 7.652567000322893, |
| "grad_norm": 0.8053033947944641, |
| "learning_rate": 0.00014118521711022517, |
| "loss": 3.2275, |
| "step": 71100 |
| }, |
| { |
| "epoch": 7.657948552362502, |
| "grad_norm": 0.872286856174469, |
| "learning_rate": 0.00014086197608016376, |
| "loss": 3.2224, |
| "step": 71150 |
| }, |
| { |
| "epoch": 7.663330104402109, |
| "grad_norm": 0.7740647792816162, |
| "learning_rate": 0.00014053873505010236, |
| "loss": 3.2056, |
| "step": 71200 |
| }, |
| { |
| "epoch": 7.668711656441718, |
| "grad_norm": 0.858696699142456, |
| "learning_rate": 0.00014021549402004092, |
| "loss": 3.2429, |
| "step": 71250 |
| }, |
| { |
| "epoch": 7.674093208481326, |
| "grad_norm": 0.825717031955719, |
| "learning_rate": 0.00013989225298997952, |
| "loss": 3.24, |
| "step": 71300 |
| }, |
| { |
| "epoch": 7.679474760520934, |
| "grad_norm": 0.8377315402030945, |
| "learning_rate": 0.00013956901195991811, |
| "loss": 3.2464, |
| "step": 71350 |
| }, |
| { |
| "epoch": 7.684856312560543, |
| "grad_norm": 0.8150701522827148, |
| "learning_rate": 0.00013924577092985668, |
| "loss": 3.2356, |
| "step": 71400 |
| }, |
| { |
| "epoch": 7.69023786460015, |
| "grad_norm": 0.8369122743606567, |
| "learning_rate": 0.00013892252989979528, |
| "loss": 3.2431, |
| "step": 71450 |
| }, |
| { |
| "epoch": 7.695619416639759, |
| "grad_norm": 0.8841993808746338, |
| "learning_rate": 0.00013859928886973384, |
| "loss": 3.2415, |
| "step": 71500 |
| }, |
| { |
| "epoch": 7.7010009686793675, |
| "grad_norm": 0.8426228165626526, |
| "learning_rate": 0.00013827604783967244, |
| "loss": 3.2186, |
| "step": 71550 |
| }, |
| { |
| "epoch": 7.706382520718975, |
| "grad_norm": 0.819513738155365, |
| "learning_rate": 0.000137952806809611, |
| "loss": 3.2303, |
| "step": 71600 |
| }, |
| { |
| "epoch": 7.711764072758584, |
| "grad_norm": 0.8620902895927429, |
| "learning_rate": 0.0001376295657795496, |
| "loss": 3.247, |
| "step": 71650 |
| }, |
| { |
| "epoch": 7.717145624798192, |
| "grad_norm": 0.8760289549827576, |
| "learning_rate": 0.0001373063247494882, |
| "loss": 3.2234, |
| "step": 71700 |
| }, |
| { |
| "epoch": 7.7225271768378, |
| "grad_norm": 0.8032657504081726, |
| "learning_rate": 0.00013698308371942676, |
| "loss": 3.2231, |
| "step": 71750 |
| }, |
| { |
| "epoch": 7.727908728877408, |
| "grad_norm": 0.8196533918380737, |
| "learning_rate": 0.00013665984268936536, |
| "loss": 3.2375, |
| "step": 71800 |
| }, |
| { |
| "epoch": 7.733290280917016, |
| "grad_norm": 0.795331597328186, |
| "learning_rate": 0.00013633660165930395, |
| "loss": 3.2346, |
| "step": 71850 |
| }, |
| { |
| "epoch": 7.738671832956625, |
| "grad_norm": 0.8460344076156616, |
| "learning_rate": 0.00013601336062924255, |
| "loss": 3.2446, |
| "step": 71900 |
| }, |
| { |
| "epoch": 7.744053384996233, |
| "grad_norm": 0.862134575843811, |
| "learning_rate": 0.00013569011959918111, |
| "loss": 3.2438, |
| "step": 71950 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "grad_norm": 0.8252710700035095, |
| "learning_rate": 0.0001353668785691197, |
| "loss": 3.228, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "eval_accuracy": 0.3897550757468175, |
| "eval_loss": 3.3368613719940186, |
| "eval_runtime": 184.6906, |
| "eval_samples_per_second": 97.52, |
| "eval_steps_per_second": 6.097, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.754816489075449, |
| "grad_norm": 0.8688374161720276, |
| "learning_rate": 0.00013504363753905828, |
| "loss": 3.232, |
| "step": 72050 |
| }, |
| { |
| "epoch": 7.760198041115058, |
| "grad_norm": 0.8163436055183411, |
| "learning_rate": 0.00013472039650899687, |
| "loss": 3.2337, |
| "step": 72100 |
| }, |
| { |
| "epoch": 7.765579593154666, |
| "grad_norm": 0.7999975085258484, |
| "learning_rate": 0.00013439715547893544, |
| "loss": 3.2324, |
| "step": 72150 |
| }, |
| { |
| "epoch": 7.770961145194274, |
| "grad_norm": 0.8432734608650208, |
| "learning_rate": 0.00013407391444887403, |
| "loss": 3.2204, |
| "step": 72200 |
| }, |
| { |
| "epoch": 7.776342697233883, |
| "grad_norm": 0.8608858585357666, |
| "learning_rate": 0.0001337506734188126, |
| "loss": 3.2419, |
| "step": 72250 |
| }, |
| { |
| "epoch": 7.78172424927349, |
| "grad_norm": 0.8750514388084412, |
| "learning_rate": 0.0001334274323887512, |
| "loss": 3.2275, |
| "step": 72300 |
| }, |
| { |
| "epoch": 7.787105801313099, |
| "grad_norm": 0.8133453726768494, |
| "learning_rate": 0.0001331041913586898, |
| "loss": 3.2265, |
| "step": 72350 |
| }, |
| { |
| "epoch": 7.792487353352707, |
| "grad_norm": 0.8550586700439453, |
| "learning_rate": 0.00013278095032862838, |
| "loss": 3.2262, |
| "step": 72400 |
| }, |
| { |
| "epoch": 7.797868905392315, |
| "grad_norm": 0.8691797256469727, |
| "learning_rate": 0.00013245770929856695, |
| "loss": 3.243, |
| "step": 72450 |
| }, |
| { |
| "epoch": 7.803250457431924, |
| "grad_norm": 0.883783757686615, |
| "learning_rate": 0.00013213446826850555, |
| "loss": 3.2444, |
| "step": 72500 |
| }, |
| { |
| "epoch": 7.808632009471531, |
| "grad_norm": 0.8034313917160034, |
| "learning_rate": 0.00013181122723844411, |
| "loss": 3.2254, |
| "step": 72550 |
| }, |
| { |
| "epoch": 7.81401356151114, |
| "grad_norm": 0.830434262752533, |
| "learning_rate": 0.0001314879862083827, |
| "loss": 3.235, |
| "step": 72600 |
| }, |
| { |
| "epoch": 7.819395113550748, |
| "grad_norm": 0.8352016806602478, |
| "learning_rate": 0.00013116474517832128, |
| "loss": 3.2293, |
| "step": 72650 |
| }, |
| { |
| "epoch": 7.824776665590356, |
| "grad_norm": 0.8240856528282166, |
| "learning_rate": 0.00013084150414825987, |
| "loss": 3.2342, |
| "step": 72700 |
| }, |
| { |
| "epoch": 7.830158217629965, |
| "grad_norm": 0.7969643473625183, |
| "learning_rate": 0.00013051826311819844, |
| "loss": 3.2313, |
| "step": 72750 |
| }, |
| { |
| "epoch": 7.835539769669572, |
| "grad_norm": 0.8742566108703613, |
| "learning_rate": 0.00013019502208813703, |
| "loss": 3.2344, |
| "step": 72800 |
| }, |
| { |
| "epoch": 7.840921321709181, |
| "grad_norm": 0.8692363500595093, |
| "learning_rate": 0.00012987178105807563, |
| "loss": 3.2332, |
| "step": 72850 |
| }, |
| { |
| "epoch": 7.846302873748789, |
| "grad_norm": 0.8365722298622131, |
| "learning_rate": 0.00012954854002801422, |
| "loss": 3.2264, |
| "step": 72900 |
| }, |
| { |
| "epoch": 7.851684425788397, |
| "grad_norm": 0.828915536403656, |
| "learning_rate": 0.0001292252989979528, |
| "loss": 3.2234, |
| "step": 72950 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "grad_norm": 0.8495892882347107, |
| "learning_rate": 0.00012890205796789139, |
| "loss": 3.2466, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "eval_accuracy": 0.3902502069565881, |
| "eval_loss": 3.332028388977051, |
| "eval_runtime": 184.8344, |
| "eval_samples_per_second": 97.444, |
| "eval_steps_per_second": 6.092, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.862447529867614, |
| "grad_norm": 0.8604885339736938, |
| "learning_rate": 0.00012857881693782998, |
| "loss": 3.228, |
| "step": 73050 |
| }, |
| { |
| "epoch": 7.867829081907222, |
| "grad_norm": 0.8913674354553223, |
| "learning_rate": 0.00012825557590776855, |
| "loss": 3.2537, |
| "step": 73100 |
| }, |
| { |
| "epoch": 7.87321063394683, |
| "grad_norm": 0.8491577506065369, |
| "learning_rate": 0.00012793233487770714, |
| "loss": 3.222, |
| "step": 73150 |
| }, |
| { |
| "epoch": 7.878592185986438, |
| "grad_norm": 0.9810826182365417, |
| "learning_rate": 0.0001276090938476457, |
| "loss": 3.2323, |
| "step": 73200 |
| }, |
| { |
| "epoch": 7.883973738026047, |
| "grad_norm": 0.8327536582946777, |
| "learning_rate": 0.0001272858528175843, |
| "loss": 3.235, |
| "step": 73250 |
| }, |
| { |
| "epoch": 7.889355290065655, |
| "grad_norm": 0.8036714196205139, |
| "learning_rate": 0.00012696261178752287, |
| "loss": 3.2236, |
| "step": 73300 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 0.8222293853759766, |
| "learning_rate": 0.00012663937075746147, |
| "loss": 3.2249, |
| "step": 73350 |
| }, |
| { |
| "epoch": 7.900118394144871, |
| "grad_norm": 0.8565043210983276, |
| "learning_rate": 0.00012631612972740006, |
| "loss": 3.2353, |
| "step": 73400 |
| }, |
| { |
| "epoch": 7.90549994618448, |
| "grad_norm": 0.8443336486816406, |
| "learning_rate": 0.00012599288869733863, |
| "loss": 3.2214, |
| "step": 73450 |
| }, |
| { |
| "epoch": 7.910881498224088, |
| "grad_norm": 0.9020589590072632, |
| "learning_rate": 0.00012566964766727722, |
| "loss": 3.2482, |
| "step": 73500 |
| }, |
| { |
| "epoch": 7.916263050263696, |
| "grad_norm": 0.8561992049217224, |
| "learning_rate": 0.00012534640663721582, |
| "loss": 3.2166, |
| "step": 73550 |
| }, |
| { |
| "epoch": 7.921644602303305, |
| "grad_norm": 0.8745976090431213, |
| "learning_rate": 0.00012502316560715439, |
| "loss": 3.2524, |
| "step": 73600 |
| }, |
| { |
| "epoch": 7.927026154342912, |
| "grad_norm": 0.8272269368171692, |
| "learning_rate": 0.00012469992457709298, |
| "loss": 3.2354, |
| "step": 73650 |
| }, |
| { |
| "epoch": 7.932407706382521, |
| "grad_norm": 0.9126244783401489, |
| "learning_rate": 0.00012437668354703158, |
| "loss": 3.2354, |
| "step": 73700 |
| }, |
| { |
| "epoch": 7.937789258422129, |
| "grad_norm": 0.8359974026679993, |
| "learning_rate": 0.00012405344251697014, |
| "loss": 3.2412, |
| "step": 73750 |
| }, |
| { |
| "epoch": 7.943170810461737, |
| "grad_norm": 0.8640374541282654, |
| "learning_rate": 0.00012373020148690874, |
| "loss": 3.2426, |
| "step": 73800 |
| }, |
| { |
| "epoch": 7.948552362501346, |
| "grad_norm": 0.878259003162384, |
| "learning_rate": 0.0001234069604568473, |
| "loss": 3.223, |
| "step": 73850 |
| }, |
| { |
| "epoch": 7.953933914540953, |
| "grad_norm": 0.9081090688705444, |
| "learning_rate": 0.0001230837194267859, |
| "loss": 3.2481, |
| "step": 73900 |
| }, |
| { |
| "epoch": 7.959315466580562, |
| "grad_norm": 0.8142951130867004, |
| "learning_rate": 0.00012276047839672447, |
| "loss": 3.2287, |
| "step": 73950 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "grad_norm": 0.8311217427253723, |
| "learning_rate": 0.00012243723736666306, |
| "loss": 3.2126, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "eval_accuracy": 0.39052531606853436, |
| "eval_loss": 3.3307554721832275, |
| "eval_runtime": 184.6893, |
| "eval_samples_per_second": 97.521, |
| "eval_steps_per_second": 6.097, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.970078570659778, |
| "grad_norm": 0.8098016381263733, |
| "learning_rate": 0.00012211399633660166, |
| "loss": 3.2278, |
| "step": 74050 |
| }, |
| { |
| "epoch": 7.975460122699387, |
| "grad_norm": 0.8477680683135986, |
| "learning_rate": 0.00012179075530654022, |
| "loss": 3.2304, |
| "step": 74100 |
| }, |
| { |
| "epoch": 7.980841674738995, |
| "grad_norm": 0.825594961643219, |
| "learning_rate": 0.00012146751427647882, |
| "loss": 3.2386, |
| "step": 74150 |
| }, |
| { |
| "epoch": 7.986223226778603, |
| "grad_norm": 0.8377440571784973, |
| "learning_rate": 0.0001211442732464174, |
| "loss": 3.2335, |
| "step": 74200 |
| }, |
| { |
| "epoch": 7.991604778818211, |
| "grad_norm": 0.8370752930641174, |
| "learning_rate": 0.00012082103221635598, |
| "loss": 3.2272, |
| "step": 74250 |
| }, |
| { |
| "epoch": 7.996986330857819, |
| "grad_norm": 0.8993654251098633, |
| "learning_rate": 0.00012049779118629456, |
| "loss": 3.2465, |
| "step": 74300 |
| }, |
| { |
| "epoch": 8.002367882897428, |
| "grad_norm": 0.8433480858802795, |
| "learning_rate": 0.0001201810149768344, |
| "loss": 3.1976, |
| "step": 74350 |
| }, |
| { |
| "epoch": 8.007749434937036, |
| "grad_norm": 0.8284233808517456, |
| "learning_rate": 0.00011985777394677296, |
| "loss": 3.1514, |
| "step": 74400 |
| }, |
| { |
| "epoch": 8.013130986976645, |
| "grad_norm": 0.8333739638328552, |
| "learning_rate": 0.00011953453291671154, |
| "loss": 3.1506, |
| "step": 74450 |
| }, |
| { |
| "epoch": 8.018512539016251, |
| "grad_norm": 0.8498620986938477, |
| "learning_rate": 0.00011921129188665014, |
| "loss": 3.1347, |
| "step": 74500 |
| }, |
| { |
| "epoch": 8.02389409105586, |
| "grad_norm": 0.8060201406478882, |
| "learning_rate": 0.0001188880508565887, |
| "loss": 3.1446, |
| "step": 74550 |
| }, |
| { |
| "epoch": 8.029275643095469, |
| "grad_norm": 0.8369296789169312, |
| "learning_rate": 0.0001185648098265273, |
| "loss": 3.1747, |
| "step": 74600 |
| }, |
| { |
| "epoch": 8.034657195135077, |
| "grad_norm": 0.8029821515083313, |
| "learning_rate": 0.0001182415687964659, |
| "loss": 3.1674, |
| "step": 74650 |
| }, |
| { |
| "epoch": 8.040038747174686, |
| "grad_norm": 0.8536189198493958, |
| "learning_rate": 0.00011791832776640448, |
| "loss": 3.1492, |
| "step": 74700 |
| }, |
| { |
| "epoch": 8.045420299214294, |
| "grad_norm": 0.8600338101387024, |
| "learning_rate": 0.00011759508673634306, |
| "loss": 3.1523, |
| "step": 74750 |
| }, |
| { |
| "epoch": 8.050801851253901, |
| "grad_norm": 0.838543713092804, |
| "learning_rate": 0.00011727184570628164, |
| "loss": 3.1805, |
| "step": 74800 |
| }, |
| { |
| "epoch": 8.05618340329351, |
| "grad_norm": 0.7800434827804565, |
| "learning_rate": 0.00011694860467622023, |
| "loss": 3.172, |
| "step": 74850 |
| }, |
| { |
| "epoch": 8.061564955333118, |
| "grad_norm": 0.8202197551727295, |
| "learning_rate": 0.0001166253636461588, |
| "loss": 3.1607, |
| "step": 74900 |
| }, |
| { |
| "epoch": 8.066946507372727, |
| "grad_norm": 0.8486565947532654, |
| "learning_rate": 0.0001163021226160974, |
| "loss": 3.1748, |
| "step": 74950 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "grad_norm": 0.9214633107185364, |
| "learning_rate": 0.00011597888158603598, |
| "loss": 3.1488, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "eval_accuracy": 0.3903842846201511, |
| "eval_loss": 3.3340110778808594, |
| "eval_runtime": 185.0586, |
| "eval_samples_per_second": 97.326, |
| "eval_steps_per_second": 6.085, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.077709611451942, |
| "grad_norm": 0.868916392326355, |
| "learning_rate": 0.00011565564055597456, |
| "loss": 3.1697, |
| "step": 75050 |
| }, |
| { |
| "epoch": 8.08309116349155, |
| "grad_norm": 0.8408175706863403, |
| "learning_rate": 0.00011533239952591314, |
| "loss": 3.1719, |
| "step": 75100 |
| }, |
| { |
| "epoch": 8.088472715531159, |
| "grad_norm": 0.8719428777694702, |
| "learning_rate": 0.00011500915849585173, |
| "loss": 3.182, |
| "step": 75150 |
| }, |
| { |
| "epoch": 8.093854267570768, |
| "grad_norm": 0.7724485397338867, |
| "learning_rate": 0.00011468591746579033, |
| "loss": 3.1634, |
| "step": 75200 |
| }, |
| { |
| "epoch": 8.099235819610376, |
| "grad_norm": 0.8824598789215088, |
| "learning_rate": 0.0001143626764357289, |
| "loss": 3.1758, |
| "step": 75250 |
| }, |
| { |
| "epoch": 8.104617371649983, |
| "grad_norm": 0.8987236022949219, |
| "learning_rate": 0.00011403943540566749, |
| "loss": 3.1593, |
| "step": 75300 |
| }, |
| { |
| "epoch": 8.109998923689592, |
| "grad_norm": 0.8450270295143127, |
| "learning_rate": 0.00011371619437560607, |
| "loss": 3.164, |
| "step": 75350 |
| }, |
| { |
| "epoch": 8.1153804757292, |
| "grad_norm": 0.8858851790428162, |
| "learning_rate": 0.00011339295334554464, |
| "loss": 3.1648, |
| "step": 75400 |
| }, |
| { |
| "epoch": 8.120762027768809, |
| "grad_norm": 0.83014315366745, |
| "learning_rate": 0.00011306971231548323, |
| "loss": 3.1557, |
| "step": 75450 |
| }, |
| { |
| "epoch": 8.126143579808417, |
| "grad_norm": 0.8151319622993469, |
| "learning_rate": 0.00011274647128542183, |
| "loss": 3.1569, |
| "step": 75500 |
| }, |
| { |
| "epoch": 8.131525131848026, |
| "grad_norm": 0.8844127655029297, |
| "learning_rate": 0.0001124232302553604, |
| "loss": 3.1876, |
| "step": 75550 |
| }, |
| { |
| "epoch": 8.136906683887632, |
| "grad_norm": 0.8928391933441162, |
| "learning_rate": 0.00011209998922529899, |
| "loss": 3.1632, |
| "step": 75600 |
| }, |
| { |
| "epoch": 8.142288235927241, |
| "grad_norm": 0.844195544719696, |
| "learning_rate": 0.00011177674819523757, |
| "loss": 3.1593, |
| "step": 75650 |
| }, |
| { |
| "epoch": 8.14766978796685, |
| "grad_norm": 0.8816542625427246, |
| "learning_rate": 0.00011145350716517617, |
| "loss": 3.1543, |
| "step": 75700 |
| }, |
| { |
| "epoch": 8.153051340006458, |
| "grad_norm": 0.917953372001648, |
| "learning_rate": 0.00011113026613511473, |
| "loss": 3.1802, |
| "step": 75750 |
| }, |
| { |
| "epoch": 8.158432892046067, |
| "grad_norm": 0.8676404356956482, |
| "learning_rate": 0.00011081348992565455, |
| "loss": 3.1598, |
| "step": 75800 |
| }, |
| { |
| "epoch": 8.163814444085673, |
| "grad_norm": 0.8764878511428833, |
| "learning_rate": 0.00011049024889559314, |
| "loss": 3.1617, |
| "step": 75850 |
| }, |
| { |
| "epoch": 8.169195996125282, |
| "grad_norm": 0.8242963552474976, |
| "learning_rate": 0.00011016700786553172, |
| "loss": 3.1673, |
| "step": 75900 |
| }, |
| { |
| "epoch": 8.17457754816489, |
| "grad_norm": 0.8988597989082336, |
| "learning_rate": 0.00010984376683547031, |
| "loss": 3.181, |
| "step": 75950 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "grad_norm": 0.8427569270133972, |
| "learning_rate": 0.00010952052580540889, |
| "loss": 3.1626, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "eval_accuracy": 0.3904491503941764, |
| "eval_loss": 3.3352701663970947, |
| "eval_runtime": 184.661, |
| "eval_samples_per_second": 97.535, |
| "eval_steps_per_second": 6.098, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.185340652244108, |
| "grad_norm": 0.8113105297088623, |
| "learning_rate": 0.00010919728477534747, |
| "loss": 3.1803, |
| "step": 76050 |
| }, |
| { |
| "epoch": 8.190722204283716, |
| "grad_norm": 0.8857055902481079, |
| "learning_rate": 0.00010887404374528605, |
| "loss": 3.1951, |
| "step": 76100 |
| }, |
| { |
| "epoch": 8.196103756323323, |
| "grad_norm": 0.8819203972816467, |
| "learning_rate": 0.00010855080271522465, |
| "loss": 3.157, |
| "step": 76150 |
| }, |
| { |
| "epoch": 8.201485308362932, |
| "grad_norm": 0.920045793056488, |
| "learning_rate": 0.00010822756168516322, |
| "loss": 3.1773, |
| "step": 76200 |
| }, |
| { |
| "epoch": 8.20686686040254, |
| "grad_norm": 0.9266934990882874, |
| "learning_rate": 0.00010790432065510181, |
| "loss": 3.18, |
| "step": 76250 |
| }, |
| { |
| "epoch": 8.212248412442149, |
| "grad_norm": 0.8361415266990662, |
| "learning_rate": 0.0001075810796250404, |
| "loss": 3.1652, |
| "step": 76300 |
| }, |
| { |
| "epoch": 8.217629964481757, |
| "grad_norm": 0.9125852584838867, |
| "learning_rate": 0.00010725783859497897, |
| "loss": 3.1847, |
| "step": 76350 |
| }, |
| { |
| "epoch": 8.223011516521364, |
| "grad_norm": 0.8211091160774231, |
| "learning_rate": 0.00010693459756491757, |
| "loss": 3.1481, |
| "step": 76400 |
| }, |
| { |
| "epoch": 8.228393068560973, |
| "grad_norm": 0.8206071257591248, |
| "learning_rate": 0.00010661135653485615, |
| "loss": 3.1628, |
| "step": 76450 |
| }, |
| { |
| "epoch": 8.233774620600581, |
| "grad_norm": 0.8912805318832397, |
| "learning_rate": 0.00010628811550479474, |
| "loss": 3.1706, |
| "step": 76500 |
| }, |
| { |
| "epoch": 8.23915617264019, |
| "grad_norm": 0.8347760438919067, |
| "learning_rate": 0.00010596487447473331, |
| "loss": 3.1822, |
| "step": 76550 |
| }, |
| { |
| "epoch": 8.244537724679798, |
| "grad_norm": 0.8456274271011353, |
| "learning_rate": 0.0001056416334446719, |
| "loss": 3.1729, |
| "step": 76600 |
| }, |
| { |
| "epoch": 8.249919276719407, |
| "grad_norm": 0.8533112406730652, |
| "learning_rate": 0.00010531839241461049, |
| "loss": 3.1695, |
| "step": 76650 |
| }, |
| { |
| "epoch": 8.255300828759013, |
| "grad_norm": 0.8624753952026367, |
| "learning_rate": 0.00010499515138454907, |
| "loss": 3.1648, |
| "step": 76700 |
| }, |
| { |
| "epoch": 8.260682380798622, |
| "grad_norm": 0.8519622683525085, |
| "learning_rate": 0.00010467191035448765, |
| "loss": 3.1798, |
| "step": 76750 |
| }, |
| { |
| "epoch": 8.26606393283823, |
| "grad_norm": 0.838147759437561, |
| "learning_rate": 0.00010434866932442624, |
| "loss": 3.1683, |
| "step": 76800 |
| }, |
| { |
| "epoch": 8.27144548487784, |
| "grad_norm": 0.903648853302002, |
| "learning_rate": 0.00010402542829436481, |
| "loss": 3.1834, |
| "step": 76850 |
| }, |
| { |
| "epoch": 8.276827036917448, |
| "grad_norm": 0.8623703718185425, |
| "learning_rate": 0.0001037021872643034, |
| "loss": 3.1707, |
| "step": 76900 |
| }, |
| { |
| "epoch": 8.282208588957054, |
| "grad_norm": 0.8603115081787109, |
| "learning_rate": 0.00010337894623424199, |
| "loss": 3.1824, |
| "step": 76950 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "grad_norm": 0.8683320879936218, |
| "learning_rate": 0.00010305570520418058, |
| "loss": 3.1706, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "eval_accuracy": 0.39092265967927187, |
| "eval_loss": 3.3312289714813232, |
| "eval_runtime": 184.7891, |
| "eval_samples_per_second": 97.468, |
| "eval_steps_per_second": 6.093, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.292971693036272, |
| "grad_norm": 0.8868846893310547, |
| "learning_rate": 0.00010273246417411915, |
| "loss": 3.1657, |
| "step": 77050 |
| }, |
| { |
| "epoch": 8.29835324507588, |
| "grad_norm": 0.835442304611206, |
| "learning_rate": 0.00010240922314405774, |
| "loss": 3.1762, |
| "step": 77100 |
| }, |
| { |
| "epoch": 8.303734797115489, |
| "grad_norm": 0.8506765961647034, |
| "learning_rate": 0.00010208598211399634, |
| "loss": 3.1836, |
| "step": 77150 |
| }, |
| { |
| "epoch": 8.309116349155097, |
| "grad_norm": 0.9037638306617737, |
| "learning_rate": 0.0001017627410839349, |
| "loss": 3.1595, |
| "step": 77200 |
| }, |
| { |
| "epoch": 8.314497901194704, |
| "grad_norm": 0.8438506722450256, |
| "learning_rate": 0.0001014395000538735, |
| "loss": 3.1774, |
| "step": 77250 |
| }, |
| { |
| "epoch": 8.319879453234313, |
| "grad_norm": 0.8813534379005432, |
| "learning_rate": 0.00010111625902381208, |
| "loss": 3.1947, |
| "step": 77300 |
| }, |
| { |
| "epoch": 8.325261005273921, |
| "grad_norm": 0.8885173797607422, |
| "learning_rate": 0.00010079301799375066, |
| "loss": 3.1824, |
| "step": 77350 |
| }, |
| { |
| "epoch": 8.33064255731353, |
| "grad_norm": 0.8622186183929443, |
| "learning_rate": 0.00010046977696368924, |
| "loss": 3.1645, |
| "step": 77400 |
| }, |
| { |
| "epoch": 8.336024109353138, |
| "grad_norm": 0.891852080821991, |
| "learning_rate": 0.00010014653593362784, |
| "loss": 3.1819, |
| "step": 77450 |
| }, |
| { |
| "epoch": 8.341405661392745, |
| "grad_norm": 0.8418218493461609, |
| "learning_rate": 9.982329490356642e-05, |
| "loss": 3.1684, |
| "step": 77500 |
| }, |
| { |
| "epoch": 8.346787213432354, |
| "grad_norm": 0.8554936051368713, |
| "learning_rate": 9.9500053873505e-05, |
| "loss": 3.1878, |
| "step": 77550 |
| }, |
| { |
| "epoch": 8.352168765471962, |
| "grad_norm": 0.854648232460022, |
| "learning_rate": 9.917681284344358e-05, |
| "loss": 3.1779, |
| "step": 77600 |
| }, |
| { |
| "epoch": 8.35755031751157, |
| "grad_norm": 0.8960241079330444, |
| "learning_rate": 9.885357181338218e-05, |
| "loss": 3.1708, |
| "step": 77650 |
| }, |
| { |
| "epoch": 8.36293186955118, |
| "grad_norm": 0.9255419969558716, |
| "learning_rate": 9.853033078332074e-05, |
| "loss": 3.1909, |
| "step": 77700 |
| }, |
| { |
| "epoch": 8.368313421590786, |
| "grad_norm": 0.8990132808685303, |
| "learning_rate": 9.820708975325934e-05, |
| "loss": 3.1713, |
| "step": 77750 |
| }, |
| { |
| "epoch": 8.373694973630395, |
| "grad_norm": 0.85789954662323, |
| "learning_rate": 9.788384872319793e-05, |
| "loss": 3.1605, |
| "step": 77800 |
| }, |
| { |
| "epoch": 8.379076525670003, |
| "grad_norm": 0.858098030090332, |
| "learning_rate": 9.75606076931365e-05, |
| "loss": 3.1687, |
| "step": 77850 |
| }, |
| { |
| "epoch": 8.384458077709612, |
| "grad_norm": 0.8912584781646729, |
| "learning_rate": 9.723736666307508e-05, |
| "loss": 3.1633, |
| "step": 77900 |
| }, |
| { |
| "epoch": 8.38983962974922, |
| "grad_norm": 0.8760279417037964, |
| "learning_rate": 9.691412563301368e-05, |
| "loss": 3.163, |
| "step": 77950 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "grad_norm": 0.8484280705451965, |
| "learning_rate": 9.659088460295227e-05, |
| "loss": 3.1875, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "eval_accuracy": 0.39122645315360116, |
| "eval_loss": 3.326995372772217, |
| "eval_runtime": 184.8092, |
| "eval_samples_per_second": 97.457, |
| "eval_steps_per_second": 6.093, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.400602733828435, |
| "grad_norm": 0.8364370465278625, |
| "learning_rate": 9.626764357289084e-05, |
| "loss": 3.1871, |
| "step": 78050 |
| }, |
| { |
| "epoch": 8.405984285868044, |
| "grad_norm": 0.8873591423034668, |
| "learning_rate": 9.594440254282943e-05, |
| "loss": 3.1791, |
| "step": 78100 |
| }, |
| { |
| "epoch": 8.411365837907653, |
| "grad_norm": 0.8371385335922241, |
| "learning_rate": 9.562116151276802e-05, |
| "loss": 3.1628, |
| "step": 78150 |
| }, |
| { |
| "epoch": 8.416747389947261, |
| "grad_norm": 0.8212653994560242, |
| "learning_rate": 9.52979204827066e-05, |
| "loss": 3.1585, |
| "step": 78200 |
| }, |
| { |
| "epoch": 8.42212894198687, |
| "grad_norm": 0.8428384065628052, |
| "learning_rate": 9.497467945264518e-05, |
| "loss": 3.2135, |
| "step": 78250 |
| }, |
| { |
| "epoch": 8.427510494026476, |
| "grad_norm": 0.8416067361831665, |
| "learning_rate": 9.465143842258377e-05, |
| "loss": 3.1811, |
| "step": 78300 |
| }, |
| { |
| "epoch": 8.432892046066085, |
| "grad_norm": 0.8841109871864319, |
| "learning_rate": 9.432819739252234e-05, |
| "loss": 3.1834, |
| "step": 78350 |
| }, |
| { |
| "epoch": 8.438273598105694, |
| "grad_norm": 0.8793404698371887, |
| "learning_rate": 9.400495636246093e-05, |
| "loss": 3.1823, |
| "step": 78400 |
| }, |
| { |
| "epoch": 8.443655150145302, |
| "grad_norm": 0.8999808430671692, |
| "learning_rate": 9.368171533239952e-05, |
| "loss": 3.1813, |
| "step": 78450 |
| }, |
| { |
| "epoch": 8.44903670218491, |
| "grad_norm": 0.832278847694397, |
| "learning_rate": 9.335847430233811e-05, |
| "loss": 3.1914, |
| "step": 78500 |
| }, |
| { |
| "epoch": 8.45441825422452, |
| "grad_norm": 0.8846303820610046, |
| "learning_rate": 9.303523327227668e-05, |
| "loss": 3.1763, |
| "step": 78550 |
| }, |
| { |
| "epoch": 8.459799806264126, |
| "grad_norm": 0.8395057320594788, |
| "learning_rate": 9.271199224221527e-05, |
| "loss": 3.1892, |
| "step": 78600 |
| }, |
| { |
| "epoch": 8.465181358303735, |
| "grad_norm": 0.8631083369255066, |
| "learning_rate": 9.238875121215387e-05, |
| "loss": 3.1991, |
| "step": 78650 |
| }, |
| { |
| "epoch": 8.470562910343343, |
| "grad_norm": 0.8669479489326477, |
| "learning_rate": 9.206551018209243e-05, |
| "loss": 3.1691, |
| "step": 78700 |
| }, |
| { |
| "epoch": 8.475944462382952, |
| "grad_norm": 0.8604099154472351, |
| "learning_rate": 9.174226915203103e-05, |
| "loss": 3.1691, |
| "step": 78750 |
| }, |
| { |
| "epoch": 8.48132601442256, |
| "grad_norm": 0.8545402884483337, |
| "learning_rate": 9.141902812196961e-05, |
| "loss": 3.1751, |
| "step": 78800 |
| }, |
| { |
| "epoch": 8.486707566462167, |
| "grad_norm": 0.8492422699928284, |
| "learning_rate": 9.109578709190818e-05, |
| "loss": 3.1837, |
| "step": 78850 |
| }, |
| { |
| "epoch": 8.492089118501776, |
| "grad_norm": 0.8209715485572815, |
| "learning_rate": 9.077254606184677e-05, |
| "loss": 3.1882, |
| "step": 78900 |
| }, |
| { |
| "epoch": 8.497470670541384, |
| "grad_norm": 0.876168429851532, |
| "learning_rate": 9.044930503178537e-05, |
| "loss": 3.1854, |
| "step": 78950 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "grad_norm": 0.9051874876022339, |
| "learning_rate": 9.012606400172395e-05, |
| "loss": 3.1783, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "eval_accuracy": 0.3914579924575338, |
| "eval_loss": 3.3256032466888428, |
| "eval_runtime": 185.3855, |
| "eval_samples_per_second": 97.154, |
| "eval_steps_per_second": 6.074, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.508233774620601, |
| "grad_norm": 0.9354413747787476, |
| "learning_rate": 8.980282297166253e-05, |
| "loss": 3.1785, |
| "step": 79050 |
| }, |
| { |
| "epoch": 8.513615326660208, |
| "grad_norm": 0.8846203684806824, |
| "learning_rate": 8.947958194160111e-05, |
| "loss": 3.2052, |
| "step": 79100 |
| }, |
| { |
| "epoch": 8.518996878699816, |
| "grad_norm": 0.8579129576683044, |
| "learning_rate": 8.91563409115397e-05, |
| "loss": 3.1818, |
| "step": 79150 |
| }, |
| { |
| "epoch": 8.524378430739425, |
| "grad_norm": 0.8579406142234802, |
| "learning_rate": 8.883309988147827e-05, |
| "loss": 3.1827, |
| "step": 79200 |
| }, |
| { |
| "epoch": 8.529759982779034, |
| "grad_norm": 0.8814583420753479, |
| "learning_rate": 8.850985885141687e-05, |
| "loss": 3.1749, |
| "step": 79250 |
| }, |
| { |
| "epoch": 8.535141534818642, |
| "grad_norm": 0.8940314054489136, |
| "learning_rate": 8.818661782135545e-05, |
| "loss": 3.1611, |
| "step": 79300 |
| }, |
| { |
| "epoch": 8.54052308685825, |
| "grad_norm": 0.9121882319450378, |
| "learning_rate": 8.786337679129403e-05, |
| "loss": 3.1679, |
| "step": 79350 |
| }, |
| { |
| "epoch": 8.545904638897857, |
| "grad_norm": 0.9019620418548584, |
| "learning_rate": 8.754013576123261e-05, |
| "loss": 3.1741, |
| "step": 79400 |
| }, |
| { |
| "epoch": 8.551286190937466, |
| "grad_norm": 0.9139887690544128, |
| "learning_rate": 8.72168947311712e-05, |
| "loss": 3.1732, |
| "step": 79450 |
| }, |
| { |
| "epoch": 8.556667742977075, |
| "grad_norm": 0.8666207790374756, |
| "learning_rate": 8.68936537011098e-05, |
| "loss": 3.1821, |
| "step": 79500 |
| }, |
| { |
| "epoch": 8.562049295016683, |
| "grad_norm": 0.8577825427055359, |
| "learning_rate": 8.657041267104837e-05, |
| "loss": 3.1826, |
| "step": 79550 |
| }, |
| { |
| "epoch": 8.567430847056292, |
| "grad_norm": 0.8668467402458191, |
| "learning_rate": 8.624717164098696e-05, |
| "loss": 3.1725, |
| "step": 79600 |
| }, |
| { |
| "epoch": 8.572812399095898, |
| "grad_norm": 0.8373755216598511, |
| "learning_rate": 8.592393061092554e-05, |
| "loss": 3.1797, |
| "step": 79650 |
| }, |
| { |
| "epoch": 8.578193951135507, |
| "grad_norm": 0.8314027786254883, |
| "learning_rate": 8.560068958086412e-05, |
| "loss": 3.1849, |
| "step": 79700 |
| }, |
| { |
| "epoch": 8.583575503175116, |
| "grad_norm": 0.9058263897895813, |
| "learning_rate": 8.52774485508027e-05, |
| "loss": 3.176, |
| "step": 79750 |
| }, |
| { |
| "epoch": 8.588957055214724, |
| "grad_norm": 0.9133097529411316, |
| "learning_rate": 8.49542075207413e-05, |
| "loss": 3.1968, |
| "step": 79800 |
| }, |
| { |
| "epoch": 8.594338607254333, |
| "grad_norm": 0.8769819140434265, |
| "learning_rate": 8.46374313112811e-05, |
| "loss": 3.1829, |
| "step": 79850 |
| }, |
| { |
| "epoch": 8.599720159293941, |
| "grad_norm": 1.0664840936660767, |
| "learning_rate": 8.431419028121969e-05, |
| "loss": 3.1828, |
| "step": 79900 |
| }, |
| { |
| "epoch": 8.605101711333548, |
| "grad_norm": 0.8446348905563354, |
| "learning_rate": 8.399094925115828e-05, |
| "loss": 3.1563, |
| "step": 79950 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "grad_norm": 0.8764998912811279, |
| "learning_rate": 8.366770822109685e-05, |
| "loss": 3.1571, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "eval_accuracy": 0.39208394174423306, |
| "eval_loss": 3.321202039718628, |
| "eval_runtime": 184.6735, |
| "eval_samples_per_second": 97.529, |
| "eval_steps_per_second": 6.097, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.615864815412765, |
| "grad_norm": 0.8480128049850464, |
| "learning_rate": 8.334446719103544e-05, |
| "loss": 3.189, |
| "step": 80050 |
| }, |
| { |
| "epoch": 8.621246367452374, |
| "grad_norm": 0.8969175219535828, |
| "learning_rate": 8.302122616097403e-05, |
| "loss": 3.1913, |
| "step": 80100 |
| }, |
| { |
| "epoch": 8.626627919491982, |
| "grad_norm": 0.8639907240867615, |
| "learning_rate": 8.269798513091261e-05, |
| "loss": 3.1644, |
| "step": 80150 |
| }, |
| { |
| "epoch": 8.632009471531589, |
| "grad_norm": 0.8794361352920532, |
| "learning_rate": 8.237474410085119e-05, |
| "loss": 3.1668, |
| "step": 80200 |
| }, |
| { |
| "epoch": 8.637391023571197, |
| "grad_norm": 0.885933518409729, |
| "learning_rate": 8.205150307078978e-05, |
| "loss": 3.1819, |
| "step": 80250 |
| }, |
| { |
| "epoch": 8.642772575610806, |
| "grad_norm": 0.8281596302986145, |
| "learning_rate": 8.172826204072838e-05, |
| "loss": 3.1691, |
| "step": 80300 |
| }, |
| { |
| "epoch": 8.648154127650415, |
| "grad_norm": 0.9188771843910217, |
| "learning_rate": 8.140502101066694e-05, |
| "loss": 3.1883, |
| "step": 80350 |
| }, |
| { |
| "epoch": 8.653535679690023, |
| "grad_norm": 0.9139700531959534, |
| "learning_rate": 8.108177998060553e-05, |
| "loss": 3.1712, |
| "step": 80400 |
| }, |
| { |
| "epoch": 8.658917231729632, |
| "grad_norm": 0.8853664398193359, |
| "learning_rate": 8.075853895054412e-05, |
| "loss": 3.1819, |
| "step": 80450 |
| }, |
| { |
| "epoch": 8.664298783769238, |
| "grad_norm": 0.8718395233154297, |
| "learning_rate": 8.043529792048269e-05, |
| "loss": 3.1864, |
| "step": 80500 |
| }, |
| { |
| "epoch": 8.669680335808847, |
| "grad_norm": 0.8565669655799866, |
| "learning_rate": 8.011205689042128e-05, |
| "loss": 3.1692, |
| "step": 80550 |
| }, |
| { |
| "epoch": 8.675061887848456, |
| "grad_norm": 0.8765687942504883, |
| "learning_rate": 7.978881586035988e-05, |
| "loss": 3.2026, |
| "step": 80600 |
| }, |
| { |
| "epoch": 8.680443439888064, |
| "grad_norm": 0.8425050973892212, |
| "learning_rate": 7.946557483029845e-05, |
| "loss": 3.1826, |
| "step": 80650 |
| }, |
| { |
| "epoch": 8.685824991927673, |
| "grad_norm": 0.8891403079032898, |
| "learning_rate": 7.914233380023704e-05, |
| "loss": 3.1744, |
| "step": 80700 |
| }, |
| { |
| "epoch": 8.69120654396728, |
| "grad_norm": 0.8497437834739685, |
| "learning_rate": 7.881909277017562e-05, |
| "loss": 3.1793, |
| "step": 80750 |
| }, |
| { |
| "epoch": 8.696588096006888, |
| "grad_norm": 0.8760660886764526, |
| "learning_rate": 7.849585174011422e-05, |
| "loss": 3.1883, |
| "step": 80800 |
| }, |
| { |
| "epoch": 8.701969648046497, |
| "grad_norm": 0.9156056642532349, |
| "learning_rate": 7.817261071005278e-05, |
| "loss": 3.1886, |
| "step": 80850 |
| }, |
| { |
| "epoch": 8.707351200086105, |
| "grad_norm": 0.9590579867362976, |
| "learning_rate": 7.784936967999138e-05, |
| "loss": 3.1879, |
| "step": 80900 |
| }, |
| { |
| "epoch": 8.712732752125714, |
| "grad_norm": 0.8654018044471741, |
| "learning_rate": 7.752612864992996e-05, |
| "loss": 3.1743, |
| "step": 80950 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "grad_norm": 1.0002062320709229, |
| "learning_rate": 7.720288761986854e-05, |
| "loss": 3.2066, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "eval_accuracy": 0.39225344024922365, |
| "eval_loss": 3.317610025405884, |
| "eval_runtime": 184.756, |
| "eval_samples_per_second": 97.485, |
| "eval_steps_per_second": 6.095, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.723495856204929, |
| "grad_norm": 0.8658928871154785, |
| "learning_rate": 7.687964658980712e-05, |
| "loss": 3.1837, |
| "step": 81050 |
| }, |
| { |
| "epoch": 8.728877408244538, |
| "grad_norm": 0.8824866414070129, |
| "learning_rate": 7.655640555974572e-05, |
| "loss": 3.1794, |
| "step": 81100 |
| }, |
| { |
| "epoch": 8.734258960284146, |
| "grad_norm": 0.9138555526733398, |
| "learning_rate": 7.623316452968428e-05, |
| "loss": 3.1763, |
| "step": 81150 |
| }, |
| { |
| "epoch": 8.739640512323755, |
| "grad_norm": 0.8968687653541565, |
| "learning_rate": 7.590992349962288e-05, |
| "loss": 3.1867, |
| "step": 81200 |
| }, |
| { |
| "epoch": 8.745022064363363, |
| "grad_norm": 0.8379364609718323, |
| "learning_rate": 7.558668246956147e-05, |
| "loss": 3.1788, |
| "step": 81250 |
| }, |
| { |
| "epoch": 8.75040361640297, |
| "grad_norm": 0.8813756108283997, |
| "learning_rate": 7.526344143950005e-05, |
| "loss": 3.1934, |
| "step": 81300 |
| }, |
| { |
| "epoch": 8.755785168442578, |
| "grad_norm": 0.8551928400993347, |
| "learning_rate": 7.494020040943862e-05, |
| "loss": 3.1963, |
| "step": 81350 |
| }, |
| { |
| "epoch": 8.761166720482187, |
| "grad_norm": 0.9404810667037964, |
| "learning_rate": 7.461695937937722e-05, |
| "loss": 3.1712, |
| "step": 81400 |
| }, |
| { |
| "epoch": 8.766548272521796, |
| "grad_norm": 0.9553894400596619, |
| "learning_rate": 7.42937183493158e-05, |
| "loss": 3.1956, |
| "step": 81450 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.8532766103744507, |
| "learning_rate": 7.397047731925439e-05, |
| "loss": 3.1838, |
| "step": 81500 |
| }, |
| { |
| "epoch": 8.777311376601011, |
| "grad_norm": 0.8783697485923767, |
| "learning_rate": 7.364723628919297e-05, |
| "loss": 3.1803, |
| "step": 81550 |
| }, |
| { |
| "epoch": 8.78269292864062, |
| "grad_norm": 0.8748445510864258, |
| "learning_rate": 7.332399525913155e-05, |
| "loss": 3.1718, |
| "step": 81600 |
| }, |
| { |
| "epoch": 8.788074480680228, |
| "grad_norm": 0.8972441554069519, |
| "learning_rate": 7.300075422907013e-05, |
| "loss": 3.1777, |
| "step": 81650 |
| }, |
| { |
| "epoch": 8.793456032719837, |
| "grad_norm": 0.8829938173294067, |
| "learning_rate": 7.267751319900872e-05, |
| "loss": 3.1816, |
| "step": 81700 |
| }, |
| { |
| "epoch": 8.798837584759445, |
| "grad_norm": 0.8357025980949402, |
| "learning_rate": 7.235427216894731e-05, |
| "loss": 3.1809, |
| "step": 81750 |
| }, |
| { |
| "epoch": 8.804219136799054, |
| "grad_norm": 0.8655660152435303, |
| "learning_rate": 7.203103113888589e-05, |
| "loss": 3.1801, |
| "step": 81800 |
| }, |
| { |
| "epoch": 8.80960068883866, |
| "grad_norm": 0.8427842855453491, |
| "learning_rate": 7.17142549294257e-05, |
| "loss": 3.1963, |
| "step": 81850 |
| }, |
| { |
| "epoch": 8.814982240878269, |
| "grad_norm": 0.9213821887969971, |
| "learning_rate": 7.139101389936428e-05, |
| "loss": 3.1815, |
| "step": 81900 |
| }, |
| { |
| "epoch": 8.820363792917878, |
| "grad_norm": 0.8830085396766663, |
| "learning_rate": 7.10742376899041e-05, |
| "loss": 3.1861, |
| "step": 81950 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "grad_norm": 0.8858305811882019, |
| "learning_rate": 7.075099665984268e-05, |
| "loss": 3.1893, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "eval_accuracy": 0.3927499839465358, |
| "eval_loss": 3.313309669494629, |
| "eval_runtime": 184.9366, |
| "eval_samples_per_second": 97.39, |
| "eval_steps_per_second": 6.089, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.831126896997095, |
| "grad_norm": 0.898823082447052, |
| "learning_rate": 7.042775562978126e-05, |
| "loss": 3.1939, |
| "step": 82050 |
| }, |
| { |
| "epoch": 8.836508449036701, |
| "grad_norm": 0.8878447413444519, |
| "learning_rate": 7.010451459971986e-05, |
| "loss": 3.1949, |
| "step": 82100 |
| }, |
| { |
| "epoch": 8.84189000107631, |
| "grad_norm": 0.853378415107727, |
| "learning_rate": 6.978127356965844e-05, |
| "loss": 3.1804, |
| "step": 82150 |
| }, |
| { |
| "epoch": 8.847271553115919, |
| "grad_norm": 0.8771511316299438, |
| "learning_rate": 6.945803253959702e-05, |
| "loss": 3.1889, |
| "step": 82200 |
| }, |
| { |
| "epoch": 8.852653105155527, |
| "grad_norm": 0.8539415597915649, |
| "learning_rate": 6.91347915095356e-05, |
| "loss": 3.1983, |
| "step": 82250 |
| }, |
| { |
| "epoch": 8.858034657195136, |
| "grad_norm": 0.8989981412887573, |
| "learning_rate": 6.881155047947418e-05, |
| "loss": 3.1973, |
| "step": 82300 |
| }, |
| { |
| "epoch": 8.863416209234742, |
| "grad_norm": 0.8486732840538025, |
| "learning_rate": 6.848830944941278e-05, |
| "loss": 3.1764, |
| "step": 82350 |
| }, |
| { |
| "epoch": 8.868797761274351, |
| "grad_norm": 0.862515389919281, |
| "learning_rate": 6.816506841935136e-05, |
| "loss": 3.1813, |
| "step": 82400 |
| }, |
| { |
| "epoch": 8.87417931331396, |
| "grad_norm": 0.8676425814628601, |
| "learning_rate": 6.784182738928994e-05, |
| "loss": 3.1833, |
| "step": 82450 |
| }, |
| { |
| "epoch": 8.879560865353568, |
| "grad_norm": 0.8382507562637329, |
| "learning_rate": 6.751858635922853e-05, |
| "loss": 3.1977, |
| "step": 82500 |
| }, |
| { |
| "epoch": 8.884942417393177, |
| "grad_norm": 0.9167499542236328, |
| "learning_rate": 6.719534532916711e-05, |
| "loss": 3.2017, |
| "step": 82550 |
| }, |
| { |
| "epoch": 8.890323969432785, |
| "grad_norm": 0.9321362972259521, |
| "learning_rate": 6.68721042991057e-05, |
| "loss": 3.1856, |
| "step": 82600 |
| }, |
| { |
| "epoch": 8.895705521472392, |
| "grad_norm": 0.8884369134902954, |
| "learning_rate": 6.654886326904428e-05, |
| "loss": 3.166, |
| "step": 82650 |
| }, |
| { |
| "epoch": 8.901087073512, |
| "grad_norm": 0.8399180769920349, |
| "learning_rate": 6.622562223898286e-05, |
| "loss": 3.1608, |
| "step": 82700 |
| }, |
| { |
| "epoch": 8.906468625551609, |
| "grad_norm": 0.9197646379470825, |
| "learning_rate": 6.590238120892145e-05, |
| "loss": 3.19, |
| "step": 82750 |
| }, |
| { |
| "epoch": 8.911850177591218, |
| "grad_norm": 0.8913666605949402, |
| "learning_rate": 6.557914017886003e-05, |
| "loss": 3.1812, |
| "step": 82800 |
| }, |
| { |
| "epoch": 8.917231729630826, |
| "grad_norm": 0.9225286841392517, |
| "learning_rate": 6.525589914879861e-05, |
| "loss": 3.1826, |
| "step": 82850 |
| }, |
| { |
| "epoch": 8.922613281670433, |
| "grad_norm": 0.8732419610023499, |
| "learning_rate": 6.49326581187372e-05, |
| "loss": 3.1804, |
| "step": 82900 |
| }, |
| { |
| "epoch": 8.927994833710041, |
| "grad_norm": 0.9099151492118835, |
| "learning_rate": 6.460941708867578e-05, |
| "loss": 3.1793, |
| "step": 82950 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "grad_norm": 0.8944827318191528, |
| "learning_rate": 6.428617605861437e-05, |
| "loss": 3.197, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "eval_accuracy": 0.39288395295721107, |
| "eval_loss": 3.311122417449951, |
| "eval_runtime": 185.1062, |
| "eval_samples_per_second": 97.301, |
| "eval_steps_per_second": 6.083, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.938757937789259, |
| "grad_norm": 0.9626566171646118, |
| "learning_rate": 6.396293502855295e-05, |
| "loss": 3.1786, |
| "step": 83050 |
| }, |
| { |
| "epoch": 8.944139489828867, |
| "grad_norm": 0.8564572334289551, |
| "learning_rate": 6.363969399849153e-05, |
| "loss": 3.1572, |
| "step": 83100 |
| }, |
| { |
| "epoch": 8.949521041868476, |
| "grad_norm": 0.8902326822280884, |
| "learning_rate": 6.331645296843011e-05, |
| "loss": 3.1862, |
| "step": 83150 |
| }, |
| { |
| "epoch": 8.954902593908082, |
| "grad_norm": 0.8501969575881958, |
| "learning_rate": 6.29932119383687e-05, |
| "loss": 3.1809, |
| "step": 83200 |
| }, |
| { |
| "epoch": 8.960284145947691, |
| "grad_norm": 0.8566688299179077, |
| "learning_rate": 6.266997090830729e-05, |
| "loss": 3.1817, |
| "step": 83250 |
| }, |
| { |
| "epoch": 8.9656656979873, |
| "grad_norm": 0.8682833909988403, |
| "learning_rate": 6.234672987824587e-05, |
| "loss": 3.1946, |
| "step": 83300 |
| }, |
| { |
| "epoch": 8.971047250026908, |
| "grad_norm": 0.9486375451087952, |
| "learning_rate": 6.202348884818447e-05, |
| "loss": 3.1855, |
| "step": 83350 |
| }, |
| { |
| "epoch": 8.976428802066517, |
| "grad_norm": 0.8489530086517334, |
| "learning_rate": 6.170024781812305e-05, |
| "loss": 3.181, |
| "step": 83400 |
| }, |
| { |
| "epoch": 8.981810354106123, |
| "grad_norm": 0.8610159158706665, |
| "learning_rate": 6.137700678806163e-05, |
| "loss": 3.1787, |
| "step": 83450 |
| }, |
| { |
| "epoch": 8.987191906145732, |
| "grad_norm": 0.926662802696228, |
| "learning_rate": 6.105376575800021e-05, |
| "loss": 3.1692, |
| "step": 83500 |
| }, |
| { |
| "epoch": 8.99257345818534, |
| "grad_norm": 0.9315507411956787, |
| "learning_rate": 6.073052472793879e-05, |
| "loss": 3.1682, |
| "step": 83550 |
| }, |
| { |
| "epoch": 8.997955010224949, |
| "grad_norm": 0.9029734134674072, |
| "learning_rate": 6.0407283697877384e-05, |
| "loss": 3.1876, |
| "step": 83600 |
| }, |
| { |
| "epoch": 9.003336562264558, |
| "grad_norm": 0.9033173322677612, |
| "learning_rate": 6.0084042667815966e-05, |
| "loss": 3.1469, |
| "step": 83650 |
| }, |
| { |
| "epoch": 9.008718114304166, |
| "grad_norm": 0.8773213028907776, |
| "learning_rate": 5.976080163775455e-05, |
| "loss": 3.1173, |
| "step": 83700 |
| }, |
| { |
| "epoch": 9.014099666343773, |
| "grad_norm": 0.9068469405174255, |
| "learning_rate": 5.9437560607693135e-05, |
| "loss": 3.1375, |
| "step": 83750 |
| }, |
| { |
| "epoch": 9.019481218383381, |
| "grad_norm": 0.8826611042022705, |
| "learning_rate": 5.9114319577631716e-05, |
| "loss": 3.0867, |
| "step": 83800 |
| }, |
| { |
| "epoch": 9.02486277042299, |
| "grad_norm": 0.8884468674659729, |
| "learning_rate": 5.8791078547570304e-05, |
| "loss": 3.1317, |
| "step": 83850 |
| }, |
| { |
| "epoch": 9.030244322462599, |
| "grad_norm": 0.8935588002204895, |
| "learning_rate": 5.8467837517508885e-05, |
| "loss": 3.1211, |
| "step": 83900 |
| }, |
| { |
| "epoch": 9.035625874502207, |
| "grad_norm": 0.8488547205924988, |
| "learning_rate": 5.8144596487447466e-05, |
| "loss": 3.1253, |
| "step": 83950 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "grad_norm": 0.9119958877563477, |
| "learning_rate": 5.7821355457386054e-05, |
| "loss": 3.1188, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "eval_accuracy": 0.3928714578751124, |
| "eval_loss": 3.3146016597747803, |
| "eval_runtime": 184.5536, |
| "eval_samples_per_second": 97.592, |
| "eval_steps_per_second": 6.101, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.046388978581422, |
| "grad_norm": 0.8722310662269592, |
| "learning_rate": 5.7498114427324635e-05, |
| "loss": 3.1142, |
| "step": 84050 |
| }, |
| { |
| "epoch": 9.051770530621031, |
| "grad_norm": 0.8786686062812805, |
| "learning_rate": 5.717487339726322e-05, |
| "loss": 3.1196, |
| "step": 84100 |
| }, |
| { |
| "epoch": 9.05715208266064, |
| "grad_norm": 0.873644232749939, |
| "learning_rate": 5.6851632367201804e-05, |
| "loss": 3.1298, |
| "step": 84150 |
| }, |
| { |
| "epoch": 9.062533634700248, |
| "grad_norm": 0.8933513760566711, |
| "learning_rate": 5.6528391337140385e-05, |
| "loss": 3.1089, |
| "step": 84200 |
| }, |
| { |
| "epoch": 9.067915186739857, |
| "grad_norm": 0.8709551095962524, |
| "learning_rate": 5.620515030707897e-05, |
| "loss": 3.106, |
| "step": 84250 |
| }, |
| { |
| "epoch": 9.073296738779463, |
| "grad_norm": 0.8717238306999207, |
| "learning_rate": 5.5881909277017554e-05, |
| "loss": 3.1345, |
| "step": 84300 |
| }, |
| { |
| "epoch": 9.078678290819072, |
| "grad_norm": 0.858155369758606, |
| "learning_rate": 5.555866824695614e-05, |
| "loss": 3.1032, |
| "step": 84350 |
| }, |
| { |
| "epoch": 9.08405984285868, |
| "grad_norm": 0.8939986824989319, |
| "learning_rate": 5.523542721689472e-05, |
| "loss": 3.1245, |
| "step": 84400 |
| }, |
| { |
| "epoch": 9.089441394898289, |
| "grad_norm": 0.9245389103889465, |
| "learning_rate": 5.4912186186833304e-05, |
| "loss": 3.1343, |
| "step": 84450 |
| }, |
| { |
| "epoch": 9.094822946937898, |
| "grad_norm": 0.9213153719902039, |
| "learning_rate": 5.45889451567719e-05, |
| "loss": 3.1191, |
| "step": 84500 |
| }, |
| { |
| "epoch": 9.100204498977504, |
| "grad_norm": 0.8970081210136414, |
| "learning_rate": 5.426570412671048e-05, |
| "loss": 3.1257, |
| "step": 84550 |
| }, |
| { |
| "epoch": 9.105586051017113, |
| "grad_norm": 0.9398916959762573, |
| "learning_rate": 5.394246309664907e-05, |
| "loss": 3.1261, |
| "step": 84600 |
| }, |
| { |
| "epoch": 9.110967603056721, |
| "grad_norm": 0.8600006103515625, |
| "learning_rate": 5.361922206658765e-05, |
| "loss": 3.1337, |
| "step": 84650 |
| }, |
| { |
| "epoch": 9.11634915509633, |
| "grad_norm": 0.926282525062561, |
| "learning_rate": 5.329598103652623e-05, |
| "loss": 3.1305, |
| "step": 84700 |
| }, |
| { |
| "epoch": 9.121730707135939, |
| "grad_norm": 0.8645695447921753, |
| "learning_rate": 5.297274000646482e-05, |
| "loss": 3.1233, |
| "step": 84750 |
| }, |
| { |
| "epoch": 9.127112259175545, |
| "grad_norm": 0.8621381521224976, |
| "learning_rate": 5.26494989764034e-05, |
| "loss": 3.121, |
| "step": 84800 |
| }, |
| { |
| "epoch": 9.132493811215154, |
| "grad_norm": 0.8733987212181091, |
| "learning_rate": 5.232625794634199e-05, |
| "loss": 3.1233, |
| "step": 84850 |
| }, |
| { |
| "epoch": 9.137875363254762, |
| "grad_norm": 0.940946102142334, |
| "learning_rate": 5.200301691628057e-05, |
| "loss": 3.1294, |
| "step": 84900 |
| }, |
| { |
| "epoch": 9.143256915294371, |
| "grad_norm": 0.8964011073112488, |
| "learning_rate": 5.167977588621915e-05, |
| "loss": 3.1282, |
| "step": 84950 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "grad_norm": 0.9262023568153381, |
| "learning_rate": 5.135653485615774e-05, |
| "loss": 3.1232, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "eval_accuracy": 0.3931999155549756, |
| "eval_loss": 3.31473970413208, |
| "eval_runtime": 184.9896, |
| "eval_samples_per_second": 97.362, |
| "eval_steps_per_second": 6.087, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.154020019373588, |
| "grad_norm": 0.860278844833374, |
| "learning_rate": 5.103329382609632e-05, |
| "loss": 3.1191, |
| "step": 85050 |
| }, |
| { |
| "epoch": 9.159401571413195, |
| "grad_norm": 0.9099411368370056, |
| "learning_rate": 5.0710052796034906e-05, |
| "loss": 3.1229, |
| "step": 85100 |
| }, |
| { |
| "epoch": 9.164783123452803, |
| "grad_norm": 0.8791196942329407, |
| "learning_rate": 5.038681176597349e-05, |
| "loss": 3.0978, |
| "step": 85150 |
| }, |
| { |
| "epoch": 9.170164675492412, |
| "grad_norm": 1.0391086339950562, |
| "learning_rate": 5.006357073591207e-05, |
| "loss": 3.1267, |
| "step": 85200 |
| }, |
| { |
| "epoch": 9.17554622753202, |
| "grad_norm": 0.8792301416397095, |
| "learning_rate": 4.974032970585066e-05, |
| "loss": 3.1279, |
| "step": 85250 |
| }, |
| { |
| "epoch": 9.180927779571629, |
| "grad_norm": 0.8521240949630737, |
| "learning_rate": 4.9417088675789244e-05, |
| "loss": 3.118, |
| "step": 85300 |
| }, |
| { |
| "epoch": 9.186309331611236, |
| "grad_norm": 0.912763237953186, |
| "learning_rate": 4.909384764572783e-05, |
| "loss": 3.121, |
| "step": 85350 |
| }, |
| { |
| "epoch": 9.191690883650844, |
| "grad_norm": 0.8790477514266968, |
| "learning_rate": 4.877060661566641e-05, |
| "loss": 3.129, |
| "step": 85400 |
| }, |
| { |
| "epoch": 9.197072435690453, |
| "grad_norm": 0.8999446630477905, |
| "learning_rate": 4.8447365585604994e-05, |
| "loss": 3.1379, |
| "step": 85450 |
| }, |
| { |
| "epoch": 9.202453987730062, |
| "grad_norm": 0.8669745326042175, |
| "learning_rate": 4.812412455554358e-05, |
| "loss": 3.1175, |
| "step": 85500 |
| }, |
| { |
| "epoch": 9.20783553976967, |
| "grad_norm": 0.9326590895652771, |
| "learning_rate": 4.780088352548216e-05, |
| "loss": 3.1152, |
| "step": 85550 |
| }, |
| { |
| "epoch": 9.213217091809279, |
| "grad_norm": 0.8711249828338623, |
| "learning_rate": 4.747764249542075e-05, |
| "loss": 3.1119, |
| "step": 85600 |
| }, |
| { |
| "epoch": 9.218598643848885, |
| "grad_norm": 0.9170059561729431, |
| "learning_rate": 4.715440146535933e-05, |
| "loss": 3.1528, |
| "step": 85650 |
| }, |
| { |
| "epoch": 9.223980195888494, |
| "grad_norm": 0.8670192360877991, |
| "learning_rate": 4.683116043529791e-05, |
| "loss": 3.1398, |
| "step": 85700 |
| }, |
| { |
| "epoch": 9.229361747928102, |
| "grad_norm": 0.9062800407409668, |
| "learning_rate": 4.65079194052365e-05, |
| "loss": 3.1122, |
| "step": 85750 |
| }, |
| { |
| "epoch": 9.234743299967711, |
| "grad_norm": 0.8952690362930298, |
| "learning_rate": 4.618467837517508e-05, |
| "loss": 3.1347, |
| "step": 85800 |
| }, |
| { |
| "epoch": 9.24012485200732, |
| "grad_norm": 0.9159784317016602, |
| "learning_rate": 4.586143734511367e-05, |
| "loss": 3.1263, |
| "step": 85850 |
| }, |
| { |
| "epoch": 9.245506404046926, |
| "grad_norm": 0.9201834797859192, |
| "learning_rate": 4.553819631505225e-05, |
| "loss": 3.1292, |
| "step": 85900 |
| }, |
| { |
| "epoch": 9.250887956086535, |
| "grad_norm": 0.8721088767051697, |
| "learning_rate": 4.521495528499083e-05, |
| "loss": 3.126, |
| "step": 85950 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "grad_norm": 0.9054554104804993, |
| "learning_rate": 4.4898179075530645e-05, |
| "loss": 3.1126, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "eval_accuracy": 0.3934344971397671, |
| "eval_loss": 3.3119852542877197, |
| "eval_runtime": 184.8448, |
| "eval_samples_per_second": 97.438, |
| "eval_steps_per_second": 6.092, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.261651060165752, |
| "grad_norm": 0.8648436665534973, |
| "learning_rate": 4.457493804546923e-05, |
| "loss": 3.1399, |
| "step": 86050 |
| }, |
| { |
| "epoch": 9.26703261220536, |
| "grad_norm": 0.8899005651473999, |
| "learning_rate": 4.4251697015407814e-05, |
| "loss": 3.1283, |
| "step": 86100 |
| }, |
| { |
| "epoch": 9.272414164244967, |
| "grad_norm": 0.8872169852256775, |
| "learning_rate": 4.392845598534641e-05, |
| "loss": 3.127, |
| "step": 86150 |
| }, |
| { |
| "epoch": 9.277795716284576, |
| "grad_norm": 0.8724729418754578, |
| "learning_rate": 4.360521495528499e-05, |
| "loss": 3.126, |
| "step": 86200 |
| }, |
| { |
| "epoch": 9.283177268324184, |
| "grad_norm": 0.8950654864311218, |
| "learning_rate": 4.3281973925223564e-05, |
| "loss": 3.1202, |
| "step": 86250 |
| }, |
| { |
| "epoch": 9.288558820363793, |
| "grad_norm": 0.9006926417350769, |
| "learning_rate": 4.295873289516216e-05, |
| "loss": 3.1234, |
| "step": 86300 |
| }, |
| { |
| "epoch": 9.293940372403402, |
| "grad_norm": 0.8863205313682556, |
| "learning_rate": 4.263549186510074e-05, |
| "loss": 3.112, |
| "step": 86350 |
| }, |
| { |
| "epoch": 9.29932192444301, |
| "grad_norm": 0.8906213045120239, |
| "learning_rate": 4.231225083503933e-05, |
| "loss": 3.1287, |
| "step": 86400 |
| }, |
| { |
| "epoch": 9.304703476482617, |
| "grad_norm": 0.8871445655822754, |
| "learning_rate": 4.198900980497791e-05, |
| "loss": 3.1298, |
| "step": 86450 |
| }, |
| { |
| "epoch": 9.310085028522225, |
| "grad_norm": 0.9487957954406738, |
| "learning_rate": 4.166576877491649e-05, |
| "loss": 3.1195, |
| "step": 86500 |
| }, |
| { |
| "epoch": 9.315466580561834, |
| "grad_norm": 0.8825381398200989, |
| "learning_rate": 4.134252774485508e-05, |
| "loss": 3.1436, |
| "step": 86550 |
| }, |
| { |
| "epoch": 9.320848132601443, |
| "grad_norm": 0.8810291886329651, |
| "learning_rate": 4.101928671479366e-05, |
| "loss": 3.1323, |
| "step": 86600 |
| }, |
| { |
| "epoch": 9.326229684641051, |
| "grad_norm": 0.9148359298706055, |
| "learning_rate": 4.069604568473225e-05, |
| "loss": 3.1344, |
| "step": 86650 |
| }, |
| { |
| "epoch": 9.331611236680658, |
| "grad_norm": 0.8992313146591187, |
| "learning_rate": 4.037280465467083e-05, |
| "loss": 3.1286, |
| "step": 86700 |
| }, |
| { |
| "epoch": 9.336992788720266, |
| "grad_norm": 0.9089197516441345, |
| "learning_rate": 4.004956362460941e-05, |
| "loss": 3.1332, |
| "step": 86750 |
| }, |
| { |
| "epoch": 9.342374340759875, |
| "grad_norm": 0.868971586227417, |
| "learning_rate": 3.9726322594548e-05, |
| "loss": 3.1276, |
| "step": 86800 |
| }, |
| { |
| "epoch": 9.347755892799483, |
| "grad_norm": 0.921055257320404, |
| "learning_rate": 3.940308156448658e-05, |
| "loss": 3.1217, |
| "step": 86850 |
| }, |
| { |
| "epoch": 9.353137444839092, |
| "grad_norm": 0.959754228591919, |
| "learning_rate": 3.9079840534425166e-05, |
| "loss": 3.1177, |
| "step": 86900 |
| }, |
| { |
| "epoch": 9.3585189968787, |
| "grad_norm": 0.892400324344635, |
| "learning_rate": 3.875659950436375e-05, |
| "loss": 3.1181, |
| "step": 86950 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "grad_norm": 0.9151771068572998, |
| "learning_rate": 3.843335847430233e-05, |
| "loss": 3.1101, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "eval_accuracy": 0.3936111667353534, |
| "eval_loss": 3.3083672523498535, |
| "eval_runtime": 185.0906, |
| "eval_samples_per_second": 97.309, |
| "eval_steps_per_second": 6.084, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.369282100957916, |
| "grad_norm": 0.8925448656082153, |
| "learning_rate": 3.811011744424092e-05, |
| "loss": 3.1359, |
| "step": 87050 |
| }, |
| { |
| "epoch": 9.374663652997524, |
| "grad_norm": 0.8919752240180969, |
| "learning_rate": 3.7786876414179504e-05, |
| "loss": 3.1191, |
| "step": 87100 |
| }, |
| { |
| "epoch": 9.380045205037133, |
| "grad_norm": 0.9180679321289062, |
| "learning_rate": 3.7463635384118085e-05, |
| "loss": 3.1496, |
| "step": 87150 |
| }, |
| { |
| "epoch": 9.385426757076742, |
| "grad_norm": 0.8926573991775513, |
| "learning_rate": 3.714039435405667e-05, |
| "loss": 3.1319, |
| "step": 87200 |
| }, |
| { |
| "epoch": 9.390808309116348, |
| "grad_norm": 0.9110729098320007, |
| "learning_rate": 3.6817153323995254e-05, |
| "loss": 3.1211, |
| "step": 87250 |
| }, |
| { |
| "epoch": 9.396189861155957, |
| "grad_norm": 0.9294565320014954, |
| "learning_rate": 3.649391229393384e-05, |
| "loss": 3.1246, |
| "step": 87300 |
| }, |
| { |
| "epoch": 9.401571413195565, |
| "grad_norm": 0.88547283411026, |
| "learning_rate": 3.617067126387242e-05, |
| "loss": 3.137, |
| "step": 87350 |
| }, |
| { |
| "epoch": 9.406952965235174, |
| "grad_norm": 0.8936682343482971, |
| "learning_rate": 3.5847430233811004e-05, |
| "loss": 3.1245, |
| "step": 87400 |
| }, |
| { |
| "epoch": 9.412334517274783, |
| "grad_norm": 0.9498528838157654, |
| "learning_rate": 3.552418920374959e-05, |
| "loss": 3.1319, |
| "step": 87450 |
| }, |
| { |
| "epoch": 9.417716069314391, |
| "grad_norm": 0.949592113494873, |
| "learning_rate": 3.520094817368818e-05, |
| "loss": 3.1445, |
| "step": 87500 |
| }, |
| { |
| "epoch": 9.423097621353998, |
| "grad_norm": 0.8684675097465515, |
| "learning_rate": 3.487770714362676e-05, |
| "loss": 3.1472, |
| "step": 87550 |
| }, |
| { |
| "epoch": 9.428479173393606, |
| "grad_norm": 0.950598418712616, |
| "learning_rate": 3.455446611356535e-05, |
| "loss": 3.1463, |
| "step": 87600 |
| }, |
| { |
| "epoch": 9.433860725433215, |
| "grad_norm": 0.9288617968559265, |
| "learning_rate": 3.423122508350393e-05, |
| "loss": 3.1354, |
| "step": 87650 |
| }, |
| { |
| "epoch": 9.439242277472824, |
| "grad_norm": 0.9261043071746826, |
| "learning_rate": 3.390798405344251e-05, |
| "loss": 3.1414, |
| "step": 87700 |
| }, |
| { |
| "epoch": 9.444623829512432, |
| "grad_norm": 0.8761609196662903, |
| "learning_rate": 3.35847430233811e-05, |
| "loss": 3.1457, |
| "step": 87750 |
| }, |
| { |
| "epoch": 9.450005381552039, |
| "grad_norm": 0.9005210995674133, |
| "learning_rate": 3.326150199331968e-05, |
| "loss": 3.1285, |
| "step": 87800 |
| }, |
| { |
| "epoch": 9.455386933591647, |
| "grad_norm": 0.9317635297775269, |
| "learning_rate": 3.293826096325827e-05, |
| "loss": 3.1293, |
| "step": 87850 |
| }, |
| { |
| "epoch": 9.460768485631256, |
| "grad_norm": 0.886011004447937, |
| "learning_rate": 3.261501993319685e-05, |
| "loss": 3.1344, |
| "step": 87900 |
| }, |
| { |
| "epoch": 9.466150037670864, |
| "grad_norm": 0.9412506818771362, |
| "learning_rate": 3.229177890313544e-05, |
| "loss": 3.136, |
| "step": 87950 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "grad_norm": 0.9001096487045288, |
| "learning_rate": 3.197500269367525e-05, |
| "loss": 3.1343, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "eval_accuracy": 0.39403382646895174, |
| "eval_loss": 3.306755781173706, |
| "eval_runtime": 184.7543, |
| "eval_samples_per_second": 97.486, |
| "eval_steps_per_second": 6.095, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.476913141750082, |
| "grad_norm": 0.9205313920974731, |
| "learning_rate": 3.165176166361383e-05, |
| "loss": 3.1289, |
| "step": 88050 |
| }, |
| { |
| "epoch": 9.482294693789688, |
| "grad_norm": 0.9257974624633789, |
| "learning_rate": 3.132852063355242e-05, |
| "loss": 3.1247, |
| "step": 88100 |
| }, |
| { |
| "epoch": 9.487676245829297, |
| "grad_norm": 0.9012815952301025, |
| "learning_rate": 3.1005279603491e-05, |
| "loss": 3.1413, |
| "step": 88150 |
| }, |
| { |
| "epoch": 9.493057797868905, |
| "grad_norm": 0.9336722493171692, |
| "learning_rate": 3.068203857342958e-05, |
| "loss": 3.1185, |
| "step": 88200 |
| }, |
| { |
| "epoch": 9.498439349908514, |
| "grad_norm": 0.9232977628707886, |
| "learning_rate": 3.035879754336817e-05, |
| "loss": 3.1124, |
| "step": 88250 |
| }, |
| { |
| "epoch": 9.503820901948123, |
| "grad_norm": 0.8725977540016174, |
| "learning_rate": 3.0035556513306754e-05, |
| "loss": 3.1293, |
| "step": 88300 |
| }, |
| { |
| "epoch": 9.50920245398773, |
| "grad_norm": 0.9048357009887695, |
| "learning_rate": 2.971231548324534e-05, |
| "loss": 3.1483, |
| "step": 88350 |
| }, |
| { |
| "epoch": 9.514584006027338, |
| "grad_norm": 0.9110617637634277, |
| "learning_rate": 2.938907445318392e-05, |
| "loss": 3.1158, |
| "step": 88400 |
| }, |
| { |
| "epoch": 9.519965558066946, |
| "grad_norm": 0.8577834963798523, |
| "learning_rate": 2.9065833423122504e-05, |
| "loss": 3.1259, |
| "step": 88450 |
| }, |
| { |
| "epoch": 9.525347110106555, |
| "grad_norm": 0.8744284510612488, |
| "learning_rate": 2.874259239306109e-05, |
| "loss": 3.1197, |
| "step": 88500 |
| }, |
| { |
| "epoch": 9.530728662146164, |
| "grad_norm": 0.8887792229652405, |
| "learning_rate": 2.8419351362999673e-05, |
| "loss": 3.1283, |
| "step": 88550 |
| }, |
| { |
| "epoch": 9.536110214185772, |
| "grad_norm": 0.9559165835380554, |
| "learning_rate": 2.809611033293826e-05, |
| "loss": 3.1234, |
| "step": 88600 |
| }, |
| { |
| "epoch": 9.541491766225379, |
| "grad_norm": 0.8760658502578735, |
| "learning_rate": 2.7772869302876845e-05, |
| "loss": 3.118, |
| "step": 88650 |
| }, |
| { |
| "epoch": 9.546873318264987, |
| "grad_norm": 0.885046660900116, |
| "learning_rate": 2.7449628272815427e-05, |
| "loss": 3.1529, |
| "step": 88700 |
| }, |
| { |
| "epoch": 9.552254870304596, |
| "grad_norm": 0.8885508179664612, |
| "learning_rate": 2.712638724275401e-05, |
| "loss": 3.1341, |
| "step": 88750 |
| }, |
| { |
| "epoch": 9.557636422344205, |
| "grad_norm": 0.90824294090271, |
| "learning_rate": 2.6803146212692596e-05, |
| "loss": 3.1306, |
| "step": 88800 |
| }, |
| { |
| "epoch": 9.563017974383813, |
| "grad_norm": 0.9239144921302795, |
| "learning_rate": 2.647990518263118e-05, |
| "loss": 3.1522, |
| "step": 88850 |
| }, |
| { |
| "epoch": 9.56839952642342, |
| "grad_norm": 0.927269458770752, |
| "learning_rate": 2.6156664152569765e-05, |
| "loss": 3.1294, |
| "step": 88900 |
| }, |
| { |
| "epoch": 9.573781078463028, |
| "grad_norm": 0.8974427580833435, |
| "learning_rate": 2.5833423122508346e-05, |
| "loss": 3.1244, |
| "step": 88950 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "grad_norm": 0.844535231590271, |
| "learning_rate": 2.5510182092446934e-05, |
| "loss": 3.1383, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "eval_accuracy": 0.39431393361373746, |
| "eval_loss": 3.303724765777588, |
| "eval_runtime": 184.8828, |
| "eval_samples_per_second": 97.418, |
| "eval_steps_per_second": 6.09, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.584544182542245, |
| "grad_norm": 0.8630052804946899, |
| "learning_rate": 2.5186941062385518e-05, |
| "loss": 3.1265, |
| "step": 89050 |
| }, |
| { |
| "epoch": 9.589925734581854, |
| "grad_norm": 0.8742738366127014, |
| "learning_rate": 2.4863700032324103e-05, |
| "loss": 3.144, |
| "step": 89100 |
| }, |
| { |
| "epoch": 9.59530728662146, |
| "grad_norm": 0.9079467058181763, |
| "learning_rate": 2.4540459002262687e-05, |
| "loss": 3.1231, |
| "step": 89150 |
| }, |
| { |
| "epoch": 9.60068883866107, |
| "grad_norm": 0.8864733576774597, |
| "learning_rate": 2.4217217972201268e-05, |
| "loss": 3.1178, |
| "step": 89200 |
| }, |
| { |
| "epoch": 9.606070390700678, |
| "grad_norm": 0.9138379096984863, |
| "learning_rate": 2.3893976942139853e-05, |
| "loss": 3.1156, |
| "step": 89250 |
| }, |
| { |
| "epoch": 9.611451942740286, |
| "grad_norm": 0.9160833954811096, |
| "learning_rate": 2.3570735912078437e-05, |
| "loss": 3.1396, |
| "step": 89300 |
| }, |
| { |
| "epoch": 9.616833494779895, |
| "grad_norm": 0.9601799249649048, |
| "learning_rate": 2.324749488201702e-05, |
| "loss": 3.1191, |
| "step": 89350 |
| }, |
| { |
| "epoch": 9.622215046819504, |
| "grad_norm": 0.9554757475852966, |
| "learning_rate": 2.292425385195561e-05, |
| "loss": 3.1543, |
| "step": 89400 |
| }, |
| { |
| "epoch": 9.62759659885911, |
| "grad_norm": 0.9189677238464355, |
| "learning_rate": 2.260101282189419e-05, |
| "loss": 3.1164, |
| "step": 89450 |
| }, |
| { |
| "epoch": 9.632978150898719, |
| "grad_norm": 0.888224184513092, |
| "learning_rate": 2.2277771791832775e-05, |
| "loss": 3.1328, |
| "step": 89500 |
| }, |
| { |
| "epoch": 9.638359702938327, |
| "grad_norm": 0.8766180872917175, |
| "learning_rate": 2.195453076177136e-05, |
| "loss": 3.1125, |
| "step": 89550 |
| }, |
| { |
| "epoch": 9.643741254977936, |
| "grad_norm": 0.9146680235862732, |
| "learning_rate": 2.1631289731709944e-05, |
| "loss": 3.121, |
| "step": 89600 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.9508671760559082, |
| "learning_rate": 2.130804870164853e-05, |
| "loss": 3.1331, |
| "step": 89650 |
| }, |
| { |
| "epoch": 9.654504359057151, |
| "grad_norm": 0.8772718906402588, |
| "learning_rate": 2.098480767158711e-05, |
| "loss": 3.1178, |
| "step": 89700 |
| }, |
| { |
| "epoch": 9.65988591109676, |
| "grad_norm": 0.9173164963722229, |
| "learning_rate": 2.0661566641525694e-05, |
| "loss": 3.1261, |
| "step": 89750 |
| }, |
| { |
| "epoch": 9.665267463136368, |
| "grad_norm": 0.8904097676277161, |
| "learning_rate": 2.0338325611464282e-05, |
| "loss": 3.1378, |
| "step": 89800 |
| }, |
| { |
| "epoch": 9.670649015175977, |
| "grad_norm": 0.9122865200042725, |
| "learning_rate": 2.0015084581402867e-05, |
| "loss": 3.1482, |
| "step": 89850 |
| }, |
| { |
| "epoch": 9.676030567215586, |
| "grad_norm": 0.9031899571418762, |
| "learning_rate": 1.969184355134145e-05, |
| "loss": 3.1317, |
| "step": 89900 |
| }, |
| { |
| "epoch": 9.681412119255192, |
| "grad_norm": 0.9516944885253906, |
| "learning_rate": 1.9368602521280032e-05, |
| "loss": 3.1399, |
| "step": 89950 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "grad_norm": 0.8837878108024597, |
| "learning_rate": 1.9045361491218617e-05, |
| "loss": 3.1249, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "eval_accuracy": 0.39447756486278607, |
| "eval_loss": 3.3018171787261963, |
| "eval_runtime": 185.0594, |
| "eval_samples_per_second": 97.326, |
| "eval_steps_per_second": 6.085, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.69217522333441, |
| "grad_norm": 0.9284664988517761, |
| "learning_rate": 1.872858528175843e-05, |
| "loss": 3.1494, |
| "step": 90050 |
| }, |
| { |
| "epoch": 9.697556775374018, |
| "grad_norm": 0.9753976464271545, |
| "learning_rate": 1.8405344251697014e-05, |
| "loss": 3.1438, |
| "step": 90100 |
| }, |
| { |
| "epoch": 9.702938327413626, |
| "grad_norm": 0.9227730631828308, |
| "learning_rate": 1.80821032216356e-05, |
| "loss": 3.1289, |
| "step": 90150 |
| }, |
| { |
| "epoch": 9.708319879453235, |
| "grad_norm": 0.8707805275917053, |
| "learning_rate": 1.7758862191574183e-05, |
| "loss": 3.1446, |
| "step": 90200 |
| }, |
| { |
| "epoch": 9.713701431492842, |
| "grad_norm": 0.8503866791725159, |
| "learning_rate": 1.7435621161512768e-05, |
| "loss": 3.1292, |
| "step": 90250 |
| }, |
| { |
| "epoch": 9.71908298353245, |
| "grad_norm": 0.8727654218673706, |
| "learning_rate": 1.711238013145135e-05, |
| "loss": 3.121, |
| "step": 90300 |
| }, |
| { |
| "epoch": 9.724464535572059, |
| "grad_norm": 0.9083965420722961, |
| "learning_rate": 1.6789139101389937e-05, |
| "loss": 3.1278, |
| "step": 90350 |
| }, |
| { |
| "epoch": 9.729846087611667, |
| "grad_norm": 0.865218997001648, |
| "learning_rate": 1.6472362891929747e-05, |
| "loss": 3.1158, |
| "step": 90400 |
| }, |
| { |
| "epoch": 9.735227639651276, |
| "grad_norm": 0.9163516759872437, |
| "learning_rate": 1.614912186186833e-05, |
| "loss": 3.1345, |
| "step": 90450 |
| }, |
| { |
| "epoch": 9.740609191690883, |
| "grad_norm": 0.8571822047233582, |
| "learning_rate": 1.5825880831806916e-05, |
| "loss": 3.1355, |
| "step": 90500 |
| }, |
| { |
| "epoch": 9.745990743730491, |
| "grad_norm": 0.8619289398193359, |
| "learning_rate": 1.55026398017455e-05, |
| "loss": 3.113, |
| "step": 90550 |
| }, |
| { |
| "epoch": 9.7513722957701, |
| "grad_norm": 0.8706497550010681, |
| "learning_rate": 1.5179398771684085e-05, |
| "loss": 3.1324, |
| "step": 90600 |
| }, |
| { |
| "epoch": 9.756753847809708, |
| "grad_norm": 0.9034489989280701, |
| "learning_rate": 1.485615774162267e-05, |
| "loss": 3.1221, |
| "step": 90650 |
| }, |
| { |
| "epoch": 9.762135399849317, |
| "grad_norm": 0.8666117191314697, |
| "learning_rate": 1.4532916711561252e-05, |
| "loss": 3.1205, |
| "step": 90700 |
| }, |
| { |
| "epoch": 9.767516951888926, |
| "grad_norm": 0.9398617148399353, |
| "learning_rate": 1.4209675681499837e-05, |
| "loss": 3.1403, |
| "step": 90750 |
| }, |
| { |
| "epoch": 9.772898503928532, |
| "grad_norm": 0.9415978789329529, |
| "learning_rate": 1.3886434651438423e-05, |
| "loss": 3.1329, |
| "step": 90800 |
| }, |
| { |
| "epoch": 9.77828005596814, |
| "grad_norm": 0.8788616061210632, |
| "learning_rate": 1.3563193621377006e-05, |
| "loss": 3.1208, |
| "step": 90850 |
| }, |
| { |
| "epoch": 9.78366160800775, |
| "grad_norm": 0.8646774888038635, |
| "learning_rate": 1.323995259131559e-05, |
| "loss": 3.1319, |
| "step": 90900 |
| }, |
| { |
| "epoch": 9.789043160047358, |
| "grad_norm": 0.9247779846191406, |
| "learning_rate": 1.2916711561254173e-05, |
| "loss": 3.1319, |
| "step": 90950 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "grad_norm": 0.9060809016227722, |
| "learning_rate": 1.2593470531192759e-05, |
| "loss": 3.1263, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "eval_accuracy": 0.39467879001101847, |
| "eval_loss": 3.299553394317627, |
| "eval_runtime": 184.8969, |
| "eval_samples_per_second": 97.411, |
| "eval_steps_per_second": 6.09, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.799806264126573, |
| "grad_norm": 0.8818645477294922, |
| "learning_rate": 1.2270229501131344e-05, |
| "loss": 3.1265, |
| "step": 91050 |
| }, |
| { |
| "epoch": 9.805187816166182, |
| "grad_norm": 0.917174220085144, |
| "learning_rate": 1.1946988471069926e-05, |
| "loss": 3.1284, |
| "step": 91100 |
| }, |
| { |
| "epoch": 9.81056936820579, |
| "grad_norm": 0.9325860738754272, |
| "learning_rate": 1.162374744100851e-05, |
| "loss": 3.1433, |
| "step": 91150 |
| }, |
| { |
| "epoch": 9.815950920245399, |
| "grad_norm": 0.9602986574172974, |
| "learning_rate": 1.1300506410947095e-05, |
| "loss": 3.1445, |
| "step": 91200 |
| }, |
| { |
| "epoch": 9.821332472285007, |
| "grad_norm": 0.9245926737785339, |
| "learning_rate": 1.097726538088568e-05, |
| "loss": 3.1266, |
| "step": 91250 |
| }, |
| { |
| "epoch": 9.826714024324616, |
| "grad_norm": 0.9178900718688965, |
| "learning_rate": 1.0654024350824264e-05, |
| "loss": 3.1296, |
| "step": 91300 |
| }, |
| { |
| "epoch": 9.832095576364223, |
| "grad_norm": 0.8924757838249207, |
| "learning_rate": 1.0330783320762847e-05, |
| "loss": 3.1316, |
| "step": 91350 |
| }, |
| { |
| "epoch": 9.837477128403831, |
| "grad_norm": 0.8745411038398743, |
| "learning_rate": 1.0007542290701433e-05, |
| "loss": 3.1478, |
| "step": 91400 |
| }, |
| { |
| "epoch": 9.84285868044344, |
| "grad_norm": 0.9549648761749268, |
| "learning_rate": 9.684301260640016e-06, |
| "loss": 3.1255, |
| "step": 91450 |
| }, |
| { |
| "epoch": 9.848240232483048, |
| "grad_norm": 0.8879762291908264, |
| "learning_rate": 9.3610602305786e-06, |
| "loss": 3.1295, |
| "step": 91500 |
| }, |
| { |
| "epoch": 9.853621784522657, |
| "grad_norm": 0.8747987747192383, |
| "learning_rate": 9.037819200517185e-06, |
| "loss": 3.1141, |
| "step": 91550 |
| }, |
| { |
| "epoch": 9.859003336562264, |
| "grad_norm": 0.9374912977218628, |
| "learning_rate": 8.71457817045577e-06, |
| "loss": 3.133, |
| "step": 91600 |
| }, |
| { |
| "epoch": 9.864384888601872, |
| "grad_norm": 0.8693592548370361, |
| "learning_rate": 8.391337140394354e-06, |
| "loss": 3.1176, |
| "step": 91650 |
| }, |
| { |
| "epoch": 9.869766440641481, |
| "grad_norm": 0.9023551344871521, |
| "learning_rate": 8.068096110332939e-06, |
| "loss": 3.116, |
| "step": 91700 |
| }, |
| { |
| "epoch": 9.87514799268109, |
| "grad_norm": 0.8818048238754272, |
| "learning_rate": 7.744855080271521e-06, |
| "loss": 3.134, |
| "step": 91750 |
| }, |
| { |
| "epoch": 9.880529544720698, |
| "grad_norm": 0.9004733562469482, |
| "learning_rate": 7.421614050210106e-06, |
| "loss": 3.1342, |
| "step": 91800 |
| }, |
| { |
| "epoch": 9.885911096760307, |
| "grad_norm": 0.9453480243682861, |
| "learning_rate": 7.09837302014869e-06, |
| "loss": 3.1202, |
| "step": 91850 |
| }, |
| { |
| "epoch": 9.891292648799913, |
| "grad_norm": 0.8884382843971252, |
| "learning_rate": 6.775131990087275e-06, |
| "loss": 3.1268, |
| "step": 91900 |
| }, |
| { |
| "epoch": 9.896674200839522, |
| "grad_norm": 0.9106020927429199, |
| "learning_rate": 6.451890960025859e-06, |
| "loss": 3.1243, |
| "step": 91950 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "grad_norm": 0.9381679892539978, |
| "learning_rate": 6.128649929964443e-06, |
| "loss": 3.1164, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "eval_accuracy": 0.39488544780364165, |
| "eval_loss": 3.2985033988952637, |
| "eval_runtime": 184.9441, |
| "eval_samples_per_second": 97.386, |
| "eval_steps_per_second": 6.088, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.907437304918739, |
| "grad_norm": 0.9050652384757996, |
| "learning_rate": 5.805408899903027e-06, |
| "loss": 3.1365, |
| "step": 92050 |
| }, |
| { |
| "epoch": 9.912818856958348, |
| "grad_norm": 0.924476146697998, |
| "learning_rate": 5.482167869841611e-06, |
| "loss": 3.1275, |
| "step": 92100 |
| }, |
| { |
| "epoch": 9.918200408997954, |
| "grad_norm": 0.8838419914245605, |
| "learning_rate": 5.1589268397801965e-06, |
| "loss": 3.1385, |
| "step": 92150 |
| }, |
| { |
| "epoch": 9.923581961037563, |
| "grad_norm": 0.8708315491676331, |
| "learning_rate": 4.83568580971878e-06, |
| "loss": 3.1325, |
| "step": 92200 |
| }, |
| { |
| "epoch": 9.928963513077171, |
| "grad_norm": 0.9714722633361816, |
| "learning_rate": 4.512444779657364e-06, |
| "loss": 3.1291, |
| "step": 92250 |
| }, |
| { |
| "epoch": 9.93434506511678, |
| "grad_norm": 0.8961527943611145, |
| "learning_rate": 4.189203749595948e-06, |
| "loss": 3.1339, |
| "step": 92300 |
| }, |
| { |
| "epoch": 9.939726617156388, |
| "grad_norm": 0.9741976261138916, |
| "learning_rate": 3.865962719534533e-06, |
| "loss": 3.1428, |
| "step": 92350 |
| }, |
| { |
| "epoch": 9.945108169195997, |
| "grad_norm": 0.8602545261383057, |
| "learning_rate": 3.542721689473117e-06, |
| "loss": 3.1453, |
| "step": 92400 |
| }, |
| { |
| "epoch": 9.950489721235604, |
| "grad_norm": 0.8766176700592041, |
| "learning_rate": 3.2194806594117013e-06, |
| "loss": 3.1286, |
| "step": 92450 |
| }, |
| { |
| "epoch": 9.955871273275212, |
| "grad_norm": 0.9090676307678223, |
| "learning_rate": 2.8962396293502854e-06, |
| "loss": 3.1159, |
| "step": 92500 |
| }, |
| { |
| "epoch": 9.961252825314821, |
| "grad_norm": 0.8845322132110596, |
| "learning_rate": 2.5729985992888694e-06, |
| "loss": 3.1353, |
| "step": 92550 |
| }, |
| { |
| "epoch": 9.96663437735443, |
| "grad_norm": 0.924277663230896, |
| "learning_rate": 2.249757569227454e-06, |
| "loss": 3.1229, |
| "step": 92600 |
| }, |
| { |
| "epoch": 9.972015929394038, |
| "grad_norm": 0.8867099285125732, |
| "learning_rate": 1.926516539166038e-06, |
| "loss": 3.1248, |
| "step": 92650 |
| }, |
| { |
| "epoch": 9.977397481433645, |
| "grad_norm": 0.9389845132827759, |
| "learning_rate": 1.603275509104622e-06, |
| "loss": 3.1412, |
| "step": 92700 |
| }, |
| { |
| "epoch": 9.982779033473253, |
| "grad_norm": 0.9016231894493103, |
| "learning_rate": 1.2800344790432064e-06, |
| "loss": 3.1243, |
| "step": 92750 |
| }, |
| { |
| "epoch": 9.988160585512862, |
| "grad_norm": 0.8891977071762085, |
| "learning_rate": 9.567934489817906e-07, |
| "loss": 3.112, |
| "step": 92800 |
| }, |
| { |
| "epoch": 9.99354213755247, |
| "grad_norm": 0.896614134311676, |
| "learning_rate": 6.335524189203748e-07, |
| "loss": 3.126, |
| "step": 92850 |
| }, |
| { |
| "epoch": 9.998923689592079, |
| "grad_norm": 0.9039609432220459, |
| "learning_rate": 3.103113888589591e-07, |
| "loss": 3.1234, |
| "step": 92900 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 92910, |
| "total_flos": 7.76821211136e+17, |
| "train_loss": 3.4538463850168144, |
| "train_runtime": 79850.848, |
| "train_samples_per_second": 37.232, |
| "train_steps_per_second": 1.164 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 92910, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.76821211136e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|