Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity,
"... is not valid JSON
| { | |
| "best_metric": 3.304586172103882, | |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_634/checkpoint-90000", | |
| "epoch": 10.0, | |
| "eval_steps": 1000, | |
| "global_step": 92910, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005381552039608223, | |
| "grad_norm": 2.7091472148895264, | |
| "learning_rate": 0.0003, | |
| "loss": 8.4865, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010763104079216447, | |
| "grad_norm": 1.967639446258545, | |
| "learning_rate": 0.0006, | |
| "loss": 6.9044, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01614465611882467, | |
| "grad_norm": 2.5172040462493896, | |
| "learning_rate": 0.0005996767589699385, | |
| "loss": 6.4803, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.021526208158432893, | |
| "grad_norm": 1.3199275732040405, | |
| "learning_rate": 0.0005993535179398771, | |
| "loss": 6.2354, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.026907760198041114, | |
| "grad_norm": 1.2347278594970703, | |
| "learning_rate": 0.0005990302769098158, | |
| "loss": 6.0754, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03228931223764934, | |
| "grad_norm": 1.4885425567626953, | |
| "learning_rate": 0.0005987070358797543, | |
| "loss": 5.9329, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.03767086427725756, | |
| "grad_norm": 1.6232192516326904, | |
| "learning_rate": 0.0005983837948496929, | |
| "loss": 5.8614, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.04305241631686579, | |
| "grad_norm": 1.598788857460022, | |
| "learning_rate": 0.0005980605538196314, | |
| "loss": 5.7913, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.048433968356474004, | |
| "grad_norm": 2.481018543243408, | |
| "learning_rate": 0.0005977373127895701, | |
| "loss": 5.7337, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05381552039608223, | |
| "grad_norm": 1.4101097583770752, | |
| "learning_rate": 0.0005974140717595086, | |
| "loss": 5.6182, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05919707243569045, | |
| "grad_norm": 1.6082617044448853, | |
| "learning_rate": 0.0005970908307294472, | |
| "loss": 5.5589, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.06457862447529868, | |
| "grad_norm": 0.9049023389816284, | |
| "learning_rate": 0.0005967675896993858, | |
| "loss": 5.5049, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.0699601765149069, | |
| "grad_norm": 1.3293204307556152, | |
| "learning_rate": 0.0005964443486693243, | |
| "loss": 5.4266, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.07534172855451512, | |
| "grad_norm": 1.7935289144515991, | |
| "learning_rate": 0.000596121107639263, | |
| "loss": 5.3506, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.08072328059412334, | |
| "grad_norm": 1.6690304279327393, | |
| "learning_rate": 0.0005957978666092015, | |
| "loss": 5.3179, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.08610483263373157, | |
| "grad_norm": 1.4270386695861816, | |
| "learning_rate": 0.0005954746255791401, | |
| "loss": 5.2698, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.09148638467333979, | |
| "grad_norm": 1.5141105651855469, | |
| "learning_rate": 0.0005951513845490787, | |
| "loss": 5.1827, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.09686793671294801, | |
| "grad_norm": 1.6831254959106445, | |
| "learning_rate": 0.0005948281435190174, | |
| "loss": 5.1898, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.10224948875255624, | |
| "grad_norm": 1.1933035850524902, | |
| "learning_rate": 0.0005945049024889559, | |
| "loss": 5.1376, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.10763104079216446, | |
| "grad_norm": 1.2600319385528564, | |
| "learning_rate": 0.0005941816614588944, | |
| "loss": 5.0756, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10763104079216446, | |
| "eval_accuracy": 0.22684951933592223, | |
| "eval_loss": 5.017084121704102, | |
| "eval_runtime": 185.5208, | |
| "eval_samples_per_second": 97.083, | |
| "eval_steps_per_second": 6.069, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.11301259283177269, | |
| "grad_norm": 1.1238996982574463, | |
| "learning_rate": 0.000593858420428833, | |
| "loss": 5.0471, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.1183941448713809, | |
| "grad_norm": 1.0484765768051147, | |
| "learning_rate": 0.0005935351793987716, | |
| "loss": 5.0295, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.12377569691098914, | |
| "grad_norm": 1.4683005809783936, | |
| "learning_rate": 0.0005932119383687103, | |
| "loss": 4.9958, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.12915724895059735, | |
| "grad_norm": 1.0093375444412231, | |
| "learning_rate": 0.0005928886973386488, | |
| "loss": 4.9523, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.13453880099020557, | |
| "grad_norm": 1.2035621404647827, | |
| "learning_rate": 0.0005925654563085874, | |
| "loss": 4.8916, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.1399203530298138, | |
| "grad_norm": 1.0326842069625854, | |
| "learning_rate": 0.000592242215278526, | |
| "loss": 4.8852, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.14530190506942203, | |
| "grad_norm": 0.9776148200035095, | |
| "learning_rate": 0.0005919189742484645, | |
| "loss": 4.8661, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.15068345710903025, | |
| "grad_norm": 1.0485109090805054, | |
| "learning_rate": 0.0005915957332184032, | |
| "loss": 4.8206, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.15606500914863847, | |
| "grad_norm": 0.931476891040802, | |
| "learning_rate": 0.0005912724921883417, | |
| "loss": 4.8047, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.16144656118824668, | |
| "grad_norm": 1.7174251079559326, | |
| "learning_rate": 0.0005909492511582803, | |
| "loss": 4.8148, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1668281132278549, | |
| "grad_norm": 0.9541407823562622, | |
| "learning_rate": 0.0005906260101282189, | |
| "loss": 4.7619, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.17220966526746315, | |
| "grad_norm": 0.8106948733329773, | |
| "learning_rate": 0.0005903027690981575, | |
| "loss": 4.7593, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.17759121730707136, | |
| "grad_norm": 0.8909491896629333, | |
| "learning_rate": 0.000589979528068096, | |
| "loss": 4.7116, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.18297276934667958, | |
| "grad_norm": 0.911844789981842, | |
| "learning_rate": 0.0005896562870380347, | |
| "loss": 4.6914, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.1883543213862878, | |
| "grad_norm": 1.3206818103790283, | |
| "learning_rate": 0.0005893330460079732, | |
| "loss": 4.6934, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.19373587342589602, | |
| "grad_norm": 0.8717551231384277, | |
| "learning_rate": 0.0005890098049779118, | |
| "loss": 4.6662, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.19911742546550426, | |
| "grad_norm": 0.8639872670173645, | |
| "learning_rate": 0.0005886865639478504, | |
| "loss": 4.6283, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.20449897750511248, | |
| "grad_norm": 1.275295615196228, | |
| "learning_rate": 0.0005883633229177889, | |
| "loss": 4.6063, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.2098805295447207, | |
| "grad_norm": 0.9782142043113708, | |
| "learning_rate": 0.0005880400818877276, | |
| "loss": 4.6013, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2152620815843289, | |
| "grad_norm": 0.7876592874526978, | |
| "learning_rate": 0.0005877168408576662, | |
| "loss": 4.5802, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2152620815843289, | |
| "eval_accuracy": 0.27130006765815323, | |
| "eval_loss": 4.498872756958008, | |
| "eval_runtime": 185.1447, | |
| "eval_samples_per_second": 97.281, | |
| "eval_steps_per_second": 6.082, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.22064363362393713, | |
| "grad_norm": 0.9489641785621643, | |
| "learning_rate": 0.0005873935998276048, | |
| "loss": 4.576, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.22602518566354537, | |
| "grad_norm": 0.8717491030693054, | |
| "learning_rate": 0.0005870703587975433, | |
| "loss": 4.5329, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.2314067377031536, | |
| "grad_norm": 0.6650089025497437, | |
| "learning_rate": 0.0005867471177674818, | |
| "loss": 4.5276, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.2367882897427618, | |
| "grad_norm": 0.9660583138465881, | |
| "learning_rate": 0.0005864238767374205, | |
| "loss": 4.5157, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.24216984178237003, | |
| "grad_norm": 0.8200618028640747, | |
| "learning_rate": 0.0005861006357073591, | |
| "loss": 4.4902, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.24755139382197827, | |
| "grad_norm": 0.832756519317627, | |
| "learning_rate": 0.0005857773946772977, | |
| "loss": 4.4648, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.2529329458615865, | |
| "grad_norm": 1.736657977104187, | |
| "learning_rate": 0.0005854541536472362, | |
| "loss": 4.451, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2583144979011947, | |
| "grad_norm": 0.8690645694732666, | |
| "learning_rate": 0.0005851309126171749, | |
| "loss": 4.442, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.2636960499408029, | |
| "grad_norm": 0.9058188199996948, | |
| "learning_rate": 0.0005848076715871134, | |
| "loss": 4.4145, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.26907760198041114, | |
| "grad_norm": 0.7224371433258057, | |
| "learning_rate": 0.000584484430557052, | |
| "loss": 4.4354, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.27445915402001936, | |
| "grad_norm": 0.9763414263725281, | |
| "learning_rate": 0.0005841611895269906, | |
| "loss": 4.3947, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.2798407060596276, | |
| "grad_norm": 0.8446633219718933, | |
| "learning_rate": 0.0005838379484969291, | |
| "loss": 4.3857, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.2852222580992358, | |
| "grad_norm": 0.7831347584724426, | |
| "learning_rate": 0.0005835147074668678, | |
| "loss": 4.3803, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.29060381013884407, | |
| "grad_norm": 0.9835968613624573, | |
| "learning_rate": 0.0005831914664368063, | |
| "loss": 4.3489, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2959853621784523, | |
| "grad_norm": 0.7985646724700928, | |
| "learning_rate": 0.0005828682254067449, | |
| "loss": 4.3591, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3013669142180605, | |
| "grad_norm": 0.760099470615387, | |
| "learning_rate": 0.0005825449843766835, | |
| "loss": 4.3337, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.3067484662576687, | |
| "grad_norm": 0.7309588193893433, | |
| "learning_rate": 0.0005822217433466221, | |
| "loss": 4.3136, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.31213001829727693, | |
| "grad_norm": 0.749220609664917, | |
| "learning_rate": 0.0005818985023165607, | |
| "loss": 4.3327, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.31751157033688515, | |
| "grad_norm": 0.7987422943115234, | |
| "learning_rate": 0.0005815752612864992, | |
| "loss": 4.3193, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.32289312237649337, | |
| "grad_norm": 0.9373783469200134, | |
| "learning_rate": 0.0005812520202564378, | |
| "loss": 4.2854, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.32289312237649337, | |
| "eval_accuracy": 0.2990188318271689, | |
| "eval_loss": 4.231717109680176, | |
| "eval_runtime": 187.2992, | |
| "eval_samples_per_second": 96.162, | |
| "eval_steps_per_second": 6.012, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3282746744161016, | |
| "grad_norm": 0.967170774936676, | |
| "learning_rate": 0.0005809287792263764, | |
| "loss": 4.286, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3336562264557098, | |
| "grad_norm": 0.8400319218635559, | |
| "learning_rate": 0.0005806055381963151, | |
| "loss": 4.2771, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.3390377784953181, | |
| "grad_norm": 0.7334919571876526, | |
| "learning_rate": 0.0005802822971662536, | |
| "loss": 4.2781, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3444193305349263, | |
| "grad_norm": 0.744314432144165, | |
| "learning_rate": 0.0005799590561361922, | |
| "loss": 4.279, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3498008825745345, | |
| "grad_norm": 0.8924934267997742, | |
| "learning_rate": 0.0005796358151061307, | |
| "loss": 4.2584, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.35518243461414273, | |
| "grad_norm": 0.6779358983039856, | |
| "learning_rate": 0.0005793125740760694, | |
| "loss": 4.2424, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.36056398665375095, | |
| "grad_norm": 0.6709272861480713, | |
| "learning_rate": 0.0005789893330460079, | |
| "loss": 4.233, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.36594553869335916, | |
| "grad_norm": 0.7489773035049438, | |
| "learning_rate": 0.0005786660920159465, | |
| "loss": 4.2351, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.3713270907329674, | |
| "grad_norm": 0.8585236668586731, | |
| "learning_rate": 0.0005783428509858851, | |
| "loss": 4.2377, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.3767086427725756, | |
| "grad_norm": 0.7472572922706604, | |
| "learning_rate": 0.0005780196099558237, | |
| "loss": 4.2177, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.3820901948121838, | |
| "grad_norm": 0.6559500694274902, | |
| "learning_rate": 0.0005776963689257623, | |
| "loss": 4.2103, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.38747174685179203, | |
| "grad_norm": 0.8734597563743591, | |
| "learning_rate": 0.0005773731278957008, | |
| "loss": 4.2007, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.3928532988914003, | |
| "grad_norm": 0.8117654323577881, | |
| "learning_rate": 0.0005770498868656394, | |
| "loss": 4.194, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.3982348509310085, | |
| "grad_norm": 0.8064318895339966, | |
| "learning_rate": 0.000576726645835578, | |
| "loss": 4.1933, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.40361640297061674, | |
| "grad_norm": 0.6744760274887085, | |
| "learning_rate": 0.0005764034048055167, | |
| "loss": 4.1869, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.40899795501022496, | |
| "grad_norm": 0.7638756036758423, | |
| "learning_rate": 0.0005760801637754552, | |
| "loss": 4.1967, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.4143795070498332, | |
| "grad_norm": 0.8815849423408508, | |
| "learning_rate": 0.0005757569227453937, | |
| "loss": 4.162, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.4197610590894414, | |
| "grad_norm": 0.7192302346229553, | |
| "learning_rate": 0.0005754336817153324, | |
| "loss": 4.1899, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4251426111290496, | |
| "grad_norm": 0.5958099365234375, | |
| "learning_rate": 0.0005751104406852709, | |
| "loss": 4.1567, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.4305241631686578, | |
| "grad_norm": 0.7568697333335876, | |
| "learning_rate": 0.0005747871996552096, | |
| "loss": 4.167, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.4305241631686578, | |
| "eval_accuracy": 0.31295736754044956, | |
| "eval_loss": 4.088932514190674, | |
| "eval_runtime": 185.3099, | |
| "eval_samples_per_second": 97.194, | |
| "eval_steps_per_second": 6.076, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.43590571520826604, | |
| "grad_norm": 0.8409259915351868, | |
| "learning_rate": 0.0005744639586251481, | |
| "loss": 4.1443, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.44128726724787426, | |
| "grad_norm": 0.630840003490448, | |
| "learning_rate": 0.0005741407175950867, | |
| "loss": 4.1604, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.44666881928748253, | |
| "grad_norm": 0.5622649788856506, | |
| "learning_rate": 0.0005738174765650253, | |
| "loss": 4.1458, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.45205037132709075, | |
| "grad_norm": 0.681736171245575, | |
| "learning_rate": 0.0005734942355349638, | |
| "loss": 4.1354, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.45743192336669897, | |
| "grad_norm": 0.7544794082641602, | |
| "learning_rate": 0.0005731709945049025, | |
| "loss": 4.1296, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.4628134754063072, | |
| "grad_norm": 0.7159223556518555, | |
| "learning_rate": 0.000572847753474841, | |
| "loss": 4.1233, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4681950274459154, | |
| "grad_norm": 0.7304403781890869, | |
| "learning_rate": 0.0005725245124447796, | |
| "loss": 4.1195, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.4735765794855236, | |
| "grad_norm": 0.6582966446876526, | |
| "learning_rate": 0.0005722012714147182, | |
| "loss": 4.122, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.47895813152513184, | |
| "grad_norm": 0.6851973533630371, | |
| "learning_rate": 0.0005718780303846568, | |
| "loss": 4.1075, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.48433968356474005, | |
| "grad_norm": 0.629666805267334, | |
| "learning_rate": 0.0005715547893545953, | |
| "loss": 4.1247, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.48972123560434827, | |
| "grad_norm": 0.6585856080055237, | |
| "learning_rate": 0.000571231548324534, | |
| "loss": 4.089, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.49510278764395654, | |
| "grad_norm": 0.7275299429893494, | |
| "learning_rate": 0.0005709083072944725, | |
| "loss": 4.102, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.5004843396835648, | |
| "grad_norm": 0.7020848393440247, | |
| "learning_rate": 0.0005705850662644111, | |
| "loss": 4.0871, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.505865891723173, | |
| "grad_norm": 0.7129533290863037, | |
| "learning_rate": 0.0005702618252343497, | |
| "loss": 4.0816, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.5112474437627812, | |
| "grad_norm": 0.7143609523773193, | |
| "learning_rate": 0.0005699385842042882, | |
| "loss": 4.0851, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.5166289958023894, | |
| "grad_norm": 0.5908857583999634, | |
| "learning_rate": 0.0005696153431742269, | |
| "loss": 4.0737, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.5220105478419976, | |
| "grad_norm": 0.6978116035461426, | |
| "learning_rate": 0.0005692921021441655, | |
| "loss": 4.0704, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.5273920998816058, | |
| "grad_norm": 0.5921167731285095, | |
| "learning_rate": 0.0005689688611141041, | |
| "loss": 4.0677, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.5327736519212141, | |
| "grad_norm": 0.6112943887710571, | |
| "learning_rate": 0.0005686456200840426, | |
| "loss": 4.0591, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.5381552039608223, | |
| "grad_norm": 0.7539244890213013, | |
| "learning_rate": 0.0005683223790539811, | |
| "loss": 4.0613, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5381552039608223, | |
| "eval_accuracy": 0.32120564286599806, | |
| "eval_loss": 3.9930038452148438, | |
| "eval_runtime": 185.4107, | |
| "eval_samples_per_second": 97.141, | |
| "eval_steps_per_second": 6.073, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5435367560004305, | |
| "grad_norm": 0.6740825176239014, | |
| "learning_rate": 0.0005679991380239198, | |
| "loss": 4.047, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.5489183080400387, | |
| "grad_norm": 0.755363941192627, | |
| "learning_rate": 0.0005676758969938584, | |
| "loss": 4.0381, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5542998600796469, | |
| "grad_norm": 0.5839517116546631, | |
| "learning_rate": 0.000567352655963797, | |
| "loss": 4.0421, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.5596814121192552, | |
| "grad_norm": 0.7238830924034119, | |
| "learning_rate": 0.0005670294149337355, | |
| "loss": 4.0415, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5650629641588634, | |
| "grad_norm": 0.6979734301567078, | |
| "learning_rate": 0.0005667061739036742, | |
| "loss": 4.0334, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5704445161984716, | |
| "grad_norm": 0.8152750134468079, | |
| "learning_rate": 0.0005663829328736127, | |
| "loss": 4.0427, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5758260682380799, | |
| "grad_norm": 0.6354012489318848, | |
| "learning_rate": 0.0005660596918435512, | |
| "loss": 4.0428, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.5812076202776881, | |
| "grad_norm": 0.7258121371269226, | |
| "learning_rate": 0.0005657364508134899, | |
| "loss": 4.0439, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5865891723172963, | |
| "grad_norm": 0.5233718752861023, | |
| "learning_rate": 0.0005654132097834284, | |
| "loss": 4.0143, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.5919707243569046, | |
| "grad_norm": 0.578235924243927, | |
| "learning_rate": 0.0005650899687533671, | |
| "loss": 4.0224, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5973522763965128, | |
| "grad_norm": 0.5696157217025757, | |
| "learning_rate": 0.0005647667277233056, | |
| "loss": 4.012, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.602733828436121, | |
| "grad_norm": 0.7277674674987793, | |
| "learning_rate": 0.0005644434866932442, | |
| "loss": 4.0144, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.6081153804757292, | |
| "grad_norm": 0.760840117931366, | |
| "learning_rate": 0.0005641202456631828, | |
| "loss": 4.0055, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.6134969325153374, | |
| "grad_norm": 0.6378624439239502, | |
| "learning_rate": 0.0005637970046331214, | |
| "loss": 4.0246, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.6188784845549457, | |
| "grad_norm": 0.6083015203475952, | |
| "learning_rate": 0.00056347376360306, | |
| "loss": 4.0077, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.6242600365945539, | |
| "grad_norm": 0.6895670890808105, | |
| "learning_rate": 0.0005631505225729985, | |
| "loss": 3.9889, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6296415886341621, | |
| "grad_norm": 0.7159281969070435, | |
| "learning_rate": 0.0005628272815429371, | |
| "loss": 3.9946, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.6350231406737703, | |
| "grad_norm": 0.7747655510902405, | |
| "learning_rate": 0.0005625040405128757, | |
| "loss": 3.9819, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6404046927133785, | |
| "grad_norm": 0.6201274991035461, | |
| "learning_rate": 0.0005621807994828143, | |
| "loss": 3.9892, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.6457862447529867, | |
| "grad_norm": 0.5920886397361755, | |
| "learning_rate": 0.0005618575584527529, | |
| "loss": 3.9942, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6457862447529867, | |
| "eval_accuracy": 0.3285595962719456, | |
| "eval_loss": 3.9178080558776855, | |
| "eval_runtime": 185.1703, | |
| "eval_samples_per_second": 97.267, | |
| "eval_steps_per_second": 6.081, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.651167796792595, | |
| "grad_norm": 0.6048468351364136, | |
| "learning_rate": 0.0005615343174226915, | |
| "loss": 3.9991, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.6565493488322032, | |
| "grad_norm": 0.6038753986358643, | |
| "learning_rate": 0.00056121107639263, | |
| "loss": 3.971, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6619309008718114, | |
| "grad_norm": 0.6248626708984375, | |
| "learning_rate": 0.0005608878353625687, | |
| "loss": 3.9687, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.6673124529114196, | |
| "grad_norm": 0.7638702988624573, | |
| "learning_rate": 0.0005605645943325072, | |
| "loss": 3.9686, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6726940049510278, | |
| "grad_norm": 0.5385346412658691, | |
| "learning_rate": 0.0005602413533024458, | |
| "loss": 3.9689, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6780755569906362, | |
| "grad_norm": 0.6341848373413086, | |
| "learning_rate": 0.0005599181122723844, | |
| "loss": 3.9662, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.6834571090302444, | |
| "grad_norm": 0.5963843464851379, | |
| "learning_rate": 0.000559594871242323, | |
| "loss": 3.9602, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.6888386610698526, | |
| "grad_norm": 0.5414633750915527, | |
| "learning_rate": 0.0005592716302122616, | |
| "loss": 3.9664, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6942202131094608, | |
| "grad_norm": 0.581794023513794, | |
| "learning_rate": 0.0005589483891822001, | |
| "loss": 3.9706, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.699601765149069, | |
| "grad_norm": 0.588293731212616, | |
| "learning_rate": 0.0005586251481521387, | |
| "loss": 3.9601, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.7049833171886772, | |
| "grad_norm": 0.6920768618583679, | |
| "learning_rate": 0.0005583019071220773, | |
| "loss": 3.9606, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.7103648692282855, | |
| "grad_norm": 0.6583566665649414, | |
| "learning_rate": 0.000557978666092016, | |
| "loss": 3.9327, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.7157464212678937, | |
| "grad_norm": 0.5119813680648804, | |
| "learning_rate": 0.0005576554250619545, | |
| "loss": 3.9518, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.7211279733075019, | |
| "grad_norm": 0.7030820250511169, | |
| "learning_rate": 0.000557332184031893, | |
| "loss": 3.9553, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.7265095253471101, | |
| "grad_norm": 0.640282928943634, | |
| "learning_rate": 0.0005570089430018317, | |
| "loss": 3.944, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.7318910773867183, | |
| "grad_norm": 0.6724056005477905, | |
| "learning_rate": 0.0005566857019717702, | |
| "loss": 3.9515, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7372726294263265, | |
| "grad_norm": 0.5751603245735168, | |
| "learning_rate": 0.0005563624609417089, | |
| "loss": 3.955, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.7426541814659348, | |
| "grad_norm": 0.5616862177848816, | |
| "learning_rate": 0.0005560456847322487, | |
| "loss": 3.9535, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.748035733505543, | |
| "grad_norm": 0.5367492437362671, | |
| "learning_rate": 0.0005557224437021872, | |
| "loss": 3.9248, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.7534172855451512, | |
| "grad_norm": 0.5820096135139465, | |
| "learning_rate": 0.0005553992026721258, | |
| "loss": 3.9229, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7534172855451512, | |
| "eval_accuracy": 0.33421975980975316, | |
| "eval_loss": 3.8591833114624023, | |
| "eval_runtime": 185.366, | |
| "eval_samples_per_second": 97.165, | |
| "eval_steps_per_second": 6.074, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7587988375847594, | |
| "grad_norm": 0.6244399547576904, | |
| "learning_rate": 0.0005550759616420644, | |
| "loss": 3.9139, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.7641803896243676, | |
| "grad_norm": 0.5721721649169922, | |
| "learning_rate": 0.000554752720612003, | |
| "loss": 3.9179, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.7695619416639758, | |
| "grad_norm": 0.5274028182029724, | |
| "learning_rate": 0.0005544294795819415, | |
| "loss": 3.939, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.7749434937035841, | |
| "grad_norm": 0.6537976861000061, | |
| "learning_rate": 0.0005541062385518801, | |
| "loss": 3.9104, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7803250457431924, | |
| "grad_norm": 0.5737007260322571, | |
| "learning_rate": 0.0005537829975218188, | |
| "loss": 3.8995, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7857065977828006, | |
| "grad_norm": 0.655914306640625, | |
| "learning_rate": 0.0005534597564917573, | |
| "loss": 3.9133, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7910881498224088, | |
| "grad_norm": 0.5855588912963867, | |
| "learning_rate": 0.0005531365154616959, | |
| "loss": 3.8965, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.796469701862017, | |
| "grad_norm": 0.5390224456787109, | |
| "learning_rate": 0.0005528132744316344, | |
| "loss": 3.9107, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.8018512539016253, | |
| "grad_norm": 0.739325225353241, | |
| "learning_rate": 0.0005524900334015731, | |
| "loss": 3.9033, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.8072328059412335, | |
| "grad_norm": 0.5827202796936035, | |
| "learning_rate": 0.0005521667923715117, | |
| "loss": 3.9216, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.8126143579808417, | |
| "grad_norm": 0.5299261212348938, | |
| "learning_rate": 0.0005518435513414502, | |
| "loss": 3.9062, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.8179959100204499, | |
| "grad_norm": 0.7677913308143616, | |
| "learning_rate": 0.0005515203103113888, | |
| "loss": 3.8891, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.8233774620600581, | |
| "grad_norm": 0.5630691647529602, | |
| "learning_rate": 0.0005511970692813274, | |
| "loss": 3.8984, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.8287590140996663, | |
| "grad_norm": 0.6013116240501404, | |
| "learning_rate": 0.000550873828251266, | |
| "loss": 3.8819, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.8341405661392746, | |
| "grad_norm": 0.6272456049919128, | |
| "learning_rate": 0.0005505505872212045, | |
| "loss": 3.8984, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.8395221181788828, | |
| "grad_norm": 0.6356287002563477, | |
| "learning_rate": 0.0005502273461911432, | |
| "loss": 3.9032, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.844903670218491, | |
| "grad_norm": 0.650209367275238, | |
| "learning_rate": 0.0005499041051610817, | |
| "loss": 3.8769, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.8502852222580992, | |
| "grad_norm": 0.5765166878700256, | |
| "learning_rate": 0.0005495808641310204, | |
| "loss": 3.8887, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.8556667742977074, | |
| "grad_norm": 0.6685440540313721, | |
| "learning_rate": 0.0005492576231009589, | |
| "loss": 3.9028, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.8610483263373157, | |
| "grad_norm": 0.5335162281990051, | |
| "learning_rate": 0.0005489343820708974, | |
| "loss": 3.8881, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8610483263373157, | |
| "eval_accuracy": 0.3379374270192134, | |
| "eval_loss": 3.8148393630981445, | |
| "eval_runtime": 184.805, | |
| "eval_samples_per_second": 97.459, | |
| "eval_steps_per_second": 6.093, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8664298783769239, | |
| "grad_norm": 0.5469226241111755, | |
| "learning_rate": 0.0005486111410408361, | |
| "loss": 3.8857, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.8718114304165321, | |
| "grad_norm": 0.5885823369026184, | |
| "learning_rate": 0.0005482879000107746, | |
| "loss": 3.8642, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.5075781941413879, | |
| "learning_rate": 0.0005479646589807133, | |
| "loss": 3.8996, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.8825745344957485, | |
| "grad_norm": 0.5800351500511169, | |
| "learning_rate": 0.0005476414179506518, | |
| "loss": 3.8807, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.8879560865353568, | |
| "grad_norm": 0.5522460341453552, | |
| "learning_rate": 0.0005473181769205904, | |
| "loss": 3.8747, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.8933376385749651, | |
| "grad_norm": 0.5287204384803772, | |
| "learning_rate": 0.000546994935890529, | |
| "loss": 3.861, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.8987191906145733, | |
| "grad_norm": 0.5514733791351318, | |
| "learning_rate": 0.0005466716948604677, | |
| "loss": 3.8641, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.9041007426541815, | |
| "grad_norm": 0.504178524017334, | |
| "learning_rate": 0.0005463484538304062, | |
| "loss": 3.862, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.9094822946937897, | |
| "grad_norm": 0.5593713521957397, | |
| "learning_rate": 0.0005460252128003447, | |
| "loss": 3.8712, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.9148638467333979, | |
| "grad_norm": 0.6000622510910034, | |
| "learning_rate": 0.0005457019717702833, | |
| "loss": 3.8649, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.9202453987730062, | |
| "grad_norm": 0.5310674905776978, | |
| "learning_rate": 0.0005453787307402219, | |
| "loss": 3.8597, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.9256269508126144, | |
| "grad_norm": 0.5901429057121277, | |
| "learning_rate": 0.0005450554897101605, | |
| "loss": 3.8501, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.9310085028522226, | |
| "grad_norm": 0.5769229531288147, | |
| "learning_rate": 0.0005447322486800991, | |
| "loss": 3.8616, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.9363900548918308, | |
| "grad_norm": 0.5501662492752075, | |
| "learning_rate": 0.0005444090076500377, | |
| "loss": 3.8685, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.941771606931439, | |
| "grad_norm": 0.5846860408782959, | |
| "learning_rate": 0.0005440857666199763, | |
| "loss": 3.8416, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9471531589710472, | |
| "grad_norm": 0.5883833765983582, | |
| "learning_rate": 0.0005437625255899148, | |
| "loss": 3.8386, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.9525347110106555, | |
| "grad_norm": 0.5724109411239624, | |
| "learning_rate": 0.0005434392845598534, | |
| "loss": 3.8617, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.9579162630502637, | |
| "grad_norm": 0.5604748129844666, | |
| "learning_rate": 0.000543116043529792, | |
| "loss": 3.8494, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9632978150898719, | |
| "grad_norm": 0.6778799295425415, | |
| "learning_rate": 0.0005427928024997306, | |
| "loss": 3.85, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.9686793671294801, | |
| "grad_norm": 0.5963719487190247, | |
| "learning_rate": 0.0005424695614696692, | |
| "loss": 3.8573, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9686793671294801, | |
| "eval_accuracy": 0.34179547375973, | |
| "eval_loss": 3.7812108993530273, | |
| "eval_runtime": 184.5862, | |
| "eval_samples_per_second": 97.575, | |
| "eval_steps_per_second": 6.1, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9740609191690883, | |
| "grad_norm": 0.5469352006912231, | |
| "learning_rate": 0.0005421463204396078, | |
| "loss": 3.8486, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.9794424712086965, | |
| "grad_norm": 0.6023728251457214, | |
| "learning_rate": 0.0005418230794095463, | |
| "loss": 3.8536, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.9848240232483048, | |
| "grad_norm": 0.5906918048858643, | |
| "learning_rate": 0.000541499838379485, | |
| "loss": 3.8355, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.9902055752879131, | |
| "grad_norm": 0.5566977858543396, | |
| "learning_rate": 0.0005411765973494235, | |
| "loss": 3.8161, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.9955871273275213, | |
| "grad_norm": 0.6707938313484192, | |
| "learning_rate": 0.0005408533563193621, | |
| "loss": 3.8442, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.0009686793671295, | |
| "grad_norm": 0.5935998558998108, | |
| "learning_rate": 0.0005405301152893007, | |
| "loss": 3.8425, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.0063502314067376, | |
| "grad_norm": 0.7090808153152466, | |
| "learning_rate": 0.0005402068742592392, | |
| "loss": 3.7753, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.011731783446346, | |
| "grad_norm": 0.5787277221679688, | |
| "learning_rate": 0.0005398836332291779, | |
| "loss": 3.7816, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.017113335485954, | |
| "grad_norm": 0.5593772530555725, | |
| "learning_rate": 0.0005395603921991164, | |
| "loss": 3.7644, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.0224948875255624, | |
| "grad_norm": 0.616612434387207, | |
| "learning_rate": 0.0005392371511690551, | |
| "loss": 3.7622, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.0278764395651705, | |
| "grad_norm": 0.6132011413574219, | |
| "learning_rate": 0.0005389139101389936, | |
| "loss": 3.7698, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.0332579916047788, | |
| "grad_norm": 0.583960771560669, | |
| "learning_rate": 0.0005385906691089321, | |
| "loss": 3.7711, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.0386395436443872, | |
| "grad_norm": 0.5626878142356873, | |
| "learning_rate": 0.0005382674280788708, | |
| "loss": 3.7675, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.0440210956839953, | |
| "grad_norm": 0.6028419733047485, | |
| "learning_rate": 0.0005379441870488093, | |
| "loss": 3.7545, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.0494026477236036, | |
| "grad_norm": 0.5416905283927917, | |
| "learning_rate": 0.0005376209460187479, | |
| "loss": 3.7607, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.0547841997632117, | |
| "grad_norm": 0.571316123008728, | |
| "learning_rate": 0.0005372977049886865, | |
| "loss": 3.7879, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.06016575180282, | |
| "grad_norm": 0.6111052632331848, | |
| "learning_rate": 0.0005369744639586251, | |
| "loss": 3.7666, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.0655473038424281, | |
| "grad_norm": 0.7614707350730896, | |
| "learning_rate": 0.0005366512229285637, | |
| "loss": 3.7689, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.0709288558820365, | |
| "grad_norm": 0.541038990020752, | |
| "learning_rate": 0.0005363279818985022, | |
| "loss": 3.756, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.0763104079216446, | |
| "grad_norm": 0.5469908714294434, | |
| "learning_rate": 0.0005360047408684408, | |
| "loss": 3.7641, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0763104079216446, | |
| "eval_accuracy": 0.3449987781982765, | |
| "eval_loss": 3.7476563453674316, | |
| "eval_runtime": 184.5322, | |
| "eval_samples_per_second": 97.604, | |
| "eval_steps_per_second": 6.102, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.081691959961253, | |
| "grad_norm": 0.5756956338882446, | |
| "learning_rate": 0.0005356814998383794, | |
| "loss": 3.7808, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.087073512000861, | |
| "grad_norm": 0.5653090476989746, | |
| "learning_rate": 0.0005353582588083181, | |
| "loss": 3.7728, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.0924550640404693, | |
| "grad_norm": 0.5604647994041443, | |
| "learning_rate": 0.0005350350177782566, | |
| "loss": 3.7572, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.0978366160800774, | |
| "grad_norm": 0.6427847743034363, | |
| "learning_rate": 0.0005347117767481952, | |
| "loss": 3.7671, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.1032181681196858, | |
| "grad_norm": 0.5688785910606384, | |
| "learning_rate": 0.000534395000538735, | |
| "loss": 3.7718, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.1085997201592939, | |
| "grad_norm": 0.6266538500785828, | |
| "learning_rate": 0.0005340717595086736, | |
| "loss": 3.7582, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.1139812721989022, | |
| "grad_norm": 0.6398711800575256, | |
| "learning_rate": 0.0005337485184786122, | |
| "loss": 3.7626, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.1193628242385103, | |
| "grad_norm": 0.5167144536972046, | |
| "learning_rate": 0.0005334252774485507, | |
| "loss": 3.7692, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.1247443762781186, | |
| "grad_norm": 0.5785194039344788, | |
| "learning_rate": 0.0005331020364184894, | |
| "loss": 3.7633, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.1301259283177267, | |
| "grad_norm": 0.5897461175918579, | |
| "learning_rate": 0.0005327787953884279, | |
| "loss": 3.7521, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.135507480357335, | |
| "grad_norm": 0.5663477182388306, | |
| "learning_rate": 0.0005324555543583665, | |
| "loss": 3.7543, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.1408890323969434, | |
| "grad_norm": 0.565779983997345, | |
| "learning_rate": 0.0005321323133283051, | |
| "loss": 3.7576, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.1462705844365515, | |
| "grad_norm": 0.5521111488342285, | |
| "learning_rate": 0.0005318090722982436, | |
| "loss": 3.7509, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.1516521364761596, | |
| "grad_norm": 0.5536676645278931, | |
| "learning_rate": 0.0005314858312681823, | |
| "loss": 3.7664, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.157033688515768, | |
| "grad_norm": 0.5922389030456543, | |
| "learning_rate": 0.0005311625902381209, | |
| "loss": 3.7486, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.1624152405553763, | |
| "grad_norm": 0.5422095656394958, | |
| "learning_rate": 0.0005308393492080595, | |
| "loss": 3.7454, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.1677967925949844, | |
| "grad_norm": 0.5556425452232361, | |
| "learning_rate": 0.000530516108177998, | |
| "loss": 3.7493, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.1731783446345927, | |
| "grad_norm": 0.589709997177124, | |
| "learning_rate": 0.0005301928671479365, | |
| "loss": 3.7442, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.1785598966742008, | |
| "grad_norm": 0.6388732194900513, | |
| "learning_rate": 0.0005298696261178752, | |
| "loss": 3.7656, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.1839414487138091, | |
| "grad_norm": 0.6585466265678406, | |
| "learning_rate": 0.0005295463850878138, | |
| "loss": 3.754, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1839414487138091, | |
| "eval_accuracy": 0.3477161869225167, | |
| "eval_loss": 3.722252130508423, | |
| "eval_runtime": 184.2851, | |
| "eval_samples_per_second": 97.734, | |
| "eval_steps_per_second": 6.11, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1893230007534172, | |
| "grad_norm": 0.5404320359230042, | |
| "learning_rate": 0.0005292231440577524, | |
| "loss": 3.7624, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.1947045527930256, | |
| "grad_norm": 0.5776370167732239, | |
| "learning_rate": 0.0005288999030276909, | |
| "loss": 3.7492, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.2000861048326337, | |
| "grad_norm": 0.6025657653808594, | |
| "learning_rate": 0.0005285766619976295, | |
| "loss": 3.7407, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.205467656872242, | |
| "grad_norm": 0.5973086357116699, | |
| "learning_rate": 0.0005282534209675681, | |
| "loss": 3.736, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.21084920891185, | |
| "grad_norm": 0.5211036801338196, | |
| "learning_rate": 0.0005279301799375066, | |
| "loss": 3.7491, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.2162307609514584, | |
| "grad_norm": 0.5768703818321228, | |
| "learning_rate": 0.0005276069389074453, | |
| "loss": 3.7408, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.2216123129910665, | |
| "grad_norm": 0.5568981766700745, | |
| "learning_rate": 0.0005272836978773838, | |
| "loss": 3.7406, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.2269938650306749, | |
| "grad_norm": 0.5531343221664429, | |
| "learning_rate": 0.0005269604568473225, | |
| "loss": 3.7345, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.232375417070283, | |
| "grad_norm": 0.6032938957214355, | |
| "learning_rate": 0.000526637215817261, | |
| "loss": 3.7292, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.2377569691098913, | |
| "grad_norm": 0.5232219099998474, | |
| "learning_rate": 0.0005263139747871996, | |
| "loss": 3.7334, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.2431385211494996, | |
| "grad_norm": 0.5911790728569031, | |
| "learning_rate": 0.0005259907337571381, | |
| "loss": 3.7533, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.2485200731891077, | |
| "grad_norm": 0.5355823636054993, | |
| "learning_rate": 0.000525673957547678, | |
| "loss": 3.7246, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.2539016252287158, | |
| "grad_norm": 0.4860752522945404, | |
| "learning_rate": 0.0005253507165176167, | |
| "loss": 3.752, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.2592831772683242, | |
| "grad_norm": 0.5607491731643677, | |
| "learning_rate": 0.0005250274754875552, | |
| "loss": 3.731, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.2646647293079325, | |
| "grad_norm": 0.5975137948989868, | |
| "learning_rate": 0.0005247042344574938, | |
| "loss": 3.7511, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.2700462813475406, | |
| "grad_norm": 0.5998445749282837, | |
| "learning_rate": 0.0005243809934274323, | |
| "loss": 3.7465, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.275427833387149, | |
| "grad_norm": 0.6529081463813782, | |
| "learning_rate": 0.0005240577523973709, | |
| "loss": 3.7262, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.280809385426757, | |
| "grad_norm": 0.5797929167747498, | |
| "learning_rate": 0.0005237345113673095, | |
| "loss": 3.7591, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.2861909374663654, | |
| "grad_norm": 0.5823287963867188, | |
| "learning_rate": 0.0005234112703372481, | |
| "loss": 3.722, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.2915724895059735, | |
| "grad_norm": 0.7930464148521423, | |
| "learning_rate": 0.0005230880293071867, | |
| "loss": 3.7378, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.2915724895059735, | |
| "eval_accuracy": 0.34940867295600286, | |
| "eval_loss": 3.700629234313965, | |
| "eval_runtime": 184.3929, | |
| "eval_samples_per_second": 97.677, | |
| "eval_steps_per_second": 6.107, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.2969540415455818, | |
| "grad_norm": 0.49873632192611694, | |
| "learning_rate": 0.0005227647882771253, | |
| "loss": 3.7239, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.30233559358519, | |
| "grad_norm": 0.5773414373397827, | |
| "learning_rate": 0.0005224415472470639, | |
| "loss": 3.7392, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.3077171456247982, | |
| "grad_norm": 0.6267489194869995, | |
| "learning_rate": 0.0005221183062170024, | |
| "loss": 3.7183, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.3130986976644063, | |
| "grad_norm": 0.5878105759620667, | |
| "learning_rate": 0.0005217950651869409, | |
| "loss": 3.7342, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.3184802497040147, | |
| "grad_norm": 0.5103281736373901, | |
| "learning_rate": 0.0005214718241568796, | |
| "loss": 3.7391, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.3238618017436228, | |
| "grad_norm": 0.5685731768608093, | |
| "learning_rate": 0.0005211485831268182, | |
| "loss": 3.7023, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.329243353783231, | |
| "grad_norm": 0.5740435123443604, | |
| "learning_rate": 0.0005208253420967568, | |
| "loss": 3.7269, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.3346249058228392, | |
| "grad_norm": 0.5537181496620178, | |
| "learning_rate": 0.0005205021010666953, | |
| "loss": 3.7228, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.3400064578624475, | |
| "grad_norm": 0.6004145741462708, | |
| "learning_rate": 0.0005201788600366339, | |
| "loss": 3.7306, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.3453880099020559, | |
| "grad_norm": 0.5789647698402405, | |
| "learning_rate": 0.0005198556190065725, | |
| "loss": 3.7216, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.350769561941664, | |
| "grad_norm": 0.5887240767478943, | |
| "learning_rate": 0.0005195323779765112, | |
| "loss": 3.7198, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.356151113981272, | |
| "grad_norm": 0.5683675408363342, | |
| "learning_rate": 0.0005192091369464497, | |
| "loss": 3.7304, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.3615326660208804, | |
| "grad_norm": 0.5594485402107239, | |
| "learning_rate": 0.0005188858959163882, | |
| "loss": 3.7167, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.3669142180604887, | |
| "grad_norm": 0.6306636333465576, | |
| "learning_rate": 0.0005185626548863269, | |
| "loss": 3.7207, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.3722957701000968, | |
| "grad_norm": 0.5611990094184875, | |
| "learning_rate": 0.0005182394138562654, | |
| "loss": 3.723, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.3776773221397052, | |
| "grad_norm": 0.5362935662269592, | |
| "learning_rate": 0.0005179161728262041, | |
| "loss": 3.7171, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.3830588741793133, | |
| "grad_norm": 0.5498529672622681, | |
| "learning_rate": 0.0005175929317961426, | |
| "loss": 3.72, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.3884404262189216, | |
| "grad_norm": 0.5442519783973694, | |
| "learning_rate": 0.0005172696907660812, | |
| "loss": 3.7128, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.3938219782585297, | |
| "grad_norm": 0.5849413871765137, | |
| "learning_rate": 0.0005169464497360198, | |
| "loss": 3.7118, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.399203530298138, | |
| "grad_norm": 0.5864673852920532, | |
| "learning_rate": 0.0005166232087059583, | |
| "loss": 3.7056, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.399203530298138, | |
| "eval_accuracy": 0.3516231274625498, | |
| "eval_loss": 3.67989444732666, | |
| "eval_runtime": 184.826, | |
| "eval_samples_per_second": 97.448, | |
| "eval_steps_per_second": 6.092, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.4045850823377461, | |
| "grad_norm": 0.5450125336647034, | |
| "learning_rate": 0.0005162999676758969, | |
| "loss": 3.7177, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.4099666343773545, | |
| "grad_norm": 0.5438733100891113, | |
| "learning_rate": 0.0005159767266458355, | |
| "loss": 3.7222, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.4153481864169626, | |
| "grad_norm": 0.5453888773918152, | |
| "learning_rate": 0.0005156534856157741, | |
| "loss": 3.708, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.420729738456571, | |
| "grad_norm": 0.572364330291748, | |
| "learning_rate": 0.0005153302445857127, | |
| "loss": 3.7001, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.426111290496179, | |
| "grad_norm": 0.5122117400169373, | |
| "learning_rate": 0.0005150070035556513, | |
| "loss": 3.7032, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.4314928425357873, | |
| "grad_norm": 0.566405713558197, | |
| "learning_rate": 0.0005146837625255898, | |
| "loss": 3.7114, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.4368743945753955, | |
| "grad_norm": 0.5422384142875671, | |
| "learning_rate": 0.0005143605214955285, | |
| "loss": 3.7076, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.4422559466150038, | |
| "grad_norm": 0.5637435913085938, | |
| "learning_rate": 0.0005140372804654671, | |
| "loss": 3.6984, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.447637498654612, | |
| "grad_norm": 0.5512182712554932, | |
| "learning_rate": 0.0005137140394354056, | |
| "loss": 3.6969, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.4530190506942202, | |
| "grad_norm": 0.5623869299888611, | |
| "learning_rate": 0.0005133907984053442, | |
| "loss": 3.6991, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.4584006027338283, | |
| "grad_norm": 0.5506566762924194, | |
| "learning_rate": 0.0005130675573752827, | |
| "loss": 3.7273, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.4637821547734367, | |
| "grad_norm": 0.5795466303825378, | |
| "learning_rate": 0.0005127443163452214, | |
| "loss": 3.7172, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.469163706813045, | |
| "grad_norm": 0.5493975877761841, | |
| "learning_rate": 0.00051242107531516, | |
| "loss": 3.6948, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.474545258852653, | |
| "grad_norm": 0.5260751247406006, | |
| "learning_rate": 0.0005120978342850986, | |
| "loss": 3.7049, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.4799268108922612, | |
| "grad_norm": 0.5563033223152161, | |
| "learning_rate": 0.0005117745932550371, | |
| "loss": 3.7076, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.4853083629318695, | |
| "grad_norm": 0.5787496566772461, | |
| "learning_rate": 0.0005114513522249758, | |
| "loss": 3.71, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.4906899149714778, | |
| "grad_norm": 0.5059001445770264, | |
| "learning_rate": 0.0005111281111949143, | |
| "loss": 3.684, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.496071467011086, | |
| "grad_norm": 0.5558372735977173, | |
| "learning_rate": 0.0005108048701648528, | |
| "loss": 3.7065, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.501453019050694, | |
| "grad_norm": 0.5342027544975281, | |
| "learning_rate": 0.0005104816291347915, | |
| "loss": 3.6999, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.5068345710903024, | |
| "grad_norm": 0.5543688535690308, | |
| "learning_rate": 0.00051015838810473, | |
| "loss": 3.7016, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.5068345710903024, | |
| "eval_accuracy": 0.35396144626120524, | |
| "eval_loss": 3.656766176223755, | |
| "eval_runtime": 184.2793, | |
| "eval_samples_per_second": 97.737, | |
| "eval_steps_per_second": 6.11, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.5122161231299107, | |
| "grad_norm": 0.5722943544387817, | |
| "learning_rate": 0.0005098351470746687, | |
| "loss": 3.6839, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.5175976751695188, | |
| "grad_norm": 0.5564660429954529, | |
| "learning_rate": 0.0005095119060446072, | |
| "loss": 3.6961, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.5229792272091272, | |
| "grad_norm": 0.5130824446678162, | |
| "learning_rate": 0.0005091886650145458, | |
| "loss": 3.6933, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.5283607792487355, | |
| "grad_norm": 0.5731927156448364, | |
| "learning_rate": 0.0005088654239844844, | |
| "loss": 3.6865, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.5337423312883436, | |
| "grad_norm": 0.5487149357795715, | |
| "learning_rate": 0.0005085421829544229, | |
| "loss": 3.675, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.5391238833279517, | |
| "grad_norm": 0.5173348188400269, | |
| "learning_rate": 0.0005082189419243616, | |
| "loss": 3.7095, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.54450543536756, | |
| "grad_norm": 0.5780784487724304, | |
| "learning_rate": 0.0005078957008943001, | |
| "loss": 3.6955, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.5498869874071683, | |
| "grad_norm": 0.5526981949806213, | |
| "learning_rate": 0.0005075724598642387, | |
| "loss": 3.6764, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.5552685394467765, | |
| "grad_norm": 0.550183117389679, | |
| "learning_rate": 0.0005072492188341773, | |
| "loss": 3.7052, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.5606500914863846, | |
| "grad_norm": 0.5323998332023621, | |
| "learning_rate": 0.000506925977804116, | |
| "loss": 3.6719, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.566031643525993, | |
| "grad_norm": 0.5508608818054199, | |
| "learning_rate": 0.0005066027367740545, | |
| "loss": 3.6832, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.5714131955656012, | |
| "grad_norm": 0.5756356716156006, | |
| "learning_rate": 0.000506279495743993, | |
| "loss": 3.6806, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.5767947476052093, | |
| "grad_norm": 0.5558910369873047, | |
| "learning_rate": 0.0005059562547139316, | |
| "loss": 3.6743, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.5821762996448174, | |
| "grad_norm": 0.5734641551971436, | |
| "learning_rate": 0.0005056330136838702, | |
| "loss": 3.6983, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.5875578516844258, | |
| "grad_norm": 0.5970582962036133, | |
| "learning_rate": 0.0005053097726538088, | |
| "loss": 3.698, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.592939403724034, | |
| "grad_norm": 0.5290464162826538, | |
| "learning_rate": 0.0005049865316237474, | |
| "loss": 3.6823, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.5983209557636422, | |
| "grad_norm": 0.5535150766372681, | |
| "learning_rate": 0.000504663290593686, | |
| "loss": 3.69, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.6037025078032503, | |
| "grad_norm": 0.5767305493354797, | |
| "learning_rate": 0.0005043400495636246, | |
| "loss": 3.6766, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.6090840598428586, | |
| "grad_norm": 0.5387904047966003, | |
| "learning_rate": 0.0005040168085335632, | |
| "loss": 3.6813, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.614465611882467, | |
| "grad_norm": 0.5717957019805908, | |
| "learning_rate": 0.0005036935675035017, | |
| "loss": 3.6979, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.614465611882467, | |
| "eval_accuracy": 0.35539251344660977, | |
| "eval_loss": 3.63720703125, | |
| "eval_runtime": 184.8407, | |
| "eval_samples_per_second": 97.441, | |
| "eval_steps_per_second": 6.092, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.619847163922075, | |
| "grad_norm": 0.5999611020088196, | |
| "learning_rate": 0.0005033703264734402, | |
| "loss": 3.6837, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.6252287159616834, | |
| "grad_norm": 0.5812547206878662, | |
| "learning_rate": 0.0005030470854433789, | |
| "loss": 3.688, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.6306102680012917, | |
| "grad_norm": 0.5843793749809265, | |
| "learning_rate": 0.0005027238444133175, | |
| "loss": 3.6731, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.6359918200408998, | |
| "grad_norm": 0.5559870600700378, | |
| "learning_rate": 0.0005024006033832561, | |
| "loss": 3.6801, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.641373372080508, | |
| "grad_norm": 0.5399496555328369, | |
| "learning_rate": 0.0005020773623531946, | |
| "loss": 3.6836, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.6467549241201163, | |
| "grad_norm": 0.5161964297294617, | |
| "learning_rate": 0.0005017541213231333, | |
| "loss": 3.6662, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.6521364761597246, | |
| "grad_norm": 0.571781575679779, | |
| "learning_rate": 0.0005014308802930718, | |
| "loss": 3.6923, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.6575180281993327, | |
| "grad_norm": 0.54901522397995, | |
| "learning_rate": 0.0005011076392630105, | |
| "loss": 3.6763, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.6628995802389408, | |
| "grad_norm": 0.5774824619293213, | |
| "learning_rate": 0.000500784398232949, | |
| "loss": 3.675, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.6682811322785491, | |
| "grad_norm": 0.5709084868431091, | |
| "learning_rate": 0.0005004611572028875, | |
| "loss": 3.6655, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.6736626843181575, | |
| "grad_norm": 0.5834586024284363, | |
| "learning_rate": 0.0005001379161728262, | |
| "loss": 3.6644, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.6790442363577656, | |
| "grad_norm": 0.5875089764595032, | |
| "learning_rate": 0.0004998146751427647, | |
| "loss": 3.6837, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.6844257883973737, | |
| "grad_norm": 0.5730432868003845, | |
| "learning_rate": 0.0004994914341127034, | |
| "loss": 3.6606, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.689807340436982, | |
| "grad_norm": 0.598292887210846, | |
| "learning_rate": 0.0004991681930826419, | |
| "loss": 3.6657, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.6951888924765903, | |
| "grad_norm": 0.5262405872344971, | |
| "learning_rate": 0.0004988449520525805, | |
| "loss": 3.6675, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.7005704445161984, | |
| "grad_norm": 0.562152624130249, | |
| "learning_rate": 0.0004985217110225191, | |
| "loss": 3.661, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.7059519965558065, | |
| "grad_norm": 0.5154584646224976, | |
| "learning_rate": 0.0004981984699924576, | |
| "loss": 3.6626, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.7113335485954149, | |
| "grad_norm": 0.5356432795524597, | |
| "learning_rate": 0.0004978752289623962, | |
| "loss": 3.652, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.7167151006350232, | |
| "grad_norm": 0.5371306538581848, | |
| "learning_rate": 0.0004975519879323348, | |
| "loss": 3.6717, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.7220966526746313, | |
| "grad_norm": 0.5752100944519043, | |
| "learning_rate": 0.0004972352117228746, | |
| "loss": 3.6631, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.7220966526746313, | |
| "eval_accuracy": 0.35704088640764325, | |
| "eval_loss": 3.6225626468658447, | |
| "eval_runtime": 183.8276, | |
| "eval_samples_per_second": 97.978, | |
| "eval_steps_per_second": 6.125, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.7274782047142396, | |
| "grad_norm": 0.49519824981689453, | |
| "learning_rate": 0.0004969119706928133, | |
| "loss": 3.649, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.732859756753848, | |
| "grad_norm": 0.5395165085792542, | |
| "learning_rate": 0.0004965887296627518, | |
| "loss": 3.6648, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.738241308793456, | |
| "grad_norm": 0.5537828207015991, | |
| "learning_rate": 0.0004962654886326904, | |
| "loss": 3.6808, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.7436228608330642, | |
| "grad_norm": 0.5962070226669312, | |
| "learning_rate": 0.000495942247602629, | |
| "loss": 3.6507, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.7490044128726725, | |
| "grad_norm": 0.6224369406700134, | |
| "learning_rate": 0.0004956190065725676, | |
| "loss": 3.6441, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.7543859649122808, | |
| "grad_norm": 0.5751168727874756, | |
| "learning_rate": 0.0004952957655425062, | |
| "loss": 3.6686, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.759767516951889, | |
| "grad_norm": 0.6183793544769287, | |
| "learning_rate": 0.0004949725245124448, | |
| "loss": 3.6619, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.765149068991497, | |
| "grad_norm": 0.5693207383155823, | |
| "learning_rate": 0.0004946492834823833, | |
| "loss": 3.6391, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.7705306210311054, | |
| "grad_norm": 0.8416994214057922, | |
| "learning_rate": 0.0004943260424523219, | |
| "loss": 3.6685, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.7759121730707137, | |
| "grad_norm": 0.5473074913024902, | |
| "learning_rate": 0.0004940028014222605, | |
| "loss": 3.651, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.7812937251103218, | |
| "grad_norm": 0.6257918477058411, | |
| "learning_rate": 0.000493679560392199, | |
| "loss": 3.647, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.78667527714993, | |
| "grad_norm": 0.6103613972663879, | |
| "learning_rate": 0.0004933563193621377, | |
| "loss": 3.6511, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.7920568291895382, | |
| "grad_norm": 0.53249591588974, | |
| "learning_rate": 0.0004930330783320762, | |
| "loss": 3.6511, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.7974383812291466, | |
| "grad_norm": 0.5363080501556396, | |
| "learning_rate": 0.0004927098373020149, | |
| "loss": 3.6559, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.8028199332687547, | |
| "grad_norm": 0.5322283506393433, | |
| "learning_rate": 0.0004923865962719534, | |
| "loss": 3.6635, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.8082014853083628, | |
| "grad_norm": 0.5365887880325317, | |
| "learning_rate": 0.0004920633552418919, | |
| "loss": 3.653, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.813583037347971, | |
| "grad_norm": 0.5999295711517334, | |
| "learning_rate": 0.0004917401142118306, | |
| "loss": 3.6537, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.8189645893875794, | |
| "grad_norm": 0.5953883528709412, | |
| "learning_rate": 0.0004914168731817692, | |
| "loss": 3.6539, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.8243461414271875, | |
| "grad_norm": 0.597101628780365, | |
| "learning_rate": 0.0004910936321517078, | |
| "loss": 3.6471, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.8297276934667959, | |
| "grad_norm": 0.5295618772506714, | |
| "learning_rate": 0.0004907703911216463, | |
| "loss": 3.6326, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.8297276934667959, | |
| "eval_accuracy": 0.35910942008585534, | |
| "eval_loss": 3.6050846576690674, | |
| "eval_runtime": 184.1833, | |
| "eval_samples_per_second": 97.788, | |
| "eval_steps_per_second": 6.113, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.8351092455064042, | |
| "grad_norm": 0.6159751415252686, | |
| "learning_rate": 0.0004904471500915849, | |
| "loss": 3.6271, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.8404907975460123, | |
| "grad_norm": 0.6205544471740723, | |
| "learning_rate": 0.0004901239090615235, | |
| "loss": 3.6651, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.8458723495856204, | |
| "grad_norm": 0.5399280786514282, | |
| "learning_rate": 0.000489800668031462, | |
| "loss": 3.6452, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.8512539016252287, | |
| "grad_norm": 0.5837388038635254, | |
| "learning_rate": 0.0004894774270014007, | |
| "loss": 3.6575, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.856635453664837, | |
| "grad_norm": 0.5471201539039612, | |
| "learning_rate": 0.0004891541859713392, | |
| "loss": 3.6555, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.8620170057044452, | |
| "grad_norm": 0.5392946600914001, | |
| "learning_rate": 0.0004888309449412779, | |
| "loss": 3.6453, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.8673985577440533, | |
| "grad_norm": 0.5226132869720459, | |
| "learning_rate": 0.0004885077039112164, | |
| "loss": 3.6449, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.8727801097836616, | |
| "grad_norm": 0.5569941401481628, | |
| "learning_rate": 0.00048818446288115497, | |
| "loss": 3.629, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.87816166182327, | |
| "grad_norm": 0.5523909330368042, | |
| "learning_rate": 0.0004878612218510936, | |
| "loss": 3.6612, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.883543213862878, | |
| "grad_norm": 0.5485296845436096, | |
| "learning_rate": 0.0004875379808210322, | |
| "loss": 3.6438, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.8889247659024861, | |
| "grad_norm": 0.5691823959350586, | |
| "learning_rate": 0.00048721473979097075, | |
| "loss": 3.6349, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.8943063179420945, | |
| "grad_norm": 0.5617777705192566, | |
| "learning_rate": 0.00048689149876090935, | |
| "loss": 3.6613, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.8996878699817028, | |
| "grad_norm": 0.6149376034736633, | |
| "learning_rate": 0.0004865682577308479, | |
| "loss": 3.6477, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.905069422021311, | |
| "grad_norm": 0.5364314913749695, | |
| "learning_rate": 0.0004862450167007865, | |
| "loss": 3.6471, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.910450974060919, | |
| "grad_norm": 0.5560828447341919, | |
| "learning_rate": 0.00048592177567072513, | |
| "loss": 3.6449, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.9158325261005273, | |
| "grad_norm": 0.562412440776825, | |
| "learning_rate": 0.00048559853464066367, | |
| "loss": 3.6439, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.9212140781401357, | |
| "grad_norm": 0.5733065605163574, | |
| "learning_rate": 0.00048527529361060227, | |
| "loss": 3.6473, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.9265956301797438, | |
| "grad_norm": 0.590917706489563, | |
| "learning_rate": 0.00048495205258054086, | |
| "loss": 3.6377, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.931977182219352, | |
| "grad_norm": 0.6305859684944153, | |
| "learning_rate": 0.0004846288115504794, | |
| "loss": 3.6313, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.9373587342589604, | |
| "grad_norm": 0.5712413787841797, | |
| "learning_rate": 0.000484305570520418, | |
| "loss": 3.6246, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.9373587342589604, | |
| "eval_accuracy": 0.36066337368737805, | |
| "eval_loss": 3.5890228748321533, | |
| "eval_runtime": 184.3111, | |
| "eval_samples_per_second": 97.721, | |
| "eval_steps_per_second": 6.109, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.9427402862985685, | |
| "grad_norm": 0.5944366455078125, | |
| "learning_rate": 0.00048398232949035665, | |
| "loss": 3.6456, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.9481218383381766, | |
| "grad_norm": 0.5790172815322876, | |
| "learning_rate": 0.0004836590884602952, | |
| "loss": 3.6318, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.953503390377785, | |
| "grad_norm": 0.5425114631652832, | |
| "learning_rate": 0.000483342312250835, | |
| "loss": 3.6534, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.9588849424173933, | |
| "grad_norm": 0.5681806802749634, | |
| "learning_rate": 0.0004830190712207736, | |
| "loss": 3.6524, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.9642664944570014, | |
| "grad_norm": 0.6699956655502319, | |
| "learning_rate": 0.0004826958301907122, | |
| "loss": 3.6431, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.9696480464966095, | |
| "grad_norm": 0.5629168152809143, | |
| "learning_rate": 0.0004823725891606507, | |
| "loss": 3.6236, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.9750295985362178, | |
| "grad_norm": 0.5990884900093079, | |
| "learning_rate": 0.0004820493481305893, | |
| "loss": 3.6318, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.9804111505758262, | |
| "grad_norm": 0.560914158821106, | |
| "learning_rate": 0.00048172610710052797, | |
| "loss": 3.6473, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.9857927026154343, | |
| "grad_norm": 0.6116918325424194, | |
| "learning_rate": 0.0004814028660704665, | |
| "loss": 3.6316, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.9911742546550424, | |
| "grad_norm": 0.5204751491546631, | |
| "learning_rate": 0.0004810796250404051, | |
| "loss": 3.6317, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9965558066946507, | |
| "grad_norm": 0.5705627799034119, | |
| "learning_rate": 0.00048075638401034364, | |
| "loss": 3.6266, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.001937358734259, | |
| "grad_norm": 0.5522881746292114, | |
| "learning_rate": 0.00048043314298028224, | |
| "loss": 3.6035, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.007318910773867, | |
| "grad_norm": 0.5946189165115356, | |
| "learning_rate": 0.00048010990195022083, | |
| "loss": 3.5188, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.0127004628134753, | |
| "grad_norm": 0.5753241777420044, | |
| "learning_rate": 0.0004797866609201594, | |
| "loss": 3.5602, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.018082014853084, | |
| "grad_norm": 0.5759482979774475, | |
| "learning_rate": 0.000479463419890098, | |
| "loss": 3.5473, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.023463566892692, | |
| "grad_norm": 0.5465207695960999, | |
| "learning_rate": 0.0004791401788600366, | |
| "loss": 3.5444, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.0288451189323, | |
| "grad_norm": 0.5923919081687927, | |
| "learning_rate": 0.00047881693782997515, | |
| "loss": 3.5464, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.034226670971908, | |
| "grad_norm": 0.5646787881851196, | |
| "learning_rate": 0.00047849369679991375, | |
| "loss": 3.5536, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.0396082230115167, | |
| "grad_norm": 0.5978553295135498, | |
| "learning_rate": 0.0004781704557698523, | |
| "loss": 3.5624, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "grad_norm": 0.5860423445701599, | |
| "learning_rate": 0.00047784721473979094, | |
| "loss": 3.5539, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.044989775051125, | |
| "eval_accuracy": 0.36169546746872777, | |
| "eval_loss": 3.579106092453003, | |
| "eval_runtime": 184.1309, | |
| "eval_samples_per_second": 97.816, | |
| "eval_steps_per_second": 6.115, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.050371327090733, | |
| "grad_norm": 0.5495768785476685, | |
| "learning_rate": 0.00047752397370972953, | |
| "loss": 3.5449, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.055752879130341, | |
| "grad_norm": 0.6042999625205994, | |
| "learning_rate": 0.0004772007326796681, | |
| "loss": 3.554, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.0611344311699495, | |
| "grad_norm": 0.6665642857551575, | |
| "learning_rate": 0.00047687749164960667, | |
| "loss": 3.5502, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.0665159832095576, | |
| "grad_norm": 0.5724649429321289, | |
| "learning_rate": 0.00047655425061954526, | |
| "loss": 3.5659, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.0718975352491658, | |
| "grad_norm": 0.5169804692268372, | |
| "learning_rate": 0.00047623100958948386, | |
| "loss": 3.548, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.0772790872887743, | |
| "grad_norm": 0.5063441395759583, | |
| "learning_rate": 0.00047590776855942245, | |
| "loss": 3.5655, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.0826606393283824, | |
| "grad_norm": 0.5674535632133484, | |
| "learning_rate": 0.00047558452752936105, | |
| "loss": 3.5459, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.0880421913679905, | |
| "grad_norm": 0.6405752301216125, | |
| "learning_rate": 0.0004752612864992996, | |
| "loss": 3.5313, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.0934237434075986, | |
| "grad_norm": 0.586518406867981, | |
| "learning_rate": 0.0004749380454692382, | |
| "loss": 3.5814, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.098805295447207, | |
| "grad_norm": 0.597720742225647, | |
| "learning_rate": 0.0004746148044391767, | |
| "loss": 3.5498, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.1041868474868153, | |
| "grad_norm": 0.6323477625846863, | |
| "learning_rate": 0.00047429156340911537, | |
| "loss": 3.5417, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.1095683995264234, | |
| "grad_norm": 0.6271188259124756, | |
| "learning_rate": 0.00047396832237905397, | |
| "loss": 3.5692, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.1149499515660315, | |
| "grad_norm": 0.5441372394561768, | |
| "learning_rate": 0.0004736450813489925, | |
| "loss": 3.5634, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.12033150360564, | |
| "grad_norm": 0.5787696838378906, | |
| "learning_rate": 0.0004733218403189311, | |
| "loss": 3.5525, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.125713055645248, | |
| "grad_norm": 0.6129936575889587, | |
| "learning_rate": 0.0004729985992888697, | |
| "loss": 3.5492, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.1310946076848563, | |
| "grad_norm": 0.585528552532196, | |
| "learning_rate": 0.00047267535825880824, | |
| "loss": 3.5742, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.1364761597244644, | |
| "grad_norm": 0.8197336792945862, | |
| "learning_rate": 0.0004723521172287469, | |
| "loss": 3.5695, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.141857711764073, | |
| "grad_norm": 0.5523197054862976, | |
| "learning_rate": 0.0004720288761986855, | |
| "loss": 3.5365, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.147239263803681, | |
| "grad_norm": 0.5677591562271118, | |
| "learning_rate": 0.000471705635168624, | |
| "loss": 3.5599, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.152620815843289, | |
| "grad_norm": 0.5868083238601685, | |
| "learning_rate": 0.0004713823941385626, | |
| "loss": 3.562, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.152620815843289, | |
| "eval_accuracy": 0.3632018311053878, | |
| "eval_loss": 3.570688009262085, | |
| "eval_runtime": 184.0571, | |
| "eval_samples_per_second": 97.856, | |
| "eval_steps_per_second": 6.118, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.1580023678828972, | |
| "grad_norm": 0.561229944229126, | |
| "learning_rate": 0.00047105915310850116, | |
| "loss": 3.5775, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.163383919922506, | |
| "grad_norm": 0.5729883909225464, | |
| "learning_rate": 0.0004707359120784398, | |
| "loss": 3.5575, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.168765471962114, | |
| "grad_norm": 0.6092100739479065, | |
| "learning_rate": 0.0004704126710483784, | |
| "loss": 3.5663, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.174147024001722, | |
| "grad_norm": 0.5391753911972046, | |
| "learning_rate": 0.0004700958948389182, | |
| "loss": 3.5662, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.1795285760413305, | |
| "grad_norm": 0.5216943621635437, | |
| "learning_rate": 0.0004697726538088568, | |
| "loss": 3.5703, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.1849101280809387, | |
| "grad_norm": 0.5354718565940857, | |
| "learning_rate": 0.00046944941277879534, | |
| "loss": 3.5427, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.1902916801205468, | |
| "grad_norm": 0.6368331909179688, | |
| "learning_rate": 0.00046912617174873394, | |
| "loss": 3.5627, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.195673232160155, | |
| "grad_norm": 0.5471709370613098, | |
| "learning_rate": 0.0004688029307186725, | |
| "loss": 3.5501, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.2010547841997634, | |
| "grad_norm": 0.5695568919181824, | |
| "learning_rate": 0.00046847968968861107, | |
| "loss": 3.5601, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.2064363362393715, | |
| "grad_norm": 0.6059097647666931, | |
| "learning_rate": 0.0004681564486585497, | |
| "loss": 3.5678, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.2118178882789796, | |
| "grad_norm": 0.5702983140945435, | |
| "learning_rate": 0.00046783320762848826, | |
| "loss": 3.5725, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.2171994403185877, | |
| "grad_norm": 0.5607845783233643, | |
| "learning_rate": 0.00046750996659842685, | |
| "loss": 3.5553, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.2225809923581963, | |
| "grad_norm": 0.5489804148674011, | |
| "learning_rate": 0.00046718672556836545, | |
| "loss": 3.5429, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.2279625443978044, | |
| "grad_norm": 0.5686173439025879, | |
| "learning_rate": 0.000466863484538304, | |
| "loss": 3.5843, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.2333440964374125, | |
| "grad_norm": 0.6201581358909607, | |
| "learning_rate": 0.0004665402435082426, | |
| "loss": 3.5601, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.2387256484770206, | |
| "grad_norm": 0.5640190243721008, | |
| "learning_rate": 0.00046621700247818123, | |
| "loss": 3.5754, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.244107200516629, | |
| "grad_norm": 0.6988780498504639, | |
| "learning_rate": 0.0004658937614481198, | |
| "loss": 3.5475, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.2494887525562373, | |
| "grad_norm": 0.616712749004364, | |
| "learning_rate": 0.00046557052041805837, | |
| "loss": 3.5727, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.2548703045958454, | |
| "grad_norm": 0.5751016139984131, | |
| "learning_rate": 0.0004652472793879969, | |
| "loss": 3.5648, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.2602518566354535, | |
| "grad_norm": 0.551199734210968, | |
| "learning_rate": 0.0004649240383579355, | |
| "loss": 3.5603, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.2602518566354535, | |
| "eval_accuracy": 0.36398217614567135, | |
| "eval_loss": 3.5630135536193848, | |
| "eval_runtime": 184.4108, | |
| "eval_samples_per_second": 97.668, | |
| "eval_steps_per_second": 6.106, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.265633408675062, | |
| "grad_norm": 0.6206423044204712, | |
| "learning_rate": 0.00046460079732787415, | |
| "loss": 3.5794, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.27101496071467, | |
| "grad_norm": 0.6502292156219482, | |
| "learning_rate": 0.0004642775562978127, | |
| "loss": 3.5543, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.2763965127542782, | |
| "grad_norm": 0.5624255537986755, | |
| "learning_rate": 0.0004639543152677513, | |
| "loss": 3.5674, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 2.281778064793887, | |
| "grad_norm": 0.6220043897628784, | |
| "learning_rate": 0.0004636310742376899, | |
| "loss": 3.5522, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.287159616833495, | |
| "grad_norm": 0.5274709463119507, | |
| "learning_rate": 0.0004633078332076284, | |
| "loss": 3.5407, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.292541168873103, | |
| "grad_norm": 0.5348024368286133, | |
| "learning_rate": 0.000462984592177567, | |
| "loss": 3.5461, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.297922720912711, | |
| "grad_norm": 0.5884036421775818, | |
| "learning_rate": 0.00046266135114750567, | |
| "loss": 3.5643, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 2.303304272952319, | |
| "grad_norm": 0.5820002555847168, | |
| "learning_rate": 0.0004623381101174442, | |
| "loss": 3.5638, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.3086858249919278, | |
| "grad_norm": 0.6022721529006958, | |
| "learning_rate": 0.0004620148690873828, | |
| "loss": 3.5642, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 2.314067377031536, | |
| "grad_norm": 0.5931360125541687, | |
| "learning_rate": 0.00046169162805732134, | |
| "loss": 3.5376, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.319448929071144, | |
| "grad_norm": 0.6036872863769531, | |
| "learning_rate": 0.00046136838702725994, | |
| "loss": 3.5702, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 2.3248304811107525, | |
| "grad_norm": 0.5146020650863647, | |
| "learning_rate": 0.00046104514599719853, | |
| "loss": 3.5693, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.3302120331503606, | |
| "grad_norm": 0.6073507070541382, | |
| "learning_rate": 0.0004607219049671371, | |
| "loss": 3.5649, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 2.3355935851899687, | |
| "grad_norm": 0.57147216796875, | |
| "learning_rate": 0.0004603986639370757, | |
| "loss": 3.5451, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.340975137229577, | |
| "grad_norm": 0.6783694624900818, | |
| "learning_rate": 0.0004600754229070143, | |
| "loss": 3.5448, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.3463566892691854, | |
| "grad_norm": 0.5610417723655701, | |
| "learning_rate": 0.00045975218187695286, | |
| "loss": 3.5461, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.3517382413087935, | |
| "grad_norm": 0.5262538194656372, | |
| "learning_rate": 0.00045942894084689145, | |
| "loss": 3.5627, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 2.3571197933484016, | |
| "grad_norm": 0.5298727750778198, | |
| "learning_rate": 0.0004591056998168301, | |
| "loss": 3.5745, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.3625013453880097, | |
| "grad_norm": 0.5683722496032715, | |
| "learning_rate": 0.00045878245878676864, | |
| "loss": 3.5601, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 2.3678828974276183, | |
| "grad_norm": 0.5847147703170776, | |
| "learning_rate": 0.00045845921775670723, | |
| "loss": 3.546, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.3678828974276183, | |
| "eval_accuracy": 0.3645442375343357, | |
| "eval_loss": 3.5517847537994385, | |
| "eval_runtime": 183.8946, | |
| "eval_samples_per_second": 97.942, | |
| "eval_steps_per_second": 6.123, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.3732644494672264, | |
| "grad_norm": 0.5403057932853699, | |
| "learning_rate": 0.0004581359767266458, | |
| "loss": 3.5433, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 2.3786460015068345, | |
| "grad_norm": 0.5787127614021301, | |
| "learning_rate": 0.00045781273569658437, | |
| "loss": 3.5425, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.384027553546443, | |
| "grad_norm": 0.5755758881568909, | |
| "learning_rate": 0.00045748949466652296, | |
| "loss": 3.5561, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 2.389409105586051, | |
| "grad_norm": 0.6487693786621094, | |
| "learning_rate": 0.00045716625363646156, | |
| "loss": 3.5523, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.3947906576256592, | |
| "grad_norm": 0.531506359577179, | |
| "learning_rate": 0.00045684301260640015, | |
| "loss": 3.5629, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.4001722096652673, | |
| "grad_norm": 0.6475794315338135, | |
| "learning_rate": 0.00045651977157633875, | |
| "loss": 3.569, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.4055537617048754, | |
| "grad_norm": 0.5457855463027954, | |
| "learning_rate": 0.0004561965305462773, | |
| "loss": 3.5683, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 2.410935313744484, | |
| "grad_norm": 0.5556653141975403, | |
| "learning_rate": 0.0004558732895162159, | |
| "loss": 3.5537, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.416316865784092, | |
| "grad_norm": 0.596947968006134, | |
| "learning_rate": 0.0004555565133067557, | |
| "loss": 3.552, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 2.4216984178237, | |
| "grad_norm": 0.552823543548584, | |
| "learning_rate": 0.0004552332722766943, | |
| "loss": 3.5577, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.4270799698633088, | |
| "grad_norm": 0.5624712705612183, | |
| "learning_rate": 0.0004549100312466328, | |
| "loss": 3.5463, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 2.432461521902917, | |
| "grad_norm": 0.5931119322776794, | |
| "learning_rate": 0.0004545867902165715, | |
| "loss": 3.5592, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.437843073942525, | |
| "grad_norm": 0.6037808656692505, | |
| "learning_rate": 0.00045426354918651007, | |
| "loss": 3.547, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 2.443224625982133, | |
| "grad_norm": 0.5643433928489685, | |
| "learning_rate": 0.0004539403081564486, | |
| "loss": 3.5637, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.4486061780217416, | |
| "grad_norm": 0.5612767934799194, | |
| "learning_rate": 0.0004536170671263872, | |
| "loss": 3.5429, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 2.4539877300613497, | |
| "grad_norm": 0.5697504878044128, | |
| "learning_rate": 0.00045329382609632574, | |
| "loss": 3.5538, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.459369282100958, | |
| "grad_norm": 0.5525127053260803, | |
| "learning_rate": 0.0004529705850662644, | |
| "loss": 3.5529, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 2.464750834140566, | |
| "grad_norm": 0.6191689968109131, | |
| "learning_rate": 0.000452647344036203, | |
| "loss": 3.5386, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.4701323861801745, | |
| "grad_norm": 0.5639016032218933, | |
| "learning_rate": 0.00045232410300614153, | |
| "loss": 3.5754, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 2.4755139382197826, | |
| "grad_norm": 0.6116729378700256, | |
| "learning_rate": 0.0004520008619760801, | |
| "loss": 3.5561, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.4755139382197826, | |
| "eval_accuracy": 0.3660388666591117, | |
| "eval_loss": 3.5395596027374268, | |
| "eval_runtime": 184.1301, | |
| "eval_samples_per_second": 97.817, | |
| "eval_steps_per_second": 6.115, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.4808954902593907, | |
| "grad_norm": 0.5503861904144287, | |
| "learning_rate": 0.0004516776209460187, | |
| "loss": 3.5527, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 2.4862770422989993, | |
| "grad_norm": 0.5716922283172607, | |
| "learning_rate": 0.00045135437991595726, | |
| "loss": 3.5307, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.4916585943386074, | |
| "grad_norm": 0.5459892153739929, | |
| "learning_rate": 0.0004510311388858959, | |
| "loss": 3.5607, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 2.4970401463782155, | |
| "grad_norm": 0.6022793054580688, | |
| "learning_rate": 0.0004507078978558345, | |
| "loss": 3.5287, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.5024216984178236, | |
| "grad_norm": 0.586172342300415, | |
| "learning_rate": 0.00045038465682577304, | |
| "loss": 3.5393, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 2.5078032504574317, | |
| "grad_norm": 0.5649908185005188, | |
| "learning_rate": 0.00045006141579571164, | |
| "loss": 3.5369, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.5131848024970402, | |
| "grad_norm": 0.6035097241401672, | |
| "learning_rate": 0.0004497381747656502, | |
| "loss": 3.5708, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 2.5185663545366483, | |
| "grad_norm": 0.5504043698310852, | |
| "learning_rate": 0.00044941493373558877, | |
| "loss": 3.5433, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.5239479065762565, | |
| "grad_norm": 0.5592088103294373, | |
| "learning_rate": 0.0004490916927055274, | |
| "loss": 3.5428, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 2.529329458615865, | |
| "grad_norm": 0.5921136140823364, | |
| "learning_rate": 0.00044876845167546596, | |
| "loss": 3.5478, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.534711010655473, | |
| "grad_norm": 0.5895003080368042, | |
| "learning_rate": 0.00044844521064540455, | |
| "loss": 3.5392, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 2.540092562695081, | |
| "grad_norm": 0.5717202425003052, | |
| "learning_rate": 0.00044812196961534315, | |
| "loss": 3.5425, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.5454741147346893, | |
| "grad_norm": 0.5879724025726318, | |
| "learning_rate": 0.0004477987285852817, | |
| "loss": 3.5676, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 2.550855666774298, | |
| "grad_norm": 0.5525007247924805, | |
| "learning_rate": 0.00044747548755522034, | |
| "loss": 3.5493, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.556237218813906, | |
| "grad_norm": 0.6007914543151855, | |
| "learning_rate": 0.00044715224652515893, | |
| "loss": 3.5563, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 2.561618770853514, | |
| "grad_norm": 0.5593198537826538, | |
| "learning_rate": 0.0004468290054950975, | |
| "loss": 3.5568, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.567000322893122, | |
| "grad_norm": 0.5726816058158875, | |
| "learning_rate": 0.00044650576446503607, | |
| "loss": 3.5613, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 2.5723818749327307, | |
| "grad_norm": 0.5541634559631348, | |
| "learning_rate": 0.0004461825234349746, | |
| "loss": 3.5451, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.577763426972339, | |
| "grad_norm": 0.545291543006897, | |
| "learning_rate": 0.0004458592824049132, | |
| "loss": 3.5541, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 2.583144979011947, | |
| "grad_norm": 0.5981674194335938, | |
| "learning_rate": 0.00044553604137485185, | |
| "loss": 3.5528, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.583144979011947, | |
| "eval_accuracy": 0.36709540734021967, | |
| "eval_loss": 3.530630588531494, | |
| "eval_runtime": 184.0185, | |
| "eval_samples_per_second": 97.876, | |
| "eval_steps_per_second": 6.119, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.5885265310515555, | |
| "grad_norm": 0.6674214005470276, | |
| "learning_rate": 0.0004452128003447904, | |
| "loss": 3.5395, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 2.5939080830911636, | |
| "grad_norm": 0.5705569982528687, | |
| "learning_rate": 0.000444889559314729, | |
| "loss": 3.5585, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 2.5992896351307717, | |
| "grad_norm": 0.5629080533981323, | |
| "learning_rate": 0.0004445663182846676, | |
| "loss": 3.5464, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 2.60467118717038, | |
| "grad_norm": 0.5345271229743958, | |
| "learning_rate": 0.0004442430772546061, | |
| "loss": 3.547, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.610052739209988, | |
| "grad_norm": 0.6864102482795715, | |
| "learning_rate": 0.0004439198362245447, | |
| "loss": 3.5398, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 2.6154342912495965, | |
| "grad_norm": 0.5583524107933044, | |
| "learning_rate": 0.00044359659519448337, | |
| "loss": 3.534, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 2.6208158432892046, | |
| "grad_norm": 0.5800455808639526, | |
| "learning_rate": 0.0004432733541644219, | |
| "loss": 3.5389, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 2.6261973953288127, | |
| "grad_norm": 0.577061653137207, | |
| "learning_rate": 0.0004429501131343605, | |
| "loss": 3.5386, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.6315789473684212, | |
| "grad_norm": 0.5940808653831482, | |
| "learning_rate": 0.00044262687210429904, | |
| "loss": 3.5421, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 2.6369604994080293, | |
| "grad_norm": 0.55911785364151, | |
| "learning_rate": 0.00044230363107423764, | |
| "loss": 3.5452, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.6423420514476375, | |
| "grad_norm": 0.6087802648544312, | |
| "learning_rate": 0.0004419803900441762, | |
| "loss": 3.5462, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 2.6477236034872456, | |
| "grad_norm": 0.5640580654144287, | |
| "learning_rate": 0.0004416571490141148, | |
| "loss": 3.5278, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.653105155526854, | |
| "grad_norm": 0.6379339098930359, | |
| "learning_rate": 0.0004413339079840534, | |
| "loss": 3.5637, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 2.658486707566462, | |
| "grad_norm": 0.5877749919891357, | |
| "learning_rate": 0.00044101066695399196, | |
| "loss": 3.5535, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 2.6638682596060703, | |
| "grad_norm": 0.5544137954711914, | |
| "learning_rate": 0.0004406938907445318, | |
| "loss": 3.5375, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 2.6692498116456784, | |
| "grad_norm": 0.5193669199943542, | |
| "learning_rate": 0.00044037064971447036, | |
| "loss": 3.5324, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.674631363685287, | |
| "grad_norm": 0.5993558764457703, | |
| "learning_rate": 0.00044004740868440896, | |
| "loss": 3.5171, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 2.680012915724895, | |
| "grad_norm": 0.606824517250061, | |
| "learning_rate": 0.00043972416765434755, | |
| "loss": 3.5454, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 2.685394467764503, | |
| "grad_norm": 0.5386514067649841, | |
| "learning_rate": 0.00043940092662428615, | |
| "loss": 3.5478, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 2.6907760198041117, | |
| "grad_norm": 0.5610313415527344, | |
| "learning_rate": 0.00043907768559422474, | |
| "loss": 3.5447, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.6907760198041117, | |
| "eval_accuracy": 0.36763606410998456, | |
| "eval_loss": 3.5236897468566895, | |
| "eval_runtime": 184.5139, | |
| "eval_samples_per_second": 97.613, | |
| "eval_steps_per_second": 6.103, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.69615757184372, | |
| "grad_norm": 0.5900009274482727, | |
| "learning_rate": 0.00043875444456416334, | |
| "loss": 3.5374, | |
| "step": 25050 | |
| }, | |
| { | |
| "epoch": 2.701539123883328, | |
| "grad_norm": 0.5528219938278198, | |
| "learning_rate": 0.0004384312035341019, | |
| "loss": 3.5508, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 2.706920675922936, | |
| "grad_norm": 0.5879670977592468, | |
| "learning_rate": 0.00043810796250404047, | |
| "loss": 3.5379, | |
| "step": 25150 | |
| }, | |
| { | |
| "epoch": 2.712302227962544, | |
| "grad_norm": 0.6659033894538879, | |
| "learning_rate": 0.000437784721473979, | |
| "loss": 3.539, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 2.7176837800021527, | |
| "grad_norm": 0.5466210246086121, | |
| "learning_rate": 0.00043746148044391766, | |
| "loss": 3.5477, | |
| "step": 25250 | |
| }, | |
| { | |
| "epoch": 2.723065332041761, | |
| "grad_norm": 0.5327200889587402, | |
| "learning_rate": 0.00043713823941385625, | |
| "loss": 3.546, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 2.728446884081369, | |
| "grad_norm": 0.5748312473297119, | |
| "learning_rate": 0.0004368149983837948, | |
| "loss": 3.5431, | |
| "step": 25350 | |
| }, | |
| { | |
| "epoch": 2.7338284361209775, | |
| "grad_norm": 0.5589819550514221, | |
| "learning_rate": 0.0004364917573537334, | |
| "loss": 3.5408, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 2.7392099881605856, | |
| "grad_norm": 0.5818286538124084, | |
| "learning_rate": 0.00043616851632367193, | |
| "loss": 3.541, | |
| "step": 25450 | |
| }, | |
| { | |
| "epoch": 2.7445915402001937, | |
| "grad_norm": 0.6057955026626587, | |
| "learning_rate": 0.0004358452752936106, | |
| "loss": 3.5448, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 2.749973092239802, | |
| "grad_norm": 0.5754573345184326, | |
| "learning_rate": 0.0004355220342635492, | |
| "loss": 3.5289, | |
| "step": 25550 | |
| }, | |
| { | |
| "epoch": 2.7553546442794103, | |
| "grad_norm": 0.5715060830116272, | |
| "learning_rate": 0.00043519879323348777, | |
| "loss": 3.5369, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 2.7607361963190185, | |
| "grad_norm": 0.6356561779975891, | |
| "learning_rate": 0.0004348755522034263, | |
| "loss": 3.5337, | |
| "step": 25650 | |
| }, | |
| { | |
| "epoch": 2.7661177483586266, | |
| "grad_norm": 0.5628809928894043, | |
| "learning_rate": 0.0004345523111733649, | |
| "loss": 3.5237, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 2.7714993003982347, | |
| "grad_norm": 0.5270907282829285, | |
| "learning_rate": 0.00043422907014330344, | |
| "loss": 3.5477, | |
| "step": 25750 | |
| }, | |
| { | |
| "epoch": 2.776880852437843, | |
| "grad_norm": 0.6290378570556641, | |
| "learning_rate": 0.0004339058291132421, | |
| "loss": 3.5433, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 2.7822624044774513, | |
| "grad_norm": 0.5788700580596924, | |
| "learning_rate": 0.0004335825880831807, | |
| "loss": 3.527, | |
| "step": 25850 | |
| }, | |
| { | |
| "epoch": 2.7876439565170594, | |
| "grad_norm": 0.5836464166641235, | |
| "learning_rate": 0.00043325934705311923, | |
| "loss": 3.5346, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 2.793025508556668, | |
| "grad_norm": 0.5945377349853516, | |
| "learning_rate": 0.0004329361060230578, | |
| "loss": 3.5364, | |
| "step": 25950 | |
| }, | |
| { | |
| "epoch": 2.798407060596276, | |
| "grad_norm": 0.5505304336547852, | |
| "learning_rate": 0.00043261286499299636, | |
| "loss": 3.5386, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.798407060596276, | |
| "eval_accuracy": 0.3685904710765469, | |
| "eval_loss": 3.5148777961730957, | |
| "eval_runtime": 184.2338, | |
| "eval_samples_per_second": 97.762, | |
| "eval_steps_per_second": 6.112, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 2.803788612635884, | |
| "grad_norm": 0.6029397249221802, | |
| "learning_rate": 0.00043228962396293496, | |
| "loss": 3.5251, | |
| "step": 26050 | |
| }, | |
| { | |
| "epoch": 2.8091701646754923, | |
| "grad_norm": 0.640298068523407, | |
| "learning_rate": 0.0004319663829328736, | |
| "loss": 3.5283, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 2.8145517167151004, | |
| "grad_norm": 0.6309469938278198, | |
| "learning_rate": 0.00043164314190281215, | |
| "loss": 3.5271, | |
| "step": 26150 | |
| }, | |
| { | |
| "epoch": 2.819933268754709, | |
| "grad_norm": 0.5581336617469788, | |
| "learning_rate": 0.00043131990087275074, | |
| "loss": 3.5351, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 2.825314820794317, | |
| "grad_norm": 0.6591759324073792, | |
| "learning_rate": 0.00043099665984268934, | |
| "loss": 3.5215, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 2.830696372833925, | |
| "grad_norm": 0.5295543074607849, | |
| "learning_rate": 0.0004306734188126279, | |
| "loss": 3.5443, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 2.8360779248735337, | |
| "grad_norm": 0.564895749092102, | |
| "learning_rate": 0.00043035017778256647, | |
| "loss": 3.5219, | |
| "step": 26350 | |
| }, | |
| { | |
| "epoch": 2.841459476913142, | |
| "grad_norm": 0.578912079334259, | |
| "learning_rate": 0.0004300269367525051, | |
| "loss": 3.5432, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 2.84684102895275, | |
| "grad_norm": 1.0079190731048584, | |
| "learning_rate": 0.00042970369572244366, | |
| "loss": 3.5434, | |
| "step": 26450 | |
| }, | |
| { | |
| "epoch": 2.852222580992358, | |
| "grad_norm": 0.5640878081321716, | |
| "learning_rate": 0.00042938045469238226, | |
| "loss": 3.5186, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 2.857604133031966, | |
| "grad_norm": 0.5524346828460693, | |
| "learning_rate": 0.0004290572136623208, | |
| "loss": 3.5352, | |
| "step": 26550 | |
| }, | |
| { | |
| "epoch": 2.8629856850715747, | |
| "grad_norm": 0.568709671497345, | |
| "learning_rate": 0.0004287339726322594, | |
| "loss": 3.5349, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 2.868367237111183, | |
| "grad_norm": 0.533467710018158, | |
| "learning_rate": 0.00042841073160219804, | |
| "loss": 3.5455, | |
| "step": 26650 | |
| }, | |
| { | |
| "epoch": 2.873748789150791, | |
| "grad_norm": 0.5956966280937195, | |
| "learning_rate": 0.0004280874905721366, | |
| "loss": 3.5321, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 2.8791303411903995, | |
| "grad_norm": 0.5749139785766602, | |
| "learning_rate": 0.0004277642495420752, | |
| "loss": 3.5357, | |
| "step": 26750 | |
| }, | |
| { | |
| "epoch": 2.8845118932300076, | |
| "grad_norm": 0.6081565022468567, | |
| "learning_rate": 0.00042744100851201377, | |
| "loss": 3.514, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 2.8898934452696157, | |
| "grad_norm": 0.5851724743843079, | |
| "learning_rate": 0.0004271177674819523, | |
| "loss": 3.5226, | |
| "step": 26850 | |
| }, | |
| { | |
| "epoch": 2.895274997309224, | |
| "grad_norm": 0.6097344160079956, | |
| "learning_rate": 0.00042680099127249217, | |
| "loss": 3.5233, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 2.9006565493488323, | |
| "grad_norm": 0.575410008430481, | |
| "learning_rate": 0.0004264777502424307, | |
| "loss": 3.5413, | |
| "step": 26950 | |
| }, | |
| { | |
| "epoch": 2.9060381013884404, | |
| "grad_norm": 0.6111117601394653, | |
| "learning_rate": 0.0004261545092123693, | |
| "loss": 3.5145, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.9060381013884404, | |
| "eval_accuracy": 0.36941308209019036, | |
| "eval_loss": 3.5066769123077393, | |
| "eval_runtime": 184.6661, | |
| "eval_samples_per_second": 97.533, | |
| "eval_steps_per_second": 6.097, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 2.9114196534280485, | |
| "grad_norm": 0.5597397089004517, | |
| "learning_rate": 0.00042583126818230795, | |
| "loss": 3.5104, | |
| "step": 27050 | |
| }, | |
| { | |
| "epoch": 2.9168012054676566, | |
| "grad_norm": 0.5836156010627747, | |
| "learning_rate": 0.0004255080271522465, | |
| "loss": 3.5324, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 2.922182757507265, | |
| "grad_norm": 0.5847307443618774, | |
| "learning_rate": 0.0004251847861221851, | |
| "loss": 3.5126, | |
| "step": 27150 | |
| }, | |
| { | |
| "epoch": 2.9275643095468733, | |
| "grad_norm": 0.5847127437591553, | |
| "learning_rate": 0.00042486154509212363, | |
| "loss": 3.5196, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 2.9329458615864814, | |
| "grad_norm": 0.5771723985671997, | |
| "learning_rate": 0.0004245383040620622, | |
| "loss": 3.5229, | |
| "step": 27250 | |
| }, | |
| { | |
| "epoch": 2.93832741362609, | |
| "grad_norm": 0.5682346224784851, | |
| "learning_rate": 0.0004242150630320009, | |
| "loss": 3.5362, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 2.943708965665698, | |
| "grad_norm": 0.600759744644165, | |
| "learning_rate": 0.0004238918220019394, | |
| "loss": 3.5464, | |
| "step": 27350 | |
| }, | |
| { | |
| "epoch": 2.949090517705306, | |
| "grad_norm": 0.5634221434593201, | |
| "learning_rate": 0.000423568580971878, | |
| "loss": 3.5436, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 2.9544720697449143, | |
| "grad_norm": 0.5330511331558228, | |
| "learning_rate": 0.00042324533994181655, | |
| "loss": 3.5156, | |
| "step": 27450 | |
| }, | |
| { | |
| "epoch": 2.9598536217845224, | |
| "grad_norm": 0.6540337800979614, | |
| "learning_rate": 0.00042292209891175514, | |
| "loss": 3.5174, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 2.965235173824131, | |
| "grad_norm": 0.5485448241233826, | |
| "learning_rate": 0.00042259885788169374, | |
| "loss": 3.5411, | |
| "step": 27550 | |
| }, | |
| { | |
| "epoch": 2.970616725863739, | |
| "grad_norm": 0.5691707134246826, | |
| "learning_rate": 0.00042227561685163233, | |
| "loss": 3.5295, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 2.975998277903347, | |
| "grad_norm": 0.5650212168693542, | |
| "learning_rate": 0.00042195237582157093, | |
| "loss": 3.5074, | |
| "step": 27650 | |
| }, | |
| { | |
| "epoch": 2.9813798299429557, | |
| "grad_norm": 0.6257863640785217, | |
| "learning_rate": 0.0004216291347915095, | |
| "loss": 3.5218, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 2.986761381982564, | |
| "grad_norm": 0.5358322262763977, | |
| "learning_rate": 0.00042130589376144806, | |
| "loss": 3.527, | |
| "step": 27750 | |
| }, | |
| { | |
| "epoch": 2.992142934022172, | |
| "grad_norm": 0.5799550414085388, | |
| "learning_rate": 0.00042098265273138666, | |
| "loss": 3.5273, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 2.9975244860617805, | |
| "grad_norm": 0.5868208408355713, | |
| "learning_rate": 0.0004206594117013252, | |
| "loss": 3.539, | |
| "step": 27850 | |
| }, | |
| { | |
| "epoch": 3.0029060381013886, | |
| "grad_norm": 0.599449872970581, | |
| "learning_rate": 0.00042033617067126385, | |
| "loss": 3.4554, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 3.0082875901409967, | |
| "grad_norm": 0.6563495993614197, | |
| "learning_rate": 0.00042001292964120244, | |
| "loss": 3.4225, | |
| "step": 27950 | |
| }, | |
| { | |
| "epoch": 3.0136691421806048, | |
| "grad_norm": 0.6105541586875916, | |
| "learning_rate": 0.000419689688611141, | |
| "loss": 3.4242, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.0136691421806048, | |
| "eval_accuracy": 0.3703614044950352, | |
| "eval_loss": 3.4996602535247803, | |
| "eval_runtime": 184.2896, | |
| "eval_samples_per_second": 97.732, | |
| "eval_steps_per_second": 6.11, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.0190506942202133, | |
| "grad_norm": 0.6090134978294373, | |
| "learning_rate": 0.0004193664475810796, | |
| "loss": 3.4247, | |
| "step": 28050 | |
| }, | |
| { | |
| "epoch": 3.0244322462598214, | |
| "grad_norm": 0.6373844742774963, | |
| "learning_rate": 0.00041904320655101817, | |
| "loss": 3.4486, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 3.0298137982994295, | |
| "grad_norm": 0.6366713643074036, | |
| "learning_rate": 0.0004187199655209567, | |
| "loss": 3.4378, | |
| "step": 28150 | |
| }, | |
| { | |
| "epoch": 3.0351953503390376, | |
| "grad_norm": 0.5975115895271301, | |
| "learning_rate": 0.00041839672449089536, | |
| "loss": 3.4535, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 3.040576902378646, | |
| "grad_norm": 0.6184483170509338, | |
| "learning_rate": 0.00041807348346083395, | |
| "loss": 3.4623, | |
| "step": 28250 | |
| }, | |
| { | |
| "epoch": 3.0459584544182543, | |
| "grad_norm": 0.5912527441978455, | |
| "learning_rate": 0.0004177502424307725, | |
| "loss": 3.4439, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 3.0513400064578624, | |
| "grad_norm": 0.6498454809188843, | |
| "learning_rate": 0.0004174270014007111, | |
| "loss": 3.4491, | |
| "step": 28350 | |
| }, | |
| { | |
| "epoch": 3.0567215584974705, | |
| "grad_norm": 0.574679434299469, | |
| "learning_rate": 0.00041710376037064963, | |
| "loss": 3.4435, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 3.062103110537079, | |
| "grad_norm": 0.601494312286377, | |
| "learning_rate": 0.0004167805193405883, | |
| "loss": 3.4275, | |
| "step": 28450 | |
| }, | |
| { | |
| "epoch": 3.067484662576687, | |
| "grad_norm": 0.5695728063583374, | |
| "learning_rate": 0.0004164572783105269, | |
| "loss": 3.4594, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 3.0728662146162953, | |
| "grad_norm": 0.5873692035675049, | |
| "learning_rate": 0.0004161340372804654, | |
| "loss": 3.4718, | |
| "step": 28550 | |
| }, | |
| { | |
| "epoch": 3.0782477666559034, | |
| "grad_norm": 0.5661360025405884, | |
| "learning_rate": 0.000415810796250404, | |
| "loss": 3.4408, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 3.083629318695512, | |
| "grad_norm": 0.6276664137840271, | |
| "learning_rate": 0.0004154875552203426, | |
| "loss": 3.4421, | |
| "step": 28650 | |
| }, | |
| { | |
| "epoch": 3.08901087073512, | |
| "grad_norm": 0.5694606900215149, | |
| "learning_rate": 0.00041516431419028114, | |
| "loss": 3.4527, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 3.094392422774728, | |
| "grad_norm": 0.6117746829986572, | |
| "learning_rate": 0.0004148410731602198, | |
| "loss": 3.4553, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 3.0997739748143363, | |
| "grad_norm": 0.5743106603622437, | |
| "learning_rate": 0.0004145178321301584, | |
| "loss": 3.4538, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 3.105155526853945, | |
| "grad_norm": 0.6104516983032227, | |
| "learning_rate": 0.00041419459110009693, | |
| "loss": 3.4521, | |
| "step": 28850 | |
| }, | |
| { | |
| "epoch": 3.110537078893553, | |
| "grad_norm": 0.6102493405342102, | |
| "learning_rate": 0.0004138713500700355, | |
| "loss": 3.4463, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 3.115918630933161, | |
| "grad_norm": 0.6518812775611877, | |
| "learning_rate": 0.00041354810903997406, | |
| "loss": 3.4416, | |
| "step": 28950 | |
| }, | |
| { | |
| "epoch": 3.121300182972769, | |
| "grad_norm": 0.5711073875427246, | |
| "learning_rate": 0.00041322486800991266, | |
| "loss": 3.4594, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 3.121300182972769, | |
| "eval_accuracy": 0.37099832772340363, | |
| "eval_loss": 3.499441385269165, | |
| "eval_runtime": 184.2527, | |
| "eval_samples_per_second": 97.752, | |
| "eval_steps_per_second": 6.111, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 3.1266817350123777, | |
| "grad_norm": 0.5713046789169312, | |
| "learning_rate": 0.00041290809180045246, | |
| "loss": 3.4588, | |
| "step": 29050 | |
| }, | |
| { | |
| "epoch": 3.132063287051986, | |
| "grad_norm": 0.6167808175086975, | |
| "learning_rate": 0.0004125848507703911, | |
| "loss": 3.4449, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 3.137444839091594, | |
| "grad_norm": 0.5783477425575256, | |
| "learning_rate": 0.0004122616097403297, | |
| "loss": 3.4579, | |
| "step": 29150 | |
| }, | |
| { | |
| "epoch": 3.1428263911312024, | |
| "grad_norm": 0.6156630516052246, | |
| "learning_rate": 0.00041193836871026825, | |
| "loss": 3.4503, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 3.1482079431708105, | |
| "grad_norm": 0.5773206353187561, | |
| "learning_rate": 0.00041161512768020684, | |
| "loss": 3.457, | |
| "step": 29250 | |
| }, | |
| { | |
| "epoch": 3.1535894952104186, | |
| "grad_norm": 0.638312816619873, | |
| "learning_rate": 0.0004112918866501454, | |
| "loss": 3.4327, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 3.1589710472500268, | |
| "grad_norm": 0.5940792560577393, | |
| "learning_rate": 0.000410968645620084, | |
| "loss": 3.4464, | |
| "step": 29350 | |
| }, | |
| { | |
| "epoch": 3.1643525992896353, | |
| "grad_norm": 0.6411643624305725, | |
| "learning_rate": 0.0004106454045900226, | |
| "loss": 3.4502, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 3.1697341513292434, | |
| "grad_norm": 0.627984881401062, | |
| "learning_rate": 0.00041032216355996117, | |
| "loss": 3.4583, | |
| "step": 29450 | |
| }, | |
| { | |
| "epoch": 3.1751157033688515, | |
| "grad_norm": 0.6114353537559509, | |
| "learning_rate": 0.00040999892252989976, | |
| "loss": 3.4434, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 3.1804972554084596, | |
| "grad_norm": 0.6601144671440125, | |
| "learning_rate": 0.00040967568149983836, | |
| "loss": 3.4406, | |
| "step": 29550 | |
| }, | |
| { | |
| "epoch": 3.185878807448068, | |
| "grad_norm": 0.5516364574432373, | |
| "learning_rate": 0.0004093524404697769, | |
| "loss": 3.4536, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 3.1912603594876763, | |
| "grad_norm": 0.5681900382041931, | |
| "learning_rate": 0.0004090291994397155, | |
| "loss": 3.4596, | |
| "step": 29650 | |
| }, | |
| { | |
| "epoch": 3.1966419115272844, | |
| "grad_norm": 0.6558794975280762, | |
| "learning_rate": 0.00040870595840965414, | |
| "loss": 3.4553, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 3.2020234635668925, | |
| "grad_norm": 0.6110433340072632, | |
| "learning_rate": 0.0004083827173795927, | |
| "loss": 3.4507, | |
| "step": 29750 | |
| }, | |
| { | |
| "epoch": 3.207405015606501, | |
| "grad_norm": 0.6141265630722046, | |
| "learning_rate": 0.0004080594763495313, | |
| "loss": 3.4408, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 3.212786567646109, | |
| "grad_norm": 0.6038244962692261, | |
| "learning_rate": 0.0004077362353194698, | |
| "loss": 3.4284, | |
| "step": 29850 | |
| }, | |
| { | |
| "epoch": 3.2181681196857173, | |
| "grad_norm": 0.6001572608947754, | |
| "learning_rate": 0.0004074129942894084, | |
| "loss": 3.4569, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 3.2235496717253254, | |
| "grad_norm": 0.6228058338165283, | |
| "learning_rate": 0.000407089753259347, | |
| "loss": 3.4854, | |
| "step": 29950 | |
| }, | |
| { | |
| "epoch": 3.228931223764934, | |
| "grad_norm": 0.6055977940559387, | |
| "learning_rate": 0.0004067665122292856, | |
| "loss": 3.4337, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.228931223764934, | |
| "eval_accuracy": 0.37154919786462304, | |
| "eval_loss": 3.493218421936035, | |
| "eval_runtime": 184.7775, | |
| "eval_samples_per_second": 97.474, | |
| "eval_steps_per_second": 6.094, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 3.234312775804542, | |
| "grad_norm": 0.6261052489280701, | |
| "learning_rate": 0.0004064432711992242, | |
| "loss": 3.4797, | |
| "step": 30050 | |
| }, | |
| { | |
| "epoch": 3.23969432784415, | |
| "grad_norm": 0.5575237274169922, | |
| "learning_rate": 0.0004061200301691628, | |
| "loss": 3.4505, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 3.2450758798837587, | |
| "grad_norm": 0.5761787295341492, | |
| "learning_rate": 0.00040579678913910133, | |
| "loss": 3.4694, | |
| "step": 30150 | |
| }, | |
| { | |
| "epoch": 3.250457431923367, | |
| "grad_norm": 0.5979387760162354, | |
| "learning_rate": 0.0004054735481090399, | |
| "loss": 3.4579, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 3.255838983962975, | |
| "grad_norm": 0.5906158089637756, | |
| "learning_rate": 0.0004051503070789786, | |
| "loss": 3.4587, | |
| "step": 30250 | |
| }, | |
| { | |
| "epoch": 3.261220536002583, | |
| "grad_norm": 0.6434659361839294, | |
| "learning_rate": 0.0004048270660489171, | |
| "loss": 3.4295, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 3.2666020880421915, | |
| "grad_norm": 0.5845122337341309, | |
| "learning_rate": 0.0004045038250188557, | |
| "loss": 3.4527, | |
| "step": 30350 | |
| }, | |
| { | |
| "epoch": 3.2719836400817996, | |
| "grad_norm": 0.608493447303772, | |
| "learning_rate": 0.00040418058398879425, | |
| "loss": 3.4514, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 3.2773651921214078, | |
| "grad_norm": 0.5747212171554565, | |
| "learning_rate": 0.00040385734295873284, | |
| "loss": 3.4804, | |
| "step": 30450 | |
| }, | |
| { | |
| "epoch": 3.282746744161016, | |
| "grad_norm": 0.6562155485153198, | |
| "learning_rate": 0.00040354056674927265, | |
| "loss": 3.4651, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 3.2881282962006244, | |
| "grad_norm": 0.6439652442932129, | |
| "learning_rate": 0.00040321732571921124, | |
| "loss": 3.4652, | |
| "step": 30550 | |
| }, | |
| { | |
| "epoch": 3.2935098482402325, | |
| "grad_norm": 0.6053032279014587, | |
| "learning_rate": 0.0004028940846891498, | |
| "loss": 3.4706, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 3.2988914002798406, | |
| "grad_norm": 0.6160561442375183, | |
| "learning_rate": 0.00040257084365908843, | |
| "loss": 3.447, | |
| "step": 30650 | |
| }, | |
| { | |
| "epoch": 3.304272952319449, | |
| "grad_norm": 0.6740018129348755, | |
| "learning_rate": 0.00040224760262902703, | |
| "loss": 3.4561, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 3.3096545043590573, | |
| "grad_norm": 0.6057961583137512, | |
| "learning_rate": 0.00040192436159896557, | |
| "loss": 3.4598, | |
| "step": 30750 | |
| }, | |
| { | |
| "epoch": 3.3150360563986654, | |
| "grad_norm": 0.5896490216255188, | |
| "learning_rate": 0.00040160112056890416, | |
| "loss": 3.452, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 3.3204176084382735, | |
| "grad_norm": 0.6008713245391846, | |
| "learning_rate": 0.00040127787953884276, | |
| "loss": 3.4669, | |
| "step": 30850 | |
| }, | |
| { | |
| "epoch": 3.3257991604778816, | |
| "grad_norm": 0.6577219367027283, | |
| "learning_rate": 0.00040095463850878135, | |
| "loss": 3.4536, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 3.33118071251749, | |
| "grad_norm": 0.581386923789978, | |
| "learning_rate": 0.00040063139747871995, | |
| "loss": 3.4546, | |
| "step": 30950 | |
| }, | |
| { | |
| "epoch": 3.3365622645570983, | |
| "grad_norm": 0.6249150037765503, | |
| "learning_rate": 0.00040030815644865854, | |
| "loss": 3.4576, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.3365622645570983, | |
| "eval_accuracy": 0.3725115364919959, | |
| "eval_loss": 3.485410213470459, | |
| "eval_runtime": 184.3657, | |
| "eval_samples_per_second": 97.692, | |
| "eval_steps_per_second": 6.107, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 3.3419438165967064, | |
| "grad_norm": 0.60781329870224, | |
| "learning_rate": 0.0003999849154185971, | |
| "loss": 3.4657, | |
| "step": 31050 | |
| }, | |
| { | |
| "epoch": 3.347325368636315, | |
| "grad_norm": 0.6397443413734436, | |
| "learning_rate": 0.0003996616743885357, | |
| "loss": 3.4562, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 3.352706920675923, | |
| "grad_norm": 0.6920917630195618, | |
| "learning_rate": 0.0003993384333584742, | |
| "loss": 3.4667, | |
| "step": 31150 | |
| }, | |
| { | |
| "epoch": 3.358088472715531, | |
| "grad_norm": 0.6365469098091125, | |
| "learning_rate": 0.00039901519232841287, | |
| "loss": 3.4531, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 3.3634700247551392, | |
| "grad_norm": 0.6367911696434021, | |
| "learning_rate": 0.00039869195129835146, | |
| "loss": 3.4559, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 3.368851576794748, | |
| "grad_norm": 0.6479049921035767, | |
| "learning_rate": 0.00039836871026829, | |
| "loss": 3.4599, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 3.374233128834356, | |
| "grad_norm": 0.6077846884727478, | |
| "learning_rate": 0.0003980454692382286, | |
| "loss": 3.4647, | |
| "step": 31350 | |
| }, | |
| { | |
| "epoch": 3.379614680873964, | |
| "grad_norm": 0.610712468624115, | |
| "learning_rate": 0.0003977222282081672, | |
| "loss": 3.4588, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 3.384996232913572, | |
| "grad_norm": 0.6273388862609863, | |
| "learning_rate": 0.00039739898717810573, | |
| "loss": 3.4545, | |
| "step": 31450 | |
| }, | |
| { | |
| "epoch": 3.3903777849531807, | |
| "grad_norm": 0.5932088494300842, | |
| "learning_rate": 0.0003970757461480444, | |
| "loss": 3.4425, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 3.3957593369927888, | |
| "grad_norm": 0.5899380445480347, | |
| "learning_rate": 0.000396752505117983, | |
| "loss": 3.4601, | |
| "step": 31550 | |
| }, | |
| { | |
| "epoch": 3.401140889032397, | |
| "grad_norm": 0.6221842765808105, | |
| "learning_rate": 0.0003964292640879215, | |
| "loss": 3.4652, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 3.4065224410720054, | |
| "grad_norm": 0.596818745136261, | |
| "learning_rate": 0.0003961060230578601, | |
| "loss": 3.4464, | |
| "step": 31650 | |
| }, | |
| { | |
| "epoch": 3.4119039931116135, | |
| "grad_norm": 0.6313436627388, | |
| "learning_rate": 0.00039578278202779865, | |
| "loss": 3.4746, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 3.4172855451512216, | |
| "grad_norm": 0.7080163359642029, | |
| "learning_rate": 0.00039545954099773725, | |
| "loss": 3.4722, | |
| "step": 31750 | |
| }, | |
| { | |
| "epoch": 3.4226670971908297, | |
| "grad_norm": 0.5776293277740479, | |
| "learning_rate": 0.0003951362999676759, | |
| "loss": 3.4666, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 3.428048649230438, | |
| "grad_norm": 0.6150648593902588, | |
| "learning_rate": 0.00039481305893761444, | |
| "loss": 3.4449, | |
| "step": 31850 | |
| }, | |
| { | |
| "epoch": 3.4334302012700464, | |
| "grad_norm": 0.654681384563446, | |
| "learning_rate": 0.00039448981790755303, | |
| "loss": 3.4583, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 3.4388117533096545, | |
| "grad_norm": 0.6297449469566345, | |
| "learning_rate": 0.0003941665768774916, | |
| "loss": 3.4802, | |
| "step": 31950 | |
| }, | |
| { | |
| "epoch": 3.4441933053492626, | |
| "grad_norm": 0.5857325792312622, | |
| "learning_rate": 0.00039384333584743016, | |
| "loss": 3.4854, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.4441933053492626, | |
| "eval_accuracy": 0.3729826554135595, | |
| "eval_loss": 3.477412700653076, | |
| "eval_runtime": 184.1469, | |
| "eval_samples_per_second": 97.808, | |
| "eval_steps_per_second": 6.115, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 3.449574857388871, | |
| "grad_norm": 0.5666436553001404, | |
| "learning_rate": 0.0003935200948173688, | |
| "loss": 3.4641, | |
| "step": 32050 | |
| }, | |
| { | |
| "epoch": 3.4549564094284793, | |
| "grad_norm": 0.6132357120513916, | |
| "learning_rate": 0.0003931968537873074, | |
| "loss": 3.4594, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 3.4603379614680874, | |
| "grad_norm": 0.6949991583824158, | |
| "learning_rate": 0.00039287361275724595, | |
| "loss": 3.4698, | |
| "step": 32150 | |
| }, | |
| { | |
| "epoch": 3.4657195135076955, | |
| "grad_norm": 0.5902615785598755, | |
| "learning_rate": 0.00039255037172718454, | |
| "loss": 3.4537, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 3.471101065547304, | |
| "grad_norm": 0.6425248980522156, | |
| "learning_rate": 0.0003922271306971231, | |
| "loss": 3.4657, | |
| "step": 32250 | |
| }, | |
| { | |
| "epoch": 3.476482617586912, | |
| "grad_norm": 0.6459230184555054, | |
| "learning_rate": 0.0003919038896670617, | |
| "loss": 3.4617, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 3.4818641696265202, | |
| "grad_norm": 0.8496313691139221, | |
| "learning_rate": 0.00039158064863700033, | |
| "loss": 3.4397, | |
| "step": 32350 | |
| }, | |
| { | |
| "epoch": 3.4872457216661283, | |
| "grad_norm": 0.6361708641052246, | |
| "learning_rate": 0.00039125740760693887, | |
| "loss": 3.4406, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 3.492627273705737, | |
| "grad_norm": 0.6062164902687073, | |
| "learning_rate": 0.00039093416657687746, | |
| "loss": 3.4758, | |
| "step": 32450 | |
| }, | |
| { | |
| "epoch": 3.498008825745345, | |
| "grad_norm": 0.6049349904060364, | |
| "learning_rate": 0.00039061092554681606, | |
| "loss": 3.4733, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 3.503390377784953, | |
| "grad_norm": 0.6517959833145142, | |
| "learning_rate": 0.0003902876845167546, | |
| "loss": 3.4654, | |
| "step": 32550 | |
| }, | |
| { | |
| "epoch": 3.5087719298245617, | |
| "grad_norm": 0.5791226625442505, | |
| "learning_rate": 0.0003899644434866932, | |
| "loss": 3.4739, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 3.5141534818641698, | |
| "grad_norm": 0.6079575419425964, | |
| "learning_rate": 0.00038964120245663184, | |
| "loss": 3.4733, | |
| "step": 32650 | |
| }, | |
| { | |
| "epoch": 3.519535033903778, | |
| "grad_norm": 0.5950577855110168, | |
| "learning_rate": 0.0003893179614265704, | |
| "loss": 3.4655, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 3.524916585943386, | |
| "grad_norm": 0.8927932381629944, | |
| "learning_rate": 0.000388994720396509, | |
| "loss": 3.4671, | |
| "step": 32750 | |
| }, | |
| { | |
| "epoch": 3.530298137982994, | |
| "grad_norm": 0.5713189840316772, | |
| "learning_rate": 0.0003886714793664475, | |
| "loss": 3.4672, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 3.5356796900226026, | |
| "grad_norm": 0.6249322891235352, | |
| "learning_rate": 0.0003883482383363861, | |
| "loss": 3.449, | |
| "step": 32850 | |
| }, | |
| { | |
| "epoch": 3.5410612420622107, | |
| "grad_norm": 0.635231077671051, | |
| "learning_rate": 0.00038802499730632476, | |
| "loss": 3.4726, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 3.546442794101819, | |
| "grad_norm": 0.6587965488433838, | |
| "learning_rate": 0.0003877017562762633, | |
| "loss": 3.4602, | |
| "step": 32950 | |
| }, | |
| { | |
| "epoch": 3.5518243461414274, | |
| "grad_norm": 0.6618245840072632, | |
| "learning_rate": 0.0003873785152462019, | |
| "loss": 3.4504, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.5518243461414274, | |
| "eval_accuracy": 0.3737593062556574, | |
| "eval_loss": 3.4713170528411865, | |
| "eval_runtime": 184.6965, | |
| "eval_samples_per_second": 97.517, | |
| "eval_steps_per_second": 6.096, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 3.5572058981810355, | |
| "grad_norm": 0.5909431576728821, | |
| "learning_rate": 0.0003870552742161405, | |
| "loss": 3.4404, | |
| "step": 33050 | |
| }, | |
| { | |
| "epoch": 3.5625874502206436, | |
| "grad_norm": 0.6251736879348755, | |
| "learning_rate": 0.00038673203318607903, | |
| "loss": 3.4581, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 3.5679690022602517, | |
| "grad_norm": 0.6399909853935242, | |
| "learning_rate": 0.0003864087921560176, | |
| "loss": 3.4428, | |
| "step": 33150 | |
| }, | |
| { | |
| "epoch": 3.57335055429986, | |
| "grad_norm": 0.6242807507514954, | |
| "learning_rate": 0.0003860855511259563, | |
| "loss": 3.4589, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 3.5787321063394684, | |
| "grad_norm": 0.6046002507209778, | |
| "learning_rate": 0.0003857623100958948, | |
| "loss": 3.4656, | |
| "step": 33250 | |
| }, | |
| { | |
| "epoch": 3.5841136583790765, | |
| "grad_norm": 0.6714318990707397, | |
| "learning_rate": 0.0003854390690658334, | |
| "loss": 3.4618, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 3.5894952104186846, | |
| "grad_norm": 0.5967142581939697, | |
| "learning_rate": 0.00038511582803577195, | |
| "loss": 3.4555, | |
| "step": 33350 | |
| }, | |
| { | |
| "epoch": 3.594876762458293, | |
| "grad_norm": 0.6154034733772278, | |
| "learning_rate": 0.00038479258700571054, | |
| "loss": 3.4456, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 3.6002583144979012, | |
| "grad_norm": 0.5872713923454285, | |
| "learning_rate": 0.00038446934597564914, | |
| "loss": 3.4508, | |
| "step": 33450 | |
| }, | |
| { | |
| "epoch": 3.6056398665375093, | |
| "grad_norm": 0.6076340079307556, | |
| "learning_rate": 0.00038414610494558773, | |
| "loss": 3.456, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 3.611021418577118, | |
| "grad_norm": 0.5723451972007751, | |
| "learning_rate": 0.00038382286391552633, | |
| "loss": 3.4651, | |
| "step": 33550 | |
| }, | |
| { | |
| "epoch": 3.616402970616726, | |
| "grad_norm": 0.5870938301086426, | |
| "learning_rate": 0.0003834996228854649, | |
| "loss": 3.4636, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 3.621784522656334, | |
| "grad_norm": 0.6072779893875122, | |
| "learning_rate": 0.00038317638185540346, | |
| "loss": 3.4644, | |
| "step": 33650 | |
| }, | |
| { | |
| "epoch": 3.627166074695942, | |
| "grad_norm": 0.6000068187713623, | |
| "learning_rate": 0.00038285960564594327, | |
| "loss": 3.4671, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 3.6325476267355503, | |
| "grad_norm": 0.6160212159156799, | |
| "learning_rate": 0.00038253636461588186, | |
| "loss": 3.463, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 3.637929178775159, | |
| "grad_norm": 0.7063570618629456, | |
| "learning_rate": 0.00038221312358582046, | |
| "loss": 3.4378, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 3.643310730814767, | |
| "grad_norm": 0.6322859525680542, | |
| "learning_rate": 0.00038188988255575905, | |
| "loss": 3.4596, | |
| "step": 33850 | |
| }, | |
| { | |
| "epoch": 3.648692282854375, | |
| "grad_norm": 0.6268704533576965, | |
| "learning_rate": 0.00038156664152569765, | |
| "loss": 3.4784, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 3.6540738348939836, | |
| "grad_norm": 0.6058254837989807, | |
| "learning_rate": 0.00038124340049563624, | |
| "loss": 3.455, | |
| "step": 33950 | |
| }, | |
| { | |
| "epoch": 3.6594553869335917, | |
| "grad_norm": 0.6425801515579224, | |
| "learning_rate": 0.0003809201594655748, | |
| "loss": 3.449, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.6594553869335917, | |
| "eval_accuracy": 0.3744735903401498, | |
| "eval_loss": 3.4654529094696045, | |
| "eval_runtime": 184.3767, | |
| "eval_samples_per_second": 97.686, | |
| "eval_steps_per_second": 6.107, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 3.6648369389732, | |
| "grad_norm": 0.6552558541297913, | |
| "learning_rate": 0.0003805969184355134, | |
| "loss": 3.4635, | |
| "step": 34050 | |
| }, | |
| { | |
| "epoch": 3.670218491012808, | |
| "grad_norm": 0.6297048926353455, | |
| "learning_rate": 0.0003802736774054519, | |
| "loss": 3.4479, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 3.675600043052416, | |
| "grad_norm": 0.6090021729469299, | |
| "learning_rate": 0.00037995043637539057, | |
| "loss": 3.4711, | |
| "step": 34150 | |
| }, | |
| { | |
| "epoch": 3.6809815950920246, | |
| "grad_norm": 0.6067349314689636, | |
| "learning_rate": 0.00037962719534532916, | |
| "loss": 3.4607, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 3.6863631471316327, | |
| "grad_norm": 0.6618533730506897, | |
| "learning_rate": 0.0003793039543152677, | |
| "loss": 3.4597, | |
| "step": 34250 | |
| }, | |
| { | |
| "epoch": 3.691744699171241, | |
| "grad_norm": 0.5938194990158081, | |
| "learning_rate": 0.0003789807132852063, | |
| "loss": 3.4435, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 3.6971262512108494, | |
| "grad_norm": 0.6445248126983643, | |
| "learning_rate": 0.0003786574722551449, | |
| "loss": 3.4287, | |
| "step": 34350 | |
| }, | |
| { | |
| "epoch": 3.7025078032504575, | |
| "grad_norm": 0.6069079041481018, | |
| "learning_rate": 0.00037833423122508343, | |
| "loss": 3.4655, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 3.7078893552900656, | |
| "grad_norm": 0.6393512487411499, | |
| "learning_rate": 0.0003780109901950221, | |
| "loss": 3.457, | |
| "step": 34450 | |
| }, | |
| { | |
| "epoch": 3.713270907329674, | |
| "grad_norm": 0.5912173390388489, | |
| "learning_rate": 0.0003776877491649607, | |
| "loss": 3.4475, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 3.7186524593692822, | |
| "grad_norm": 0.6467220187187195, | |
| "learning_rate": 0.0003773645081348992, | |
| "loss": 3.4497, | |
| "step": 34550 | |
| }, | |
| { | |
| "epoch": 3.7240340114088903, | |
| "grad_norm": 0.6172649264335632, | |
| "learning_rate": 0.0003770412671048378, | |
| "loss": 3.4442, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 3.7294155634484984, | |
| "grad_norm": 0.7250891327857971, | |
| "learning_rate": 0.00037671802607477635, | |
| "loss": 3.4512, | |
| "step": 34650 | |
| }, | |
| { | |
| "epoch": 3.7347971154881066, | |
| "grad_norm": 0.6259332299232483, | |
| "learning_rate": 0.000376394785044715, | |
| "loss": 3.4547, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 3.740178667527715, | |
| "grad_norm": 0.6603428721427917, | |
| "learning_rate": 0.0003760715440146536, | |
| "loss": 3.4524, | |
| "step": 34750 | |
| }, | |
| { | |
| "epoch": 3.745560219567323, | |
| "grad_norm": 0.6384559273719788, | |
| "learning_rate": 0.00037574830298459214, | |
| "loss": 3.4597, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 3.7509417716069313, | |
| "grad_norm": 0.5936117172241211, | |
| "learning_rate": 0.00037542506195453073, | |
| "loss": 3.4618, | |
| "step": 34850 | |
| }, | |
| { | |
| "epoch": 3.75632332364654, | |
| "grad_norm": 0.6444550156593323, | |
| "learning_rate": 0.0003751018209244693, | |
| "loss": 3.4451, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 3.761704875686148, | |
| "grad_norm": 0.6175785660743713, | |
| "learning_rate": 0.00037477857989440787, | |
| "loss": 3.4498, | |
| "step": 34950 | |
| }, | |
| { | |
| "epoch": 3.767086427725756, | |
| "grad_norm": 0.6283439993858337, | |
| "learning_rate": 0.0003744553388643465, | |
| "loss": 3.4402, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.767086427725756, | |
| "eval_accuracy": 0.37506064189301147, | |
| "eval_loss": 3.4579710960388184, | |
| "eval_runtime": 184.4841, | |
| "eval_samples_per_second": 97.629, | |
| "eval_steps_per_second": 6.104, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 3.772467979765364, | |
| "grad_norm": 0.6437238454818726, | |
| "learning_rate": 0.0003741320978342851, | |
| "loss": 3.45, | |
| "step": 35050 | |
| }, | |
| { | |
| "epoch": 3.7778495318049723, | |
| "grad_norm": 0.6655225157737732, | |
| "learning_rate": 0.00037380885680422365, | |
| "loss": 3.4558, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 3.783231083844581, | |
| "grad_norm": 0.6094281673431396, | |
| "learning_rate": 0.00037348561577416224, | |
| "loss": 3.451, | |
| "step": 35150 | |
| }, | |
| { | |
| "epoch": 3.788612635884189, | |
| "grad_norm": 0.6181848645210266, | |
| "learning_rate": 0.0003731623747441008, | |
| "loss": 3.443, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 3.793994187923797, | |
| "grad_norm": 0.6608075499534607, | |
| "learning_rate": 0.0003728391337140394, | |
| "loss": 3.4426, | |
| "step": 35250 | |
| }, | |
| { | |
| "epoch": 3.7993757399634056, | |
| "grad_norm": 0.6112875938415527, | |
| "learning_rate": 0.00037251589268397803, | |
| "loss": 3.4448, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 3.8047572920030137, | |
| "grad_norm": 0.650355339050293, | |
| "learning_rate": 0.00037219265165391657, | |
| "loss": 3.4504, | |
| "step": 35350 | |
| }, | |
| { | |
| "epoch": 3.810138844042622, | |
| "grad_norm": 0.6517062187194824, | |
| "learning_rate": 0.00037186941062385516, | |
| "loss": 3.4615, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 3.8155203960822304, | |
| "grad_norm": 0.7008894085884094, | |
| "learning_rate": 0.0003715461695937937, | |
| "loss": 3.4503, | |
| "step": 35450 | |
| }, | |
| { | |
| "epoch": 3.8209019481218385, | |
| "grad_norm": 0.6126015186309814, | |
| "learning_rate": 0.0003712229285637323, | |
| "loss": 3.4488, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 3.8262835001614466, | |
| "grad_norm": 0.6550661325454712, | |
| "learning_rate": 0.0003708996875336709, | |
| "loss": 3.4358, | |
| "step": 35550 | |
| }, | |
| { | |
| "epoch": 3.8316650522010547, | |
| "grad_norm": 0.6141082644462585, | |
| "learning_rate": 0.0003705764465036095, | |
| "loss": 3.4613, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 3.837046604240663, | |
| "grad_norm": 0.6304882764816284, | |
| "learning_rate": 0.0003702532054735481, | |
| "loss": 3.4615, | |
| "step": 35650 | |
| }, | |
| { | |
| "epoch": 3.8424281562802713, | |
| "grad_norm": 0.6499120593070984, | |
| "learning_rate": 0.0003699299644434867, | |
| "loss": 3.4667, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 3.8478097083198795, | |
| "grad_norm": 0.6515492796897888, | |
| "learning_rate": 0.0003696067234134252, | |
| "loss": 3.4591, | |
| "step": 35750 | |
| }, | |
| { | |
| "epoch": 3.8531912603594876, | |
| "grad_norm": 0.6420202255249023, | |
| "learning_rate": 0.0003692834823833638, | |
| "loss": 3.4544, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 3.858572812399096, | |
| "grad_norm": 0.5830044150352478, | |
| "learning_rate": 0.00036896024135330246, | |
| "loss": 3.4592, | |
| "step": 35850 | |
| }, | |
| { | |
| "epoch": 3.863954364438704, | |
| "grad_norm": 0.61103355884552, | |
| "learning_rate": 0.000368637000323241, | |
| "loss": 3.4685, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 3.8693359164783123, | |
| "grad_norm": 0.7067725658416748, | |
| "learning_rate": 0.0003683137592931796, | |
| "loss": 3.469, | |
| "step": 35950 | |
| }, | |
| { | |
| "epoch": 3.8747174685179204, | |
| "grad_norm": 0.5974989533424377, | |
| "learning_rate": 0.00036799051826311814, | |
| "loss": 3.4581, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.8747174685179204, | |
| "eval_accuracy": 0.37549394960961563, | |
| "eval_loss": 3.4535152912139893, | |
| "eval_runtime": 184.307, | |
| "eval_samples_per_second": 97.723, | |
| "eval_steps_per_second": 6.109, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 3.8800990205575285, | |
| "grad_norm": 0.568252682685852, | |
| "learning_rate": 0.00036766727723305673, | |
| "loss": 3.4568, | |
| "step": 36050 | |
| }, | |
| { | |
| "epoch": 3.885480572597137, | |
| "grad_norm": 0.6623275279998779, | |
| "learning_rate": 0.0003673440362029953, | |
| "loss": 3.4443, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 3.890862124636745, | |
| "grad_norm": 0.6388083696365356, | |
| "learning_rate": 0.0003670207951729339, | |
| "loss": 3.456, | |
| "step": 36150 | |
| }, | |
| { | |
| "epoch": 3.8962436766763533, | |
| "grad_norm": 0.634536862373352, | |
| "learning_rate": 0.0003666975541428725, | |
| "loss": 3.4427, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 3.901625228715962, | |
| "grad_norm": 0.5885773301124573, | |
| "learning_rate": 0.0003663743131128111, | |
| "loss": 3.4392, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 3.90700678075557, | |
| "grad_norm": 0.7272632718086243, | |
| "learning_rate": 0.00036605107208274965, | |
| "loss": 3.4446, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 3.912388332795178, | |
| "grad_norm": 0.590799868106842, | |
| "learning_rate": 0.00036572783105268824, | |
| "loss": 3.4486, | |
| "step": 36350 | |
| }, | |
| { | |
| "epoch": 3.9177698848347866, | |
| "grad_norm": 0.6622004508972168, | |
| "learning_rate": 0.0003654045900226268, | |
| "loss": 3.4538, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 3.9231514368743947, | |
| "grad_norm": 0.6048895120620728, | |
| "learning_rate": 0.00036508134899256543, | |
| "loss": 3.4494, | |
| "step": 36450 | |
| }, | |
| { | |
| "epoch": 3.928532988914003, | |
| "grad_norm": 0.6244843602180481, | |
| "learning_rate": 0.00036475810796250403, | |
| "loss": 3.4429, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 3.933914540953611, | |
| "grad_norm": 0.7105559706687927, | |
| "learning_rate": 0.00036443486693244257, | |
| "loss": 3.4539, | |
| "step": 36550 | |
| }, | |
| { | |
| "epoch": 3.939296092993219, | |
| "grad_norm": 0.5961714386940002, | |
| "learning_rate": 0.00036411162590238116, | |
| "loss": 3.4572, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 3.9446776450328276, | |
| "grad_norm": 0.6044503450393677, | |
| "learning_rate": 0.00036378838487231976, | |
| "loss": 3.4577, | |
| "step": 36650 | |
| }, | |
| { | |
| "epoch": 3.9500591970724357, | |
| "grad_norm": 0.6206450462341309, | |
| "learning_rate": 0.00036346514384225835, | |
| "loss": 3.4502, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 3.955440749112044, | |
| "grad_norm": 0.6197695732116699, | |
| "learning_rate": 0.00036314190281219695, | |
| "loss": 3.4515, | |
| "step": 36750 | |
| }, | |
| { | |
| "epoch": 3.9608223011516523, | |
| "grad_norm": 0.6754399538040161, | |
| "learning_rate": 0.00036281866178213554, | |
| "loss": 3.4478, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 3.9662038531912605, | |
| "grad_norm": 0.6515491604804993, | |
| "learning_rate": 0.0003624954207520741, | |
| "loss": 3.4414, | |
| "step": 36850 | |
| }, | |
| { | |
| "epoch": 3.9715854052308686, | |
| "grad_norm": 0.7095997929573059, | |
| "learning_rate": 0.0003621721797220127, | |
| "loss": 3.4407, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 3.9769669572704767, | |
| "grad_norm": 0.6265909671783447, | |
| "learning_rate": 0.0003618489386919512, | |
| "loss": 3.4681, | |
| "step": 36950 | |
| }, | |
| { | |
| "epoch": 3.9823485093100848, | |
| "grad_norm": 0.6624922156333923, | |
| "learning_rate": 0.00036152569766188987, | |
| "loss": 3.4399, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.9823485093100848, | |
| "eval_accuracy": 0.3764659583440039, | |
| "eval_loss": 3.4450950622558594, | |
| "eval_runtime": 184.7248, | |
| "eval_samples_per_second": 97.502, | |
| "eval_steps_per_second": 6.096, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 3.9877300613496933, | |
| "grad_norm": 0.675195574760437, | |
| "learning_rate": 0.00036120245663182846, | |
| "loss": 3.4563, | |
| "step": 37050 | |
| }, | |
| { | |
| "epoch": 3.9931116133893014, | |
| "grad_norm": 0.6407047510147095, | |
| "learning_rate": 0.000360879215601767, | |
| "loss": 3.4663, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 3.9984931654289095, | |
| "grad_norm": 0.6153064966201782, | |
| "learning_rate": 0.0003605559745717056, | |
| "loss": 3.457, | |
| "step": 37150 | |
| }, | |
| { | |
| "epoch": 4.003874717468518, | |
| "grad_norm": 0.6592976450920105, | |
| "learning_rate": 0.0003602327335416442, | |
| "loss": 3.3949, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 4.009256269508126, | |
| "grad_norm": 0.6417883634567261, | |
| "learning_rate": 0.00035990949251158273, | |
| "loss": 3.3379, | |
| "step": 37250 | |
| }, | |
| { | |
| "epoch": 4.014637821547734, | |
| "grad_norm": 0.6505351066589355, | |
| "learning_rate": 0.0003595862514815214, | |
| "loss": 3.3506, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 4.020019373587343, | |
| "grad_norm": 0.622450053691864, | |
| "learning_rate": 0.00035926301045146, | |
| "loss": 3.354, | |
| "step": 37350 | |
| }, | |
| { | |
| "epoch": 4.0254009256269505, | |
| "grad_norm": 0.6620802283287048, | |
| "learning_rate": 0.0003589397694213985, | |
| "loss": 3.3948, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 4.030782477666559, | |
| "grad_norm": 0.674140214920044, | |
| "learning_rate": 0.0003586165283913371, | |
| "loss": 3.374, | |
| "step": 37450 | |
| }, | |
| { | |
| "epoch": 4.036164029706168, | |
| "grad_norm": 0.6449016332626343, | |
| "learning_rate": 0.00035829328736127565, | |
| "loss": 3.3668, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 4.041545581745775, | |
| "grad_norm": 0.6092488169670105, | |
| "learning_rate": 0.00035797004633121425, | |
| "loss": 3.3746, | |
| "step": 37550 | |
| }, | |
| { | |
| "epoch": 4.046927133785384, | |
| "grad_norm": 0.655305027961731, | |
| "learning_rate": 0.0003576468053011529, | |
| "loss": 3.352, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 4.0523086858249915, | |
| "grad_norm": 0.650629460811615, | |
| "learning_rate": 0.00035732356427109143, | |
| "loss": 3.3727, | |
| "step": 37650 | |
| }, | |
| { | |
| "epoch": 4.0576902378646, | |
| "grad_norm": 0.6936136484146118, | |
| "learning_rate": 0.00035700032324103003, | |
| "loss": 3.3571, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 4.063071789904209, | |
| "grad_norm": 0.6292705535888672, | |
| "learning_rate": 0.00035668354703156984, | |
| "loss": 3.3797, | |
| "step": 37750 | |
| }, | |
| { | |
| "epoch": 4.068453341943816, | |
| "grad_norm": 0.6342557072639465, | |
| "learning_rate": 0.00035636030600150843, | |
| "loss": 3.3791, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 4.073834893983425, | |
| "grad_norm": 0.6861465573310852, | |
| "learning_rate": 0.00035603706497144697, | |
| "loss": 3.3849, | |
| "step": 37850 | |
| }, | |
| { | |
| "epoch": 4.079216446023033, | |
| "grad_norm": 0.6176989674568176, | |
| "learning_rate": 0.00035571382394138557, | |
| "loss": 3.3603, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 4.084597998062641, | |
| "grad_norm": 0.6110954880714417, | |
| "learning_rate": 0.0003553905829113242, | |
| "loss": 3.3471, | |
| "step": 37950 | |
| }, | |
| { | |
| "epoch": 4.08997955010225, | |
| "grad_norm": 0.6518476009368896, | |
| "learning_rate": 0.00035506734188126275, | |
| "loss": 3.3962, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 4.08997955010225, | |
| "eval_accuracy": 0.37652365302743346, | |
| "eval_loss": 3.4501006603240967, | |
| "eval_runtime": 184.0449, | |
| "eval_samples_per_second": 97.862, | |
| "eval_steps_per_second": 6.118, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 4.095361102141858, | |
| "grad_norm": 0.7071831822395325, | |
| "learning_rate": 0.00035474410085120135, | |
| "loss": 3.3712, | |
| "step": 38050 | |
| }, | |
| { | |
| "epoch": 4.100742654181466, | |
| "grad_norm": 0.6040788292884827, | |
| "learning_rate": 0.00035442085982113994, | |
| "loss": 3.3925, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 4.106124206221074, | |
| "grad_norm": 0.6902562975883484, | |
| "learning_rate": 0.0003540976187910785, | |
| "loss": 3.3507, | |
| "step": 38150 | |
| }, | |
| { | |
| "epoch": 4.111505758260682, | |
| "grad_norm": 0.6432631015777588, | |
| "learning_rate": 0.0003537743777610171, | |
| "loss": 3.3778, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 4.1168873103002905, | |
| "grad_norm": 0.6424291729927063, | |
| "learning_rate": 0.00035345113673095573, | |
| "loss": 3.3656, | |
| "step": 38250 | |
| }, | |
| { | |
| "epoch": 4.122268862339899, | |
| "grad_norm": 0.6857510209083557, | |
| "learning_rate": 0.00035312789570089427, | |
| "loss": 3.3442, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 4.127650414379507, | |
| "grad_norm": 0.6548146605491638, | |
| "learning_rate": 0.00035280465467083286, | |
| "loss": 3.3866, | |
| "step": 38350 | |
| }, | |
| { | |
| "epoch": 4.133031966419115, | |
| "grad_norm": 0.6027295589447021, | |
| "learning_rate": 0.0003524814136407714, | |
| "loss": 3.3592, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 4.138413518458724, | |
| "grad_norm": 0.6444876194000244, | |
| "learning_rate": 0.00035215817261071, | |
| "loss": 3.3592, | |
| "step": 38450 | |
| }, | |
| { | |
| "epoch": 4.1437950704983315, | |
| "grad_norm": 0.6224494576454163, | |
| "learning_rate": 0.00035183493158064865, | |
| "loss": 3.3752, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 4.14917662253794, | |
| "grad_norm": 0.5903575420379639, | |
| "learning_rate": 0.0003515116905505872, | |
| "loss": 3.3655, | |
| "step": 38550 | |
| }, | |
| { | |
| "epoch": 4.154558174577549, | |
| "grad_norm": 0.7473770976066589, | |
| "learning_rate": 0.0003511884495205258, | |
| "loss": 3.3896, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 4.159939726617156, | |
| "grad_norm": 0.6288912296295166, | |
| "learning_rate": 0.0003508652084904644, | |
| "loss": 3.3903, | |
| "step": 38650 | |
| }, | |
| { | |
| "epoch": 4.165321278656765, | |
| "grad_norm": 0.6335214376449585, | |
| "learning_rate": 0.0003505419674604029, | |
| "loss": 3.3766, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 4.1707028306963725, | |
| "grad_norm": 0.6742566823959351, | |
| "learning_rate": 0.0003502187264303415, | |
| "loss": 3.3673, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 4.176084382735981, | |
| "grad_norm": 0.6295595169067383, | |
| "learning_rate": 0.00034989548540028016, | |
| "loss": 3.384, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 4.18146593477559, | |
| "grad_norm": 0.6516152024269104, | |
| "learning_rate": 0.0003495787091908199, | |
| "loss": 3.3958, | |
| "step": 38850 | |
| }, | |
| { | |
| "epoch": 4.186847486815197, | |
| "grad_norm": 0.6820975542068481, | |
| "learning_rate": 0.0003492554681607585, | |
| "loss": 3.3867, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 4.192229038854806, | |
| "grad_norm": 0.650514543056488, | |
| "learning_rate": 0.0003489322271306971, | |
| "loss": 3.3893, | |
| "step": 38950 | |
| }, | |
| { | |
| "epoch": 4.197610590894414, | |
| "grad_norm": 0.6839084625244141, | |
| "learning_rate": 0.0003486089861006357, | |
| "loss": 3.3787, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 4.197610590894414, | |
| "eval_accuracy": 0.37696271934709175, | |
| "eval_loss": 3.4459009170532227, | |
| "eval_runtime": 184.0684, | |
| "eval_samples_per_second": 97.849, | |
| "eval_steps_per_second": 6.117, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 4.202992142934022, | |
| "grad_norm": 0.6607173085212708, | |
| "learning_rate": 0.00034828574507057424, | |
| "loss": 3.3867, | |
| "step": 39050 | |
| }, | |
| { | |
| "epoch": 4.208373694973631, | |
| "grad_norm": 0.6593530774116516, | |
| "learning_rate": 0.00034796250404051283, | |
| "loss": 3.3728, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 4.213755247013238, | |
| "grad_norm": 0.6216228008270264, | |
| "learning_rate": 0.00034763926301045137, | |
| "loss": 3.3738, | |
| "step": 39150 | |
| }, | |
| { | |
| "epoch": 4.219136799052847, | |
| "grad_norm": 0.6555355191230774, | |
| "learning_rate": 0.00034731602198039, | |
| "loss": 3.3772, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 4.224518351092455, | |
| "grad_norm": 0.6763815879821777, | |
| "learning_rate": 0.0003469927809503286, | |
| "loss": 3.3861, | |
| "step": 39250 | |
| }, | |
| { | |
| "epoch": 4.229899903132063, | |
| "grad_norm": 0.6993557810783386, | |
| "learning_rate": 0.00034666953992026716, | |
| "loss": 3.3736, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 4.2352814551716715, | |
| "grad_norm": 0.6681185960769653, | |
| "learning_rate": 0.00034634629889020575, | |
| "loss": 3.3867, | |
| "step": 39350 | |
| }, | |
| { | |
| "epoch": 4.24066300721128, | |
| "grad_norm": 0.673267126083374, | |
| "learning_rate": 0.00034602305786014435, | |
| "loss": 3.3843, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 4.246044559250888, | |
| "grad_norm": 0.6191767454147339, | |
| "learning_rate": 0.00034569981683008294, | |
| "loss": 3.3884, | |
| "step": 39450 | |
| }, | |
| { | |
| "epoch": 4.251426111290496, | |
| "grad_norm": 0.6691135764122009, | |
| "learning_rate": 0.00034537657580002154, | |
| "loss": 3.3979, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 4.256807663330104, | |
| "grad_norm": 0.6780727505683899, | |
| "learning_rate": 0.00034505333476996013, | |
| "loss": 3.3776, | |
| "step": 39550 | |
| }, | |
| { | |
| "epoch": 4.2621892153697125, | |
| "grad_norm": 0.6386967897415161, | |
| "learning_rate": 0.00034473009373989867, | |
| "loss": 3.3767, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 4.267570767409321, | |
| "grad_norm": 0.6862613558769226, | |
| "learning_rate": 0.00034440685270983727, | |
| "loss": 3.3865, | |
| "step": 39650 | |
| }, | |
| { | |
| "epoch": 4.272952319448929, | |
| "grad_norm": 0.6203247904777527, | |
| "learning_rate": 0.0003440836116797758, | |
| "loss": 3.3937, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 4.278333871488537, | |
| "grad_norm": 0.6543366312980652, | |
| "learning_rate": 0.00034376037064971445, | |
| "loss": 3.3948, | |
| "step": 39750 | |
| }, | |
| { | |
| "epoch": 4.283715423528146, | |
| "grad_norm": 0.6546263098716736, | |
| "learning_rate": 0.00034343712961965305, | |
| "loss": 3.375, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 4.2890969755677535, | |
| "grad_norm": 0.6535509824752808, | |
| "learning_rate": 0.0003431138885895916, | |
| "loss": 3.3772, | |
| "step": 39850 | |
| }, | |
| { | |
| "epoch": 4.294478527607362, | |
| "grad_norm": 0.6452800035476685, | |
| "learning_rate": 0.0003427906475595302, | |
| "loss": 3.3722, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 4.299860079646971, | |
| "grad_norm": 0.6130262613296509, | |
| "learning_rate": 0.0003424674065294688, | |
| "loss": 3.3759, | |
| "step": 39950 | |
| }, | |
| { | |
| "epoch": 4.305241631686578, | |
| "grad_norm": 0.6329742670059204, | |
| "learning_rate": 0.0003421441654994073, | |
| "loss": 3.3976, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.305241631686578, | |
| "eval_accuracy": 0.37801404468958466, | |
| "eval_loss": 3.4401395320892334, | |
| "eval_runtime": 184.1282, | |
| "eval_samples_per_second": 97.818, | |
| "eval_steps_per_second": 6.115, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 4.310623183726187, | |
| "grad_norm": 0.6644855737686157, | |
| "learning_rate": 0.00034182092446934597, | |
| "loss": 3.3697, | |
| "step": 40050 | |
| }, | |
| { | |
| "epoch": 4.3160047357657945, | |
| "grad_norm": 0.6772639751434326, | |
| "learning_rate": 0.00034149768343928456, | |
| "loss": 3.389, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 4.321386287805403, | |
| "grad_norm": 0.6445088386535645, | |
| "learning_rate": 0.0003411744424092231, | |
| "loss": 3.39, | |
| "step": 40150 | |
| }, | |
| { | |
| "epoch": 4.326767839845012, | |
| "grad_norm": 0.6840226054191589, | |
| "learning_rate": 0.0003408512013791617, | |
| "loss": 3.3781, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 4.332149391884619, | |
| "grad_norm": 0.7001984715461731, | |
| "learning_rate": 0.00034052796034910024, | |
| "loss": 3.4049, | |
| "step": 40250 | |
| }, | |
| { | |
| "epoch": 4.337530943924228, | |
| "grad_norm": 0.6908742189407349, | |
| "learning_rate": 0.0003402047193190389, | |
| "loss": 3.3838, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 4.342912495963836, | |
| "grad_norm": 0.6147873401641846, | |
| "learning_rate": 0.0003398814782889775, | |
| "loss": 3.3824, | |
| "step": 40350 | |
| }, | |
| { | |
| "epoch": 4.348294048003444, | |
| "grad_norm": 0.6918706893920898, | |
| "learning_rate": 0.000339558237258916, | |
| "loss": 3.4047, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 4.3536756000430525, | |
| "grad_norm": 0.6723856925964355, | |
| "learning_rate": 0.0003392349962288546, | |
| "loss": 3.4004, | |
| "step": 40450 | |
| }, | |
| { | |
| "epoch": 4.359057152082661, | |
| "grad_norm": 0.6632176637649536, | |
| "learning_rate": 0.0003389117551987932, | |
| "loss": 3.3876, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 4.364438704122269, | |
| "grad_norm": 0.6662275791168213, | |
| "learning_rate": 0.00033858851416873175, | |
| "loss": 3.396, | |
| "step": 40550 | |
| }, | |
| { | |
| "epoch": 4.369820256161877, | |
| "grad_norm": 0.6214112043380737, | |
| "learning_rate": 0.0003382652731386704, | |
| "loss": 3.3953, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 4.375201808201485, | |
| "grad_norm": 0.6457570195198059, | |
| "learning_rate": 0.000337942032108609, | |
| "loss": 3.3698, | |
| "step": 40650 | |
| }, | |
| { | |
| "epoch": 4.3805833602410935, | |
| "grad_norm": 0.7015320062637329, | |
| "learning_rate": 0.00033761879107854754, | |
| "loss": 3.4086, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 4.385964912280702, | |
| "grad_norm": 0.6032451391220093, | |
| "learning_rate": 0.00033729555004848613, | |
| "loss": 3.3934, | |
| "step": 40750 | |
| }, | |
| { | |
| "epoch": 4.39134646432031, | |
| "grad_norm": 0.7044788002967834, | |
| "learning_rate": 0.00033697230901842467, | |
| "loss": 3.387, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 4.396728016359918, | |
| "grad_norm": 0.6239883899688721, | |
| "learning_rate": 0.00033664906798836327, | |
| "loss": 3.4018, | |
| "step": 40850 | |
| }, | |
| { | |
| "epoch": 4.402109568399527, | |
| "grad_norm": 0.6300894021987915, | |
| "learning_rate": 0.0003363258269583019, | |
| "loss": 3.3962, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 4.4074911204391345, | |
| "grad_norm": 0.6542284488677979, | |
| "learning_rate": 0.00033600258592824046, | |
| "loss": 3.3824, | |
| "step": 40950 | |
| }, | |
| { | |
| "epoch": 4.412872672478743, | |
| "grad_norm": 0.6408656239509583, | |
| "learning_rate": 0.00033567934489817905, | |
| "loss": 3.3844, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.412872672478743, | |
| "eval_accuracy": 0.37776924973333864, | |
| "eval_loss": 3.43493390083313, | |
| "eval_runtime": 184.3217, | |
| "eval_samples_per_second": 97.715, | |
| "eval_steps_per_second": 6.109, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 4.418254224518351, | |
| "grad_norm": 0.6970384120941162, | |
| "learning_rate": 0.00033535610386811764, | |
| "loss": 3.3984, | |
| "step": 41050 | |
| }, | |
| { | |
| "epoch": 4.423635776557959, | |
| "grad_norm": 0.719208300113678, | |
| "learning_rate": 0.0003350328628380562, | |
| "loss": 3.4049, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 4.429017328597568, | |
| "grad_norm": 0.6588863134384155, | |
| "learning_rate": 0.0003347096218079948, | |
| "loss": 3.3783, | |
| "step": 41150 | |
| }, | |
| { | |
| "epoch": 4.4343988806371755, | |
| "grad_norm": 0.6663377285003662, | |
| "learning_rate": 0.00033438638077793343, | |
| "loss": 3.3826, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 4.439780432676784, | |
| "grad_norm": 0.649978518486023, | |
| "learning_rate": 0.00033406313974787197, | |
| "loss": 3.3947, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 4.445161984716393, | |
| "grad_norm": 0.6662973761558533, | |
| "learning_rate": 0.00033373989871781056, | |
| "loss": 3.3922, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 4.450543536756, | |
| "grad_norm": 0.6409628391265869, | |
| "learning_rate": 0.0003334166576877491, | |
| "loss": 3.3832, | |
| "step": 41350 | |
| }, | |
| { | |
| "epoch": 4.455925088795609, | |
| "grad_norm": 0.6468245387077332, | |
| "learning_rate": 0.0003330934166576877, | |
| "loss": 3.3892, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 4.461306640835216, | |
| "grad_norm": 0.6183198690414429, | |
| "learning_rate": 0.00033277017562762635, | |
| "loss": 3.4058, | |
| "step": 41450 | |
| }, | |
| { | |
| "epoch": 4.466688192874825, | |
| "grad_norm": 0.6112544536590576, | |
| "learning_rate": 0.0003324469345975649, | |
| "loss": 3.4, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 4.4720697449144335, | |
| "grad_norm": 0.6665729880332947, | |
| "learning_rate": 0.0003321236935675035, | |
| "loss": 3.4103, | |
| "step": 41550 | |
| }, | |
| { | |
| "epoch": 4.477451296954041, | |
| "grad_norm": 0.6289717555046082, | |
| "learning_rate": 0.0003318004525374421, | |
| "loss": 3.3997, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 4.48283284899365, | |
| "grad_norm": 0.6453819274902344, | |
| "learning_rate": 0.0003314772115073806, | |
| "loss": 3.3909, | |
| "step": 41650 | |
| }, | |
| { | |
| "epoch": 4.488214401033258, | |
| "grad_norm": 0.7051796913146973, | |
| "learning_rate": 0.0003311539704773192, | |
| "loss": 3.4105, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 4.493595953072866, | |
| "grad_norm": 0.7093035578727722, | |
| "learning_rate": 0.00033083072944725786, | |
| "loss": 3.3901, | |
| "step": 41750 | |
| }, | |
| { | |
| "epoch": 4.4989775051124745, | |
| "grad_norm": 0.6689607501029968, | |
| "learning_rate": 0.0003305074884171964, | |
| "loss": 3.3928, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 4.504359057152083, | |
| "grad_norm": 0.6262616515159607, | |
| "learning_rate": 0.000330184247387135, | |
| "loss": 3.3853, | |
| "step": 41850 | |
| }, | |
| { | |
| "epoch": 4.509740609191691, | |
| "grad_norm": 0.663016676902771, | |
| "learning_rate": 0.00032986100635707354, | |
| "loss": 3.3819, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 4.515122161231299, | |
| "grad_norm": 0.6628140807151794, | |
| "learning_rate": 0.00032953776532701213, | |
| "loss": 3.3863, | |
| "step": 41950 | |
| }, | |
| { | |
| "epoch": 4.520503713270907, | |
| "grad_norm": 0.6778246760368347, | |
| "learning_rate": 0.00032921452429695067, | |
| "loss": 3.3913, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.520503713270907, | |
| "eval_accuracy": 0.37859772800292407, | |
| "eval_loss": 3.43190598487854, | |
| "eval_runtime": 183.9779, | |
| "eval_samples_per_second": 97.898, | |
| "eval_steps_per_second": 6.12, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 4.5258852653105155, | |
| "grad_norm": 0.6411513686180115, | |
| "learning_rate": 0.0003288912832668893, | |
| "loss": 3.3865, | |
| "step": 42050 | |
| }, | |
| { | |
| "epoch": 4.531266817350124, | |
| "grad_norm": 0.6528933644294739, | |
| "learning_rate": 0.0003285680422368279, | |
| "loss": 3.4013, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 4.536648369389732, | |
| "grad_norm": 0.6158764362335205, | |
| "learning_rate": 0.0003282448012067665, | |
| "loss": 3.3782, | |
| "step": 42150 | |
| }, | |
| { | |
| "epoch": 4.54202992142934, | |
| "grad_norm": 0.7035838961601257, | |
| "learning_rate": 0.00032792156017670505, | |
| "loss": 3.4084, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 4.547411473468949, | |
| "grad_norm": 0.6502971053123474, | |
| "learning_rate": 0.00032759831914664365, | |
| "loss": 3.3929, | |
| "step": 42250 | |
| }, | |
| { | |
| "epoch": 4.5527930255085565, | |
| "grad_norm": 0.6222167611122131, | |
| "learning_rate": 0.0003272750781165823, | |
| "loss": 3.3799, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 4.558174577548165, | |
| "grad_norm": 0.7282003164291382, | |
| "learning_rate": 0.00032695183708652083, | |
| "loss": 3.3734, | |
| "step": 42350 | |
| }, | |
| { | |
| "epoch": 4.563556129587774, | |
| "grad_norm": 0.6520505547523499, | |
| "learning_rate": 0.00032662859605645943, | |
| "loss": 3.3722, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 4.568937681627381, | |
| "grad_norm": 0.6786864399909973, | |
| "learning_rate": 0.00032630535502639797, | |
| "loss": 3.4029, | |
| "step": 42450 | |
| }, | |
| { | |
| "epoch": 4.57431923366699, | |
| "grad_norm": 0.651889979839325, | |
| "learning_rate": 0.00032598211399633656, | |
| "loss": 3.3934, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 4.579700785706597, | |
| "grad_norm": 0.662212610244751, | |
| "learning_rate": 0.0003256588729662751, | |
| "loss": 3.3881, | |
| "step": 42550 | |
| }, | |
| { | |
| "epoch": 4.585082337746206, | |
| "grad_norm": 0.6370143294334412, | |
| "learning_rate": 0.00032533563193621375, | |
| "loss": 3.3962, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 4.5904638897858145, | |
| "grad_norm": 0.6888731718063354, | |
| "learning_rate": 0.00032501239090615235, | |
| "loss": 3.3792, | |
| "step": 42650 | |
| }, | |
| { | |
| "epoch": 4.595845441825422, | |
| "grad_norm": 0.6885330677032471, | |
| "learning_rate": 0.0003246891498760909, | |
| "loss": 3.3741, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 4.601226993865031, | |
| "grad_norm": 0.6344414949417114, | |
| "learning_rate": 0.0003243659088460295, | |
| "loss": 3.3869, | |
| "step": 42750 | |
| }, | |
| { | |
| "epoch": 4.606608545904638, | |
| "grad_norm": 0.680259108543396, | |
| "learning_rate": 0.0003240426678159681, | |
| "loss": 3.3875, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 4.611990097944247, | |
| "grad_norm": 0.6663812398910522, | |
| "learning_rate": 0.0003237258916065079, | |
| "loss": 3.415, | |
| "step": 42850 | |
| }, | |
| { | |
| "epoch": 4.6173716499838555, | |
| "grad_norm": 0.6414598822593689, | |
| "learning_rate": 0.0003234026505764465, | |
| "loss": 3.3925, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 4.622753202023463, | |
| "grad_norm": 0.657367467880249, | |
| "learning_rate": 0.000323079409546385, | |
| "loss": 3.3902, | |
| "step": 42950 | |
| }, | |
| { | |
| "epoch": 4.628134754063072, | |
| "grad_norm": 0.6666000485420227, | |
| "learning_rate": 0.00032275616851632367, | |
| "loss": 3.3802, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.628134754063072, | |
| "eval_accuracy": 0.378789283044141, | |
| "eval_loss": 3.4260668754577637, | |
| "eval_runtime": 184.1527, | |
| "eval_samples_per_second": 97.805, | |
| "eval_steps_per_second": 6.114, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 4.63351630610268, | |
| "grad_norm": 0.6748586893081665, | |
| "learning_rate": 0.00032243292748626226, | |
| "loss": 3.4022, | |
| "step": 43050 | |
| }, | |
| { | |
| "epoch": 4.638897858142288, | |
| "grad_norm": 0.6140267252922058, | |
| "learning_rate": 0.0003221096864562008, | |
| "loss": 3.3837, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 4.6442794101818965, | |
| "grad_norm": 0.6773905754089355, | |
| "learning_rate": 0.0003217864454261394, | |
| "loss": 3.3804, | |
| "step": 43150 | |
| }, | |
| { | |
| "epoch": 4.649660962221505, | |
| "grad_norm": 0.6858988404273987, | |
| "learning_rate": 0.00032146320439607794, | |
| "loss": 3.3782, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 4.655042514261113, | |
| "grad_norm": 0.6207734942436218, | |
| "learning_rate": 0.0003211399633660166, | |
| "loss": 3.3924, | |
| "step": 43250 | |
| }, | |
| { | |
| "epoch": 4.660424066300721, | |
| "grad_norm": 0.6594465374946594, | |
| "learning_rate": 0.0003208167223359552, | |
| "loss": 3.4044, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 4.665805618340329, | |
| "grad_norm": 0.7123188972473145, | |
| "learning_rate": 0.0003204934813058937, | |
| "loss": 3.3888, | |
| "step": 43350 | |
| }, | |
| { | |
| "epoch": 4.6711871703799375, | |
| "grad_norm": 0.6485627889633179, | |
| "learning_rate": 0.0003201702402758323, | |
| "loss": 3.4016, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 4.676568722419546, | |
| "grad_norm": 0.6207435727119446, | |
| "learning_rate": 0.0003198469992457709, | |
| "loss": 3.3982, | |
| "step": 43450 | |
| }, | |
| { | |
| "epoch": 4.681950274459154, | |
| "grad_norm": 0.6076232194900513, | |
| "learning_rate": 0.00031952375821570945, | |
| "loss": 3.3853, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 4.687331826498762, | |
| "grad_norm": 0.6661508679389954, | |
| "learning_rate": 0.0003192005171856481, | |
| "loss": 3.397, | |
| "step": 43550 | |
| }, | |
| { | |
| "epoch": 4.692713378538371, | |
| "grad_norm": 0.6532484889030457, | |
| "learning_rate": 0.0003188772761555867, | |
| "loss": 3.3898, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 4.6980949305779784, | |
| "grad_norm": 0.6694145798683167, | |
| "learning_rate": 0.00031855403512552524, | |
| "loss": 3.4124, | |
| "step": 43650 | |
| }, | |
| { | |
| "epoch": 4.703476482617587, | |
| "grad_norm": 0.7081990838050842, | |
| "learning_rate": 0.00031823079409546383, | |
| "loss": 3.3859, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 4.7088580346571955, | |
| "grad_norm": 0.6278550028800964, | |
| "learning_rate": 0.00031790755306540237, | |
| "loss": 3.3846, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 4.714239586696803, | |
| "grad_norm": 0.6548607349395752, | |
| "learning_rate": 0.00031758431203534097, | |
| "loss": 3.388, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 4.719621138736412, | |
| "grad_norm": 0.6434149742126465, | |
| "learning_rate": 0.0003172610710052796, | |
| "loss": 3.3952, | |
| "step": 43850 | |
| }, | |
| { | |
| "epoch": 4.725002690776019, | |
| "grad_norm": 0.6363683938980103, | |
| "learning_rate": 0.00031693782997521816, | |
| "loss": 3.3814, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 4.730384242815628, | |
| "grad_norm": 0.6362830996513367, | |
| "learning_rate": 0.00031661458894515675, | |
| "loss": 3.3889, | |
| "step": 43950 | |
| }, | |
| { | |
| "epoch": 4.7357657948552365, | |
| "grad_norm": 0.6621768474578857, | |
| "learning_rate": 0.0003162913479150953, | |
| "loss": 3.393, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.7357657948552365, | |
| "eval_accuracy": 0.3792806114028381, | |
| "eval_loss": 3.420133352279663, | |
| "eval_runtime": 184.4795, | |
| "eval_samples_per_second": 97.631, | |
| "eval_steps_per_second": 6.104, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 4.741147346894844, | |
| "grad_norm": 0.6609659194946289, | |
| "learning_rate": 0.0003159681068850339, | |
| "loss": 3.3955, | |
| "step": 44050 | |
| }, | |
| { | |
| "epoch": 4.746528898934453, | |
| "grad_norm": 0.6639893054962158, | |
| "learning_rate": 0.00031564486585497253, | |
| "loss": 3.3899, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 4.751910450974061, | |
| "grad_norm": 0.6364026069641113, | |
| "learning_rate": 0.0003153216248249111, | |
| "loss": 3.3913, | |
| "step": 44150 | |
| }, | |
| { | |
| "epoch": 4.757292003013669, | |
| "grad_norm": 0.6532016396522522, | |
| "learning_rate": 0.00031499838379484967, | |
| "loss": 3.4, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 4.7626735550532775, | |
| "grad_norm": 0.6646419167518616, | |
| "learning_rate": 0.00031467514276478826, | |
| "loss": 3.3959, | |
| "step": 44250 | |
| }, | |
| { | |
| "epoch": 4.768055107092886, | |
| "grad_norm": 0.6860271096229553, | |
| "learning_rate": 0.0003143519017347268, | |
| "loss": 3.4054, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 4.773436659132494, | |
| "grad_norm": 0.6953567862510681, | |
| "learning_rate": 0.0003140286607046654, | |
| "loss": 3.3912, | |
| "step": 44350 | |
| }, | |
| { | |
| "epoch": 4.778818211172102, | |
| "grad_norm": 0.6766008734703064, | |
| "learning_rate": 0.00031370541967460405, | |
| "loss": 3.3885, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 4.78419976321171, | |
| "grad_norm": 0.6261332631111145, | |
| "learning_rate": 0.0003133821786445426, | |
| "loss": 3.3869, | |
| "step": 44450 | |
| }, | |
| { | |
| "epoch": 4.7895813152513185, | |
| "grad_norm": 0.6700854897499084, | |
| "learning_rate": 0.0003130589376144812, | |
| "loss": 3.4018, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 4.794962867290927, | |
| "grad_norm": 0.680160641670227, | |
| "learning_rate": 0.0003127356965844197, | |
| "loss": 3.3795, | |
| "step": 44550 | |
| }, | |
| { | |
| "epoch": 4.800344419330535, | |
| "grad_norm": 0.6283779740333557, | |
| "learning_rate": 0.0003124124555543583, | |
| "loss": 3.3899, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 4.805725971370143, | |
| "grad_norm": 0.6881921887397766, | |
| "learning_rate": 0.0003120892145242969, | |
| "loss": 3.3931, | |
| "step": 44650 | |
| }, | |
| { | |
| "epoch": 4.811107523409751, | |
| "grad_norm": 0.7671242356300354, | |
| "learning_rate": 0.0003117659734942355, | |
| "loss": 3.4, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 4.8164890754493594, | |
| "grad_norm": 0.6230366826057434, | |
| "learning_rate": 0.00031144919728477526, | |
| "loss": 3.3955, | |
| "step": 44750 | |
| }, | |
| { | |
| "epoch": 4.821870627488968, | |
| "grad_norm": 0.6817081570625305, | |
| "learning_rate": 0.0003111259562547139, | |
| "loss": 3.3913, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 4.827252179528576, | |
| "grad_norm": 0.6694238185882568, | |
| "learning_rate": 0.0003108027152246525, | |
| "loss": 3.3856, | |
| "step": 44850 | |
| }, | |
| { | |
| "epoch": 4.832633731568184, | |
| "grad_norm": 0.6804167628288269, | |
| "learning_rate": 0.0003104794741945911, | |
| "loss": 3.3836, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 4.838015283607793, | |
| "grad_norm": 0.7400542497634888, | |
| "learning_rate": 0.00031015623316452964, | |
| "loss": 3.3945, | |
| "step": 44950 | |
| }, | |
| { | |
| "epoch": 4.8433968356474, | |
| "grad_norm": 0.6841332912445068, | |
| "learning_rate": 0.00030983299213446823, | |
| "loss": 3.4061, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.8433968356474, | |
| "eval_accuracy": 0.3799427421011795, | |
| "eval_loss": 3.4151318073272705, | |
| "eval_runtime": 184.0085, | |
| "eval_samples_per_second": 97.881, | |
| "eval_steps_per_second": 6.119, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 4.848778387687009, | |
| "grad_norm": 0.6875273585319519, | |
| "learning_rate": 0.0003095097511044069, | |
| "loss": 3.381, | |
| "step": 45050 | |
| }, | |
| { | |
| "epoch": 4.8541599397266175, | |
| "grad_norm": 0.630673348903656, | |
| "learning_rate": 0.0003091865100743454, | |
| "loss": 3.3863, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 4.859541491766225, | |
| "grad_norm": 0.6930129528045654, | |
| "learning_rate": 0.000308863269044284, | |
| "loss": 3.3922, | |
| "step": 45150 | |
| }, | |
| { | |
| "epoch": 4.864923043805834, | |
| "grad_norm": 0.6872902512550354, | |
| "learning_rate": 0.00030854002801422256, | |
| "loss": 3.4027, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 4.870304595845441, | |
| "grad_norm": 0.6678783297538757, | |
| "learning_rate": 0.00030821678698416115, | |
| "loss": 3.3917, | |
| "step": 45250 | |
| }, | |
| { | |
| "epoch": 4.87568614788505, | |
| "grad_norm": 0.7070086598396301, | |
| "learning_rate": 0.0003078935459540997, | |
| "loss": 3.3919, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 4.8810676999246585, | |
| "grad_norm": 0.7027028203010559, | |
| "learning_rate": 0.00030757030492403834, | |
| "loss": 3.3948, | |
| "step": 45350 | |
| }, | |
| { | |
| "epoch": 4.886449251964266, | |
| "grad_norm": 0.6884028911590576, | |
| "learning_rate": 0.00030724706389397694, | |
| "loss": 3.4086, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 4.891830804003875, | |
| "grad_norm": 0.7701573967933655, | |
| "learning_rate": 0.0003069238228639155, | |
| "loss": 3.3962, | |
| "step": 45450 | |
| }, | |
| { | |
| "epoch": 4.897212356043483, | |
| "grad_norm": 0.6813535094261169, | |
| "learning_rate": 0.00030660058183385407, | |
| "loss": 3.4013, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 4.902593908083091, | |
| "grad_norm": 0.6463482975959778, | |
| "learning_rate": 0.00030627734080379267, | |
| "loss": 3.3926, | |
| "step": 45550 | |
| }, | |
| { | |
| "epoch": 4.9079754601226995, | |
| "grad_norm": 0.6464250683784485, | |
| "learning_rate": 0.0003059540997737312, | |
| "loss": 3.3822, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 4.913357012162308, | |
| "grad_norm": 0.6345609426498413, | |
| "learning_rate": 0.00030563085874366986, | |
| "loss": 3.393, | |
| "step": 45650 | |
| }, | |
| { | |
| "epoch": 4.918738564201916, | |
| "grad_norm": 0.6901547908782959, | |
| "learning_rate": 0.00030530761771360845, | |
| "loss": 3.3725, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 4.924120116241524, | |
| "grad_norm": 0.6914418935775757, | |
| "learning_rate": 0.000304984376683547, | |
| "loss": 3.3789, | |
| "step": 45750 | |
| }, | |
| { | |
| "epoch": 4.929501668281132, | |
| "grad_norm": 0.734369695186615, | |
| "learning_rate": 0.0003046611356534856, | |
| "loss": 3.3904, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 4.9348832203207404, | |
| "grad_norm": 0.6802307963371277, | |
| "learning_rate": 0.0003043378946234241, | |
| "loss": 3.3965, | |
| "step": 45850 | |
| }, | |
| { | |
| "epoch": 4.940264772360349, | |
| "grad_norm": 0.6808255314826965, | |
| "learning_rate": 0.0003040146535933628, | |
| "loss": 3.3812, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 4.945646324399957, | |
| "grad_norm": 0.6597874760627747, | |
| "learning_rate": 0.00030369141256330137, | |
| "loss": 3.3646, | |
| "step": 45950 | |
| }, | |
| { | |
| "epoch": 4.951027876439565, | |
| "grad_norm": 0.6720778346061707, | |
| "learning_rate": 0.0003033681715332399, | |
| "loss": 3.3894, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.951027876439565, | |
| "eval_accuracy": 0.38033237135688225, | |
| "eval_loss": 3.410205841064453, | |
| "eval_runtime": 184.0, | |
| "eval_samples_per_second": 97.886, | |
| "eval_steps_per_second": 6.12, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 4.956409428479174, | |
| "grad_norm": 0.6535595059394836, | |
| "learning_rate": 0.0003030449305031785, | |
| "loss": 3.3842, | |
| "step": 46050 | |
| }, | |
| { | |
| "epoch": 4.961790980518781, | |
| "grad_norm": 0.6358091235160828, | |
| "learning_rate": 0.0003027216894731171, | |
| "loss": 3.3774, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 4.96717253255839, | |
| "grad_norm": 1.3344215154647827, | |
| "learning_rate": 0.00030239844844305564, | |
| "loss": 3.3972, | |
| "step": 46150 | |
| }, | |
| { | |
| "epoch": 4.9725540845979985, | |
| "grad_norm": 0.6452608108520508, | |
| "learning_rate": 0.0003020752074129943, | |
| "loss": 3.3968, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 4.977935636637606, | |
| "grad_norm": 0.7212709188461304, | |
| "learning_rate": 0.0003017519663829329, | |
| "loss": 3.3917, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 4.983317188677215, | |
| "grad_norm": 0.6865209341049194, | |
| "learning_rate": 0.0003014287253528714, | |
| "loss": 3.3774, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 4.988698740716822, | |
| "grad_norm": 0.7008549571037292, | |
| "learning_rate": 0.00030110548432281, | |
| "loss": 3.384, | |
| "step": 46350 | |
| }, | |
| { | |
| "epoch": 4.994080292756431, | |
| "grad_norm": 0.6504687666893005, | |
| "learning_rate": 0.00030078224329274856, | |
| "loss": 3.3863, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 4.9994618447960395, | |
| "grad_norm": 0.6690630912780762, | |
| "learning_rate": 0.00030045900226268715, | |
| "loss": 3.3824, | |
| "step": 46450 | |
| }, | |
| { | |
| "epoch": 5.004843396835647, | |
| "grad_norm": 0.6338120102882385, | |
| "learning_rate": 0.0003001357612326258, | |
| "loss": 3.2995, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 5.010224948875256, | |
| "grad_norm": 0.6364915370941162, | |
| "learning_rate": 0.00029981252020256434, | |
| "loss": 3.2997, | |
| "step": 46550 | |
| }, | |
| { | |
| "epoch": 5.015606500914864, | |
| "grad_norm": 0.6910309791564941, | |
| "learning_rate": 0.00029948927917250294, | |
| "loss": 3.3019, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 5.020988052954472, | |
| "grad_norm": 0.6775645613670349, | |
| "learning_rate": 0.00029916603814244153, | |
| "loss": 3.2952, | |
| "step": 46650 | |
| }, | |
| { | |
| "epoch": 5.0263696049940805, | |
| "grad_norm": 0.7527064681053162, | |
| "learning_rate": 0.0002988427971123801, | |
| "loss": 3.3052, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 5.031751157033688, | |
| "grad_norm": 0.6929546594619751, | |
| "learning_rate": 0.00029851955608231867, | |
| "loss": 3.3145, | |
| "step": 46750 | |
| }, | |
| { | |
| "epoch": 5.037132709073297, | |
| "grad_norm": 0.624110996723175, | |
| "learning_rate": 0.00029819631505225726, | |
| "loss": 3.3169, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 5.042514261112905, | |
| "grad_norm": 0.698697566986084, | |
| "learning_rate": 0.00029787307402219586, | |
| "loss": 3.3072, | |
| "step": 46850 | |
| }, | |
| { | |
| "epoch": 5.047895813152513, | |
| "grad_norm": 0.760403573513031, | |
| "learning_rate": 0.00029754983299213445, | |
| "loss": 3.3023, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 5.0532773651921215, | |
| "grad_norm": 0.7195192575454712, | |
| "learning_rate": 0.000297226591962073, | |
| "loss": 3.3267, | |
| "step": 46950 | |
| }, | |
| { | |
| "epoch": 5.05865891723173, | |
| "grad_norm": 0.640288770198822, | |
| "learning_rate": 0.00029690335093201164, | |
| "loss": 3.2988, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 5.05865891723173, | |
| "eval_accuracy": 0.38089660580330287, | |
| "eval_loss": 3.4140713214874268, | |
| "eval_runtime": 184.5126, | |
| "eval_samples_per_second": 97.614, | |
| "eval_steps_per_second": 6.103, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 5.064040469271338, | |
| "grad_norm": 0.7123163938522339, | |
| "learning_rate": 0.0002965801099019502, | |
| "loss": 3.3039, | |
| "step": 47050 | |
| }, | |
| { | |
| "epoch": 5.069422021310946, | |
| "grad_norm": 0.6437656879425049, | |
| "learning_rate": 0.0002962568688718888, | |
| "loss": 3.308, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 5.074803573350554, | |
| "grad_norm": 0.6438406109809875, | |
| "learning_rate": 0.00029593362784182737, | |
| "loss": 3.3086, | |
| "step": 47150 | |
| }, | |
| { | |
| "epoch": 5.080185125390162, | |
| "grad_norm": 0.6933695673942566, | |
| "learning_rate": 0.00029561038681176596, | |
| "loss": 3.3286, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 5.085566677429771, | |
| "grad_norm": 0.6499897241592407, | |
| "learning_rate": 0.00029528714578170456, | |
| "loss": 3.3282, | |
| "step": 47250 | |
| }, | |
| { | |
| "epoch": 5.090948229469379, | |
| "grad_norm": 0.6867937445640564, | |
| "learning_rate": 0.0002949639047516431, | |
| "loss": 3.3086, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 5.096329781508987, | |
| "grad_norm": 0.7074191570281982, | |
| "learning_rate": 0.0002946406637215817, | |
| "loss": 3.2997, | |
| "step": 47350 | |
| }, | |
| { | |
| "epoch": 5.101711333548596, | |
| "grad_norm": 0.6474207043647766, | |
| "learning_rate": 0.0002943174226915203, | |
| "loss": 3.3096, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 5.107092885588203, | |
| "grad_norm": 0.7220308780670166, | |
| "learning_rate": 0.0002939941816614589, | |
| "loss": 3.3151, | |
| "step": 47450 | |
| }, | |
| { | |
| "epoch": 5.112474437627812, | |
| "grad_norm": 0.6929433941841125, | |
| "learning_rate": 0.0002936709406313974, | |
| "loss": 3.3169, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 5.1178559896674205, | |
| "grad_norm": 0.6843468546867371, | |
| "learning_rate": 0.0002933476996013361, | |
| "loss": 3.3147, | |
| "step": 47550 | |
| }, | |
| { | |
| "epoch": 5.123237541707028, | |
| "grad_norm": 0.6644719243049622, | |
| "learning_rate": 0.0002930244585712746, | |
| "loss": 3.3146, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 5.128619093746637, | |
| "grad_norm": 0.7787826657295227, | |
| "learning_rate": 0.0002927012175412132, | |
| "loss": 3.3084, | |
| "step": 47650 | |
| }, | |
| { | |
| "epoch": 5.134000645786244, | |
| "grad_norm": 0.7502739429473877, | |
| "learning_rate": 0.0002923779765111518, | |
| "loss": 3.32, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 5.139382197825853, | |
| "grad_norm": 0.6958211660385132, | |
| "learning_rate": 0.0002920547354810904, | |
| "loss": 3.3128, | |
| "step": 47750 | |
| }, | |
| { | |
| "epoch": 5.1447637498654615, | |
| "grad_norm": 0.7047584652900696, | |
| "learning_rate": 0.00029173149445102894, | |
| "loss": 3.344, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 5.150145301905069, | |
| "grad_norm": 0.6888059377670288, | |
| "learning_rate": 0.00029140825342096753, | |
| "loss": 3.3247, | |
| "step": 47850 | |
| }, | |
| { | |
| "epoch": 5.155526853944678, | |
| "grad_norm": 0.7417131066322327, | |
| "learning_rate": 0.0002910850123909061, | |
| "loss": 3.3113, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 5.160908405984286, | |
| "grad_norm": 0.6905604600906372, | |
| "learning_rate": 0.0002907617713608447, | |
| "loss": 3.3091, | |
| "step": 47950 | |
| }, | |
| { | |
| "epoch": 5.166289958023894, | |
| "grad_norm": 0.6897056698799133, | |
| "learning_rate": 0.0002904385303307833, | |
| "loss": 3.3063, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 5.166289958023894, | |
| "eval_accuracy": 0.3808204401289449, | |
| "eval_loss": 3.41259765625, | |
| "eval_runtime": 184.1998, | |
| "eval_samples_per_second": 97.78, | |
| "eval_steps_per_second": 6.113, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 5.1716715100635025, | |
| "grad_norm": 0.7102882862091064, | |
| "learning_rate": 0.00029011528930072186, | |
| "loss": 3.3235, | |
| "step": 48050 | |
| }, | |
| { | |
| "epoch": 5.17705306210311, | |
| "grad_norm": 0.6777679920196533, | |
| "learning_rate": 0.00028979204827066045, | |
| "loss": 3.325, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 5.182434614142719, | |
| "grad_norm": 0.7214860916137695, | |
| "learning_rate": 0.00028946880724059905, | |
| "loss": 3.3411, | |
| "step": 48150 | |
| }, | |
| { | |
| "epoch": 5.187816166182327, | |
| "grad_norm": 0.6319332718849182, | |
| "learning_rate": 0.00028914556621053764, | |
| "loss": 3.3318, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 5.193197718221935, | |
| "grad_norm": 0.707127034664154, | |
| "learning_rate": 0.00028882232518047624, | |
| "loss": 3.3286, | |
| "step": 48250 | |
| }, | |
| { | |
| "epoch": 5.198579270261543, | |
| "grad_norm": 0.7037991285324097, | |
| "learning_rate": 0.00028849908415041483, | |
| "loss": 3.3115, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 5.203960822301152, | |
| "grad_norm": 0.6779250502586365, | |
| "learning_rate": 0.00028817584312035337, | |
| "loss": 3.3145, | |
| "step": 48350 | |
| }, | |
| { | |
| "epoch": 5.20934237434076, | |
| "grad_norm": 0.6654755473136902, | |
| "learning_rate": 0.00028785260209029197, | |
| "loss": 3.3269, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 5.214723926380368, | |
| "grad_norm": 0.713239848613739, | |
| "learning_rate": 0.00028752936106023056, | |
| "loss": 3.3167, | |
| "step": 48450 | |
| }, | |
| { | |
| "epoch": 5.220105478419977, | |
| "grad_norm": 0.6545668244361877, | |
| "learning_rate": 0.00028720612003016915, | |
| "loss": 3.3313, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 5.225487030459584, | |
| "grad_norm": 0.6693907380104065, | |
| "learning_rate": 0.00028688287900010775, | |
| "loss": 3.3347, | |
| "step": 48550 | |
| }, | |
| { | |
| "epoch": 5.230868582499193, | |
| "grad_norm": 0.7082215547561646, | |
| "learning_rate": 0.0002865596379700463, | |
| "loss": 3.3182, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 5.236250134538801, | |
| "grad_norm": 0.6843087673187256, | |
| "learning_rate": 0.0002862363969399849, | |
| "loss": 3.3212, | |
| "step": 48650 | |
| }, | |
| { | |
| "epoch": 5.241631686578409, | |
| "grad_norm": 0.6862715482711792, | |
| "learning_rate": 0.0002859131559099235, | |
| "loss": 3.3275, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 5.247013238618018, | |
| "grad_norm": 0.6855129599571228, | |
| "learning_rate": 0.0002855963797004633, | |
| "loss": 3.33, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 5.252394790657625, | |
| "grad_norm": 0.6837905049324036, | |
| "learning_rate": 0.0002852731386704019, | |
| "loss": 3.3324, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 5.257776342697234, | |
| "grad_norm": 0.6358972787857056, | |
| "learning_rate": 0.0002849498976403405, | |
| "loss": 3.3167, | |
| "step": 48850 | |
| }, | |
| { | |
| "epoch": 5.2631578947368425, | |
| "grad_norm": 0.6836221218109131, | |
| "learning_rate": 0.000284626656610279, | |
| "loss": 3.3239, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 5.26853944677645, | |
| "grad_norm": 0.7453120946884155, | |
| "learning_rate": 0.0002843034155802176, | |
| "loss": 3.3433, | |
| "step": 48950 | |
| }, | |
| { | |
| "epoch": 5.273920998816059, | |
| "grad_norm": 0.6284498572349548, | |
| "learning_rate": 0.0002839801745501562, | |
| "loss": 3.3432, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 5.273920998816059, | |
| "eval_accuracy": 0.3816899891901242, | |
| "eval_loss": 3.4081289768218994, | |
| "eval_runtime": 183.9045, | |
| "eval_samples_per_second": 97.937, | |
| "eval_steps_per_second": 6.123, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 5.279302550855666, | |
| "grad_norm": 0.6980060338973999, | |
| "learning_rate": 0.000283663398340696, | |
| "loss": 3.3392, | |
| "step": 49050 | |
| }, | |
| { | |
| "epoch": 5.284684102895275, | |
| "grad_norm": 0.6563853621482849, | |
| "learning_rate": 0.0002833401573106346, | |
| "loss": 3.3385, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 5.2900656549348835, | |
| "grad_norm": 0.7191060781478882, | |
| "learning_rate": 0.0002830169162805732, | |
| "loss": 3.3377, | |
| "step": 49150 | |
| }, | |
| { | |
| "epoch": 5.295447206974491, | |
| "grad_norm": 0.6789143085479736, | |
| "learning_rate": 0.0002826936752505118, | |
| "loss": 3.3321, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 5.3008287590141, | |
| "grad_norm": 0.6590706706047058, | |
| "learning_rate": 0.00028237043422045034, | |
| "loss": 3.343, | |
| "step": 49250 | |
| }, | |
| { | |
| "epoch": 5.306210311053708, | |
| "grad_norm": 0.6632896661758423, | |
| "learning_rate": 0.00028204719319038893, | |
| "loss": 3.314, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 5.311591863093316, | |
| "grad_norm": 0.7267322540283203, | |
| "learning_rate": 0.0002817239521603275, | |
| "loss": 3.3426, | |
| "step": 49350 | |
| }, | |
| { | |
| "epoch": 5.316973415132924, | |
| "grad_norm": 0.6905198693275452, | |
| "learning_rate": 0.0002814007111302661, | |
| "loss": 3.3185, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 5.322354967172533, | |
| "grad_norm": 0.6762885451316833, | |
| "learning_rate": 0.0002810774701002047, | |
| "loss": 3.3412, | |
| "step": 49450 | |
| }, | |
| { | |
| "epoch": 5.327736519212141, | |
| "grad_norm": 0.7283837795257568, | |
| "learning_rate": 0.00028075422907014325, | |
| "loss": 3.3324, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 5.333118071251749, | |
| "grad_norm": 0.7100688219070435, | |
| "learning_rate": 0.00028043098804008185, | |
| "loss": 3.3433, | |
| "step": 49550 | |
| }, | |
| { | |
| "epoch": 5.338499623291357, | |
| "grad_norm": 0.6812211275100708, | |
| "learning_rate": 0.00028010774701002044, | |
| "loss": 3.3305, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 5.343881175330965, | |
| "grad_norm": 0.6936942934989929, | |
| "learning_rate": 0.00027978450597995904, | |
| "loss": 3.3567, | |
| "step": 49650 | |
| }, | |
| { | |
| "epoch": 5.349262727370574, | |
| "grad_norm": 0.6920641660690308, | |
| "learning_rate": 0.0002794612649498976, | |
| "loss": 3.3213, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 5.354644279410182, | |
| "grad_norm": 0.7323217988014221, | |
| "learning_rate": 0.00027913802391983623, | |
| "loss": 3.3312, | |
| "step": 49750 | |
| }, | |
| { | |
| "epoch": 5.36002583144979, | |
| "grad_norm": 0.6813651919364929, | |
| "learning_rate": 0.00027881478288977477, | |
| "loss": 3.3503, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 5.365407383489399, | |
| "grad_norm": 0.697881281375885, | |
| "learning_rate": 0.00027849154185971336, | |
| "loss": 3.3306, | |
| "step": 49850 | |
| }, | |
| { | |
| "epoch": 5.370788935529006, | |
| "grad_norm": 0.73151034116745, | |
| "learning_rate": 0.00027816830082965196, | |
| "loss": 3.3337, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 5.376170487568615, | |
| "grad_norm": 0.6968307495117188, | |
| "learning_rate": 0.00027784505979959055, | |
| "loss": 3.3471, | |
| "step": 49950 | |
| }, | |
| { | |
| "epoch": 5.3815520396082235, | |
| "grad_norm": 0.6323778033256531, | |
| "learning_rate": 0.00027752181876952915, | |
| "loss": 3.3325, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.3815520396082235, | |
| "eval_accuracy": 0.3820180122584361, | |
| "eval_loss": 3.4034645557403564, | |
| "eval_runtime": 184.1386, | |
| "eval_samples_per_second": 97.812, | |
| "eval_steps_per_second": 6.115, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 5.386933591647831, | |
| "grad_norm": 0.6919659376144409, | |
| "learning_rate": 0.0002771985777394677, | |
| "loss": 3.3212, | |
| "step": 50050 | |
| }, | |
| { | |
| "epoch": 5.39231514368744, | |
| "grad_norm": 0.6490268707275391, | |
| "learning_rate": 0.0002768753367094063, | |
| "loss": 3.3219, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 5.397696695727047, | |
| "grad_norm": 0.7063648700714111, | |
| "learning_rate": 0.0002765520956793449, | |
| "loss": 3.3245, | |
| "step": 50150 | |
| }, | |
| { | |
| "epoch": 5.403078247766656, | |
| "grad_norm": 0.7237613201141357, | |
| "learning_rate": 0.00027622885464928347, | |
| "loss": 3.3175, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 5.4084597998062645, | |
| "grad_norm": 0.6712755560874939, | |
| "learning_rate": 0.000275905613619222, | |
| "loss": 3.3331, | |
| "step": 50250 | |
| }, | |
| { | |
| "epoch": 5.413841351845872, | |
| "grad_norm": 0.7124558091163635, | |
| "learning_rate": 0.00027558237258916066, | |
| "loss": 3.3226, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 5.419222903885481, | |
| "grad_norm": 0.7070554494857788, | |
| "learning_rate": 0.0002752591315590992, | |
| "loss": 3.3364, | |
| "step": 50350 | |
| }, | |
| { | |
| "epoch": 5.424604455925088, | |
| "grad_norm": 0.7024772763252258, | |
| "learning_rate": 0.0002749358905290378, | |
| "loss": 3.3117, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 5.429986007964697, | |
| "grad_norm": 0.6279031038284302, | |
| "learning_rate": 0.0002746126494989764, | |
| "loss": 3.328, | |
| "step": 50450 | |
| }, | |
| { | |
| "epoch": 5.435367560004305, | |
| "grad_norm": 0.6562391519546509, | |
| "learning_rate": 0.000274289408468915, | |
| "loss": 3.3276, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 5.440749112043913, | |
| "grad_norm": 0.7366046905517578, | |
| "learning_rate": 0.0002739661674388535, | |
| "loss": 3.3354, | |
| "step": 50550 | |
| }, | |
| { | |
| "epoch": 5.446130664083522, | |
| "grad_norm": 0.6988213062286377, | |
| "learning_rate": 0.0002736429264087921, | |
| "loss": 3.335, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 5.45151221612313, | |
| "grad_norm": 0.6984215974807739, | |
| "learning_rate": 0.0002733196853787307, | |
| "loss": 3.3239, | |
| "step": 50650 | |
| }, | |
| { | |
| "epoch": 5.456893768162738, | |
| "grad_norm": 0.6875210404396057, | |
| "learning_rate": 0.0002729964443486693, | |
| "loss": 3.3494, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 5.462275320202346, | |
| "grad_norm": 0.7105006575584412, | |
| "learning_rate": 0.0002726732033186079, | |
| "loss": 3.3254, | |
| "step": 50750 | |
| }, | |
| { | |
| "epoch": 5.467656872241955, | |
| "grad_norm": 0.772357702255249, | |
| "learning_rate": 0.00027234996228854644, | |
| "loss": 3.3215, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 5.473038424281563, | |
| "grad_norm": 0.7547757625579834, | |
| "learning_rate": 0.0002720267212584851, | |
| "loss": 3.3237, | |
| "step": 50850 | |
| }, | |
| { | |
| "epoch": 5.478419976321171, | |
| "grad_norm": 0.6670495271682739, | |
| "learning_rate": 0.00027170348022842363, | |
| "loss": 3.3262, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 5.483801528360779, | |
| "grad_norm": 0.6717911958694458, | |
| "learning_rate": 0.00027138023919836223, | |
| "loss": 3.3375, | |
| "step": 50950 | |
| }, | |
| { | |
| "epoch": 5.489183080400387, | |
| "grad_norm": 0.6847409009933472, | |
| "learning_rate": 0.0002710569981683008, | |
| "loss": 3.3373, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.489183080400387, | |
| "eval_accuracy": 0.38237319854870166, | |
| "eval_loss": 3.399523973464966, | |
| "eval_runtime": 184.134, | |
| "eval_samples_per_second": 97.815, | |
| "eval_steps_per_second": 6.115, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 5.494564632439996, | |
| "grad_norm": 0.7280586957931519, | |
| "learning_rate": 0.0002707337571382394, | |
| "loss": 3.3331, | |
| "step": 51050 | |
| }, | |
| { | |
| "epoch": 5.499946184479604, | |
| "grad_norm": 0.7104876637458801, | |
| "learning_rate": 0.00027041051610817796, | |
| "loss": 3.346, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 5.505327736519212, | |
| "grad_norm": 0.692229688167572, | |
| "learning_rate": 0.00027008727507811655, | |
| "loss": 3.3201, | |
| "step": 51150 | |
| }, | |
| { | |
| "epoch": 5.510709288558821, | |
| "grad_norm": 0.658868134021759, | |
| "learning_rate": 0.00026976403404805515, | |
| "loss": 3.3207, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 5.516090840598428, | |
| "grad_norm": 0.698153555393219, | |
| "learning_rate": 0.00026944079301799374, | |
| "loss": 3.3452, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 5.521472392638037, | |
| "grad_norm": 0.7251303791999817, | |
| "learning_rate": 0.00026911755198793234, | |
| "loss": 3.3477, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 5.5268539446776455, | |
| "grad_norm": 0.6895780563354492, | |
| "learning_rate": 0.0002687943109578709, | |
| "loss": 3.3188, | |
| "step": 51350 | |
| }, | |
| { | |
| "epoch": 5.532235496717253, | |
| "grad_norm": 0.7204925417900085, | |
| "learning_rate": 0.00026847106992780947, | |
| "loss": 3.3279, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 5.537617048756862, | |
| "grad_norm": 0.7200186252593994, | |
| "learning_rate": 0.00026814782889774807, | |
| "loss": 3.3376, | |
| "step": 51450 | |
| }, | |
| { | |
| "epoch": 5.542998600796469, | |
| "grad_norm": 0.7235830426216125, | |
| "learning_rate": 0.00026782458786768666, | |
| "loss": 3.3367, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 5.548380152836078, | |
| "grad_norm": 0.6808749437332153, | |
| "learning_rate": 0.0002675013468376252, | |
| "loss": 3.3245, | |
| "step": 51550 | |
| }, | |
| { | |
| "epoch": 5.553761704875686, | |
| "grad_norm": 0.7232927680015564, | |
| "learning_rate": 0.00026717810580756385, | |
| "loss": 3.3472, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 5.559143256915294, | |
| "grad_norm": 0.7236559987068176, | |
| "learning_rate": 0.0002668548647775024, | |
| "loss": 3.3412, | |
| "step": 51650 | |
| }, | |
| { | |
| "epoch": 5.564524808954903, | |
| "grad_norm": 0.7104244232177734, | |
| "learning_rate": 0.000266531623747441, | |
| "loss": 3.3445, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 5.569906360994511, | |
| "grad_norm": 0.7107536792755127, | |
| "learning_rate": 0.0002662083827173796, | |
| "loss": 3.3265, | |
| "step": 51750 | |
| }, | |
| { | |
| "epoch": 5.575287913034119, | |
| "grad_norm": 0.7243977189064026, | |
| "learning_rate": 0.0002658851416873182, | |
| "loss": 3.3212, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 5.580669465073727, | |
| "grad_norm": 0.8111730217933655, | |
| "learning_rate": 0.00026556190065725677, | |
| "loss": 3.3283, | |
| "step": 51850 | |
| }, | |
| { | |
| "epoch": 5.586051017113336, | |
| "grad_norm": 0.7091906070709229, | |
| "learning_rate": 0.0002652386596271953, | |
| "loss": 3.3271, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 5.591432569152944, | |
| "grad_norm": 0.7432578802108765, | |
| "learning_rate": 0.0002649154185971339, | |
| "loss": 3.3401, | |
| "step": 51950 | |
| }, | |
| { | |
| "epoch": 5.596814121192552, | |
| "grad_norm": 0.7412343621253967, | |
| "learning_rate": 0.0002645921775670725, | |
| "loss": 3.3506, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.596814121192552, | |
| "eval_accuracy": 0.38275261443294983, | |
| "eval_loss": 3.394073486328125, | |
| "eval_runtime": 183.9682, | |
| "eval_samples_per_second": 97.903, | |
| "eval_steps_per_second": 6.121, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 5.60219567323216, | |
| "grad_norm": 0.6942436695098877, | |
| "learning_rate": 0.0002642689365370111, | |
| "loss": 3.3277, | |
| "step": 52050 | |
| }, | |
| { | |
| "epoch": 5.607577225271768, | |
| "grad_norm": 0.7447067499160767, | |
| "learning_rate": 0.00026394569550694963, | |
| "loss": 3.3234, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 5.612958777311377, | |
| "grad_norm": 0.7501024603843689, | |
| "learning_rate": 0.00026362245447688823, | |
| "loss": 3.3183, | |
| "step": 52150 | |
| }, | |
| { | |
| "epoch": 5.618340329350985, | |
| "grad_norm": 0.6899356842041016, | |
| "learning_rate": 0.0002632992134468268, | |
| "loss": 3.3369, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 5.623721881390593, | |
| "grad_norm": 0.7304120659828186, | |
| "learning_rate": 0.0002629759724167654, | |
| "loss": 3.329, | |
| "step": 52250 | |
| }, | |
| { | |
| "epoch": 5.629103433430201, | |
| "grad_norm": 0.6875249147415161, | |
| "learning_rate": 0.000262652731386704, | |
| "loss": 3.3431, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 5.634484985469809, | |
| "grad_norm": 0.770589292049408, | |
| "learning_rate": 0.00026232949035664255, | |
| "loss": 3.3448, | |
| "step": 52350 | |
| }, | |
| { | |
| "epoch": 5.639866537509418, | |
| "grad_norm": 0.6867998838424683, | |
| "learning_rate": 0.00026200624932658115, | |
| "loss": 3.3254, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 5.645248089549026, | |
| "grad_norm": 0.701833188533783, | |
| "learning_rate": 0.00026168300829651974, | |
| "loss": 3.3515, | |
| "step": 52450 | |
| }, | |
| { | |
| "epoch": 5.650629641588634, | |
| "grad_norm": 0.6975635290145874, | |
| "learning_rate": 0.00026135976726645834, | |
| "loss": 3.3351, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 5.656011193628243, | |
| "grad_norm": 0.7372641563415527, | |
| "learning_rate": 0.0002610365262363969, | |
| "loss": 3.329, | |
| "step": 52550 | |
| }, | |
| { | |
| "epoch": 5.66139274566785, | |
| "grad_norm": 0.7018235921859741, | |
| "learning_rate": 0.00026071328520633553, | |
| "loss": 3.3242, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 5.666774297707459, | |
| "grad_norm": 0.7098267674446106, | |
| "learning_rate": 0.00026039004417627407, | |
| "loss": 3.3422, | |
| "step": 52650 | |
| }, | |
| { | |
| "epoch": 5.672155849747067, | |
| "grad_norm": 0.7544245719909668, | |
| "learning_rate": 0.00026006680314621266, | |
| "loss": 3.349, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 5.677537401786675, | |
| "grad_norm": 0.7348368167877197, | |
| "learning_rate": 0.00025974356211615126, | |
| "loss": 3.33, | |
| "step": 52750 | |
| }, | |
| { | |
| "epoch": 5.682918953826284, | |
| "grad_norm": 0.6879133582115173, | |
| "learning_rate": 0.00025942032108608985, | |
| "loss": 3.3439, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 5.688300505865891, | |
| "grad_norm": 0.7134944796562195, | |
| "learning_rate": 0.00025909708005602845, | |
| "loss": 3.3526, | |
| "step": 52850 | |
| }, | |
| { | |
| "epoch": 5.6936820579055, | |
| "grad_norm": 0.7157636880874634, | |
| "learning_rate": 0.000258773839025967, | |
| "loss": 3.3244, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 5.699063609945108, | |
| "grad_norm": 0.6918929815292358, | |
| "learning_rate": 0.0002584505979959056, | |
| "loss": 3.3486, | |
| "step": 52950 | |
| }, | |
| { | |
| "epoch": 5.704445161984716, | |
| "grad_norm": 0.7037851214408875, | |
| "learning_rate": 0.0002581273569658442, | |
| "loss": 3.323, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.704445161984716, | |
| "eval_accuracy": 0.3833323862423279, | |
| "eval_loss": 3.3892478942871094, | |
| "eval_runtime": 184.0857, | |
| "eval_samples_per_second": 97.84, | |
| "eval_steps_per_second": 6.117, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 5.709826714024325, | |
| "grad_norm": 0.730811595916748, | |
| "learning_rate": 0.00025780411593578277, | |
| "loss": 3.3242, | |
| "step": 53050 | |
| }, | |
| { | |
| "epoch": 5.715208266063933, | |
| "grad_norm": 0.6601924300193787, | |
| "learning_rate": 0.0002574873397263226, | |
| "loss": 3.3345, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 5.720589818103541, | |
| "grad_norm": 0.7023354768753052, | |
| "learning_rate": 0.00025716409869626117, | |
| "loss": 3.3334, | |
| "step": 53150 | |
| }, | |
| { | |
| "epoch": 5.725971370143149, | |
| "grad_norm": 0.7079684734344482, | |
| "learning_rate": 0.0002568408576661997, | |
| "loss": 3.3024, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 5.731352922182758, | |
| "grad_norm": 0.7269954681396484, | |
| "learning_rate": 0.00025651761663613836, | |
| "loss": 3.3144, | |
| "step": 53250 | |
| }, | |
| { | |
| "epoch": 5.736734474222366, | |
| "grad_norm": 0.6841776967048645, | |
| "learning_rate": 0.0002561943756060769, | |
| "loss": 3.327, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 5.742116026261974, | |
| "grad_norm": Infinity, | |
| "learning_rate": 0.0002558775993966167, | |
| "loss": 3.359, | |
| "step": 53350 | |
| }, | |
| { | |
| "epoch": 5.747497578301582, | |
| "grad_norm": 0.6885505318641663, | |
| "learning_rate": 0.0002555543583665553, | |
| "loss": 3.3354, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 5.75287913034119, | |
| "grad_norm": 0.6598957180976868, | |
| "learning_rate": 0.0002552311173364939, | |
| "loss": 3.3288, | |
| "step": 53450 | |
| }, | |
| { | |
| "epoch": 5.758260682380799, | |
| "grad_norm": 0.7532528638839722, | |
| "learning_rate": 0.0002549078763064325, | |
| "loss": 3.324, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 5.763642234420407, | |
| "grad_norm": 0.7090945839881897, | |
| "learning_rate": 0.00025458463527637103, | |
| "loss": 3.3497, | |
| "step": 53550 | |
| }, | |
| { | |
| "epoch": 5.769023786460015, | |
| "grad_norm": 0.7386473417282104, | |
| "learning_rate": 0.0002542613942463097, | |
| "loss": 3.3253, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 5.774405338499624, | |
| "grad_norm": 0.6926794052124023, | |
| "learning_rate": 0.0002539381532162482, | |
| "loss": 3.3503, | |
| "step": 53650 | |
| }, | |
| { | |
| "epoch": 5.779786890539231, | |
| "grad_norm": 0.6957927942276001, | |
| "learning_rate": 0.0002536149121861868, | |
| "loss": 3.3296, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 5.78516844257884, | |
| "grad_norm": 0.6737018823623657, | |
| "learning_rate": 0.0002532916711561254, | |
| "loss": 3.3303, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 5.790549994618448, | |
| "grad_norm": 0.6735786199569702, | |
| "learning_rate": 0.000252968430126064, | |
| "loss": 3.3267, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 5.795931546658056, | |
| "grad_norm": 0.7254409790039062, | |
| "learning_rate": 0.00025264518909600255, | |
| "loss": 3.317, | |
| "step": 53850 | |
| }, | |
| { | |
| "epoch": 5.801313098697665, | |
| "grad_norm": 0.7492697834968567, | |
| "learning_rate": 0.00025232194806594114, | |
| "loss": 3.3346, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 5.806694650737272, | |
| "grad_norm": 0.7044953107833862, | |
| "learning_rate": 0.00025199870703587974, | |
| "loss": 3.3272, | |
| "step": 53950 | |
| }, | |
| { | |
| "epoch": 5.812076202776881, | |
| "grad_norm": 0.7152127623558044, | |
| "learning_rate": 0.00025167546600581833, | |
| "loss": 3.3388, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.812076202776881, | |
| "eval_accuracy": 0.3838037224696671, | |
| "eval_loss": 3.383706569671631, | |
| "eval_runtime": 184.3385, | |
| "eval_samples_per_second": 97.706, | |
| "eval_steps_per_second": 6.108, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 5.817457754816489, | |
| "grad_norm": 0.6760851740837097, | |
| "learning_rate": 0.0002513522249757569, | |
| "loss": 3.3252, | |
| "step": 54050 | |
| }, | |
| { | |
| "epoch": 5.822839306856097, | |
| "grad_norm": 0.7422081232070923, | |
| "learning_rate": 0.00025102898394569547, | |
| "loss": 3.3384, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 5.828220858895706, | |
| "grad_norm": 0.7860513925552368, | |
| "learning_rate": 0.00025070574291563406, | |
| "loss": 3.3348, | |
| "step": 54150 | |
| }, | |
| { | |
| "epoch": 5.833602410935313, | |
| "grad_norm": 0.705047070980072, | |
| "learning_rate": 0.00025038250188557265, | |
| "loss": 3.3267, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 5.838983962974922, | |
| "grad_norm": 0.7171577215194702, | |
| "learning_rate": 0.00025005926085551125, | |
| "loss": 3.3154, | |
| "step": 54250 | |
| }, | |
| { | |
| "epoch": 5.84436551501453, | |
| "grad_norm": 0.7100509405136108, | |
| "learning_rate": 0.0002497360198254498, | |
| "loss": 3.3315, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 5.849747067054138, | |
| "grad_norm": 0.7151844501495361, | |
| "learning_rate": 0.00024941277879538844, | |
| "loss": 3.336, | |
| "step": 54350 | |
| }, | |
| { | |
| "epoch": 5.855128619093747, | |
| "grad_norm": 0.7535050511360168, | |
| "learning_rate": 0.000249089537765327, | |
| "loss": 3.3343, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 5.860510171133355, | |
| "grad_norm": 0.6949968934059143, | |
| "learning_rate": 0.0002487662967352656, | |
| "loss": 3.3246, | |
| "step": 54450 | |
| }, | |
| { | |
| "epoch": 5.865891723172963, | |
| "grad_norm": 0.7528766989707947, | |
| "learning_rate": 0.00024844305570520417, | |
| "loss": 3.3319, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 5.871273275212571, | |
| "grad_norm": 0.6865386962890625, | |
| "learning_rate": 0.00024811981467514276, | |
| "loss": 3.3407, | |
| "step": 54550 | |
| }, | |
| { | |
| "epoch": 5.87665482725218, | |
| "grad_norm": 0.721444308757782, | |
| "learning_rate": 0.00024779657364508136, | |
| "loss": 3.3203, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 5.882036379291788, | |
| "grad_norm": 0.7100372910499573, | |
| "learning_rate": 0.0002474733326150199, | |
| "loss": 3.3361, | |
| "step": 54650 | |
| }, | |
| { | |
| "epoch": 5.887417931331396, | |
| "grad_norm": 0.6949239373207092, | |
| "learning_rate": 0.0002471500915849585, | |
| "loss": 3.3499, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 5.892799483371004, | |
| "grad_norm": 0.7012357115745544, | |
| "learning_rate": 0.0002468268505548971, | |
| "loss": 3.3406, | |
| "step": 54750 | |
| }, | |
| { | |
| "epoch": 5.898181035410612, | |
| "grad_norm": 0.6897000074386597, | |
| "learning_rate": 0.0002465036095248357, | |
| "loss": 3.3447, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 5.903562587450221, | |
| "grad_norm": 0.726473867893219, | |
| "learning_rate": 0.0002461803684947742, | |
| "loss": 3.3388, | |
| "step": 54850 | |
| }, | |
| { | |
| "epoch": 5.9089441394898286, | |
| "grad_norm": 0.7095021605491638, | |
| "learning_rate": 0.0002458571274647128, | |
| "loss": 3.3416, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 5.914325691529437, | |
| "grad_norm": 0.7249286770820618, | |
| "learning_rate": 0.0002455338864346514, | |
| "loss": 3.3427, | |
| "step": 54950 | |
| }, | |
| { | |
| "epoch": 5.919707243569046, | |
| "grad_norm": 0.7552708983421326, | |
| "learning_rate": 0.00024521064540459, | |
| "loss": 3.3396, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.919707243569046, | |
| "eval_accuracy": 0.38397126522267705, | |
| "eval_loss": 3.3809568881988525, | |
| "eval_runtime": 184.0462, | |
| "eval_samples_per_second": 97.861, | |
| "eval_steps_per_second": 6.118, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 5.925088795608653, | |
| "grad_norm": 0.7218322157859802, | |
| "learning_rate": 0.0002448874043745286, | |
| "loss": 3.3392, | |
| "step": 55050 | |
| }, | |
| { | |
| "epoch": 5.930470347648262, | |
| "grad_norm": 0.7611395120620728, | |
| "learning_rate": 0.00024456416334446714, | |
| "loss": 3.3307, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 5.93585189968787, | |
| "grad_norm": 0.7061910033226013, | |
| "learning_rate": 0.00024424092231440574, | |
| "loss": 3.3327, | |
| "step": 55150 | |
| }, | |
| { | |
| "epoch": 5.941233451727478, | |
| "grad_norm": 0.6887156367301941, | |
| "learning_rate": 0.00024391768128434436, | |
| "loss": 3.35, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 5.946615003767087, | |
| "grad_norm": 0.7248396277427673, | |
| "learning_rate": 0.00024359444025428293, | |
| "loss": 3.3488, | |
| "step": 55250 | |
| }, | |
| { | |
| "epoch": 5.951996555806694, | |
| "grad_norm": 0.7237182855606079, | |
| "learning_rate": 0.0002432711992242215, | |
| "loss": 3.3265, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 5.957378107846303, | |
| "grad_norm": 0.7203229069709778, | |
| "learning_rate": 0.0002429479581941601, | |
| "loss": 3.3224, | |
| "step": 55350 | |
| }, | |
| { | |
| "epoch": 5.962759659885911, | |
| "grad_norm": 0.7003002762794495, | |
| "learning_rate": 0.00024262471716409868, | |
| "loss": 3.3244, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 5.968141211925519, | |
| "grad_norm": 0.725002110004425, | |
| "learning_rate": 0.00024230147613403728, | |
| "loss": 3.3313, | |
| "step": 55450 | |
| }, | |
| { | |
| "epoch": 5.973522763965128, | |
| "grad_norm": 0.7069681286811829, | |
| "learning_rate": 0.00024197823510397584, | |
| "loss": 3.3428, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 5.978904316004736, | |
| "grad_norm": 0.7072627544403076, | |
| "learning_rate": 0.0002416549940739144, | |
| "loss": 3.3378, | |
| "step": 55550 | |
| }, | |
| { | |
| "epoch": 5.984285868044344, | |
| "grad_norm": 0.7657013535499573, | |
| "learning_rate": 0.00024133175304385303, | |
| "loss": 3.3343, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 5.989667420083952, | |
| "grad_norm": 0.7375352382659912, | |
| "learning_rate": 0.0002410085120137916, | |
| "loss": 3.3289, | |
| "step": 55650 | |
| }, | |
| { | |
| "epoch": 5.995048972123561, | |
| "grad_norm": 0.7755512595176697, | |
| "learning_rate": 0.00024068527098373017, | |
| "loss": 3.3562, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 6.000430524163169, | |
| "grad_norm": 0.7189819812774658, | |
| "learning_rate": 0.0002403620299536688, | |
| "loss": 3.3257, | |
| "step": 55750 | |
| }, | |
| { | |
| "epoch": 6.005812076202777, | |
| "grad_norm": 0.713076114654541, | |
| "learning_rate": 0.00024003878892360736, | |
| "loss": 3.2465, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 6.011193628242385, | |
| "grad_norm": 0.7407640814781189, | |
| "learning_rate": 0.00023971554789354593, | |
| "loss": 3.2446, | |
| "step": 55850 | |
| }, | |
| { | |
| "epoch": 6.016575180281993, | |
| "grad_norm": 0.7104265093803406, | |
| "learning_rate": 0.00023939230686348452, | |
| "loss": 3.2488, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 6.021956732321602, | |
| "grad_norm": 0.7195842266082764, | |
| "learning_rate": 0.0002390690658334231, | |
| "loss": 3.2401, | |
| "step": 55950 | |
| }, | |
| { | |
| "epoch": 6.0273382843612096, | |
| "grad_norm": 0.7403207421302795, | |
| "learning_rate": 0.00023874582480336168, | |
| "loss": 3.237, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 6.0273382843612096, | |
| "eval_accuracy": 0.3839623556858762, | |
| "eval_loss": 3.3840363025665283, | |
| "eval_runtime": 183.8506, | |
| "eval_samples_per_second": 97.965, | |
| "eval_steps_per_second": 6.125, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 6.032719836400818, | |
| "grad_norm": 0.6788797378540039, | |
| "learning_rate": 0.00023842258377330028, | |
| "loss": 3.2471, | |
| "step": 56050 | |
| }, | |
| { | |
| "epoch": 6.038101388440427, | |
| "grad_norm": 0.6888821125030518, | |
| "learning_rate": 0.00023809934274323885, | |
| "loss": 3.2575, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 6.043482940480034, | |
| "grad_norm": 0.7463328838348389, | |
| "learning_rate": 0.0002377761017131774, | |
| "loss": 3.2439, | |
| "step": 56150 | |
| }, | |
| { | |
| "epoch": 6.048864492519643, | |
| "grad_norm": 0.690376341342926, | |
| "learning_rate": 0.00023745286068311603, | |
| "loss": 3.2494, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 6.0542460445592505, | |
| "grad_norm": 0.7049649953842163, | |
| "learning_rate": 0.0002371296196530546, | |
| "loss": 3.2616, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 6.059627596598859, | |
| "grad_norm": 0.7419145703315735, | |
| "learning_rate": 0.00023680637862299317, | |
| "loss": 3.2423, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 6.065009148638468, | |
| "grad_norm": 0.732083261013031, | |
| "learning_rate": 0.0002364831375929318, | |
| "loss": 3.2486, | |
| "step": 56350 | |
| }, | |
| { | |
| "epoch": 6.070390700678075, | |
| "grad_norm": 0.7247629165649414, | |
| "learning_rate": 0.00023615989656287036, | |
| "loss": 3.2325, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 6.075772252717684, | |
| "grad_norm": 0.7294711470603943, | |
| "learning_rate": 0.00023583665553280895, | |
| "loss": 3.2576, | |
| "step": 56450 | |
| }, | |
| { | |
| "epoch": 6.081153804757292, | |
| "grad_norm": 0.6983347535133362, | |
| "learning_rate": 0.00023551341450274752, | |
| "loss": 3.2536, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 6.0865353567969, | |
| "grad_norm": 0.7216185927391052, | |
| "learning_rate": 0.00023519017347268612, | |
| "loss": 3.2575, | |
| "step": 56550 | |
| }, | |
| { | |
| "epoch": 6.091916908836509, | |
| "grad_norm": 0.738471269607544, | |
| "learning_rate": 0.0002348669324426247, | |
| "loss": 3.2558, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 6.097298460876116, | |
| "grad_norm": 0.7062331438064575, | |
| "learning_rate": 0.00023454369141256328, | |
| "loss": 3.2474, | |
| "step": 56650 | |
| }, | |
| { | |
| "epoch": 6.102680012915725, | |
| "grad_norm": 0.7130937576293945, | |
| "learning_rate": 0.00023422045038250185, | |
| "loss": 3.2571, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 6.108061564955333, | |
| "grad_norm": 0.7500676512718201, | |
| "learning_rate": 0.00023389720935244047, | |
| "loss": 3.2526, | |
| "step": 56750 | |
| }, | |
| { | |
| "epoch": 6.113443116994941, | |
| "grad_norm": 0.7299767136573792, | |
| "learning_rate": 0.00023357396832237903, | |
| "loss": 3.2539, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 6.11882466903455, | |
| "grad_norm": 0.7288519740104675, | |
| "learning_rate": 0.0002332507272923176, | |
| "loss": 3.286, | |
| "step": 56850 | |
| }, | |
| { | |
| "epoch": 6.124206221074158, | |
| "grad_norm": 0.7326245903968811, | |
| "learning_rate": 0.00023292748626225622, | |
| "loss": 3.2707, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 6.129587773113766, | |
| "grad_norm": 0.7590323686599731, | |
| "learning_rate": 0.0002326042452321948, | |
| "loss": 3.2554, | |
| "step": 56950 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "grad_norm": 0.732631266117096, | |
| "learning_rate": 0.00023228100420213336, | |
| "loss": 3.2561, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 6.134969325153374, | |
| "eval_accuracy": 0.3845425621068056, | |
| "eval_loss": 3.3839833736419678, | |
| "eval_runtime": 184.6599, | |
| "eval_samples_per_second": 97.536, | |
| "eval_steps_per_second": 6.098, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 6.140350877192983, | |
| "grad_norm": 0.7149685621261597, | |
| "learning_rate": 0.00023195776317207195, | |
| "loss": 3.2676, | |
| "step": 57050 | |
| }, | |
| { | |
| "epoch": 6.1457324292325906, | |
| "grad_norm": 0.7450768351554871, | |
| "learning_rate": 0.00023163452214201055, | |
| "loss": 3.267, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 6.151113981272199, | |
| "grad_norm": 0.71061110496521, | |
| "learning_rate": 0.00023131128111194912, | |
| "loss": 3.2632, | |
| "step": 57150 | |
| }, | |
| { | |
| "epoch": 6.156495533311807, | |
| "grad_norm": 0.8081583976745605, | |
| "learning_rate": 0.0002309880400818877, | |
| "loss": 3.265, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 6.161877085351415, | |
| "grad_norm": 0.6962241530418396, | |
| "learning_rate": 0.00023066479905182628, | |
| "loss": 3.2733, | |
| "step": 57250 | |
| }, | |
| { | |
| "epoch": 6.167258637391024, | |
| "grad_norm": 0.7313778400421143, | |
| "learning_rate": 0.00023034155802176487, | |
| "loss": 3.2848, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 6.1726401894306315, | |
| "grad_norm": 0.7355571985244751, | |
| "learning_rate": 0.00023001831699170347, | |
| "loss": 3.2765, | |
| "step": 57350 | |
| }, | |
| { | |
| "epoch": 6.17802174147024, | |
| "grad_norm": 0.7399037480354309, | |
| "learning_rate": 0.00022970154078224327, | |
| "loss": 3.2481, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 6.183403293509849, | |
| "grad_norm": 0.7529656291007996, | |
| "learning_rate": 0.00022937829975218187, | |
| "loss": 3.2717, | |
| "step": 57450 | |
| }, | |
| { | |
| "epoch": 6.188784845549456, | |
| "grad_norm": 0.6955074071884155, | |
| "learning_rate": 0.00022905505872212044, | |
| "loss": 3.2703, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 6.194166397589065, | |
| "grad_norm": 0.704826831817627, | |
| "learning_rate": 0.00022873181769205903, | |
| "loss": 3.2711, | |
| "step": 57550 | |
| }, | |
| { | |
| "epoch": 6.1995479496286725, | |
| "grad_norm": 0.75359708070755, | |
| "learning_rate": 0.0002284085766619976, | |
| "loss": 3.2582, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 6.204929501668281, | |
| "grad_norm": 0.7706862688064575, | |
| "learning_rate": 0.0002280853356319362, | |
| "loss": 3.2637, | |
| "step": 57650 | |
| }, | |
| { | |
| "epoch": 6.21031105370789, | |
| "grad_norm": 0.7804993391036987, | |
| "learning_rate": 0.0002277620946018748, | |
| "loss": 3.2706, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 6.215692605747497, | |
| "grad_norm": 0.7386913299560547, | |
| "learning_rate": 0.00022743885357181336, | |
| "loss": 3.2786, | |
| "step": 57750 | |
| }, | |
| { | |
| "epoch": 6.221074157787106, | |
| "grad_norm": 0.7626097202301025, | |
| "learning_rate": 0.00022711561254175192, | |
| "loss": 3.2787, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 6.226455709826714, | |
| "grad_norm": 0.7361026406288147, | |
| "learning_rate": 0.00022679237151169054, | |
| "loss": 3.2744, | |
| "step": 57850 | |
| }, | |
| { | |
| "epoch": 6.231837261866322, | |
| "grad_norm": 0.7097683548927307, | |
| "learning_rate": 0.0002264691304816291, | |
| "loss": 3.2767, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 6.237218813905931, | |
| "grad_norm": 0.792580783367157, | |
| "learning_rate": 0.00022614588945156768, | |
| "loss": 3.2917, | |
| "step": 57950 | |
| }, | |
| { | |
| "epoch": 6.242600365945538, | |
| "grad_norm": 0.6779635548591614, | |
| "learning_rate": 0.0002258226484215063, | |
| "loss": 3.2553, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 6.242600365945538, | |
| "eval_accuracy": 0.3850510576217773, | |
| "eval_loss": 3.3806025981903076, | |
| "eval_runtime": 184.4045, | |
| "eval_samples_per_second": 97.671, | |
| "eval_steps_per_second": 6.106, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 6.247981917985147, | |
| "grad_norm": 0.7419663667678833, | |
| "learning_rate": 0.00022549940739144487, | |
| "loss": 3.259, | |
| "step": 58050 | |
| }, | |
| { | |
| "epoch": 6.253363470024755, | |
| "grad_norm": 0.7085452675819397, | |
| "learning_rate": 0.00022517616636138344, | |
| "loss": 3.2813, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 6.258745022064363, | |
| "grad_norm": 0.7902224659919739, | |
| "learning_rate": 0.00022485292533132203, | |
| "loss": 3.2794, | |
| "step": 58150 | |
| }, | |
| { | |
| "epoch": 6.264126574103972, | |
| "grad_norm": 0.7155488133430481, | |
| "learning_rate": 0.00022452968430126063, | |
| "loss": 3.2745, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 6.26950812614358, | |
| "grad_norm": 1.2818570137023926, | |
| "learning_rate": 0.00022420644327119922, | |
| "loss": 3.268, | |
| "step": 58250 | |
| }, | |
| { | |
| "epoch": 6.274889678183188, | |
| "grad_norm": 0.750627875328064, | |
| "learning_rate": 0.0002238832022411378, | |
| "loss": 3.2734, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 6.280271230222796, | |
| "grad_norm": 0.7265231013298035, | |
| "learning_rate": 0.00022355996121107636, | |
| "loss": 3.274, | |
| "step": 58350 | |
| }, | |
| { | |
| "epoch": 6.285652782262405, | |
| "grad_norm": 0.679812490940094, | |
| "learning_rate": 0.00022323672018101498, | |
| "loss": 3.29, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 6.2910343343020125, | |
| "grad_norm": 0.7458456754684448, | |
| "learning_rate": 0.00022291347915095355, | |
| "loss": 3.2672, | |
| "step": 58450 | |
| }, | |
| { | |
| "epoch": 6.296415886341621, | |
| "grad_norm": 0.7149653434753418, | |
| "learning_rate": 0.0002225902381208921, | |
| "loss": 3.2806, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 6.301797438381229, | |
| "grad_norm": 0.7718719244003296, | |
| "learning_rate": 0.00022226699709083073, | |
| "loss": 3.2599, | |
| "step": 58550 | |
| }, | |
| { | |
| "epoch": 6.307178990420837, | |
| "grad_norm": 0.7340017557144165, | |
| "learning_rate": 0.0002219437560607693, | |
| "loss": 3.2805, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 6.312560542460446, | |
| "grad_norm": 0.7550706267356873, | |
| "learning_rate": 0.00022162051503070787, | |
| "loss": 3.2808, | |
| "step": 58650 | |
| }, | |
| { | |
| "epoch": 6.3179420945000535, | |
| "grad_norm": 0.7718273401260376, | |
| "learning_rate": 0.00022129727400064646, | |
| "loss": 3.2815, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 6.323323646539662, | |
| "grad_norm": 0.7422057390213013, | |
| "learning_rate": 0.00022097403297058506, | |
| "loss": 3.2822, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 6.328705198579271, | |
| "grad_norm": 0.7373623847961426, | |
| "learning_rate": 0.00022065079194052363, | |
| "loss": 3.2781, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 6.334086750618878, | |
| "grad_norm": 0.716949462890625, | |
| "learning_rate": 0.00022032755091046222, | |
| "loss": 3.2824, | |
| "step": 58850 | |
| }, | |
| { | |
| "epoch": 6.339468302658487, | |
| "grad_norm": 0.7594158053398132, | |
| "learning_rate": 0.0002200043098804008, | |
| "loss": 3.2713, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 6.344849854698095, | |
| "grad_norm": 0.7525697350502014, | |
| "learning_rate": 0.00021968106885033938, | |
| "loss": 3.2712, | |
| "step": 58950 | |
| }, | |
| { | |
| "epoch": 6.350231406737703, | |
| "grad_norm": 0.8482348918914795, | |
| "learning_rate": 0.00021935782782027798, | |
| "loss": 3.2965, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 6.350231406737703, | |
| "eval_accuracy": 0.3853254061635089, | |
| "eval_loss": 3.375702142715454, | |
| "eval_runtime": 184.5908, | |
| "eval_samples_per_second": 97.573, | |
| "eval_steps_per_second": 6.1, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 6.355612958777312, | |
| "grad_norm": 0.7227912545204163, | |
| "learning_rate": 0.00021903458679021655, | |
| "loss": 3.299, | |
| "step": 59050 | |
| }, | |
| { | |
| "epoch": 6.360994510816919, | |
| "grad_norm": 0.7208841443061829, | |
| "learning_rate": 0.0002187113457601551, | |
| "loss": 3.2816, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 6.366376062856528, | |
| "grad_norm": 0.6990681290626526, | |
| "learning_rate": 0.00021838810473009373, | |
| "loss": 3.2922, | |
| "step": 59150 | |
| }, | |
| { | |
| "epoch": 6.371757614896136, | |
| "grad_norm": 0.7942614555358887, | |
| "learning_rate": 0.0002180648637000323, | |
| "loss": 3.2885, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 6.377139166935744, | |
| "grad_norm": 0.7575851678848267, | |
| "learning_rate": 0.0002177416226699709, | |
| "loss": 3.2699, | |
| "step": 59250 | |
| }, | |
| { | |
| "epoch": 6.382520718975353, | |
| "grad_norm": 0.7590829133987427, | |
| "learning_rate": 0.00021741838163990946, | |
| "loss": 3.2947, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 6.387902271014961, | |
| "grad_norm": 0.7357906103134155, | |
| "learning_rate": 0.00021709514060984806, | |
| "loss": 3.2785, | |
| "step": 59350 | |
| }, | |
| { | |
| "epoch": 6.393283823054569, | |
| "grad_norm": 0.7962828278541565, | |
| "learning_rate": 0.00021677189957978665, | |
| "loss": 3.2822, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 6.398665375094177, | |
| "grad_norm": 0.750723659992218, | |
| "learning_rate": 0.00021645512337032643, | |
| "loss": 3.295, | |
| "step": 59450 | |
| }, | |
| { | |
| "epoch": 6.404046927133785, | |
| "grad_norm": 0.7317831516265869, | |
| "learning_rate": 0.00021613188234026506, | |
| "loss": 3.287, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 6.4094284791733935, | |
| "grad_norm": 0.7471715211868286, | |
| "learning_rate": 0.00021580864131020362, | |
| "loss": 3.2852, | |
| "step": 59550 | |
| }, | |
| { | |
| "epoch": 6.414810031213002, | |
| "grad_norm": 0.759760320186615, | |
| "learning_rate": 0.0002154854002801422, | |
| "loss": 3.2947, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 6.42019158325261, | |
| "grad_norm": 0.8137605786323547, | |
| "learning_rate": 0.0002151621592500808, | |
| "loss": 3.2968, | |
| "step": 59650 | |
| }, | |
| { | |
| "epoch": 6.425573135292218, | |
| "grad_norm": 0.7391119599342346, | |
| "learning_rate": 0.00021483891822001938, | |
| "loss": 3.2797, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 6.430954687331827, | |
| "grad_norm": 0.7140395641326904, | |
| "learning_rate": 0.00021451567718995795, | |
| "loss": 3.2755, | |
| "step": 59750 | |
| }, | |
| { | |
| "epoch": 6.4363362393714345, | |
| "grad_norm": 0.7186073064804077, | |
| "learning_rate": 0.00021419243615989654, | |
| "loss": 3.295, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 6.441717791411043, | |
| "grad_norm": 0.7441525459289551, | |
| "learning_rate": 0.00021386919512983514, | |
| "loss": 3.2836, | |
| "step": 59850 | |
| }, | |
| { | |
| "epoch": 6.447099343450651, | |
| "grad_norm": 0.743151843547821, | |
| "learning_rate": 0.0002135459540997737, | |
| "loss": 3.2637, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 6.452480895490259, | |
| "grad_norm": 0.7524924278259277, | |
| "learning_rate": 0.0002132227130697123, | |
| "loss": 3.2825, | |
| "step": 59950 | |
| }, | |
| { | |
| "epoch": 6.457862447529868, | |
| "grad_norm": 0.747959554195404, | |
| "learning_rate": 0.00021289947203965087, | |
| "loss": 3.2712, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.457862447529868, | |
| "eval_accuracy": 0.3854671981821068, | |
| "eval_loss": 3.372908353805542, | |
| "eval_runtime": 184.6808, | |
| "eval_samples_per_second": 97.525, | |
| "eval_steps_per_second": 6.097, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 6.4632439995694755, | |
| "grad_norm": 0.8120265007019043, | |
| "learning_rate": 0.0002125762310095895, | |
| "loss": 3.263, | |
| "step": 60050 | |
| }, | |
| { | |
| "epoch": 6.468625551609084, | |
| "grad_norm": 0.7335713505744934, | |
| "learning_rate": 0.00021225298997952806, | |
| "loss": 3.282, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 6.474007103648693, | |
| "grad_norm": 0.744031548500061, | |
| "learning_rate": 0.00021192974894946662, | |
| "loss": 3.2892, | |
| "step": 60150 | |
| }, | |
| { | |
| "epoch": 6.4793886556883, | |
| "grad_norm": 0.7717245221138, | |
| "learning_rate": 0.00021160650791940524, | |
| "loss": 3.3058, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 6.484770207727909, | |
| "grad_norm": 0.7391660213470459, | |
| "learning_rate": 0.0002112832668893438, | |
| "loss": 3.2783, | |
| "step": 60250 | |
| }, | |
| { | |
| "epoch": 6.490151759767517, | |
| "grad_norm": 0.7157920598983765, | |
| "learning_rate": 0.00021096002585928238, | |
| "loss": 3.2907, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 6.495533311807125, | |
| "grad_norm": 0.7572237253189087, | |
| "learning_rate": 0.00021063678482922097, | |
| "loss": 3.28, | |
| "step": 60350 | |
| }, | |
| { | |
| "epoch": 6.500914863846734, | |
| "grad_norm": 0.7682546377182007, | |
| "learning_rate": 0.00021031354379915957, | |
| "loss": 3.2659, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 6.506296415886341, | |
| "grad_norm": 0.7918459177017212, | |
| "learning_rate": 0.00020999030276909814, | |
| "loss": 3.2761, | |
| "step": 60450 | |
| }, | |
| { | |
| "epoch": 6.51167796792595, | |
| "grad_norm": 0.7500776052474976, | |
| "learning_rate": 0.00020966706173903673, | |
| "loss": 3.2804, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 6.517059519965558, | |
| "grad_norm": 0.7752882838249207, | |
| "learning_rate": 0.0002093438207089753, | |
| "loss": 3.287, | |
| "step": 60550 | |
| }, | |
| { | |
| "epoch": 6.522441072005166, | |
| "grad_norm": 0.7291586399078369, | |
| "learning_rate": 0.00020902057967891387, | |
| "loss": 3.2863, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 6.5278226240447745, | |
| "grad_norm": 0.7302510738372803, | |
| "learning_rate": 0.0002086973386488525, | |
| "loss": 3.3077, | |
| "step": 60650 | |
| }, | |
| { | |
| "epoch": 6.533204176084383, | |
| "grad_norm": 0.7242251038551331, | |
| "learning_rate": 0.00020837409761879106, | |
| "loss": 3.2746, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 6.538585728123991, | |
| "grad_norm": 0.7295466661453247, | |
| "learning_rate": 0.00020805085658872962, | |
| "loss": 3.2869, | |
| "step": 60750 | |
| }, | |
| { | |
| "epoch": 6.543967280163599, | |
| "grad_norm": 0.7399840354919434, | |
| "learning_rate": 0.00020772761555866825, | |
| "loss": 3.2958, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 6.549348832203208, | |
| "grad_norm": 0.8592695593833923, | |
| "learning_rate": 0.0002074043745286068, | |
| "loss": 3.2937, | |
| "step": 60850 | |
| }, | |
| { | |
| "epoch": 6.5547303842428155, | |
| "grad_norm": 0.7525405883789062, | |
| "learning_rate": 0.00020708113349854538, | |
| "loss": 3.2769, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 6.560111936282424, | |
| "grad_norm": 0.719514787197113, | |
| "learning_rate": 0.00020675789246848397, | |
| "loss": 3.2716, | |
| "step": 60950 | |
| }, | |
| { | |
| "epoch": 6.565493488322032, | |
| "grad_norm": 0.7651938796043396, | |
| "learning_rate": 0.00020643465143842257, | |
| "loss": 3.2857, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.565493488322032, | |
| "eval_accuracy": 0.38612998079777516, | |
| "eval_loss": 3.3699281215667725, | |
| "eval_runtime": 184.6571, | |
| "eval_samples_per_second": 97.538, | |
| "eval_steps_per_second": 6.098, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 6.57087504036164, | |
| "grad_norm": 0.8320351243019104, | |
| "learning_rate": 0.00020611141040836116, | |
| "loss": 3.2741, | |
| "step": 61050 | |
| }, | |
| { | |
| "epoch": 6.576256592401249, | |
| "grad_norm": 0.7015548944473267, | |
| "learning_rate": 0.00020578816937829973, | |
| "loss": 3.3026, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 6.5816381444408565, | |
| "grad_norm": 0.7299136519432068, | |
| "learning_rate": 0.0002054649283482383, | |
| "loss": 3.2858, | |
| "step": 61150 | |
| }, | |
| { | |
| "epoch": 6.587019696480465, | |
| "grad_norm": 0.7206780910491943, | |
| "learning_rate": 0.00020514168731817692, | |
| "loss": 3.2945, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 6.592401248520073, | |
| "grad_norm": 0.7657489776611328, | |
| "learning_rate": 0.0002048184462881155, | |
| "loss": 3.2834, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 6.597782800559681, | |
| "grad_norm": 0.786909282207489, | |
| "learning_rate": 0.00020449520525805406, | |
| "loss": 3.2806, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 6.60316435259929, | |
| "grad_norm": 0.6916054487228394, | |
| "learning_rate": 0.00020417196422799268, | |
| "loss": 3.2892, | |
| "step": 61350 | |
| }, | |
| { | |
| "epoch": 6.608545904638898, | |
| "grad_norm": 0.8359130620956421, | |
| "learning_rate": 0.00020384872319793125, | |
| "loss": 3.2798, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 6.613927456678506, | |
| "grad_norm": 0.7628897428512573, | |
| "learning_rate": 0.00020353194698847105, | |
| "loss": 3.2831, | |
| "step": 61450 | |
| }, | |
| { | |
| "epoch": 6.619309008718115, | |
| "grad_norm": 0.7674877643585205, | |
| "learning_rate": 0.00020321517077901086, | |
| "loss": 3.2636, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 6.624690560757722, | |
| "grad_norm": 0.7460737228393555, | |
| "learning_rate": 0.00020289192974894945, | |
| "loss": 3.2839, | |
| "step": 61550 | |
| }, | |
| { | |
| "epoch": 6.630072112797331, | |
| "grad_norm": 0.7583357691764832, | |
| "learning_rate": 0.00020256868871888802, | |
| "loss": 3.2861, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 6.635453664836939, | |
| "grad_norm": 0.8294389843940735, | |
| "learning_rate": 0.00020224544768882664, | |
| "loss": 3.2915, | |
| "step": 61650 | |
| }, | |
| { | |
| "epoch": 6.640835216876547, | |
| "grad_norm": 0.7348142266273499, | |
| "learning_rate": 0.0002019222066587652, | |
| "loss": 3.2809, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 6.6462167689161555, | |
| "grad_norm": 0.726068913936615, | |
| "learning_rate": 0.00020159896562870378, | |
| "loss": 3.2742, | |
| "step": 61750 | |
| }, | |
| { | |
| "epoch": 6.651598320955763, | |
| "grad_norm": 0.7640007734298706, | |
| "learning_rate": 0.00020127572459864237, | |
| "loss": 3.294, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 6.656979872995372, | |
| "grad_norm": 0.7816165685653687, | |
| "learning_rate": 0.00020095248356858097, | |
| "loss": 3.2971, | |
| "step": 61850 | |
| }, | |
| { | |
| "epoch": 6.66236142503498, | |
| "grad_norm": 0.7704948782920837, | |
| "learning_rate": 0.00020062924253851953, | |
| "loss": 3.2863, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 6.667742977074588, | |
| "grad_norm": 0.7741549611091614, | |
| "learning_rate": 0.00020030600150845813, | |
| "loss": 3.2754, | |
| "step": 61950 | |
| }, | |
| { | |
| "epoch": 6.6731245291141965, | |
| "grad_norm": 0.7607588768005371, | |
| "learning_rate": 0.0001999827604783967, | |
| "loss": 3.2798, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.6731245291141965, | |
| "eval_accuracy": 0.386048056520363, | |
| "eval_loss": 3.3638477325439453, | |
| "eval_runtime": 184.5657, | |
| "eval_samples_per_second": 97.586, | |
| "eval_steps_per_second": 6.101, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 6.678506081153805, | |
| "grad_norm": 0.7352211475372314, | |
| "learning_rate": 0.0001996595194483353, | |
| "loss": 3.2716, | |
| "step": 62050 | |
| }, | |
| { | |
| "epoch": 6.683887633193413, | |
| "grad_norm": 0.7325161695480347, | |
| "learning_rate": 0.00019933627841827389, | |
| "loss": 3.2848, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 6.689269185233021, | |
| "grad_norm": 0.7613785266876221, | |
| "learning_rate": 0.00019901303738821245, | |
| "loss": 3.2751, | |
| "step": 62150 | |
| }, | |
| { | |
| "epoch": 6.69465073727263, | |
| "grad_norm": 0.7282025814056396, | |
| "learning_rate": 0.00019868979635815102, | |
| "loss": 3.2849, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 6.7000322893122375, | |
| "grad_norm": 0.7329760789871216, | |
| "learning_rate": 0.00019836655532808964, | |
| "loss": 3.2808, | |
| "step": 62250 | |
| }, | |
| { | |
| "epoch": 6.705413841351846, | |
| "grad_norm": 0.7544186115264893, | |
| "learning_rate": 0.0001980433142980282, | |
| "loss": 3.2799, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 6.710795393391454, | |
| "grad_norm": 0.7261156439781189, | |
| "learning_rate": 0.00019772007326796678, | |
| "loss": 3.2821, | |
| "step": 62350 | |
| }, | |
| { | |
| "epoch": 6.716176945431062, | |
| "grad_norm": 0.722864031791687, | |
| "learning_rate": 0.0001973968322379054, | |
| "loss": 3.2838, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 6.721558497470671, | |
| "grad_norm": 0.7303403615951538, | |
| "learning_rate": 0.00019707359120784397, | |
| "loss": 3.307, | |
| "step": 62450 | |
| }, | |
| { | |
| "epoch": 6.7269400495102785, | |
| "grad_norm": 0.7062789797782898, | |
| "learning_rate": 0.00019675035017778253, | |
| "loss": 3.2812, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 6.732321601549887, | |
| "grad_norm": 0.748625636100769, | |
| "learning_rate": 0.00019642710914772113, | |
| "loss": 3.2913, | |
| "step": 62550 | |
| }, | |
| { | |
| "epoch": 6.737703153589496, | |
| "grad_norm": 0.7621287703514099, | |
| "learning_rate": 0.00019610386811765972, | |
| "loss": 3.2881, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 6.743084705629103, | |
| "grad_norm": 0.7435240149497986, | |
| "learning_rate": 0.00019578062708759832, | |
| "loss": 3.2807, | |
| "step": 62650 | |
| }, | |
| { | |
| "epoch": 6.748466257668712, | |
| "grad_norm": 0.7503770589828491, | |
| "learning_rate": 0.00019545738605753689, | |
| "loss": 3.3027, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 6.75384780970832, | |
| "grad_norm": 0.7563770413398743, | |
| "learning_rate": 0.00019513414502747545, | |
| "loss": 3.2875, | |
| "step": 62750 | |
| }, | |
| { | |
| "epoch": 6.759229361747928, | |
| "grad_norm": 0.8158665299415588, | |
| "learning_rate": 0.00019481090399741408, | |
| "loss": 3.2725, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 6.7646109137875365, | |
| "grad_norm": 0.7590814828872681, | |
| "learning_rate": 0.00019448766296735264, | |
| "loss": 3.3002, | |
| "step": 62850 | |
| }, | |
| { | |
| "epoch": 6.769992465827144, | |
| "grad_norm": 0.7146130800247192, | |
| "learning_rate": 0.0001941644219372912, | |
| "loss": 3.2843, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 6.775374017866753, | |
| "grad_norm": 0.7704887986183167, | |
| "learning_rate": 0.00019384118090722983, | |
| "loss": 3.2762, | |
| "step": 62950 | |
| }, | |
| { | |
| "epoch": 6.780755569906361, | |
| "grad_norm": 0.748637855052948, | |
| "learning_rate": 0.0001935179398771684, | |
| "loss": 3.2641, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.780755569906361, | |
| "eval_accuracy": 0.3869664993810588, | |
| "eval_loss": 3.359816551208496, | |
| "eval_runtime": 184.8195, | |
| "eval_samples_per_second": 97.452, | |
| "eval_steps_per_second": 6.092, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 6.786137121945969, | |
| "grad_norm": 0.8034060597419739, | |
| "learning_rate": 0.00019319469884710697, | |
| "loss": 3.2973, | |
| "step": 63050 | |
| }, | |
| { | |
| "epoch": 6.7915186739855775, | |
| "grad_norm": 0.7926182150840759, | |
| "learning_rate": 0.00019287145781704556, | |
| "loss": 3.2821, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 6.796900226025185, | |
| "grad_norm": 0.7208576798439026, | |
| "learning_rate": 0.00019254821678698416, | |
| "loss": 3.283, | |
| "step": 63150 | |
| }, | |
| { | |
| "epoch": 6.802281778064794, | |
| "grad_norm": 0.7681335210800171, | |
| "learning_rate": 0.00019222497575692272, | |
| "loss": 3.2823, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 6.807663330104402, | |
| "grad_norm": 0.7635934352874756, | |
| "learning_rate": 0.00019190173472686132, | |
| "loss": 3.3013, | |
| "step": 63250 | |
| }, | |
| { | |
| "epoch": 6.813044882144011, | |
| "grad_norm": 0.7212764024734497, | |
| "learning_rate": 0.0001915784936967999, | |
| "loss": 3.299, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 6.8184264341836185, | |
| "grad_norm": 0.7291616201400757, | |
| "learning_rate": 0.00019125525266673845, | |
| "loss": 3.2863, | |
| "step": 63350 | |
| }, | |
| { | |
| "epoch": 6.823807986223227, | |
| "grad_norm": 0.7780003547668457, | |
| "learning_rate": 0.00019093201163667708, | |
| "loss": 3.2753, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 6.829189538262835, | |
| "grad_norm": 0.7213042378425598, | |
| "learning_rate": 0.00019060877060661564, | |
| "loss": 3.2894, | |
| "step": 63450 | |
| }, | |
| { | |
| "epoch": 6.834571090302443, | |
| "grad_norm": 0.7561571002006531, | |
| "learning_rate": 0.0001902855295765542, | |
| "loss": 3.2773, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 6.839952642342052, | |
| "grad_norm": 0.8355714678764343, | |
| "learning_rate": 0.00018996228854649283, | |
| "loss": 3.2936, | |
| "step": 63550 | |
| }, | |
| { | |
| "epoch": 6.8453341943816595, | |
| "grad_norm": 0.7631604075431824, | |
| "learning_rate": 0.0001896390475164314, | |
| "loss": 3.2791, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 6.850715746421268, | |
| "grad_norm": 0.81952303647995, | |
| "learning_rate": 0.0001893222713069712, | |
| "loss": 3.2909, | |
| "step": 63650 | |
| }, | |
| { | |
| "epoch": 6.856097298460876, | |
| "grad_norm": 0.7389697432518005, | |
| "learning_rate": 0.0001889990302769098, | |
| "loss": 3.292, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 6.861478850500484, | |
| "grad_norm": 0.7638382315635681, | |
| "learning_rate": 0.0001886757892468484, | |
| "loss": 3.2677, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 6.866860402540093, | |
| "grad_norm": 0.7346477508544922, | |
| "learning_rate": 0.00018835254821678696, | |
| "loss": 3.2803, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 6.8722419545797, | |
| "grad_norm": 0.7897524237632751, | |
| "learning_rate": 0.00018802930718672553, | |
| "loss": 3.2859, | |
| "step": 63850 | |
| }, | |
| { | |
| "epoch": 6.877623506619309, | |
| "grad_norm": 0.791483461856842, | |
| "learning_rate": 0.00018770606615666415, | |
| "loss": 3.2651, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 6.8830050586589175, | |
| "grad_norm": 0.7509163022041321, | |
| "learning_rate": 0.00018738282512660272, | |
| "loss": 3.2808, | |
| "step": 63950 | |
| }, | |
| { | |
| "epoch": 6.888386610698525, | |
| "grad_norm": 0.8272793889045715, | |
| "learning_rate": 0.0001870595840965413, | |
| "loss": 3.2795, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.888386610698525, | |
| "eval_accuracy": 0.3871891291481907, | |
| "eval_loss": 3.356895923614502, | |
| "eval_runtime": 184.5849, | |
| "eval_samples_per_second": 97.576, | |
| "eval_steps_per_second": 6.1, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 6.893768162738134, | |
| "grad_norm": 0.7592813968658447, | |
| "learning_rate": 0.0001867363430664799, | |
| "loss": 3.2996, | |
| "step": 64050 | |
| }, | |
| { | |
| "epoch": 6.899149714777742, | |
| "grad_norm": 0.7349041700363159, | |
| "learning_rate": 0.00018641310203641848, | |
| "loss": 3.2695, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 6.90453126681735, | |
| "grad_norm": 0.7781025767326355, | |
| "learning_rate": 0.00018608986100635705, | |
| "loss": 3.2748, | |
| "step": 64150 | |
| }, | |
| { | |
| "epoch": 6.9099128188569585, | |
| "grad_norm": 0.7788296937942505, | |
| "learning_rate": 0.00018576661997629564, | |
| "loss": 3.2818, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 6.915294370896566, | |
| "grad_norm": 0.8603556752204895, | |
| "learning_rate": 0.00018544337894623423, | |
| "loss": 3.2798, | |
| "step": 64250 | |
| }, | |
| { | |
| "epoch": 6.920675922936175, | |
| "grad_norm": 0.7542293667793274, | |
| "learning_rate": 0.0001851201379161728, | |
| "loss": 3.2944, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 6.926057474975783, | |
| "grad_norm": 0.7522438764572144, | |
| "learning_rate": 0.0001847968968861114, | |
| "loss": 3.3009, | |
| "step": 64350 | |
| }, | |
| { | |
| "epoch": 6.931439027015391, | |
| "grad_norm": 0.7493849992752075, | |
| "learning_rate": 0.00018447365585604996, | |
| "loss": 3.2968, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 6.9368205790549995, | |
| "grad_norm": 0.7522703409194946, | |
| "learning_rate": 0.00018415041482598859, | |
| "loss": 3.2705, | |
| "step": 64450 | |
| }, | |
| { | |
| "epoch": 6.942202131094608, | |
| "grad_norm": 0.7539739012718201, | |
| "learning_rate": 0.00018382717379592715, | |
| "loss": 3.2899, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 6.947583683134216, | |
| "grad_norm": 0.8078305125236511, | |
| "learning_rate": 0.00018350393276586572, | |
| "loss": 3.3006, | |
| "step": 64550 | |
| }, | |
| { | |
| "epoch": 6.952965235173824, | |
| "grad_norm": 0.7467326521873474, | |
| "learning_rate": 0.00018318069173580434, | |
| "loss": 3.2725, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 6.958346787213433, | |
| "grad_norm": 0.7241435050964355, | |
| "learning_rate": 0.0001828574507057429, | |
| "loss": 3.29, | |
| "step": 64650 | |
| }, | |
| { | |
| "epoch": 6.9637283392530405, | |
| "grad_norm": 0.7517207860946655, | |
| "learning_rate": 0.00018253420967568148, | |
| "loss": 3.2657, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 6.969109891292649, | |
| "grad_norm": 0.8167835474014282, | |
| "learning_rate": 0.00018221096864562007, | |
| "loss": 3.291, | |
| "step": 64750 | |
| }, | |
| { | |
| "epoch": 6.974491443332257, | |
| "grad_norm": 0.7494839429855347, | |
| "learning_rate": 0.00018188772761555867, | |
| "loss": 3.3016, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 6.979872995371865, | |
| "grad_norm": 0.7137006521224976, | |
| "learning_rate": 0.00018156448658549723, | |
| "loss": 3.2964, | |
| "step": 64850 | |
| }, | |
| { | |
| "epoch": 6.985254547411474, | |
| "grad_norm": 0.7802633047103882, | |
| "learning_rate": 0.00018124124555543583, | |
| "loss": 3.2759, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 6.990636099451081, | |
| "grad_norm": 0.7316725254058838, | |
| "learning_rate": 0.0001809180045253744, | |
| "loss": 3.2743, | |
| "step": 64950 | |
| }, | |
| { | |
| "epoch": 6.99601765149069, | |
| "grad_norm": 0.7353840470314026, | |
| "learning_rate": 0.00018059476349531296, | |
| "loss": 3.2994, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 6.99601765149069, | |
| "eval_accuracy": 0.38757897570966904, | |
| "eval_loss": 3.3525335788726807, | |
| "eval_runtime": 184.903, | |
| "eval_samples_per_second": 97.408, | |
| "eval_steps_per_second": 6.09, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 7.0013992035302985, | |
| "grad_norm": 0.7722090482711792, | |
| "learning_rate": 0.00018027152246525159, | |
| "loss": 3.2508, | |
| "step": 65050 | |
| }, | |
| { | |
| "epoch": 7.006780755569906, | |
| "grad_norm": 0.745133638381958, | |
| "learning_rate": 0.00017994828143519015, | |
| "loss": 3.185, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 7.012162307609515, | |
| "grad_norm": 0.7634187936782837, | |
| "learning_rate": 0.00017962504040512872, | |
| "loss": 3.185, | |
| "step": 65150 | |
| }, | |
| { | |
| "epoch": 7.017543859649122, | |
| "grad_norm": 0.7494111657142639, | |
| "learning_rate": 0.00017930179937506734, | |
| "loss": 3.21, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 7.022925411688731, | |
| "grad_norm": 0.798454999923706, | |
| "learning_rate": 0.0001789785583450059, | |
| "loss": 3.2116, | |
| "step": 65250 | |
| }, | |
| { | |
| "epoch": 7.0283069637283395, | |
| "grad_norm": 0.7529196739196777, | |
| "learning_rate": 0.00017865531731494448, | |
| "loss": 3.2022, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 7.033688515767947, | |
| "grad_norm": 0.7898669242858887, | |
| "learning_rate": 0.00017833207628488307, | |
| "loss": 3.1969, | |
| "step": 65350 | |
| }, | |
| { | |
| "epoch": 7.039070067807556, | |
| "grad_norm": 0.7746036052703857, | |
| "learning_rate": 0.00017800883525482167, | |
| "loss": 3.1942, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 7.044451619847164, | |
| "grad_norm": 0.8117377161979675, | |
| "learning_rate": 0.00017768559422476026, | |
| "loss": 3.2071, | |
| "step": 65450 | |
| }, | |
| { | |
| "epoch": 7.049833171886772, | |
| "grad_norm": 0.7499861121177673, | |
| "learning_rate": 0.00017736235319469883, | |
| "loss": 3.2166, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 7.0552147239263805, | |
| "grad_norm": 0.7656523585319519, | |
| "learning_rate": 0.0001770391121646374, | |
| "loss": 3.1925, | |
| "step": 65550 | |
| }, | |
| { | |
| "epoch": 7.060596275965988, | |
| "grad_norm": 0.7908286452293396, | |
| "learning_rate": 0.00017671587113457602, | |
| "loss": 3.2114, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 7.065977828005597, | |
| "grad_norm": 0.7967495322227478, | |
| "learning_rate": 0.0001763926301045146, | |
| "loss": 3.2132, | |
| "step": 65650 | |
| }, | |
| { | |
| "epoch": 7.071359380045205, | |
| "grad_norm": 0.7433943748474121, | |
| "learning_rate": 0.00017606938907445315, | |
| "loss": 3.2086, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 7.076740932084813, | |
| "grad_norm": 0.7751272320747375, | |
| "learning_rate": 0.00017574614804439178, | |
| "loss": 3.2388, | |
| "step": 65750 | |
| }, | |
| { | |
| "epoch": 7.0821224841244215, | |
| "grad_norm": 0.7907826900482178, | |
| "learning_rate": 0.00017542290701433034, | |
| "loss": 3.2292, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 7.08750403616403, | |
| "grad_norm": 0.7536311745643616, | |
| "learning_rate": 0.0001750996659842689, | |
| "loss": 3.2051, | |
| "step": 65850 | |
| }, | |
| { | |
| "epoch": 7.092885588203638, | |
| "grad_norm": 0.8473614454269409, | |
| "learning_rate": 0.0001747764249542075, | |
| "loss": 3.1845, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 7.098267140243246, | |
| "grad_norm": 0.764302134513855, | |
| "learning_rate": 0.0001744531839241461, | |
| "loss": 3.193, | |
| "step": 65950 | |
| }, | |
| { | |
| "epoch": 7.103648692282855, | |
| "grad_norm": 0.7599833607673645, | |
| "learning_rate": 0.00017412994289408467, | |
| "loss": 3.2226, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 7.103648692282855, | |
| "eval_accuracy": 0.38751063304323374, | |
| "eval_loss": 3.3595468997955322, | |
| "eval_runtime": 184.5489, | |
| "eval_samples_per_second": 97.595, | |
| "eval_steps_per_second": 6.101, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 7.109030244322462, | |
| "grad_norm": 0.7426865100860596, | |
| "learning_rate": 0.00017380670186402326, | |
| "loss": 3.2087, | |
| "step": 66050 | |
| }, | |
| { | |
| "epoch": 7.114411796362071, | |
| "grad_norm": 0.7286989688873291, | |
| "learning_rate": 0.00017348346083396183, | |
| "loss": 3.2345, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 7.119793348401679, | |
| "grad_norm": 0.7323970794677734, | |
| "learning_rate": 0.00017316021980390042, | |
| "loss": 3.2154, | |
| "step": 66150 | |
| }, | |
| { | |
| "epoch": 7.125174900441287, | |
| "grad_norm": 0.8056437373161316, | |
| "learning_rate": 0.00017283697877383902, | |
| "loss": 3.2244, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 7.130556452480896, | |
| "grad_norm": 0.8104016184806824, | |
| "learning_rate": 0.0001725137377437776, | |
| "loss": 3.2237, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 7.135938004520503, | |
| "grad_norm": 0.7950676679611206, | |
| "learning_rate": 0.00017219049671371615, | |
| "loss": 3.2161, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 7.141319556560112, | |
| "grad_norm": 0.7405356764793396, | |
| "learning_rate": 0.00017186725568365478, | |
| "loss": 3.2312, | |
| "step": 66350 | |
| }, | |
| { | |
| "epoch": 7.1467011085997205, | |
| "grad_norm": 0.7578369379043579, | |
| "learning_rate": 0.00017154401465359334, | |
| "loss": 3.2037, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 7.152082660639328, | |
| "grad_norm": 0.7615239024162292, | |
| "learning_rate": 0.00017122077362353194, | |
| "loss": 3.2061, | |
| "step": 66450 | |
| }, | |
| { | |
| "epoch": 7.157464212678937, | |
| "grad_norm": 0.7853385210037231, | |
| "learning_rate": 0.00017089753259347053, | |
| "loss": 3.2221, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 7.162845764718545, | |
| "grad_norm": 0.7812248468399048, | |
| "learning_rate": 0.0001705742915634091, | |
| "loss": 3.2089, | |
| "step": 66550 | |
| }, | |
| { | |
| "epoch": 7.168227316758153, | |
| "grad_norm": 0.79538494348526, | |
| "learning_rate": 0.0001702510505333477, | |
| "loss": 3.2266, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 7.1736088687977615, | |
| "grad_norm": 0.7920806407928467, | |
| "learning_rate": 0.00016992780950328626, | |
| "loss": 3.2159, | |
| "step": 66650 | |
| }, | |
| { | |
| "epoch": 7.178990420837369, | |
| "grad_norm": 0.8788477182388306, | |
| "learning_rate": 0.00016960456847322486, | |
| "loss": 3.2352, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 7.184371972876978, | |
| "grad_norm": 0.787079930305481, | |
| "learning_rate": 0.00016928132744316345, | |
| "loss": 3.2172, | |
| "step": 66750 | |
| }, | |
| { | |
| "epoch": 7.189753524916586, | |
| "grad_norm": 0.7809692025184631, | |
| "learning_rate": 0.00016895808641310202, | |
| "loss": 3.2311, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 7.195135076956194, | |
| "grad_norm": 0.7488144636154175, | |
| "learning_rate": 0.0001686348453830406, | |
| "loss": 3.2273, | |
| "step": 66850 | |
| }, | |
| { | |
| "epoch": 7.2005166289958025, | |
| "grad_norm": 0.7899309396743774, | |
| "learning_rate": 0.0001683116043529792, | |
| "loss": 3.2404, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 7.205898181035411, | |
| "grad_norm": 0.7522909045219421, | |
| "learning_rate": 0.00016798836332291778, | |
| "loss": 3.2386, | |
| "step": 66950 | |
| }, | |
| { | |
| "epoch": 7.211279733075019, | |
| "grad_norm": 0.7979075312614441, | |
| "learning_rate": 0.00016766512229285634, | |
| "loss": 3.236, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 7.211279733075019, | |
| "eval_accuracy": 0.3877005582911335, | |
| "eval_loss": 3.3552801609039307, | |
| "eval_runtime": 184.2064, | |
| "eval_samples_per_second": 97.776, | |
| "eval_steps_per_second": 6.113, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 7.216661285114627, | |
| "grad_norm": 0.7617997527122498, | |
| "learning_rate": 0.00016734188126279494, | |
| "loss": 3.2404, | |
| "step": 67050 | |
| }, | |
| { | |
| "epoch": 7.222042837154235, | |
| "grad_norm": 0.7634185552597046, | |
| "learning_rate": 0.00016701864023273353, | |
| "loss": 3.2352, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 7.2274243891938434, | |
| "grad_norm": 0.8232079148292542, | |
| "learning_rate": 0.0001666953992026721, | |
| "loss": 3.2292, | |
| "step": 67150 | |
| }, | |
| { | |
| "epoch": 7.232805941233452, | |
| "grad_norm": 0.8213094472885132, | |
| "learning_rate": 0.0001663721581726107, | |
| "loss": 3.2231, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 7.23818749327306, | |
| "grad_norm": 0.7654265761375427, | |
| "learning_rate": 0.00016604891714254926, | |
| "loss": 3.2287, | |
| "step": 67250 | |
| }, | |
| { | |
| "epoch": 7.243569045312668, | |
| "grad_norm": 0.7495639324188232, | |
| "learning_rate": 0.00016572567611248786, | |
| "loss": 3.2287, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 7.248950597352277, | |
| "grad_norm": 0.7579506039619446, | |
| "learning_rate": 0.00016540243508242645, | |
| "loss": 3.2304, | |
| "step": 67350 | |
| }, | |
| { | |
| "epoch": 7.254332149391884, | |
| "grad_norm": 0.7823368906974792, | |
| "learning_rate": 0.00016507919405236502, | |
| "loss": 3.209, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 7.259713701431493, | |
| "grad_norm": 0.7844114303588867, | |
| "learning_rate": 0.00016475595302230364, | |
| "loss": 3.2216, | |
| "step": 67450 | |
| }, | |
| { | |
| "epoch": 7.265095253471101, | |
| "grad_norm": 0.7908133864402771, | |
| "learning_rate": 0.0001644327119922422, | |
| "loss": 3.2328, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 7.270476805510709, | |
| "grad_norm": 0.8082903623580933, | |
| "learning_rate": 0.00016410947096218078, | |
| "loss": 3.2432, | |
| "step": 67550 | |
| }, | |
| { | |
| "epoch": 7.275858357550318, | |
| "grad_norm": 0.7870227694511414, | |
| "learning_rate": 0.00016378622993211937, | |
| "loss": 3.2239, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 7.281239909589925, | |
| "grad_norm": 0.7916932702064514, | |
| "learning_rate": 0.00016346298890205797, | |
| "loss": 3.2237, | |
| "step": 67650 | |
| }, | |
| { | |
| "epoch": 7.286621461629534, | |
| "grad_norm": 0.7457194924354553, | |
| "learning_rate": 0.00016314621269259777, | |
| "loss": 3.2412, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 7.2920030136691425, | |
| "grad_norm": 0.8166563510894775, | |
| "learning_rate": 0.00016282297166253634, | |
| "loss": 3.2098, | |
| "step": 67750 | |
| }, | |
| { | |
| "epoch": 7.29738456570875, | |
| "grad_norm": 0.8328801393508911, | |
| "learning_rate": 0.00016249973063247494, | |
| "loss": 3.2359, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 7.302766117748359, | |
| "grad_norm": 0.777811586856842, | |
| "learning_rate": 0.00016217648960241353, | |
| "loss": 3.2321, | |
| "step": 67850 | |
| }, | |
| { | |
| "epoch": 7.308147669787967, | |
| "grad_norm": 0.8480938673019409, | |
| "learning_rate": 0.0001618532485723521, | |
| "loss": 3.2132, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 7.313529221827575, | |
| "grad_norm": 0.7613167762756348, | |
| "learning_rate": 0.00016153000754229067, | |
| "loss": 3.235, | |
| "step": 67950 | |
| }, | |
| { | |
| "epoch": 7.3189107738671835, | |
| "grad_norm": 0.7387501001358032, | |
| "learning_rate": 0.0001612067665122293, | |
| "loss": 3.2177, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 7.3189107738671835, | |
| "eval_accuracy": 0.3881735243117899, | |
| "eval_loss": 3.3549270629882812, | |
| "eval_runtime": 183.9194, | |
| "eval_samples_per_second": 97.929, | |
| "eval_steps_per_second": 6.122, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 7.324292325906791, | |
| "grad_norm": 0.8005468845367432, | |
| "learning_rate": 0.00016088352548216785, | |
| "loss": 3.224, | |
| "step": 68050 | |
| }, | |
| { | |
| "epoch": 7.3296738779464, | |
| "grad_norm": 0.7966600060462952, | |
| "learning_rate": 0.00016056028445210642, | |
| "loss": 3.2397, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 7.335055429986008, | |
| "grad_norm": 0.727301836013794, | |
| "learning_rate": 0.00016023704342204504, | |
| "loss": 3.2308, | |
| "step": 68150 | |
| }, | |
| { | |
| "epoch": 7.340436982025616, | |
| "grad_norm": 0.7933337688446045, | |
| "learning_rate": 0.0001599138023919836, | |
| "loss": 3.2197, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 7.3458185340652244, | |
| "grad_norm": 0.77813321352005, | |
| "learning_rate": 0.0001595905613619222, | |
| "loss": 3.2116, | |
| "step": 68250 | |
| }, | |
| { | |
| "epoch": 7.351200086104833, | |
| "grad_norm": 0.8122656345367432, | |
| "learning_rate": 0.00015926732033186077, | |
| "loss": 3.2054, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 7.356581638144441, | |
| "grad_norm": 0.7639948725700378, | |
| "learning_rate": 0.00015894407930179934, | |
| "loss": 3.2372, | |
| "step": 68350 | |
| }, | |
| { | |
| "epoch": 7.361963190184049, | |
| "grad_norm": 0.7615376710891724, | |
| "learning_rate": 0.00015862083827173796, | |
| "loss": 3.2182, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 7.367344742223658, | |
| "grad_norm": 0.7874367833137512, | |
| "learning_rate": 0.00015829759724167653, | |
| "loss": 3.2262, | |
| "step": 68450 | |
| }, | |
| { | |
| "epoch": 7.372726294263265, | |
| "grad_norm": 0.7851904630661011, | |
| "learning_rate": 0.0001579743562116151, | |
| "loss": 3.2176, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 7.378107846302874, | |
| "grad_norm": 0.7756659388542175, | |
| "learning_rate": 0.00015765111518155372, | |
| "loss": 3.2207, | |
| "step": 68550 | |
| }, | |
| { | |
| "epoch": 7.383489398342482, | |
| "grad_norm": 0.7584941387176514, | |
| "learning_rate": 0.0001573278741514923, | |
| "loss": 3.2037, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 7.38887095038209, | |
| "grad_norm": 0.7656226754188538, | |
| "learning_rate": 0.00015700463312143085, | |
| "loss": 3.241, | |
| "step": 68650 | |
| }, | |
| { | |
| "epoch": 7.394252502421699, | |
| "grad_norm": 0.814250648021698, | |
| "learning_rate": 0.00015668139209136945, | |
| "loss": 3.2298, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 7.399634054461306, | |
| "grad_norm": 0.8023512363433838, | |
| "learning_rate": 0.00015635815106130804, | |
| "loss": 3.2192, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 7.405015606500915, | |
| "grad_norm": 0.845595121383667, | |
| "learning_rate": 0.0001560349100312466, | |
| "loss": 3.2426, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 7.4103971585405235, | |
| "grad_norm": 0.7891848087310791, | |
| "learning_rate": 0.0001557116690011852, | |
| "loss": 3.214, | |
| "step": 68850 | |
| }, | |
| { | |
| "epoch": 7.415778710580131, | |
| "grad_norm": 0.8064113855361938, | |
| "learning_rate": 0.00015538842797112377, | |
| "loss": 3.2318, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 7.42116026261974, | |
| "grad_norm": 0.7896233797073364, | |
| "learning_rate": 0.00015506518694106237, | |
| "loss": 3.2226, | |
| "step": 68950 | |
| }, | |
| { | |
| "epoch": 7.426541814659347, | |
| "grad_norm": 0.7546834945678711, | |
| "learning_rate": 0.00015474194591100096, | |
| "loss": 3.2373, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 7.426541814659347, | |
| "eval_accuracy": 0.3881545100564224, | |
| "eval_loss": 3.350152015686035, | |
| "eval_runtime": 183.5756, | |
| "eval_samples_per_second": 98.112, | |
| "eval_steps_per_second": 6.134, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 7.431923366698956, | |
| "grad_norm": 0.8021217584609985, | |
| "learning_rate": 0.00015441870488093953, | |
| "loss": 3.2293, | |
| "step": 69050 | |
| }, | |
| { | |
| "epoch": 7.4373049187385645, | |
| "grad_norm": 0.7711283564567566, | |
| "learning_rate": 0.0001540954638508781, | |
| "loss": 3.2068, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 7.442686470778172, | |
| "grad_norm": 0.7406951189041138, | |
| "learning_rate": 0.00015377222282081672, | |
| "loss": 3.2363, | |
| "step": 69150 | |
| }, | |
| { | |
| "epoch": 7.448068022817781, | |
| "grad_norm": 0.7828607559204102, | |
| "learning_rate": 0.0001534489817907553, | |
| "loss": 3.23, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 7.453449574857389, | |
| "grad_norm": 0.7775204181671143, | |
| "learning_rate": 0.00015312574076069388, | |
| "loss": 3.2188, | |
| "step": 69250 | |
| }, | |
| { | |
| "epoch": 7.458831126896997, | |
| "grad_norm": 0.8648480176925659, | |
| "learning_rate": 0.00015280249973063248, | |
| "loss": 3.2287, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 7.4642126789366054, | |
| "grad_norm": 0.7523376941680908, | |
| "learning_rate": 0.00015247925870057104, | |
| "loss": 3.2364, | |
| "step": 69350 | |
| }, | |
| { | |
| "epoch": 7.469594230976213, | |
| "grad_norm": 0.7940405607223511, | |
| "learning_rate": 0.00015215601767050964, | |
| "loss": 3.2329, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 7.474975783015822, | |
| "grad_norm": 0.8218305706977844, | |
| "learning_rate": 0.0001518327766404482, | |
| "loss": 3.2279, | |
| "step": 69450 | |
| }, | |
| { | |
| "epoch": 7.48035733505543, | |
| "grad_norm": 0.8601292967796326, | |
| "learning_rate": 0.0001515095356103868, | |
| "loss": 3.2287, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 7.485738887095038, | |
| "grad_norm": 0.7894765138626099, | |
| "learning_rate": 0.0001511862945803254, | |
| "loss": 3.2431, | |
| "step": 69550 | |
| }, | |
| { | |
| "epoch": 7.491120439134646, | |
| "grad_norm": 0.7626713514328003, | |
| "learning_rate": 0.00015086305355026396, | |
| "loss": 3.239, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 7.496501991174255, | |
| "grad_norm": 0.7660843133926392, | |
| "learning_rate": 0.00015053981252020253, | |
| "loss": 3.2452, | |
| "step": 69650 | |
| }, | |
| { | |
| "epoch": 7.501883543213863, | |
| "grad_norm": 0.7898152470588684, | |
| "learning_rate": 0.00015021657149014115, | |
| "loss": 3.2364, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 7.507265095253471, | |
| "grad_norm": 0.7465683221817017, | |
| "learning_rate": 0.00014989333046007972, | |
| "loss": 3.2378, | |
| "step": 69750 | |
| }, | |
| { | |
| "epoch": 7.51264664729308, | |
| "grad_norm": 0.8163168430328369, | |
| "learning_rate": 0.00014957008943001832, | |
| "loss": 3.2374, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 7.518028199332687, | |
| "grad_norm": 0.7777751684188843, | |
| "learning_rate": 0.00014925331322055812, | |
| "loss": 3.2407, | |
| "step": 69850 | |
| }, | |
| { | |
| "epoch": 7.523409751372296, | |
| "grad_norm": 0.7856535911560059, | |
| "learning_rate": 0.0001489300721904967, | |
| "loss": 3.2332, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 7.528791303411904, | |
| "grad_norm": 0.8025381565093994, | |
| "learning_rate": 0.00014860683116043528, | |
| "loss": 3.2268, | |
| "step": 69950 | |
| }, | |
| { | |
| "epoch": 7.534172855451512, | |
| "grad_norm": 0.768328070640564, | |
| "learning_rate": 0.00014828359013037385, | |
| "loss": 3.2271, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.534172855451512, | |
| "eval_accuracy": 0.3889133418249187, | |
| "eval_loss": 3.347104549407959, | |
| "eval_runtime": 186.753, | |
| "eval_samples_per_second": 96.443, | |
| "eval_steps_per_second": 6.029, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 7.539554407491121, | |
| "grad_norm": 0.7838550209999084, | |
| "learning_rate": 0.00014796034910031245, | |
| "loss": 3.2358, | |
| "step": 70050 | |
| }, | |
| { | |
| "epoch": 7.544935959530728, | |
| "grad_norm": 0.8263210654258728, | |
| "learning_rate": 0.00014763710807025104, | |
| "loss": 3.2473, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 7.550317511570337, | |
| "grad_norm": 0.7420403361320496, | |
| "learning_rate": 0.0001473138670401896, | |
| "loss": 3.2405, | |
| "step": 70150 | |
| }, | |
| { | |
| "epoch": 7.5556990636099455, | |
| "grad_norm": 0.8509701490402222, | |
| "learning_rate": 0.0001469906260101282, | |
| "loss": 3.2202, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 7.561080615649553, | |
| "grad_norm": 0.7671039700508118, | |
| "learning_rate": 0.0001466673849800668, | |
| "loss": 3.2382, | |
| "step": 70250 | |
| }, | |
| { | |
| "epoch": 7.566462167689162, | |
| "grad_norm": 0.7579676508903503, | |
| "learning_rate": 0.0001463441439500054, | |
| "loss": 3.2368, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 7.57184371972877, | |
| "grad_norm": 0.7678229212760925, | |
| "learning_rate": 0.00014602090291994396, | |
| "loss": 3.2319, | |
| "step": 70350 | |
| }, | |
| { | |
| "epoch": 7.577225271768378, | |
| "grad_norm": 0.7729119658470154, | |
| "learning_rate": 0.00014569766188988255, | |
| "loss": 3.2431, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 7.5826068238079865, | |
| "grad_norm": 0.8020555973052979, | |
| "learning_rate": 0.00014537442085982112, | |
| "loss": 3.2512, | |
| "step": 70450 | |
| }, | |
| { | |
| "epoch": 7.587988375847594, | |
| "grad_norm": 0.8227038383483887, | |
| "learning_rate": 0.00014505117982975972, | |
| "loss": 3.2352, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 7.593369927887203, | |
| "grad_norm": 0.7413498163223267, | |
| "learning_rate": 0.00014472793879969828, | |
| "loss": 3.226, | |
| "step": 70550 | |
| }, | |
| { | |
| "epoch": 7.598751479926811, | |
| "grad_norm": 0.8386573791503906, | |
| "learning_rate": 0.00014440469776963688, | |
| "loss": 3.2471, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 7.604133031966419, | |
| "grad_norm": 0.7619134783744812, | |
| "learning_rate": 0.00014408145673957545, | |
| "loss": 3.213, | |
| "step": 70650 | |
| }, | |
| { | |
| "epoch": 7.609514584006027, | |
| "grad_norm": 0.7510246634483337, | |
| "learning_rate": 0.00014375821570951404, | |
| "loss": 3.2313, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 7.614896136045635, | |
| "grad_norm": 0.795792818069458, | |
| "learning_rate": 0.00014343497467945264, | |
| "loss": 3.2269, | |
| "step": 70750 | |
| }, | |
| { | |
| "epoch": 7.620277688085244, | |
| "grad_norm": 0.803733766078949, | |
| "learning_rate": 0.00014311173364939123, | |
| "loss": 3.2285, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 7.625659240124852, | |
| "grad_norm": 0.7954763770103455, | |
| "learning_rate": 0.0001427884926193298, | |
| "loss": 3.2346, | |
| "step": 70850 | |
| }, | |
| { | |
| "epoch": 7.63104079216446, | |
| "grad_norm": 0.7582803964614868, | |
| "learning_rate": 0.0001424652515892684, | |
| "loss": 3.2259, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 7.636422344204068, | |
| "grad_norm": 0.8451318740844727, | |
| "learning_rate": 0.000142142010559207, | |
| "loss": 3.2392, | |
| "step": 70950 | |
| }, | |
| { | |
| "epoch": 7.641803896243677, | |
| "grad_norm": 0.7700170278549194, | |
| "learning_rate": 0.00014181876952914555, | |
| "loss": 3.2571, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 7.641803896243677, | |
| "eval_accuracy": 0.38935566773121144, | |
| "eval_loss": 3.3431193828582764, | |
| "eval_runtime": 184.6174, | |
| "eval_samples_per_second": 97.559, | |
| "eval_steps_per_second": 6.099, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 7.647185448283285, | |
| "grad_norm": 0.7822245955467224, | |
| "learning_rate": 0.00014149552849908415, | |
| "loss": 3.237, | |
| "step": 71050 | |
| }, | |
| { | |
| "epoch": 7.652567000322893, | |
| "grad_norm": 0.8479968905448914, | |
| "learning_rate": 0.00014117228746902272, | |
| "loss": 3.2428, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 7.657948552362502, | |
| "grad_norm": 0.818569004535675, | |
| "learning_rate": 0.0001408490464389613, | |
| "loss": 3.2402, | |
| "step": 71150 | |
| }, | |
| { | |
| "epoch": 7.663330104402109, | |
| "grad_norm": 0.8019593954086304, | |
| "learning_rate": 0.00014052580540889988, | |
| "loss": 3.2463, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 7.668711656441718, | |
| "grad_norm": 0.7790815234184265, | |
| "learning_rate": 0.00014020256437883847, | |
| "loss": 3.2355, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 7.674093208481326, | |
| "grad_norm": 0.7565615177154541, | |
| "learning_rate": 0.00013987932334877707, | |
| "loss": 3.2197, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 7.679474760520934, | |
| "grad_norm": 0.782861590385437, | |
| "learning_rate": 0.00013955608231871564, | |
| "loss": 3.2444, | |
| "step": 71350 | |
| }, | |
| { | |
| "epoch": 7.684856312560543, | |
| "grad_norm": 0.8096931576728821, | |
| "learning_rate": 0.00013923284128865423, | |
| "loss": 3.2483, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 7.69023786460015, | |
| "grad_norm": 0.7934433817863464, | |
| "learning_rate": 0.00013890960025859283, | |
| "loss": 3.2454, | |
| "step": 71450 | |
| }, | |
| { | |
| "epoch": 7.695619416639759, | |
| "grad_norm": 0.7780919671058655, | |
| "learning_rate": 0.0001385863592285314, | |
| "loss": 3.236, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 7.7010009686793675, | |
| "grad_norm": 0.7714535593986511, | |
| "learning_rate": 0.00013826311819847, | |
| "loss": 3.2354, | |
| "step": 71550 | |
| }, | |
| { | |
| "epoch": 7.706382520718975, | |
| "grad_norm": 0.7559123039245605, | |
| "learning_rate": 0.00013793987716840858, | |
| "loss": 3.2452, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 7.711764072758584, | |
| "grad_norm": 0.7606051564216614, | |
| "learning_rate": 0.00013761663613834715, | |
| "loss": 3.2283, | |
| "step": 71650 | |
| }, | |
| { | |
| "epoch": 7.717145624798192, | |
| "grad_norm": 0.7872258424758911, | |
| "learning_rate": 0.00013729339510828572, | |
| "loss": 3.2319, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 7.7225271768378, | |
| "grad_norm": 0.7712429761886597, | |
| "learning_rate": 0.0001369701540782243, | |
| "loss": 3.2328, | |
| "step": 71750 | |
| }, | |
| { | |
| "epoch": 7.727908728877408, | |
| "grad_norm": 0.8140832185745239, | |
| "learning_rate": 0.0001366469130481629, | |
| "loss": 3.2216, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 7.733290280917016, | |
| "grad_norm": 0.8168127536773682, | |
| "learning_rate": 0.00013632367201810147, | |
| "loss": 3.2295, | |
| "step": 71850 | |
| }, | |
| { | |
| "epoch": 7.738671832956625, | |
| "grad_norm": 0.8033936619758606, | |
| "learning_rate": 0.0001360068958086413, | |
| "loss": 3.2374, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 7.744053384996233, | |
| "grad_norm": 0.9045777320861816, | |
| "learning_rate": 0.00013568365477857988, | |
| "loss": 3.2282, | |
| "step": 71950 | |
| }, | |
| { | |
| "epoch": 7.749434937035841, | |
| "grad_norm": 0.7644913792610168, | |
| "learning_rate": 0.00013536041374851847, | |
| "loss": 3.2354, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.749434937035841, | |
| "eval_accuracy": 0.38948757233701825, | |
| "eval_loss": 3.3385231494903564, | |
| "eval_runtime": 185.7025, | |
| "eval_samples_per_second": 96.988, | |
| "eval_steps_per_second": 6.063, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 7.754816489075449, | |
| "grad_norm": 0.7797622680664062, | |
| "learning_rate": 0.00013503717271845706, | |
| "loss": 3.2282, | |
| "step": 72050 | |
| }, | |
| { | |
| "epoch": 7.760198041115058, | |
| "grad_norm": 0.7901070713996887, | |
| "learning_rate": 0.00013471393168839563, | |
| "loss": 3.2469, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 7.765579593154666, | |
| "grad_norm": 0.8164810538291931, | |
| "learning_rate": 0.00013439069065833423, | |
| "loss": 3.2479, | |
| "step": 72150 | |
| }, | |
| { | |
| "epoch": 7.770961145194274, | |
| "grad_norm": 0.8166574835777283, | |
| "learning_rate": 0.0001340674496282728, | |
| "loss": 3.2366, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 7.776342697233883, | |
| "grad_norm": 0.7946748733520508, | |
| "learning_rate": 0.0001337442085982114, | |
| "loss": 3.2279, | |
| "step": 72250 | |
| }, | |
| { | |
| "epoch": 7.78172424927349, | |
| "grad_norm": 0.8163143396377563, | |
| "learning_rate": 0.00013342096756814996, | |
| "loss": 3.2278, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 7.787105801313099, | |
| "grad_norm": 0.7848666906356812, | |
| "learning_rate": 0.00013309772653808855, | |
| "loss": 3.2493, | |
| "step": 72350 | |
| }, | |
| { | |
| "epoch": 7.792487353352707, | |
| "grad_norm": 0.7951918840408325, | |
| "learning_rate": 0.00013277448550802715, | |
| "loss": 3.2208, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 7.797868905392315, | |
| "grad_norm": 0.7784159779548645, | |
| "learning_rate": 0.0001324512444779657, | |
| "loss": 3.2259, | |
| "step": 72450 | |
| }, | |
| { | |
| "epoch": 7.803250457431924, | |
| "grad_norm": 0.8353852033615112, | |
| "learning_rate": 0.0001321280034479043, | |
| "loss": 3.2519, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 7.808632009471531, | |
| "grad_norm": 0.7999012470245361, | |
| "learning_rate": 0.0001318047624178429, | |
| "loss": 3.2405, | |
| "step": 72550 | |
| }, | |
| { | |
| "epoch": 7.81401356151114, | |
| "grad_norm": 0.8065581321716309, | |
| "learning_rate": 0.0001314815213877815, | |
| "loss": 3.2377, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 7.819395113550748, | |
| "grad_norm": 0.7664511799812317, | |
| "learning_rate": 0.00013115828035772007, | |
| "loss": 3.2396, | |
| "step": 72650 | |
| }, | |
| { | |
| "epoch": 7.824776665590356, | |
| "grad_norm": 0.8212724924087524, | |
| "learning_rate": 0.00013083503932765866, | |
| "loss": 3.2414, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 7.830158217629965, | |
| "grad_norm": 0.8192585110664368, | |
| "learning_rate": 0.00013051179829759723, | |
| "loss": 3.2474, | |
| "step": 72750 | |
| }, | |
| { | |
| "epoch": 7.835539769669572, | |
| "grad_norm": 0.7615439295768738, | |
| "learning_rate": 0.00013018855726753582, | |
| "loss": 3.2471, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 7.840921321709181, | |
| "grad_norm": 0.8273650407791138, | |
| "learning_rate": 0.0001298653162374744, | |
| "loss": 3.2322, | |
| "step": 72850 | |
| }, | |
| { | |
| "epoch": 7.846302873748789, | |
| "grad_norm": 0.8271594047546387, | |
| "learning_rate": 0.00012954207520741298, | |
| "loss": 3.2416, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 7.851684425788397, | |
| "grad_norm": 0.7770468592643738, | |
| "learning_rate": 0.00012921883417735155, | |
| "loss": 3.2437, | |
| "step": 72950 | |
| }, | |
| { | |
| "epoch": 7.857065977828006, | |
| "grad_norm": 0.8396730422973633, | |
| "learning_rate": 0.00012889559314729015, | |
| "loss": 3.2343, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.857065977828006, | |
| "eval_accuracy": 0.3899009965751523, | |
| "eval_loss": 3.3356707096099854, | |
| "eval_runtime": 185.9451, | |
| "eval_samples_per_second": 96.862, | |
| "eval_steps_per_second": 6.056, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 7.862447529867614, | |
| "grad_norm": 0.8383583426475525, | |
| "learning_rate": 0.00012857235211722874, | |
| "loss": 3.2385, | |
| "step": 73050 | |
| }, | |
| { | |
| "epoch": 7.867829081907222, | |
| "grad_norm": 0.8135746121406555, | |
| "learning_rate": 0.00012824911108716734, | |
| "loss": 3.2319, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 7.87321063394683, | |
| "grad_norm": 0.7756822109222412, | |
| "learning_rate": 0.0001279258700571059, | |
| "loss": 3.2414, | |
| "step": 73150 | |
| }, | |
| { | |
| "epoch": 7.878592185986438, | |
| "grad_norm": 0.7989129424095154, | |
| "learning_rate": 0.0001276026290270445, | |
| "loss": 3.2258, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 7.883973738026047, | |
| "grad_norm": 0.7954966425895691, | |
| "learning_rate": 0.00012727938799698307, | |
| "loss": 3.2598, | |
| "step": 73250 | |
| }, | |
| { | |
| "epoch": 7.889355290065655, | |
| "grad_norm": 0.8074511885643005, | |
| "learning_rate": 0.00012696261178752287, | |
| "loss": 3.2205, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 7.894736842105263, | |
| "grad_norm": 0.8159090876579285, | |
| "learning_rate": 0.00012663937075746147, | |
| "loss": 3.2612, | |
| "step": 73350 | |
| }, | |
| { | |
| "epoch": 7.900118394144871, | |
| "grad_norm": 0.8149001598358154, | |
| "learning_rate": 0.00012631612972740006, | |
| "loss": 3.2642, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 7.90549994618448, | |
| "grad_norm": 0.7865789532661438, | |
| "learning_rate": 0.00012599288869733863, | |
| "loss": 3.2372, | |
| "step": 73450 | |
| }, | |
| { | |
| "epoch": 7.910881498224088, | |
| "grad_norm": 0.805672824382782, | |
| "learning_rate": 0.00012566964766727722, | |
| "loss": 3.232, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 7.916263050263696, | |
| "grad_norm": 0.7899633049964905, | |
| "learning_rate": 0.00012534640663721582, | |
| "loss": 3.2377, | |
| "step": 73550 | |
| }, | |
| { | |
| "epoch": 7.921644602303305, | |
| "grad_norm": 0.795754611492157, | |
| "learning_rate": 0.00012502316560715439, | |
| "loss": 3.2179, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 7.927026154342912, | |
| "grad_norm": 0.7938458323478699, | |
| "learning_rate": 0.00012469992457709298, | |
| "loss": 3.2383, | |
| "step": 73650 | |
| }, | |
| { | |
| "epoch": 7.932407706382521, | |
| "grad_norm": 0.7996332049369812, | |
| "learning_rate": 0.00012437668354703158, | |
| "loss": 3.2522, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 7.937789258422129, | |
| "grad_norm": 0.7663394808769226, | |
| "learning_rate": 0.00012405344251697014, | |
| "loss": 3.2147, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 7.943170810461737, | |
| "grad_norm": 0.7848943471908569, | |
| "learning_rate": 0.00012373020148690874, | |
| "loss": 3.2357, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 7.948552362501346, | |
| "grad_norm": 0.7804843187332153, | |
| "learning_rate": 0.0001234069604568473, | |
| "loss": 3.2358, | |
| "step": 73850 | |
| }, | |
| { | |
| "epoch": 7.953933914540953, | |
| "grad_norm": 0.81059730052948, | |
| "learning_rate": 0.0001230837194267859, | |
| "loss": 3.2335, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 7.959315466580562, | |
| "grad_norm": 0.8302901387214661, | |
| "learning_rate": 0.00012276047839672447, | |
| "loss": 3.2246, | |
| "step": 73950 | |
| }, | |
| { | |
| "epoch": 7.96469701862017, | |
| "grad_norm": 0.793449878692627, | |
| "learning_rate": 0.00012243723736666306, | |
| "loss": 3.2224, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.96469701862017, | |
| "eval_accuracy": 0.3902945373348164, | |
| "eval_loss": 3.3304708003997803, | |
| "eval_runtime": 185.3875, | |
| "eval_samples_per_second": 97.153, | |
| "eval_steps_per_second": 6.074, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 7.970078570659778, | |
| "grad_norm": 0.8431436419487, | |
| "learning_rate": 0.00012211399633660166, | |
| "loss": 3.2265, | |
| "step": 74050 | |
| }, | |
| { | |
| "epoch": 7.975460122699387, | |
| "grad_norm": 0.8346486687660217, | |
| "learning_rate": 0.00012179075530654022, | |
| "loss": 3.2429, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 7.980841674738995, | |
| "grad_norm": 0.8443086743354797, | |
| "learning_rate": 0.00012146751427647882, | |
| "loss": 3.2296, | |
| "step": 74150 | |
| }, | |
| { | |
| "epoch": 7.986223226778603, | |
| "grad_norm": 0.8074975609779358, | |
| "learning_rate": 0.0001211442732464174, | |
| "loss": 3.2325, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 7.991604778818211, | |
| "grad_norm": 0.7959332466125488, | |
| "learning_rate": 0.00012082103221635598, | |
| "loss": 3.2252, | |
| "step": 74250 | |
| }, | |
| { | |
| "epoch": 7.996986330857819, | |
| "grad_norm": 0.7904613018035889, | |
| "learning_rate": 0.00012049779118629456, | |
| "loss": 3.2217, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 8.002367882897428, | |
| "grad_norm": 0.830864429473877, | |
| "learning_rate": 0.00012017455015623316, | |
| "loss": 3.2092, | |
| "step": 74350 | |
| }, | |
| { | |
| "epoch": 8.007749434937036, | |
| "grad_norm": 0.8585742712020874, | |
| "learning_rate": 0.00011985130912617175, | |
| "loss": 3.151, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 8.013130986976645, | |
| "grad_norm": 0.7843531370162964, | |
| "learning_rate": 0.00011952806809611032, | |
| "loss": 3.1576, | |
| "step": 74450 | |
| }, | |
| { | |
| "epoch": 8.018512539016251, | |
| "grad_norm": 0.8142103552818298, | |
| "learning_rate": 0.00011920482706604891, | |
| "loss": 3.1468, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 8.02389409105586, | |
| "grad_norm": 0.7832826972007751, | |
| "learning_rate": 0.0001188815860359875, | |
| "loss": 3.1542, | |
| "step": 74550 | |
| }, | |
| { | |
| "epoch": 8.029275643095469, | |
| "grad_norm": 0.825363278388977, | |
| "learning_rate": 0.00011855834500592608, | |
| "loss": 3.1713, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 8.034657195135077, | |
| "grad_norm": 0.8069754242897034, | |
| "learning_rate": 0.00011823510397586466, | |
| "loss": 3.1709, | |
| "step": 74650 | |
| }, | |
| { | |
| "epoch": 8.040038747174686, | |
| "grad_norm": 0.8154868483543396, | |
| "learning_rate": 0.00011791186294580325, | |
| "loss": 3.1677, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 8.045420299214294, | |
| "grad_norm": 0.8420593738555908, | |
| "learning_rate": 0.00011758862191574182, | |
| "loss": 3.1661, | |
| "step": 74750 | |
| }, | |
| { | |
| "epoch": 8.050801851253901, | |
| "grad_norm": 0.7894124388694763, | |
| "learning_rate": 0.00011726538088568041, | |
| "loss": 3.1623, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 8.05618340329351, | |
| "grad_norm": 0.7732349038124084, | |
| "learning_rate": 0.000116942139855619, | |
| "loss": 3.1625, | |
| "step": 74850 | |
| }, | |
| { | |
| "epoch": 8.061564955333118, | |
| "grad_norm": 0.7911288738250732, | |
| "learning_rate": 0.00011661889882555759, | |
| "loss": 3.1792, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 8.066946507372727, | |
| "grad_norm": 0.7990174293518066, | |
| "learning_rate": 0.00011629565779549616, | |
| "loss": 3.1663, | |
| "step": 74950 | |
| }, | |
| { | |
| "epoch": 8.072328059412335, | |
| "grad_norm": 0.8296521902084351, | |
| "learning_rate": 0.00011597241676543475, | |
| "loss": 3.1624, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 8.072328059412335, | |
| "eval_accuracy": 0.390195771859793, | |
| "eval_loss": 3.338721513748169, | |
| "eval_runtime": 185.4374, | |
| "eval_samples_per_second": 97.127, | |
| "eval_steps_per_second": 6.072, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 8.077709611451942, | |
| "grad_norm": 0.8073539733886719, | |
| "learning_rate": 0.00011564917573537335, | |
| "loss": 3.1724, | |
| "step": 75050 | |
| }, | |
| { | |
| "epoch": 8.08309116349155, | |
| "grad_norm": 0.8138803839683533, | |
| "learning_rate": 0.00011532593470531191, | |
| "loss": 3.1767, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 8.088472715531159, | |
| "grad_norm": 0.8345963358879089, | |
| "learning_rate": 0.0001150026936752505, | |
| "loss": 3.1701, | |
| "step": 75150 | |
| }, | |
| { | |
| "epoch": 8.093854267570768, | |
| "grad_norm": 0.8031169176101685, | |
| "learning_rate": 0.00011467945264518909, | |
| "loss": 3.1608, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 8.099235819610376, | |
| "grad_norm": 0.8811976909637451, | |
| "learning_rate": 0.00011435621161512766, | |
| "loss": 3.1759, | |
| "step": 75250 | |
| }, | |
| { | |
| "epoch": 8.104617371649983, | |
| "grad_norm": 0.8665459156036377, | |
| "learning_rate": 0.00011403297058506625, | |
| "loss": 3.1649, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 8.109998923689592, | |
| "grad_norm": 0.8429995775222778, | |
| "learning_rate": 0.00011370972955500485, | |
| "loss": 3.1501, | |
| "step": 75350 | |
| }, | |
| { | |
| "epoch": 8.1153804757292, | |
| "grad_norm": 0.7925933599472046, | |
| "learning_rate": 0.00011338648852494343, | |
| "loss": 3.1473, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 8.120762027768809, | |
| "grad_norm": 0.806531548500061, | |
| "learning_rate": 0.00011306324749488201, | |
| "loss": 3.1832, | |
| "step": 75450 | |
| }, | |
| { | |
| "epoch": 8.126143579808417, | |
| "grad_norm": 0.8357629179954529, | |
| "learning_rate": 0.00011274000646482059, | |
| "loss": 3.1748, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 8.131525131848026, | |
| "grad_norm": 0.8737144470214844, | |
| "learning_rate": 0.00011241676543475918, | |
| "loss": 3.1739, | |
| "step": 75550 | |
| }, | |
| { | |
| "epoch": 8.136906683887632, | |
| "grad_norm": 0.8199858665466309, | |
| "learning_rate": 0.00011209352440469775, | |
| "loss": 3.1759, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 8.142288235927241, | |
| "grad_norm": 0.7963452339172363, | |
| "learning_rate": 0.00011177028337463635, | |
| "loss": 3.1703, | |
| "step": 75650 | |
| }, | |
| { | |
| "epoch": 8.14766978796685, | |
| "grad_norm": 0.8108802437782288, | |
| "learning_rate": 0.00011144704234457493, | |
| "loss": 3.1827, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 8.153051340006458, | |
| "grad_norm": 0.8074478507041931, | |
| "learning_rate": 0.00011112380131451351, | |
| "loss": 3.1549, | |
| "step": 75750 | |
| }, | |
| { | |
| "epoch": 8.158432892046067, | |
| "grad_norm": 0.8079409599304199, | |
| "learning_rate": 0.00011080056028445209, | |
| "loss": 3.1839, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 8.163814444085673, | |
| "grad_norm": 0.8951064944267273, | |
| "learning_rate": 0.00011047731925439068, | |
| "loss": 3.1682, | |
| "step": 75850 | |
| }, | |
| { | |
| "epoch": 8.169195996125282, | |
| "grad_norm": 0.8033889532089233, | |
| "learning_rate": 0.00011015407822432928, | |
| "loss": 3.1732, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 8.17457754816489, | |
| "grad_norm": 0.8481862545013428, | |
| "learning_rate": 0.00010983083719426785, | |
| "loss": 3.1711, | |
| "step": 75950 | |
| }, | |
| { | |
| "epoch": 8.1799591002045, | |
| "grad_norm": 0.8201789855957031, | |
| "learning_rate": 0.00010950759616420644, | |
| "loss": 3.1815, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 8.1799591002045, | |
| "eval_accuracy": 0.39044654272486884, | |
| "eval_loss": 3.3347246646881104, | |
| "eval_runtime": 185.7806, | |
| "eval_samples_per_second": 96.948, | |
| "eval_steps_per_second": 6.061, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 8.185340652244108, | |
| "grad_norm": 0.7705520391464233, | |
| "learning_rate": 0.00010918435513414502, | |
| "loss": 3.1902, | |
| "step": 76050 | |
| }, | |
| { | |
| "epoch": 8.190722204283716, | |
| "grad_norm": 0.8561078310012817, | |
| "learning_rate": 0.00010886111410408359, | |
| "loss": 3.188, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 8.196103756323323, | |
| "grad_norm": 0.8287132382392883, | |
| "learning_rate": 0.00010853787307402218, | |
| "loss": 3.176, | |
| "step": 76150 | |
| }, | |
| { | |
| "epoch": 8.201485308362932, | |
| "grad_norm": 0.8220933675765991, | |
| "learning_rate": 0.00010821463204396078, | |
| "loss": 3.1587, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 8.20686686040254, | |
| "grad_norm": 0.8482598066329956, | |
| "learning_rate": 0.00010789139101389935, | |
| "loss": 3.1814, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 8.212248412442149, | |
| "grad_norm": 0.7827437520027161, | |
| "learning_rate": 0.00010757461480443917, | |
| "loss": 3.184, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 8.217629964481757, | |
| "grad_norm": 0.8484346270561218, | |
| "learning_rate": 0.00010725137377437776, | |
| "loss": 3.176, | |
| "step": 76350 | |
| }, | |
| { | |
| "epoch": 8.223011516521364, | |
| "grad_norm": 0.8320658206939697, | |
| "learning_rate": 0.00010692813274431633, | |
| "loss": 3.1593, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 8.228393068560973, | |
| "grad_norm": 0.8569603562355042, | |
| "learning_rate": 0.00010660489171425492, | |
| "loss": 3.1722, | |
| "step": 76450 | |
| }, | |
| { | |
| "epoch": 8.233774620600581, | |
| "grad_norm": 0.8456915020942688, | |
| "learning_rate": 0.0001062816506841935, | |
| "loss": 3.1646, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 8.23915617264019, | |
| "grad_norm": 0.8972082138061523, | |
| "learning_rate": 0.00010595840965413209, | |
| "loss": 3.1839, | |
| "step": 76550 | |
| }, | |
| { | |
| "epoch": 8.244537724679798, | |
| "grad_norm": 0.7962077260017395, | |
| "learning_rate": 0.00010563516862407067, | |
| "loss": 3.1742, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 8.249919276719407, | |
| "grad_norm": 0.8209040760993958, | |
| "learning_rate": 0.00010531192759400926, | |
| "loss": 3.1883, | |
| "step": 76650 | |
| }, | |
| { | |
| "epoch": 8.255300828759013, | |
| "grad_norm": 0.8176620602607727, | |
| "learning_rate": 0.00010498868656394784, | |
| "loss": 3.1758, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 8.260682380798622, | |
| "grad_norm": 0.8269286751747131, | |
| "learning_rate": 0.00010466544553388642, | |
| "loss": 3.1552, | |
| "step": 76750 | |
| }, | |
| { | |
| "epoch": 8.26606393283823, | |
| "grad_norm": 0.8178252577781677, | |
| "learning_rate": 0.000104342204503825, | |
| "loss": 3.1911, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 8.27144548487784, | |
| "grad_norm": 0.8506616950035095, | |
| "learning_rate": 0.0001040189634737636, | |
| "loss": 3.1894, | |
| "step": 76850 | |
| }, | |
| { | |
| "epoch": 8.276827036917448, | |
| "grad_norm": 0.8493346571922302, | |
| "learning_rate": 0.00010369572244370217, | |
| "loss": 3.179, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 8.282208588957054, | |
| "grad_norm": 0.826920211315155, | |
| "learning_rate": 0.00010337248141364076, | |
| "loss": 3.176, | |
| "step": 76950 | |
| }, | |
| { | |
| "epoch": 8.287590140996663, | |
| "grad_norm": 0.7999362349510193, | |
| "learning_rate": 0.00010304924038357936, | |
| "loss": 3.1758, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 8.287590140996663, | |
| "eval_accuracy": 0.39078890797437227, | |
| "eval_loss": 3.3327534198760986, | |
| "eval_runtime": 185.3503, | |
| "eval_samples_per_second": 97.173, | |
| "eval_steps_per_second": 6.075, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 8.292971693036272, | |
| "grad_norm": 0.9111582040786743, | |
| "learning_rate": 0.00010272599935351792, | |
| "loss": 3.178, | |
| "step": 77050 | |
| }, | |
| { | |
| "epoch": 8.29835324507588, | |
| "grad_norm": 0.8611065745353699, | |
| "learning_rate": 0.00010240275832345652, | |
| "loss": 3.1736, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 8.303734797115489, | |
| "grad_norm": 0.7752180099487305, | |
| "learning_rate": 0.0001020795172933951, | |
| "loss": 3.1778, | |
| "step": 77150 | |
| }, | |
| { | |
| "epoch": 8.309116349155097, | |
| "grad_norm": 0.7995375394821167, | |
| "learning_rate": 0.0001017562762633337, | |
| "loss": 3.2014, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 8.314497901194704, | |
| "grad_norm": 0.8144875168800354, | |
| "learning_rate": 0.00010143303523327226, | |
| "loss": 3.1808, | |
| "step": 77250 | |
| }, | |
| { | |
| "epoch": 8.319879453234313, | |
| "grad_norm": 0.8051859140396118, | |
| "learning_rate": 0.00010110979420321086, | |
| "loss": 3.1826, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 8.325261005273921, | |
| "grad_norm": 0.7994803786277771, | |
| "learning_rate": 0.00010078655317314944, | |
| "loss": 3.1801, | |
| "step": 77350 | |
| }, | |
| { | |
| "epoch": 8.33064255731353, | |
| "grad_norm": 0.8555433750152588, | |
| "learning_rate": 0.00010046331214308802, | |
| "loss": 3.166, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 8.336024109353138, | |
| "grad_norm": 0.8833298087120056, | |
| "learning_rate": 0.0001001400711130266, | |
| "loss": 3.2049, | |
| "step": 77450 | |
| }, | |
| { | |
| "epoch": 8.341405661392745, | |
| "grad_norm": 0.8662601113319397, | |
| "learning_rate": 9.98168300829652e-05, | |
| "loss": 3.1979, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 8.346787213432354, | |
| "grad_norm": 0.8390443325042725, | |
| "learning_rate": 9.949358905290376e-05, | |
| "loss": 3.1907, | |
| "step": 77550 | |
| }, | |
| { | |
| "epoch": 8.352168765471962, | |
| "grad_norm": 0.8560362458229065, | |
| "learning_rate": 9.917034802284236e-05, | |
| "loss": 3.1891, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 8.35755031751157, | |
| "grad_norm": 0.8235760927200317, | |
| "learning_rate": 9.884710699278094e-05, | |
| "loss": 3.1708, | |
| "step": 77650 | |
| }, | |
| { | |
| "epoch": 8.36293186955118, | |
| "grad_norm": 0.8120622634887695, | |
| "learning_rate": 9.852386596271953e-05, | |
| "loss": 3.1958, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 8.368313421590786, | |
| "grad_norm": 0.7914016246795654, | |
| "learning_rate": 9.82006249326581e-05, | |
| "loss": 3.1867, | |
| "step": 77750 | |
| }, | |
| { | |
| "epoch": 8.373694973630395, | |
| "grad_norm": 0.86472088098526, | |
| "learning_rate": 9.78773839025967e-05, | |
| "loss": 3.1636, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 8.379076525670003, | |
| "grad_norm": 0.8104182481765747, | |
| "learning_rate": 9.755414287253529e-05, | |
| "loss": 3.1801, | |
| "step": 77850 | |
| }, | |
| { | |
| "epoch": 8.384458077709612, | |
| "grad_norm": 0.8004899621009827, | |
| "learning_rate": 9.723090184247386e-05, | |
| "loss": 3.2015, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 8.38983962974922, | |
| "grad_norm": 0.8116545081138611, | |
| "learning_rate": 9.690766081241245e-05, | |
| "loss": 3.1839, | |
| "step": 77950 | |
| }, | |
| { | |
| "epoch": 8.395221181788829, | |
| "grad_norm": 0.8094545006752014, | |
| "learning_rate": 9.658441978235103e-05, | |
| "loss": 3.1992, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 8.395221181788829, | |
| "eval_accuracy": 0.3910330510132914, | |
| "eval_loss": 3.3307039737701416, | |
| "eval_runtime": 184.6192, | |
| "eval_samples_per_second": 97.558, | |
| "eval_steps_per_second": 6.099, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 8.400602733828435, | |
| "grad_norm": 0.8134550452232361, | |
| "learning_rate": 9.626117875228961e-05, | |
| "loss": 3.1925, | |
| "step": 78050 | |
| }, | |
| { | |
| "epoch": 8.405984285868044, | |
| "grad_norm": 0.8060505390167236, | |
| "learning_rate": 9.59379377222282e-05, | |
| "loss": 3.1681, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 8.411365837907653, | |
| "grad_norm": 0.8342357873916626, | |
| "learning_rate": 9.561469669216679e-05, | |
| "loss": 3.1893, | |
| "step": 78150 | |
| }, | |
| { | |
| "epoch": 8.416747389947261, | |
| "grad_norm": 0.803632378578186, | |
| "learning_rate": 9.529145566210537e-05, | |
| "loss": 3.1694, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 8.42212894198687, | |
| "grad_norm": 0.8132495284080505, | |
| "learning_rate": 9.496821463204395e-05, | |
| "loss": 3.181, | |
| "step": 78250 | |
| }, | |
| { | |
| "epoch": 8.427510494026476, | |
| "grad_norm": 0.7954928874969482, | |
| "learning_rate": 9.464497360198253e-05, | |
| "loss": 3.1813, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 8.432892046066085, | |
| "grad_norm": 0.8573002815246582, | |
| "learning_rate": 9.432173257192113e-05, | |
| "loss": 3.1867, | |
| "step": 78350 | |
| }, | |
| { | |
| "epoch": 8.438273598105694, | |
| "grad_norm": 0.8724850416183472, | |
| "learning_rate": 9.39984915418597e-05, | |
| "loss": 3.1723, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 8.443655150145302, | |
| "grad_norm": 0.825176477432251, | |
| "learning_rate": 9.367525051179829e-05, | |
| "loss": 3.1915, | |
| "step": 78450 | |
| }, | |
| { | |
| "epoch": 8.44903670218491, | |
| "grad_norm": 0.8631423115730286, | |
| "learning_rate": 9.335200948173688e-05, | |
| "loss": 3.189, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 8.45441825422452, | |
| "grad_norm": 0.8290612697601318, | |
| "learning_rate": 9.302876845167545e-05, | |
| "loss": 3.1749, | |
| "step": 78550 | |
| }, | |
| { | |
| "epoch": 8.459799806264126, | |
| "grad_norm": 0.8389089703559875, | |
| "learning_rate": 9.270552742161403e-05, | |
| "loss": 3.1706, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 8.465181358303735, | |
| "grad_norm": 0.8537444472312927, | |
| "learning_rate": 9.238228639155263e-05, | |
| "loss": 3.1814, | |
| "step": 78650 | |
| }, | |
| { | |
| "epoch": 8.470562910343343, | |
| "grad_norm": 0.8286445140838623, | |
| "learning_rate": 9.205904536149122e-05, | |
| "loss": 3.1737, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 8.475944462382952, | |
| "grad_norm": 0.8667829632759094, | |
| "learning_rate": 9.173580433142979e-05, | |
| "loss": 3.1634, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 8.48132601442256, | |
| "grad_norm": 0.8203412294387817, | |
| "learning_rate": 9.141256330136838e-05, | |
| "loss": 3.1728, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 8.486707566462167, | |
| "grad_norm": 0.8149620294570923, | |
| "learning_rate": 9.108932227130697e-05, | |
| "loss": 3.1789, | |
| "step": 78850 | |
| }, | |
| { | |
| "epoch": 8.492089118501776, | |
| "grad_norm": 0.8609354496002197, | |
| "learning_rate": 9.076608124124555e-05, | |
| "loss": 3.1664, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 8.497470670541384, | |
| "grad_norm": 0.8484876751899719, | |
| "learning_rate": 9.044284021118413e-05, | |
| "loss": 3.1736, | |
| "step": 78950 | |
| }, | |
| { | |
| "epoch": 8.502852222580993, | |
| "grad_norm": 0.817091166973114, | |
| "learning_rate": 9.011959918112272e-05, | |
| "loss": 3.183, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 8.502852222580993, | |
| "eval_accuracy": 0.3913540116438954, | |
| "eval_loss": 3.326340913772583, | |
| "eval_runtime": 183.8032, | |
| "eval_samples_per_second": 97.991, | |
| "eval_steps_per_second": 6.126, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 8.508233774620601, | |
| "grad_norm": 0.846644401550293, | |
| "learning_rate": 8.979635815106129e-05, | |
| "loss": 3.1844, | |
| "step": 79050 | |
| }, | |
| { | |
| "epoch": 8.513615326660208, | |
| "grad_norm": 0.8385283350944519, | |
| "learning_rate": 8.947311712099989e-05, | |
| "loss": 3.196, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 8.518996878699816, | |
| "grad_norm": 0.8298741579055786, | |
| "learning_rate": 8.914987609093847e-05, | |
| "loss": 3.1814, | |
| "step": 79150 | |
| }, | |
| { | |
| "epoch": 8.524378430739425, | |
| "grad_norm": 0.8278105854988098, | |
| "learning_rate": 8.882663506087706e-05, | |
| "loss": 3.1983, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 8.529759982779034, | |
| "grad_norm": 0.8638647198677063, | |
| "learning_rate": 8.850339403081563e-05, | |
| "loss": 3.1806, | |
| "step": 79250 | |
| }, | |
| { | |
| "epoch": 8.535141534818642, | |
| "grad_norm": 0.8197671175003052, | |
| "learning_rate": 8.818015300075422e-05, | |
| "loss": 3.1808, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 8.54052308685825, | |
| "grad_norm": 0.8648247122764587, | |
| "learning_rate": 8.785691197069282e-05, | |
| "loss": 3.2081, | |
| "step": 79350 | |
| }, | |
| { | |
| "epoch": 8.545904638897857, | |
| "grad_norm": 0.8894992470741272, | |
| "learning_rate": 8.753367094063139e-05, | |
| "loss": 3.1711, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 8.551286190937466, | |
| "grad_norm": 0.8495647311210632, | |
| "learning_rate": 8.721042991056998e-05, | |
| "loss": 3.1824, | |
| "step": 79450 | |
| }, | |
| { | |
| "epoch": 8.556667742977075, | |
| "grad_norm": 0.8385526537895203, | |
| "learning_rate": 8.688718888050856e-05, | |
| "loss": 3.1831, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 8.562049295016683, | |
| "grad_norm": 0.8313244581222534, | |
| "learning_rate": 8.656394785044713e-05, | |
| "loss": 3.1923, | |
| "step": 79550 | |
| }, | |
| { | |
| "epoch": 8.567430847056292, | |
| "grad_norm": 0.7992787957191467, | |
| "learning_rate": 8.624070682038572e-05, | |
| "loss": 3.1883, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 8.572812399095898, | |
| "grad_norm": 0.8515611290931702, | |
| "learning_rate": 8.591746579032432e-05, | |
| "loss": 3.1855, | |
| "step": 79650 | |
| }, | |
| { | |
| "epoch": 8.578193951135507, | |
| "grad_norm": 0.8336965441703796, | |
| "learning_rate": 8.55942247602629e-05, | |
| "loss": 3.1975, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 8.583575503175116, | |
| "grad_norm": 0.8350300192832947, | |
| "learning_rate": 8.527098373020148e-05, | |
| "loss": 3.1877, | |
| "step": 79750 | |
| }, | |
| { | |
| "epoch": 8.588957055214724, | |
| "grad_norm": 0.8040255308151245, | |
| "learning_rate": 8.494774270014006e-05, | |
| "loss": 3.1902, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 8.594338607254333, | |
| "grad_norm": 0.8291066884994507, | |
| "learning_rate": 8.462450167007866e-05, | |
| "loss": 3.1798, | |
| "step": 79850 | |
| }, | |
| { | |
| "epoch": 8.599720159293941, | |
| "grad_norm": 0.8915792107582092, | |
| "learning_rate": 8.430126064001722e-05, | |
| "loss": 3.1781, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 8.605101711333548, | |
| "grad_norm": 0.8477835655212402, | |
| "learning_rate": 8.397801960995582e-05, | |
| "loss": 3.1845, | |
| "step": 79950 | |
| }, | |
| { | |
| "epoch": 8.610483263373157, | |
| "grad_norm": 0.8423581123352051, | |
| "learning_rate": 8.36547785798944e-05, | |
| "loss": 3.1571, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.610483263373157, | |
| "eval_accuracy": 0.39168312124108545, | |
| "eval_loss": 3.323802947998047, | |
| "eval_runtime": 184.6741, | |
| "eval_samples_per_second": 97.529, | |
| "eval_steps_per_second": 6.097, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 8.615864815412765, | |
| "grad_norm": 0.8273554444313049, | |
| "learning_rate": 8.333153754983298e-05, | |
| "loss": 3.1984, | |
| "step": 80050 | |
| }, | |
| { | |
| "epoch": 8.621246367452374, | |
| "grad_norm": 0.8392621874809265, | |
| "learning_rate": 8.300829651977156e-05, | |
| "loss": 3.1611, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 8.626627919491982, | |
| "grad_norm": 0.8796780109405518, | |
| "learning_rate": 8.268505548971016e-05, | |
| "loss": 3.1941, | |
| "step": 80150 | |
| }, | |
| { | |
| "epoch": 8.632009471531589, | |
| "grad_norm": 0.8048021793365479, | |
| "learning_rate": 8.236181445964875e-05, | |
| "loss": 3.1688, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 8.637391023571197, | |
| "grad_norm": 0.8406497836112976, | |
| "learning_rate": 8.203857342958732e-05, | |
| "loss": 3.179, | |
| "step": 80250 | |
| }, | |
| { | |
| "epoch": 8.642772575610806, | |
| "grad_norm": 0.8190396428108215, | |
| "learning_rate": 8.171533239952591e-05, | |
| "loss": 3.1728, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 8.648154127650415, | |
| "grad_norm": 0.8504931330680847, | |
| "learning_rate": 8.13985561900657e-05, | |
| "loss": 3.1711, | |
| "step": 80350 | |
| }, | |
| { | |
| "epoch": 8.653535679690023, | |
| "grad_norm": 0.8742973208427429, | |
| "learning_rate": 8.10753151600043e-05, | |
| "loss": 3.1971, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 8.658917231729632, | |
| "grad_norm": 0.838006317615509, | |
| "learning_rate": 8.07520741299429e-05, | |
| "loss": 3.1982, | |
| "step": 80450 | |
| }, | |
| { | |
| "epoch": 8.664298783769238, | |
| "grad_norm": 0.8249325752258301, | |
| "learning_rate": 8.042883309988148e-05, | |
| "loss": 3.1787, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 8.669680335808847, | |
| "grad_norm": 0.8424504995346069, | |
| "learning_rate": 8.010559206982006e-05, | |
| "loss": 3.1855, | |
| "step": 80550 | |
| }, | |
| { | |
| "epoch": 8.675061887848456, | |
| "grad_norm": 0.8270896077156067, | |
| "learning_rate": 7.978235103975864e-05, | |
| "loss": 3.1776, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 8.680443439888064, | |
| "grad_norm": 0.8327749967575073, | |
| "learning_rate": 7.945911000969723e-05, | |
| "loss": 3.1889, | |
| "step": 80650 | |
| }, | |
| { | |
| "epoch": 8.685824991927673, | |
| "grad_norm": 0.8465210199356079, | |
| "learning_rate": 7.91358689796358e-05, | |
| "loss": 3.2079, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 8.69120654396728, | |
| "grad_norm": 0.8314090371131897, | |
| "learning_rate": 7.88126279495744e-05, | |
| "loss": 3.1709, | |
| "step": 80750 | |
| }, | |
| { | |
| "epoch": 8.696588096006888, | |
| "grad_norm": 0.8430169224739075, | |
| "learning_rate": 7.849585174011422e-05, | |
| "loss": 3.1854, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 8.701969648046497, | |
| "grad_norm": 0.8381490111351013, | |
| "learning_rate": 7.817261071005278e-05, | |
| "loss": 3.1823, | |
| "step": 80850 | |
| }, | |
| { | |
| "epoch": 8.707351200086105, | |
| "grad_norm": 0.8118202090263367, | |
| "learning_rate": 7.784936967999138e-05, | |
| "loss": 3.1944, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 8.712732752125714, | |
| "grad_norm": 0.8550557494163513, | |
| "learning_rate": 7.752612864992996e-05, | |
| "loss": 3.1696, | |
| "step": 80950 | |
| }, | |
| { | |
| "epoch": 8.718114304165322, | |
| "grad_norm": 0.8136093020439148, | |
| "learning_rate": 7.720288761986854e-05, | |
| "loss": 3.1844, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 8.718114304165322, | |
| "eval_accuracy": 0.39210121755339555, | |
| "eval_loss": 3.3207666873931885, | |
| "eval_runtime": 185.196, | |
| "eval_samples_per_second": 97.254, | |
| "eval_steps_per_second": 6.08, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 8.723495856204929, | |
| "grad_norm": 0.8364823460578918, | |
| "learning_rate": 7.687964658980712e-05, | |
| "loss": 3.1805, | |
| "step": 81050 | |
| }, | |
| { | |
| "epoch": 8.728877408244538, | |
| "grad_norm": 0.7969305515289307, | |
| "learning_rate": 7.655640555974572e-05, | |
| "loss": 3.1823, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 8.734258960284146, | |
| "grad_norm": 0.8257160782814026, | |
| "learning_rate": 7.623316452968428e-05, | |
| "loss": 3.2009, | |
| "step": 81150 | |
| }, | |
| { | |
| "epoch": 8.739640512323755, | |
| "grad_norm": 0.8170008659362793, | |
| "learning_rate": 7.590992349962288e-05, | |
| "loss": 3.1918, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 8.745022064363363, | |
| "grad_norm": 0.834387481212616, | |
| "learning_rate": 7.558668246956147e-05, | |
| "loss": 3.1915, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 8.75040361640297, | |
| "grad_norm": 0.8909284472465515, | |
| "learning_rate": 7.526344143950005e-05, | |
| "loss": 3.192, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 8.755785168442578, | |
| "grad_norm": 0.8296206593513489, | |
| "learning_rate": 7.494020040943862e-05, | |
| "loss": 3.1884, | |
| "step": 81350 | |
| }, | |
| { | |
| "epoch": 8.761166720482187, | |
| "grad_norm": 0.852840781211853, | |
| "learning_rate": 7.461695937937722e-05, | |
| "loss": 3.1816, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 8.766548272521796, | |
| "grad_norm": 0.8436883687973022, | |
| "learning_rate": 7.42937183493158e-05, | |
| "loss": 3.1841, | |
| "step": 81450 | |
| }, | |
| { | |
| "epoch": 8.771929824561404, | |
| "grad_norm": 0.8767805099487305, | |
| "learning_rate": 7.397047731925439e-05, | |
| "loss": 3.1764, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 8.777311376601011, | |
| "grad_norm": 0.8951683640480042, | |
| "learning_rate": 7.364723628919297e-05, | |
| "loss": 3.2043, | |
| "step": 81550 | |
| }, | |
| { | |
| "epoch": 8.78269292864062, | |
| "grad_norm": 0.8517978191375732, | |
| "learning_rate": 7.332399525913155e-05, | |
| "loss": 3.1797, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 8.788074480680228, | |
| "grad_norm": 0.9093903303146362, | |
| "learning_rate": 7.300075422907013e-05, | |
| "loss": 3.1845, | |
| "step": 81650 | |
| }, | |
| { | |
| "epoch": 8.793456032719837, | |
| "grad_norm": 0.8133341670036316, | |
| "learning_rate": 7.267751319900872e-05, | |
| "loss": 3.1801, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 8.798837584759445, | |
| "grad_norm": 0.8153607845306396, | |
| "learning_rate": 7.235427216894731e-05, | |
| "loss": 3.2009, | |
| "step": 81750 | |
| }, | |
| { | |
| "epoch": 8.804219136799054, | |
| "grad_norm": 0.8574018478393555, | |
| "learning_rate": 7.203103113888589e-05, | |
| "loss": 3.1938, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 8.80960068883866, | |
| "grad_norm": 0.8556495904922485, | |
| "learning_rate": 7.170779010882447e-05, | |
| "loss": 3.1888, | |
| "step": 81850 | |
| }, | |
| { | |
| "epoch": 8.814982240878269, | |
| "grad_norm": 0.8293871879577637, | |
| "learning_rate": 7.138454907876305e-05, | |
| "loss": 3.189, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 8.820363792917878, | |
| "grad_norm": 0.8736574649810791, | |
| "learning_rate": 7.106130804870164e-05, | |
| "loss": 3.1674, | |
| "step": 81950 | |
| }, | |
| { | |
| "epoch": 8.825745344957486, | |
| "grad_norm": 0.7945114970207214, | |
| "learning_rate": 7.073806701864023e-05, | |
| "loss": 3.1748, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.825745344957486, | |
| "eval_accuracy": 0.3925913507303267, | |
| "eval_loss": 3.317441940307617, | |
| "eval_runtime": 185.4585, | |
| "eval_samples_per_second": 97.116, | |
| "eval_steps_per_second": 6.071, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 8.831126896997095, | |
| "grad_norm": 0.8252271413803101, | |
| "learning_rate": 7.041482598857881e-05, | |
| "loss": 3.1792, | |
| "step": 82050 | |
| }, | |
| { | |
| "epoch": 8.836508449036701, | |
| "grad_norm": 0.8844432830810547, | |
| "learning_rate": 7.009158495851739e-05, | |
| "loss": 3.1762, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 8.84189000107631, | |
| "grad_norm": 0.8228882551193237, | |
| "learning_rate": 6.976834392845599e-05, | |
| "loss": 3.1902, | |
| "step": 82150 | |
| }, | |
| { | |
| "epoch": 8.847271553115919, | |
| "grad_norm": 0.8460994958877563, | |
| "learning_rate": 6.944510289839457e-05, | |
| "loss": 3.201, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 8.852653105155527, | |
| "grad_norm": 0.9102237820625305, | |
| "learning_rate": 6.912186186833315e-05, | |
| "loss": 3.1763, | |
| "step": 82250 | |
| }, | |
| { | |
| "epoch": 8.858034657195136, | |
| "grad_norm": 0.8262957334518433, | |
| "learning_rate": 6.879862083827173e-05, | |
| "loss": 3.1675, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 8.863416209234742, | |
| "grad_norm": 0.8179271817207336, | |
| "learning_rate": 6.847537980821031e-05, | |
| "loss": 3.1844, | |
| "step": 82350 | |
| }, | |
| { | |
| "epoch": 8.868797761274351, | |
| "grad_norm": 0.8240422010421753, | |
| "learning_rate": 6.81521387781489e-05, | |
| "loss": 3.1944, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 8.87417931331396, | |
| "grad_norm": 0.8756139278411865, | |
| "learning_rate": 6.782889774808749e-05, | |
| "loss": 3.1893, | |
| "step": 82450 | |
| }, | |
| { | |
| "epoch": 8.879560865353568, | |
| "grad_norm": 0.8071765303611755, | |
| "learning_rate": 6.750565671802607e-05, | |
| "loss": 3.1865, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 8.884942417393177, | |
| "grad_norm": 0.8261629343032837, | |
| "learning_rate": 6.718241568796465e-05, | |
| "loss": 3.1821, | |
| "step": 82550 | |
| }, | |
| { | |
| "epoch": 8.890323969432785, | |
| "grad_norm": 0.8211767673492432, | |
| "learning_rate": 6.685917465790323e-05, | |
| "loss": 3.1737, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 8.895705521472392, | |
| "grad_norm": 0.8834940791130066, | |
| "learning_rate": 6.653593362784182e-05, | |
| "loss": 3.1813, | |
| "step": 82650 | |
| }, | |
| { | |
| "epoch": 8.901087073512, | |
| "grad_norm": 0.8213329911231995, | |
| "learning_rate": 6.62126925977804e-05, | |
| "loss": 3.1915, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 8.906468625551609, | |
| "grad_norm": 0.8448511958122253, | |
| "learning_rate": 6.588945156771899e-05, | |
| "loss": 3.1989, | |
| "step": 82750 | |
| }, | |
| { | |
| "epoch": 8.911850177591218, | |
| "grad_norm": 0.8716330528259277, | |
| "learning_rate": 6.556621053765757e-05, | |
| "loss": 3.1908, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 8.917231729630826, | |
| "grad_norm": 0.8602597117424011, | |
| "learning_rate": 6.524296950759615e-05, | |
| "loss": 3.1827, | |
| "step": 82850 | |
| }, | |
| { | |
| "epoch": 8.922613281670433, | |
| "grad_norm": 0.9103492498397827, | |
| "learning_rate": 6.491972847753474e-05, | |
| "loss": 3.1925, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 8.927994833710041, | |
| "grad_norm": 0.8320696949958801, | |
| "learning_rate": 6.459648744747333e-05, | |
| "loss": 3.19, | |
| "step": 82950 | |
| }, | |
| { | |
| "epoch": 8.93337638574965, | |
| "grad_norm": 0.8699012994766235, | |
| "learning_rate": 6.427324641741192e-05, | |
| "loss": 3.1609, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.93337638574965, | |
| "eval_accuracy": 0.39288330103988417, | |
| "eval_loss": 3.313811779022217, | |
| "eval_runtime": 185.5161, | |
| "eval_samples_per_second": 97.086, | |
| "eval_steps_per_second": 6.07, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 8.938757937789259, | |
| "grad_norm": 0.8381906151771545, | |
| "learning_rate": 6.39500053873505e-05, | |
| "loss": 3.1844, | |
| "step": 83050 | |
| }, | |
| { | |
| "epoch": 8.944139489828867, | |
| "grad_norm": 0.8325570821762085, | |
| "learning_rate": 6.362676435728908e-05, | |
| "loss": 3.1902, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 8.949521041868476, | |
| "grad_norm": 0.8284711241722107, | |
| "learning_rate": 6.330352332722766e-05, | |
| "loss": 3.1778, | |
| "step": 83150 | |
| }, | |
| { | |
| "epoch": 8.954902593908082, | |
| "grad_norm": 0.837222158908844, | |
| "learning_rate": 6.298028229716624e-05, | |
| "loss": 3.1819, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 8.960284145947691, | |
| "grad_norm": 0.8105958104133606, | |
| "learning_rate": 6.265704126710484e-05, | |
| "loss": 3.199, | |
| "step": 83250 | |
| }, | |
| { | |
| "epoch": 8.9656656979873, | |
| "grad_norm": 0.8044517040252686, | |
| "learning_rate": 6.233380023704342e-05, | |
| "loss": 3.1894, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 8.971047250026908, | |
| "grad_norm": 0.8663701415061951, | |
| "learning_rate": 6.2010559206982e-05, | |
| "loss": 3.1987, | |
| "step": 83350 | |
| }, | |
| { | |
| "epoch": 8.976428802066517, | |
| "grad_norm": 0.8896883726119995, | |
| "learning_rate": 6.168731817692058e-05, | |
| "loss": 3.191, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 8.981810354106123, | |
| "grad_norm": 0.8449499607086182, | |
| "learning_rate": 6.136407714685916e-05, | |
| "loss": 3.1882, | |
| "step": 83450 | |
| }, | |
| { | |
| "epoch": 8.987191906145732, | |
| "grad_norm": 0.858260452747345, | |
| "learning_rate": 6.104083611679776e-05, | |
| "loss": 3.1969, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 8.99257345818534, | |
| "grad_norm": 0.8580666780471802, | |
| "learning_rate": 6.071759508673634e-05, | |
| "loss": 3.1819, | |
| "step": 83550 | |
| }, | |
| { | |
| "epoch": 8.997955010224949, | |
| "grad_norm": 0.8120842576026917, | |
| "learning_rate": 6.039435405667492e-05, | |
| "loss": 3.1814, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 9.003336562264558, | |
| "grad_norm": 0.8290035128593445, | |
| "learning_rate": 6.007111302661351e-05, | |
| "loss": 3.1373, | |
| "step": 83650 | |
| }, | |
| { | |
| "epoch": 9.008718114304166, | |
| "grad_norm": 0.8526427149772644, | |
| "learning_rate": 5.974787199655209e-05, | |
| "loss": 3.1279, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 9.014099666343773, | |
| "grad_norm": 0.812345027923584, | |
| "learning_rate": 5.942463096649068e-05, | |
| "loss": 3.1242, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 9.019481218383381, | |
| "grad_norm": 0.8443864583969116, | |
| "learning_rate": 5.910138993642926e-05, | |
| "loss": 3.1184, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 9.02486277042299, | |
| "grad_norm": 0.8466078639030457, | |
| "learning_rate": 5.877814890636784e-05, | |
| "loss": 3.1087, | |
| "step": 83850 | |
| }, | |
| { | |
| "epoch": 9.030244322462599, | |
| "grad_norm": 0.8269876837730408, | |
| "learning_rate": 5.845490787630643e-05, | |
| "loss": 3.1322, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 9.035625874502207, | |
| "grad_norm": 0.9255964756011963, | |
| "learning_rate": 5.813166684624501e-05, | |
| "loss": 3.1263, | |
| "step": 83950 | |
| }, | |
| { | |
| "epoch": 9.041007426541814, | |
| "grad_norm": 0.8110038042068481, | |
| "learning_rate": 5.7808425816183596e-05, | |
| "loss": 3.1272, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 9.041007426541814, | |
| "eval_accuracy": 0.3927278187574217, | |
| "eval_loss": 3.315974473953247, | |
| "eval_runtime": 185.4621, | |
| "eval_samples_per_second": 97.114, | |
| "eval_steps_per_second": 6.071, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 9.046388978581422, | |
| "grad_norm": 0.8578800559043884, | |
| "learning_rate": 5.748518478612218e-05, | |
| "loss": 3.1213, | |
| "step": 84050 | |
| }, | |
| { | |
| "epoch": 9.051770530621031, | |
| "grad_norm": 0.8739824891090393, | |
| "learning_rate": 5.716194375606076e-05, | |
| "loss": 3.123, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 9.05715208266064, | |
| "grad_norm": 0.8663125038146973, | |
| "learning_rate": 5.6838702725999346e-05, | |
| "loss": 3.121, | |
| "step": 84150 | |
| }, | |
| { | |
| "epoch": 9.062533634700248, | |
| "grad_norm": 0.8483325242996216, | |
| "learning_rate": 5.651546169593793e-05, | |
| "loss": 3.1228, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 9.067915186739857, | |
| "grad_norm": 0.8528785109519958, | |
| "learning_rate": 5.619222066587652e-05, | |
| "loss": 3.121, | |
| "step": 84250 | |
| }, | |
| { | |
| "epoch": 9.073296738779463, | |
| "grad_norm": 0.8312401175498962, | |
| "learning_rate": 5.58689796358151e-05, | |
| "loss": 3.128, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 9.078678290819072, | |
| "grad_norm": 0.9178869128227234, | |
| "learning_rate": 5.554573860575369e-05, | |
| "loss": 3.1186, | |
| "step": 84350 | |
| }, | |
| { | |
| "epoch": 9.08405984285868, | |
| "grad_norm": 0.8239533305168152, | |
| "learning_rate": 5.522249757569227e-05, | |
| "loss": 3.122, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 9.089441394898289, | |
| "grad_norm": 0.8259393572807312, | |
| "learning_rate": 5.489925654563085e-05, | |
| "loss": 3.1234, | |
| "step": 84450 | |
| }, | |
| { | |
| "epoch": 9.094822946937898, | |
| "grad_norm": 0.8551408052444458, | |
| "learning_rate": 5.457601551556944e-05, | |
| "loss": 3.1202, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 9.100204498977504, | |
| "grad_norm": 0.8616343140602112, | |
| "learning_rate": 5.425277448550802e-05, | |
| "loss": 3.1344, | |
| "step": 84550 | |
| }, | |
| { | |
| "epoch": 9.105586051017113, | |
| "grad_norm": 0.8499940037727356, | |
| "learning_rate": 5.392953345544661e-05, | |
| "loss": 3.127, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 9.110967603056721, | |
| "grad_norm": 0.8221794366836548, | |
| "learning_rate": 5.360629242538519e-05, | |
| "loss": 3.1158, | |
| "step": 84650 | |
| }, | |
| { | |
| "epoch": 9.11634915509633, | |
| "grad_norm": 0.8354493379592896, | |
| "learning_rate": 5.328305139532377e-05, | |
| "loss": 3.1206, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 9.121730707135939, | |
| "grad_norm": 0.8396961688995361, | |
| "learning_rate": 5.295981036526236e-05, | |
| "loss": 3.1188, | |
| "step": 84750 | |
| }, | |
| { | |
| "epoch": 9.127112259175545, | |
| "grad_norm": 0.8268956542015076, | |
| "learning_rate": 5.264303415580217e-05, | |
| "loss": 3.1341, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 9.132493811215154, | |
| "grad_norm": 0.8730154037475586, | |
| "learning_rate": 5.2319793125740754e-05, | |
| "loss": 3.1424, | |
| "step": 84850 | |
| }, | |
| { | |
| "epoch": 9.137875363254762, | |
| "grad_norm": 0.8668919205665588, | |
| "learning_rate": 5.1996552095679336e-05, | |
| "loss": 3.1165, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 9.143256915294371, | |
| "grad_norm": 0.8446504473686218, | |
| "learning_rate": 5.1673311065617923e-05, | |
| "loss": 3.1291, | |
| "step": 84950 | |
| }, | |
| { | |
| "epoch": 9.14863846733398, | |
| "grad_norm": 0.8692771792411804, | |
| "learning_rate": 5.1350070035556505e-05, | |
| "loss": 3.1285, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 9.14863846733398, | |
| "eval_accuracy": 0.39281289396858043, | |
| "eval_loss": 3.3171780109405518, | |
| "eval_runtime": 185.7486, | |
| "eval_samples_per_second": 96.964, | |
| "eval_steps_per_second": 6.062, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 9.154020019373588, | |
| "grad_norm": 0.8421473503112793, | |
| "learning_rate": 5.102682900549509e-05, | |
| "loss": 3.1497, | |
| "step": 85050 | |
| }, | |
| { | |
| "epoch": 9.159401571413195, | |
| "grad_norm": 0.833160936832428, | |
| "learning_rate": 5.0703587975433674e-05, | |
| "loss": 3.1164, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 9.164783123452803, | |
| "grad_norm": 0.8475486040115356, | |
| "learning_rate": 5.0380346945372255e-05, | |
| "loss": 3.132, | |
| "step": 85150 | |
| }, | |
| { | |
| "epoch": 9.170164675492412, | |
| "grad_norm": 0.8560284972190857, | |
| "learning_rate": 5.005710591531085e-05, | |
| "loss": 3.1301, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 9.17554622753202, | |
| "grad_norm": 0.8586344122886658, | |
| "learning_rate": 4.973386488524943e-05, | |
| "loss": 3.1301, | |
| "step": 85250 | |
| }, | |
| { | |
| "epoch": 9.180927779571629, | |
| "grad_norm": 0.8414719104766846, | |
| "learning_rate": 4.9417088675789244e-05, | |
| "loss": 3.1336, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 9.186309331611236, | |
| "grad_norm": 0.866367757320404, | |
| "learning_rate": 4.909384764572783e-05, | |
| "loss": 3.1325, | |
| "step": 85350 | |
| }, | |
| { | |
| "epoch": 9.191690883650844, | |
| "grad_norm": 0.8298116326332092, | |
| "learning_rate": 4.877060661566641e-05, | |
| "loss": 3.1171, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 9.197072435690453, | |
| "grad_norm": 0.8735843300819397, | |
| "learning_rate": 4.8447365585604994e-05, | |
| "loss": 3.1335, | |
| "step": 85450 | |
| }, | |
| { | |
| "epoch": 9.202453987730062, | |
| "grad_norm": 0.8736468553543091, | |
| "learning_rate": 4.812412455554358e-05, | |
| "loss": 3.1323, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 9.20783553976967, | |
| "grad_norm": 0.8373434543609619, | |
| "learning_rate": 4.780088352548216e-05, | |
| "loss": 3.1252, | |
| "step": 85550 | |
| }, | |
| { | |
| "epoch": 9.213217091809279, | |
| "grad_norm": 0.808627188205719, | |
| "learning_rate": 4.747764249542075e-05, | |
| "loss": 3.1197, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 9.218598643848885, | |
| "grad_norm": 0.9199257493019104, | |
| "learning_rate": 4.715440146535933e-05, | |
| "loss": 3.1223, | |
| "step": 85650 | |
| }, | |
| { | |
| "epoch": 9.223980195888494, | |
| "grad_norm": 0.8038100600242615, | |
| "learning_rate": 4.683116043529791e-05, | |
| "loss": 3.1341, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 9.229361747928102, | |
| "grad_norm": 0.8255937099456787, | |
| "learning_rate": 4.65079194052365e-05, | |
| "loss": 3.1318, | |
| "step": 85750 | |
| }, | |
| { | |
| "epoch": 9.234743299967711, | |
| "grad_norm": 0.8218817710876465, | |
| "learning_rate": 4.618467837517508e-05, | |
| "loss": 3.1116, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 9.24012485200732, | |
| "grad_norm": 0.8301002383232117, | |
| "learning_rate": 4.586143734511367e-05, | |
| "loss": 3.1297, | |
| "step": 85850 | |
| }, | |
| { | |
| "epoch": 9.245506404046926, | |
| "grad_norm": 0.8707810044288635, | |
| "learning_rate": 4.554466113565348e-05, | |
| "loss": 3.1228, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 9.250887956086535, | |
| "grad_norm": 0.840757429599762, | |
| "learning_rate": 4.5221420105592064e-05, | |
| "loss": 3.1417, | |
| "step": 85950 | |
| }, | |
| { | |
| "epoch": 9.256269508126143, | |
| "grad_norm": 0.81743323802948, | |
| "learning_rate": 4.4898179075530645e-05, | |
| "loss": 3.1459, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 9.256269508126143, | |
| "eval_accuracy": 0.39307344359355956, | |
| "eval_loss": 3.3138790130615234, | |
| "eval_runtime": 185.9872, | |
| "eval_samples_per_second": 96.84, | |
| "eval_steps_per_second": 6.054, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 9.261651060165752, | |
| "grad_norm": 0.841184675693512, | |
| "learning_rate": 4.457493804546923e-05, | |
| "loss": 3.1329, | |
| "step": 86050 | |
| }, | |
| { | |
| "epoch": 9.26703261220536, | |
| "grad_norm": 0.8968733549118042, | |
| "learning_rate": 4.4251697015407814e-05, | |
| "loss": 3.1239, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 9.272414164244967, | |
| "grad_norm": 0.8560697436332703, | |
| "learning_rate": 4.392845598534641e-05, | |
| "loss": 3.1327, | |
| "step": 86150 | |
| }, | |
| { | |
| "epoch": 9.277795716284576, | |
| "grad_norm": 0.8354721665382385, | |
| "learning_rate": 4.360521495528499e-05, | |
| "loss": 3.1174, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 9.283177268324184, | |
| "grad_norm": 0.922799289226532, | |
| "learning_rate": 4.3281973925223564e-05, | |
| "loss": 3.1365, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 9.288558820363793, | |
| "grad_norm": 0.873641312122345, | |
| "learning_rate": 4.295873289516216e-05, | |
| "loss": 3.1173, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 9.293940372403402, | |
| "grad_norm": 0.8811352252960205, | |
| "learning_rate": 4.263549186510074e-05, | |
| "loss": 3.1353, | |
| "step": 86350 | |
| }, | |
| { | |
| "epoch": 9.29932192444301, | |
| "grad_norm": 0.8473592400550842, | |
| "learning_rate": 4.231225083503933e-05, | |
| "loss": 3.1451, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 9.304703476482617, | |
| "grad_norm": 0.835388720035553, | |
| "learning_rate": 4.198900980497791e-05, | |
| "loss": 3.1413, | |
| "step": 86450 | |
| }, | |
| { | |
| "epoch": 9.310085028522225, | |
| "grad_norm": 0.8779233694076538, | |
| "learning_rate": 4.166576877491649e-05, | |
| "loss": 3.1286, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 9.315466580561834, | |
| "grad_norm": 0.8000664114952087, | |
| "learning_rate": 4.134252774485508e-05, | |
| "loss": 3.1521, | |
| "step": 86550 | |
| }, | |
| { | |
| "epoch": 9.320848132601443, | |
| "grad_norm": 0.8716637492179871, | |
| "learning_rate": 4.101928671479366e-05, | |
| "loss": 3.1337, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 9.326229684641051, | |
| "grad_norm": 0.8766072392463684, | |
| "learning_rate": 4.069604568473225e-05, | |
| "loss": 3.1502, | |
| "step": 86650 | |
| }, | |
| { | |
| "epoch": 9.331611236680658, | |
| "grad_norm": 0.8547214269638062, | |
| "learning_rate": 4.037280465467083e-05, | |
| "loss": 3.1284, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 9.336992788720266, | |
| "grad_norm": 0.873173713684082, | |
| "learning_rate": 4.004956362460941e-05, | |
| "loss": 3.1394, | |
| "step": 86750 | |
| }, | |
| { | |
| "epoch": 9.342374340759875, | |
| "grad_norm": 0.8491945266723633, | |
| "learning_rate": 3.9726322594548e-05, | |
| "loss": 3.1302, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 9.347755892799483, | |
| "grad_norm": 0.8217043876647949, | |
| "learning_rate": 3.940308156448658e-05, | |
| "loss": 3.1351, | |
| "step": 86850 | |
| }, | |
| { | |
| "epoch": 9.353137444839092, | |
| "grad_norm": 0.8568532466888428, | |
| "learning_rate": 3.9079840534425166e-05, | |
| "loss": 3.1454, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 9.3585189968787, | |
| "grad_norm": 0.8353368639945984, | |
| "learning_rate": 3.875659950436375e-05, | |
| "loss": 3.1309, | |
| "step": 86950 | |
| }, | |
| { | |
| "epoch": 9.363900548918307, | |
| "grad_norm": 0.8400478363037109, | |
| "learning_rate": 3.843335847430233e-05, | |
| "loss": 3.1261, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 9.363900548918307, | |
| "eval_accuracy": 0.3934188511239218, | |
| "eval_loss": 3.3116421699523926, | |
| "eval_runtime": 185.4864, | |
| "eval_samples_per_second": 97.101, | |
| "eval_steps_per_second": 6.071, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 9.369282100957916, | |
| "grad_norm": 0.8323249816894531, | |
| "learning_rate": 3.811011744424092e-05, | |
| "loss": 3.1403, | |
| "step": 87050 | |
| }, | |
| { | |
| "epoch": 9.374663652997524, | |
| "grad_norm": 0.9313833117485046, | |
| "learning_rate": 3.7786876414179504e-05, | |
| "loss": 3.1213, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 9.380045205037133, | |
| "grad_norm": 0.842764675617218, | |
| "learning_rate": 3.7463635384118085e-05, | |
| "loss": 3.1293, | |
| "step": 87150 | |
| }, | |
| { | |
| "epoch": 9.385426757076742, | |
| "grad_norm": 0.8564713597297668, | |
| "learning_rate": 3.714039435405667e-05, | |
| "loss": 3.1349, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 9.390808309116348, | |
| "grad_norm": 0.8736740946769714, | |
| "learning_rate": 3.6817153323995254e-05, | |
| "loss": 3.1443, | |
| "step": 87250 | |
| }, | |
| { | |
| "epoch": 9.396189861155957, | |
| "grad_norm": 0.8590500354766846, | |
| "learning_rate": 3.649391229393384e-05, | |
| "loss": 3.1389, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 9.401571413195565, | |
| "grad_norm": 0.8680191040039062, | |
| "learning_rate": 3.617067126387242e-05, | |
| "loss": 3.1419, | |
| "step": 87350 | |
| }, | |
| { | |
| "epoch": 9.406952965235174, | |
| "grad_norm": 0.8538930416107178, | |
| "learning_rate": 3.5847430233811004e-05, | |
| "loss": 3.1478, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 9.412334517274783, | |
| "grad_norm": 0.8698980808258057, | |
| "learning_rate": 3.552418920374959e-05, | |
| "loss": 3.1476, | |
| "step": 87450 | |
| }, | |
| { | |
| "epoch": 9.417716069314391, | |
| "grad_norm": 0.8061973452568054, | |
| "learning_rate": 3.520094817368818e-05, | |
| "loss": 3.1177, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 9.423097621353998, | |
| "grad_norm": 0.8118069171905518, | |
| "learning_rate": 3.487770714362676e-05, | |
| "loss": 3.1362, | |
| "step": 87550 | |
| }, | |
| { | |
| "epoch": 9.428479173393606, | |
| "grad_norm": 0.8155990839004517, | |
| "learning_rate": 3.455446611356535e-05, | |
| "loss": 3.1171, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 9.433860725433215, | |
| "grad_norm": 0.8968056440353394, | |
| "learning_rate": 3.423122508350393e-05, | |
| "loss": 3.141, | |
| "step": 87650 | |
| }, | |
| { | |
| "epoch": 9.439242277472824, | |
| "grad_norm": 0.874661922454834, | |
| "learning_rate": 3.390798405344251e-05, | |
| "loss": 3.1435, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 9.444623829512432, | |
| "grad_norm": 0.8804532885551453, | |
| "learning_rate": 3.35847430233811e-05, | |
| "loss": 3.1396, | |
| "step": 87750 | |
| }, | |
| { | |
| "epoch": 9.450005381552039, | |
| "grad_norm": 0.8631547689437866, | |
| "learning_rate": 3.326150199331968e-05, | |
| "loss": 3.138, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 9.455386933591647, | |
| "grad_norm": 0.8592112064361572, | |
| "learning_rate": 3.293826096325827e-05, | |
| "loss": 3.1345, | |
| "step": 87850 | |
| }, | |
| { | |
| "epoch": 9.460768485631256, | |
| "grad_norm": 0.8821307420730591, | |
| "learning_rate": 3.261501993319685e-05, | |
| "loss": 3.1486, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 9.466150037670864, | |
| "grad_norm": 0.8409145474433899, | |
| "learning_rate": 3.229177890313544e-05, | |
| "loss": 3.1303, | |
| "step": 87950 | |
| }, | |
| { | |
| "epoch": 9.471531589710473, | |
| "grad_norm": 0.8484240174293518, | |
| "learning_rate": 3.196853787307402e-05, | |
| "loss": 3.1134, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 9.471531589710473, | |
| "eval_accuracy": 0.3936306156022722, | |
| "eval_loss": 3.310249090194702, | |
| "eval_runtime": 185.3061, | |
| "eval_samples_per_second": 97.196, | |
| "eval_steps_per_second": 6.076, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 9.476913141750082, | |
| "grad_norm": 0.8421609997749329, | |
| "learning_rate": 3.1645296843012606e-05, | |
| "loss": 3.1351, | |
| "step": 88050 | |
| }, | |
| { | |
| "epoch": 9.482294693789688, | |
| "grad_norm": 0.8248376846313477, | |
| "learning_rate": 3.132205581295119e-05, | |
| "loss": 3.1349, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 9.487676245829297, | |
| "grad_norm": 0.8693441152572632, | |
| "learning_rate": 3.099881478288977e-05, | |
| "loss": 3.1267, | |
| "step": 88150 | |
| }, | |
| { | |
| "epoch": 9.493057797868905, | |
| "grad_norm": 0.8903070092201233, | |
| "learning_rate": 3.0675573752828356e-05, | |
| "loss": 3.1221, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 9.498439349908514, | |
| "grad_norm": 0.8783222436904907, | |
| "learning_rate": 3.035233272276694e-05, | |
| "loss": 3.1374, | |
| "step": 88250 | |
| }, | |
| { | |
| "epoch": 9.503820901948123, | |
| "grad_norm": 0.8198007941246033, | |
| "learning_rate": 3.0029091692705525e-05, | |
| "loss": 3.1365, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 9.50920245398773, | |
| "grad_norm": 0.8365300297737122, | |
| "learning_rate": 2.970585066264411e-05, | |
| "loss": 3.1285, | |
| "step": 88350 | |
| }, | |
| { | |
| "epoch": 9.514584006027338, | |
| "grad_norm": 0.8559662103652954, | |
| "learning_rate": 2.938260963258269e-05, | |
| "loss": 3.1212, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 9.519965558066946, | |
| "grad_norm": 0.8498660922050476, | |
| "learning_rate": 2.9059368602521275e-05, | |
| "loss": 3.1306, | |
| "step": 88450 | |
| }, | |
| { | |
| "epoch": 9.525347110106555, | |
| "grad_norm": 0.8696129322052002, | |
| "learning_rate": 2.8736127572459863e-05, | |
| "loss": 3.1373, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 9.530728662146164, | |
| "grad_norm": 0.8482581377029419, | |
| "learning_rate": 2.8412886542398448e-05, | |
| "loss": 3.1295, | |
| "step": 88550 | |
| }, | |
| { | |
| "epoch": 9.536110214185772, | |
| "grad_norm": 0.9024134278297424, | |
| "learning_rate": 2.8089645512337032e-05, | |
| "loss": 3.1596, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 9.541491766225379, | |
| "grad_norm": 0.9035645723342896, | |
| "learning_rate": 2.7766404482275613e-05, | |
| "loss": 3.112, | |
| "step": 88650 | |
| }, | |
| { | |
| "epoch": 9.546873318264987, | |
| "grad_norm": 0.8633137941360474, | |
| "learning_rate": 2.7443163452214198e-05, | |
| "loss": 3.1364, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 9.552254870304596, | |
| "grad_norm": 0.8794911503791809, | |
| "learning_rate": 2.7119922422152782e-05, | |
| "loss": 3.1351, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 9.557636422344205, | |
| "grad_norm": 0.8476806879043579, | |
| "learning_rate": 2.6796681392091367e-05, | |
| "loss": 3.1373, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 9.563017974383813, | |
| "grad_norm": 0.8718453049659729, | |
| "learning_rate": 2.6473440362029955e-05, | |
| "loss": 3.1415, | |
| "step": 88850 | |
| }, | |
| { | |
| "epoch": 9.56839952642342, | |
| "grad_norm": 0.8849031329154968, | |
| "learning_rate": 2.6150199331968536e-05, | |
| "loss": 3.1315, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 9.573781078463028, | |
| "grad_norm": 0.8923875689506531, | |
| "learning_rate": 2.582695830190712e-05, | |
| "loss": 3.1496, | |
| "step": 88950 | |
| }, | |
| { | |
| "epoch": 9.579162630502637, | |
| "grad_norm": 0.8623582124710083, | |
| "learning_rate": 2.5503717271845705e-05, | |
| "loss": 3.1413, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 9.579162630502637, | |
| "eval_accuracy": 0.3940205708166384, | |
| "eval_loss": 3.3068735599517822, | |
| "eval_runtime": 184.392, | |
| "eval_samples_per_second": 97.678, | |
| "eval_steps_per_second": 6.107, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 9.584544182542245, | |
| "grad_norm": 0.9111592769622803, | |
| "learning_rate": 2.518047624178429e-05, | |
| "loss": 3.1464, | |
| "step": 89050 | |
| }, | |
| { | |
| "epoch": 9.589925734581854, | |
| "grad_norm": 0.8834981918334961, | |
| "learning_rate": 2.4857235211722874e-05, | |
| "loss": 3.1262, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 9.59530728662146, | |
| "grad_norm": 0.8303783535957336, | |
| "learning_rate": 2.4533994181661455e-05, | |
| "loss": 3.1282, | |
| "step": 89150 | |
| }, | |
| { | |
| "epoch": 9.60068883866107, | |
| "grad_norm": 0.8445287346839905, | |
| "learning_rate": 2.421075315160004e-05, | |
| "loss": 3.1328, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 9.606070390700678, | |
| "grad_norm": 0.8937562704086304, | |
| "learning_rate": 2.3887512121538624e-05, | |
| "loss": 3.1477, | |
| "step": 89250 | |
| }, | |
| { | |
| "epoch": 9.611451942740286, | |
| "grad_norm": 0.8591155409812927, | |
| "learning_rate": 2.3564271091477212e-05, | |
| "loss": 3.1471, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 9.616833494779895, | |
| "grad_norm": 0.8643044233322144, | |
| "learning_rate": 2.3241030061415796e-05, | |
| "loss": 3.144, | |
| "step": 89350 | |
| }, | |
| { | |
| "epoch": 9.622215046819504, | |
| "grad_norm": 0.9129336476325989, | |
| "learning_rate": 2.2917789031354377e-05, | |
| "loss": 3.1312, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 9.62759659885911, | |
| "grad_norm": 0.8323395848274231, | |
| "learning_rate": 2.2594548001292962e-05, | |
| "loss": 3.15, | |
| "step": 89450 | |
| }, | |
| { | |
| "epoch": 9.632978150898719, | |
| "grad_norm": 0.8554784059524536, | |
| "learning_rate": 2.2271306971231546e-05, | |
| "loss": 3.1275, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 9.638359702938327, | |
| "grad_norm": 0.8535261154174805, | |
| "learning_rate": 2.194806594117013e-05, | |
| "loss": 3.1496, | |
| "step": 89550 | |
| }, | |
| { | |
| "epoch": 9.643741254977936, | |
| "grad_norm": 0.8813555240631104, | |
| "learning_rate": 2.1624824911108715e-05, | |
| "loss": 3.1192, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 9.649122807017545, | |
| "grad_norm": 0.9066751003265381, | |
| "learning_rate": 2.1301583881047296e-05, | |
| "loss": 3.1424, | |
| "step": 89650 | |
| }, | |
| { | |
| "epoch": 9.654504359057151, | |
| "grad_norm": 0.8248937129974365, | |
| "learning_rate": 2.097834285098588e-05, | |
| "loss": 3.133, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 9.65988591109676, | |
| "grad_norm": 0.8432828783988953, | |
| "learning_rate": 2.065510182092447e-05, | |
| "loss": 3.1264, | |
| "step": 89750 | |
| }, | |
| { | |
| "epoch": 9.665267463136368, | |
| "grad_norm": 0.872593104839325, | |
| "learning_rate": 2.0331860790863053e-05, | |
| "loss": 3.1371, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 9.670649015175977, | |
| "grad_norm": 0.8409303426742554, | |
| "learning_rate": 2.0008619760801638e-05, | |
| "loss": 3.1162, | |
| "step": 89850 | |
| }, | |
| { | |
| "epoch": 9.676030567215586, | |
| "grad_norm": 0.8890901207923889, | |
| "learning_rate": 1.968537873074022e-05, | |
| "loss": 3.1171, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 9.681412119255192, | |
| "grad_norm": 0.9161780476570129, | |
| "learning_rate": 1.9362137700678803e-05, | |
| "loss": 3.1319, | |
| "step": 89950 | |
| }, | |
| { | |
| "epoch": 9.6867936712948, | |
| "grad_norm": 0.873457670211792, | |
| "learning_rate": 1.9038896670617388e-05, | |
| "loss": 3.1369, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.6867936712948, | |
| "eval_accuracy": 0.3942919857303989, | |
| "eval_loss": 3.304586172103882, | |
| "eval_runtime": 184.3962, | |
| "eval_samples_per_second": 97.676, | |
| "eval_steps_per_second": 6.106, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 9.69217522333441, | |
| "grad_norm": 0.9777708053588867, | |
| "learning_rate": 1.8715655640555972e-05, | |
| "loss": 3.1312, | |
| "step": 90050 | |
| }, | |
| { | |
| "epoch": 9.697556775374018, | |
| "grad_norm": 0.8539653420448303, | |
| "learning_rate": 1.8392414610494557e-05, | |
| "loss": 3.1435, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 9.702938327413626, | |
| "grad_norm": 0.8861962556838989, | |
| "learning_rate": 1.806917358043314e-05, | |
| "loss": 3.1381, | |
| "step": 90150 | |
| }, | |
| { | |
| "epoch": 9.708319879453235, | |
| "grad_norm": 0.8735243678092957, | |
| "learning_rate": 1.7745932550371726e-05, | |
| "loss": 3.1296, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 9.713701431492842, | |
| "grad_norm": 0.9188650250434875, | |
| "learning_rate": 1.742269152031031e-05, | |
| "loss": 3.1088, | |
| "step": 90250 | |
| }, | |
| { | |
| "epoch": 9.71908298353245, | |
| "grad_norm": 0.8372655510902405, | |
| "learning_rate": 1.7099450490248895e-05, | |
| "loss": 3.139, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 9.724464535572059, | |
| "grad_norm": 0.8758909702301025, | |
| "learning_rate": 1.677620946018748e-05, | |
| "loss": 3.1331, | |
| "step": 90350 | |
| }, | |
| { | |
| "epoch": 9.729846087611667, | |
| "grad_norm": 0.8464704155921936, | |
| "learning_rate": 1.6452968430126064e-05, | |
| "loss": 3.1213, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 9.735227639651276, | |
| "grad_norm": 0.8519407510757446, | |
| "learning_rate": 1.6129727400064645e-05, | |
| "loss": 3.1362, | |
| "step": 90450 | |
| }, | |
| { | |
| "epoch": 9.740609191690883, | |
| "grad_norm": 0.8398826122283936, | |
| "learning_rate": 1.580648637000323e-05, | |
| "loss": 3.1388, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 9.745990743730491, | |
| "grad_norm": 0.8363977074623108, | |
| "learning_rate": 1.5483245339941817e-05, | |
| "loss": 3.127, | |
| "step": 90550 | |
| }, | |
| { | |
| "epoch": 9.7513722957701, | |
| "grad_norm": 0.8710938692092896, | |
| "learning_rate": 1.5166469130481629e-05, | |
| "loss": 3.1246, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 9.756753847809708, | |
| "grad_norm": 0.8815416097640991, | |
| "learning_rate": 1.4843228100420212e-05, | |
| "loss": 3.1324, | |
| "step": 90650 | |
| }, | |
| { | |
| "epoch": 9.762135399849317, | |
| "grad_norm": 0.8630658984184265, | |
| "learning_rate": 1.4519987070358796e-05, | |
| "loss": 3.132, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 9.767516951888926, | |
| "grad_norm": 0.8698672652244568, | |
| "learning_rate": 1.419674604029738e-05, | |
| "loss": 3.141, | |
| "step": 90750 | |
| }, | |
| { | |
| "epoch": 9.772898503928532, | |
| "grad_norm": 0.8320836424827576, | |
| "learning_rate": 1.3873505010235965e-05, | |
| "loss": 3.1428, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 9.77828005596814, | |
| "grad_norm": 0.8394739031791687, | |
| "learning_rate": 1.355026398017455e-05, | |
| "loss": 3.1335, | |
| "step": 90850 | |
| }, | |
| { | |
| "epoch": 9.78366160800775, | |
| "grad_norm": 0.8621455430984497, | |
| "learning_rate": 1.3227022950113132e-05, | |
| "loss": 3.1404, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 9.789043160047358, | |
| "grad_norm": 0.8300653696060181, | |
| "learning_rate": 1.2903781920051719e-05, | |
| "loss": 3.1376, | |
| "step": 90950 | |
| }, | |
| { | |
| "epoch": 9.794424712086967, | |
| "grad_norm": 0.8661669492721558, | |
| "learning_rate": 1.2580540889990301e-05, | |
| "loss": 3.1464, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 9.794424712086967, | |
| "eval_accuracy": 0.3945432998599138, | |
| "eval_loss": 3.3021764755249023, | |
| "eval_runtime": 184.4409, | |
| "eval_samples_per_second": 97.652, | |
| "eval_steps_per_second": 6.105, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 9.799806264126573, | |
| "grad_norm": 0.8688340187072754, | |
| "learning_rate": 1.2257299859928886e-05, | |
| "loss": 3.1412, | |
| "step": 91050 | |
| }, | |
| { | |
| "epoch": 9.805187816166182, | |
| "grad_norm": 0.8872902393341064, | |
| "learning_rate": 1.193405882986747e-05, | |
| "loss": 3.1244, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 9.81056936820579, | |
| "grad_norm": 0.892867922782898, | |
| "learning_rate": 1.1610817799806053e-05, | |
| "loss": 3.1254, | |
| "step": 91150 | |
| }, | |
| { | |
| "epoch": 9.815950920245399, | |
| "grad_norm": 0.8671237230300903, | |
| "learning_rate": 1.128757676974464e-05, | |
| "loss": 3.1384, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 9.821332472285007, | |
| "grad_norm": 0.9021868705749512, | |
| "learning_rate": 1.0964335739683222e-05, | |
| "loss": 3.1351, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 9.826714024324616, | |
| "grad_norm": 0.8489910960197449, | |
| "learning_rate": 1.0641094709621807e-05, | |
| "loss": 3.1339, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 9.832095576364223, | |
| "grad_norm": 0.8646838665008545, | |
| "learning_rate": 1.0317853679560393e-05, | |
| "loss": 3.1222, | |
| "step": 91350 | |
| }, | |
| { | |
| "epoch": 9.837477128403831, | |
| "grad_norm": 0.8420634865760803, | |
| "learning_rate": 9.994612649498976e-06, | |
| "loss": 3.1074, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 9.84285868044344, | |
| "grad_norm": 0.8360474109649658, | |
| "learning_rate": 9.67137161943756e-06, | |
| "loss": 3.1366, | |
| "step": 91450 | |
| }, | |
| { | |
| "epoch": 9.848240232483048, | |
| "grad_norm": 0.8966155648231506, | |
| "learning_rate": 9.348130589376145e-06, | |
| "loss": 3.1395, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 9.853621784522657, | |
| "grad_norm": 0.816577136516571, | |
| "learning_rate": 9.024889559314727e-06, | |
| "loss": 3.1317, | |
| "step": 91550 | |
| }, | |
| { | |
| "epoch": 9.859003336562264, | |
| "grad_norm": 0.8456005454063416, | |
| "learning_rate": 8.701648529253312e-06, | |
| "loss": 3.1147, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 9.864384888601872, | |
| "grad_norm": 0.8357172608375549, | |
| "learning_rate": 8.378407499191896e-06, | |
| "loss": 3.1533, | |
| "step": 91650 | |
| }, | |
| { | |
| "epoch": 9.869766440641481, | |
| "grad_norm": 0.8470631837844849, | |
| "learning_rate": 8.055166469130481e-06, | |
| "loss": 3.1173, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 9.87514799268109, | |
| "grad_norm": 0.860017716884613, | |
| "learning_rate": 7.731925439069065e-06, | |
| "loss": 3.1249, | |
| "step": 91750 | |
| }, | |
| { | |
| "epoch": 9.880529544720698, | |
| "grad_norm": 0.90912264585495, | |
| "learning_rate": 7.408684409007649e-06, | |
| "loss": 3.1311, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 9.885911096760307, | |
| "grad_norm": 0.8600446581840515, | |
| "learning_rate": 7.085443378946234e-06, | |
| "loss": 3.1362, | |
| "step": 91850 | |
| }, | |
| { | |
| "epoch": 9.891292648799913, | |
| "grad_norm": 0.9116879105567932, | |
| "learning_rate": 6.762202348884817e-06, | |
| "loss": 3.1527, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 9.896674200839522, | |
| "grad_norm": 0.8597344160079956, | |
| "learning_rate": 6.438961318823403e-06, | |
| "loss": 3.1249, | |
| "step": 91950 | |
| }, | |
| { | |
| "epoch": 9.90205575287913, | |
| "grad_norm": 0.830201268196106, | |
| "learning_rate": 6.115720288761986e-06, | |
| "loss": 3.1453, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.90205575287913, | |
| "eval_accuracy": 0.3947720141887633, | |
| "eval_loss": 3.30094051361084, | |
| "eval_runtime": 184.0615, | |
| "eval_samples_per_second": 97.853, | |
| "eval_steps_per_second": 6.118, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 9.907437304918739, | |
| "grad_norm": 0.8901052474975586, | |
| "learning_rate": 5.792479258700571e-06, | |
| "loss": 3.1262, | |
| "step": 92050 | |
| }, | |
| { | |
| "epoch": 9.912818856958348, | |
| "grad_norm": 0.8816144466400146, | |
| "learning_rate": 5.469238228639154e-06, | |
| "loss": 3.1218, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 9.918200408997954, | |
| "grad_norm": 0.8819746375083923, | |
| "learning_rate": 5.145997198577739e-06, | |
| "loss": 3.1237, | |
| "step": 92150 | |
| }, | |
| { | |
| "epoch": 9.923581961037563, | |
| "grad_norm": 0.8707346320152283, | |
| "learning_rate": 4.822756168516323e-06, | |
| "loss": 3.1305, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 9.928963513077171, | |
| "grad_norm": 0.9079093337059021, | |
| "learning_rate": 4.499515138454908e-06, | |
| "loss": 3.1423, | |
| "step": 92250 | |
| }, | |
| { | |
| "epoch": 9.93434506511678, | |
| "grad_norm": 0.8696450591087341, | |
| "learning_rate": 4.1762741083934915e-06, | |
| "loss": 3.1339, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 9.939726617156388, | |
| "grad_norm": 0.8753088116645813, | |
| "learning_rate": 3.853033078332076e-06, | |
| "loss": 3.1185, | |
| "step": 92350 | |
| }, | |
| { | |
| "epoch": 9.945108169195997, | |
| "grad_norm": 0.8948190808296204, | |
| "learning_rate": 3.52979204827066e-06, | |
| "loss": 3.1383, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 9.950489721235604, | |
| "grad_norm": 0.8588597178459167, | |
| "learning_rate": 3.206551018209244e-06, | |
| "loss": 3.1417, | |
| "step": 92450 | |
| }, | |
| { | |
| "epoch": 9.955871273275212, | |
| "grad_norm": 0.811172604560852, | |
| "learning_rate": 2.8833099881478286e-06, | |
| "loss": 3.1271, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 9.961252825314821, | |
| "grad_norm": 0.863527238368988, | |
| "learning_rate": 2.5600689580864127e-06, | |
| "loss": 3.1285, | |
| "step": 92550 | |
| }, | |
| { | |
| "epoch": 9.96663437735443, | |
| "grad_norm": 0.8756324648857117, | |
| "learning_rate": 2.2368279280249972e-06, | |
| "loss": 3.1397, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 9.972015929394038, | |
| "grad_norm": 0.8331931233406067, | |
| "learning_rate": 1.9135868979635813e-06, | |
| "loss": 3.1276, | |
| "step": 92650 | |
| }, | |
| { | |
| "epoch": 9.977397481433645, | |
| "grad_norm": 0.8202987313270569, | |
| "learning_rate": 1.5903458679021656e-06, | |
| "loss": 3.1323, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 9.982779033473253, | |
| "grad_norm": 0.9294122457504272, | |
| "learning_rate": 1.2671048378407496e-06, | |
| "loss": 3.1288, | |
| "step": 92750 | |
| }, | |
| { | |
| "epoch": 9.988160585512862, | |
| "grad_norm": 0.8398417234420776, | |
| "learning_rate": 9.438638077793341e-07, | |
| "loss": 3.132, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 9.99354213755247, | |
| "grad_norm": 0.8458587527275085, | |
| "learning_rate": 6.206227777179182e-07, | |
| "loss": 3.1209, | |
| "step": 92850 | |
| }, | |
| { | |
| "epoch": 9.998923689592079, | |
| "grad_norm": 0.8288222551345825, | |
| "learning_rate": 2.9738174765650254e-07, | |
| "loss": 3.1182, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "step": 92910, | |
| "total_flos": 7.7681075945472e+17, | |
| "train_loss": 3.4561853061462933, | |
| "train_runtime": 79879.1766, | |
| "train_samples_per_second": 37.218, | |
| "train_steps_per_second": 1.163 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 92910, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 7.7681075945472e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |