| { |
| "best_metric": 3.3022682666778564, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_634/checkpoint-90000", |
| "epoch": 10.0, |
| "eval_steps": 1000, |
| "global_step": 92910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005381552039608223, |
| "grad_norm": 2.052231550216675, |
| "learning_rate": 0.0003, |
| "loss": 8.4926, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.010763104079216447, |
| "grad_norm": 1.5619138479232788, |
| "learning_rate": 0.0006, |
| "loss": 6.9523, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01614465611882467, |
| "grad_norm": 1.1164947748184204, |
| "learning_rate": 0.0005996767589699385, |
| "loss": 6.5308, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.021526208158432893, |
| "grad_norm": 1.8875620365142822, |
| "learning_rate": 0.0005993535179398771, |
| "loss": 6.2593, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.026907760198041114, |
| "grad_norm": 1.2109925746917725, |
| "learning_rate": 0.0005990302769098158, |
| "loss": 6.0853, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03228931223764934, |
| "grad_norm": 1.7742042541503906, |
| "learning_rate": 0.0005987070358797543, |
| "loss": 5.9511, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03767086427725756, |
| "grad_norm": 1.2113205194473267, |
| "learning_rate": 0.0005983837948496929, |
| "loss": 5.8652, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04305241631686579, |
| "grad_norm": 1.3714256286621094, |
| "learning_rate": 0.0005980605538196314, |
| "loss": 5.7874, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.048433968356474004, |
| "grad_norm": 1.0156667232513428, |
| "learning_rate": 0.0005977373127895701, |
| "loss": 5.6947, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05381552039608223, |
| "grad_norm": 1.5654748678207397, |
| "learning_rate": 0.0005974140717595086, |
| "loss": 5.6658, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05919707243569045, |
| "grad_norm": 1.3490835428237915, |
| "learning_rate": 0.0005970908307294472, |
| "loss": 5.5831, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06457862447529868, |
| "grad_norm": 1.2378731966018677, |
| "learning_rate": 0.0005967675896993858, |
| "loss": 5.4948, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0699601765149069, |
| "grad_norm": 1.5635018348693848, |
| "learning_rate": 0.0005964443486693243, |
| "loss": 5.4534, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07534172855451512, |
| "grad_norm": 1.1525553464889526, |
| "learning_rate": 0.000596121107639263, |
| "loss": 5.3752, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08072328059412334, |
| "grad_norm": 1.0958820581436157, |
| "learning_rate": 0.0005957978666092015, |
| "loss": 5.315, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.08610483263373157, |
| "grad_norm": 1.1695090532302856, |
| "learning_rate": 0.0005954746255791401, |
| "loss": 5.2518, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09148638467333979, |
| "grad_norm": 1.161879539489746, |
| "learning_rate": 0.0005951513845490787, |
| "loss": 5.2155, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.09686793671294801, |
| "grad_norm": 0.9739872813224792, |
| "learning_rate": 0.0005948281435190174, |
| "loss": 5.1544, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10224948875255624, |
| "grad_norm": 1.5205905437469482, |
| "learning_rate": 0.0005945049024889559, |
| "loss": 5.1359, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "grad_norm": 1.3709813356399536, |
| "learning_rate": 0.0005941816614588944, |
| "loss": 5.0858, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "eval_accuracy": 0.2278260914915988, |
| "eval_loss": 5.015499114990234, |
| "eval_runtime": 186.9987, |
| "eval_samples_per_second": 96.316, |
| "eval_steps_per_second": 6.021, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11301259283177269, |
| "grad_norm": 1.0402472019195557, |
| "learning_rate": 0.000593858420428833, |
| "loss": 5.0431, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1183941448713809, |
| "grad_norm": 0.8706046342849731, |
| "learning_rate": 0.0005935351793987716, |
| "loss": 5.0351, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12377569691098914, |
| "grad_norm": 0.8232613205909729, |
| "learning_rate": 0.0005932119383687103, |
| "loss": 4.9863, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.12915724895059735, |
| "grad_norm": 1.4329087734222412, |
| "learning_rate": 0.0005928886973386488, |
| "loss": 4.9463, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.13453880099020557, |
| "grad_norm": 0.941353976726532, |
| "learning_rate": 0.0005925654563085874, |
| "loss": 4.9111, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1399203530298138, |
| "grad_norm": 1.3174821138381958, |
| "learning_rate": 0.000592242215278526, |
| "loss": 4.9106, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14530190506942203, |
| "grad_norm": 1.029506802558899, |
| "learning_rate": 0.0005919189742484645, |
| "loss": 4.8696, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.15068345710903025, |
| "grad_norm": 1.0898398160934448, |
| "learning_rate": 0.0005915957332184032, |
| "loss": 4.8182, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15606500914863847, |
| "grad_norm": 0.975596010684967, |
| "learning_rate": 0.0005912724921883417, |
| "loss": 4.8058, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.16144656118824668, |
| "grad_norm": 0.8824706077575684, |
| "learning_rate": 0.0005909492511582803, |
| "loss": 4.7975, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1668281132278549, |
| "grad_norm": 0.8124077320098877, |
| "learning_rate": 0.0005906260101282189, |
| "loss": 4.7705, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.17220966526746315, |
| "grad_norm": 0.9325596690177917, |
| "learning_rate": 0.0005903027690981575, |
| "loss": 4.7586, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.17759121730707136, |
| "grad_norm": 1.0702885389328003, |
| "learning_rate": 0.000589979528068096, |
| "loss": 4.7277, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18297276934667958, |
| "grad_norm": 0.8514541983604431, |
| "learning_rate": 0.0005896562870380347, |
| "loss": 4.6998, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1883543213862878, |
| "grad_norm": 0.9110283255577087, |
| "learning_rate": 0.0005893330460079732, |
| "loss": 4.6754, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.19373587342589602, |
| "grad_norm": 0.9315692782402039, |
| "learning_rate": 0.0005890098049779118, |
| "loss": 4.6766, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.19911742546550426, |
| "grad_norm": 0.8537421822547913, |
| "learning_rate": 0.0005886865639478504, |
| "loss": 4.6399, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20449897750511248, |
| "grad_norm": 1.0513622760772705, |
| "learning_rate": 0.0005883633229177889, |
| "loss": 4.6233, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2098805295447207, |
| "grad_norm": 1.177033543586731, |
| "learning_rate": 0.0005880400818877276, |
| "loss": 4.5863, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "grad_norm": 0.9385454058647156, |
| "learning_rate": 0.0005877168408576662, |
| "loss": 4.5722, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "eval_accuracy": 0.27065575603341335, |
| "eval_loss": 4.51419734954834, |
| "eval_runtime": 185.8696, |
| "eval_samples_per_second": 96.901, |
| "eval_steps_per_second": 6.058, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.22064363362393713, |
| "grad_norm": 0.9258783459663391, |
| "learning_rate": 0.0005873935998276048, |
| "loss": 4.5719, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22602518566354537, |
| "grad_norm": 0.9797497987747192, |
| "learning_rate": 0.0005870703587975433, |
| "loss": 4.5343, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2314067377031536, |
| "grad_norm": 0.8767006397247314, |
| "learning_rate": 0.0005867471177674818, |
| "loss": 4.5316, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2367882897427618, |
| "grad_norm": 0.9726645946502686, |
| "learning_rate": 0.0005864238767374205, |
| "loss": 4.5213, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24216984178237003, |
| "grad_norm": 1.0684372186660767, |
| "learning_rate": 0.0005861006357073591, |
| "loss": 4.4855, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.24755139382197827, |
| "grad_norm": 1.1976218223571777, |
| "learning_rate": 0.0005857773946772977, |
| "loss": 4.4821, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2529329458615865, |
| "grad_norm": 0.8945443630218506, |
| "learning_rate": 0.0005854541536472362, |
| "loss": 4.4691, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2583144979011947, |
| "grad_norm": 0.8019833564758301, |
| "learning_rate": 0.0005851309126171749, |
| "loss": 4.4531, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2636960499408029, |
| "grad_norm": 0.885037899017334, |
| "learning_rate": 0.0005848076715871134, |
| "loss": 4.4344, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.26907760198041114, |
| "grad_norm": 0.9687507748603821, |
| "learning_rate": 0.000584484430557052, |
| "loss": 4.4189, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.27445915402001936, |
| "grad_norm": 0.6978208422660828, |
| "learning_rate": 0.0005841611895269906, |
| "loss": 4.3996, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2798407060596276, |
| "grad_norm": 0.918928325176239, |
| "learning_rate": 0.0005838379484969291, |
| "loss": 4.4231, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2852222580992358, |
| "grad_norm": 0.8406935334205627, |
| "learning_rate": 0.0005835147074668678, |
| "loss": 4.3805, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29060381013884407, |
| "grad_norm": 0.9145887494087219, |
| "learning_rate": 0.0005831914664368063, |
| "loss": 4.3672, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.2959853621784523, |
| "grad_norm": 0.6630155444145203, |
| "learning_rate": 0.0005828682254067449, |
| "loss": 4.3586, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3013669142180605, |
| "grad_norm": 0.7521917819976807, |
| "learning_rate": 0.0005825449843766835, |
| "loss": 4.3556, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3067484662576687, |
| "grad_norm": 0.9356996417045593, |
| "learning_rate": 0.0005822217433466221, |
| "loss": 4.3275, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31213001829727693, |
| "grad_norm": 0.8674432635307312, |
| "learning_rate": 0.0005818985023165607, |
| "loss": 4.3087, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.31751157033688515, |
| "grad_norm": 0.7058337330818176, |
| "learning_rate": 0.0005815752612864992, |
| "loss": 4.3152, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "grad_norm": 0.7263809442520142, |
| "learning_rate": 0.0005812520202564378, |
| "loss": 4.3199, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "eval_accuracy": 0.2993136071118096, |
| "eval_loss": 4.23207426071167, |
| "eval_runtime": 185.6955, |
| "eval_samples_per_second": 96.992, |
| "eval_steps_per_second": 6.064, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3282746744161016, |
| "grad_norm": 0.950084388256073, |
| "learning_rate": 0.0005809287792263764, |
| "loss": 4.2983, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.3336562264557098, |
| "grad_norm": 0.6798693537712097, |
| "learning_rate": 0.0005806055381963151, |
| "loss": 4.2793, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3390377784953181, |
| "grad_norm": 0.7456609010696411, |
| "learning_rate": 0.0005802822971662536, |
| "loss": 4.2776, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3444193305349263, |
| "grad_norm": 0.7102623581886292, |
| "learning_rate": 0.0005799590561361922, |
| "loss": 4.28, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3498008825745345, |
| "grad_norm": 0.6629081964492798, |
| "learning_rate": 0.0005796358151061307, |
| "loss": 4.2568, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.35518243461414273, |
| "grad_norm": 0.7504428625106812, |
| "learning_rate": 0.0005793125740760694, |
| "loss": 4.2702, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.36056398665375095, |
| "grad_norm": 0.8710833191871643, |
| "learning_rate": 0.0005789893330460079, |
| "loss": 4.26, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.36594553869335916, |
| "grad_norm": 0.7749010920524597, |
| "learning_rate": 0.0005786660920159465, |
| "loss": 4.2404, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3713270907329674, |
| "grad_norm": 0.7596012949943542, |
| "learning_rate": 0.0005783428509858851, |
| "loss": 4.2251, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.3767086427725756, |
| "grad_norm": 0.922978401184082, |
| "learning_rate": 0.0005780196099558237, |
| "loss": 4.2135, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3820901948121838, |
| "grad_norm": 0.7895872592926025, |
| "learning_rate": 0.0005776963689257623, |
| "loss": 4.2069, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.38747174685179203, |
| "grad_norm": 0.7416473627090454, |
| "learning_rate": 0.0005773731278957008, |
| "loss": 4.1992, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3928532988914003, |
| "grad_norm": 0.6661369204521179, |
| "learning_rate": 0.0005770498868656394, |
| "loss": 4.217, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.3982348509310085, |
| "grad_norm": 0.6775140166282654, |
| "learning_rate": 0.000576726645835578, |
| "loss": 4.1955, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.40361640297061674, |
| "grad_norm": 0.6402688026428223, |
| "learning_rate": 0.0005764034048055167, |
| "loss": 4.1927, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.40899795501022496, |
| "grad_norm": 0.7230303883552551, |
| "learning_rate": 0.0005760801637754552, |
| "loss": 4.1789, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4143795070498332, |
| "grad_norm": 0.6041465401649475, |
| "learning_rate": 0.0005757569227453937, |
| "loss": 4.1825, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.4197610590894414, |
| "grad_norm": 0.6345029473304749, |
| "learning_rate": 0.0005754336817153324, |
| "loss": 4.1569, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.4251426111290496, |
| "grad_norm": 0.6404072642326355, |
| "learning_rate": 0.0005751104406852709, |
| "loss": 4.167, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "grad_norm": 0.625283420085907, |
| "learning_rate": 0.0005747871996552096, |
| "loss": 4.1627, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "eval_accuracy": 0.31280949096013405, |
| "eval_loss": 4.0881171226501465, |
| "eval_runtime": 185.8867, |
| "eval_samples_per_second": 96.892, |
| "eval_steps_per_second": 6.057, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.43590571520826604, |
| "grad_norm": 0.6916139721870422, |
| "learning_rate": 0.0005744639586251481, |
| "loss": 4.1593, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.44128726724787426, |
| "grad_norm": 0.6562201380729675, |
| "learning_rate": 0.0005741407175950867, |
| "loss": 4.1461, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.44666881928748253, |
| "grad_norm": 0.6418390274047852, |
| "learning_rate": 0.0005738174765650253, |
| "loss": 4.1162, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.45205037132709075, |
| "grad_norm": 0.6089012622833252, |
| "learning_rate": 0.0005734942355349638, |
| "loss": 4.1414, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.45743192336669897, |
| "grad_norm": 0.6137473583221436, |
| "learning_rate": 0.0005731709945049025, |
| "loss": 4.1413, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4628134754063072, |
| "grad_norm": 0.8024901151657104, |
| "learning_rate": 0.000572847753474841, |
| "loss": 4.1291, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4681950274459154, |
| "grad_norm": 0.6898571848869324, |
| "learning_rate": 0.0005725245124447796, |
| "loss": 4.1269, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.4735765794855236, |
| "grad_norm": 0.6257875561714172, |
| "learning_rate": 0.0005722012714147182, |
| "loss": 4.1281, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.47895813152513184, |
| "grad_norm": 0.6706247329711914, |
| "learning_rate": 0.0005718780303846568, |
| "loss": 4.1279, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.48433968356474005, |
| "grad_norm": 0.9896004796028137, |
| "learning_rate": 0.0005715547893545953, |
| "loss": 4.1109, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.48972123560434827, |
| "grad_norm": 0.8057212829589844, |
| "learning_rate": 0.000571231548324534, |
| "loss": 4.0953, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.49510278764395654, |
| "grad_norm": 0.5745981931686401, |
| "learning_rate": 0.0005709083072944725, |
| "loss": 4.1006, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5004843396835648, |
| "grad_norm": 0.7094736099243164, |
| "learning_rate": 0.0005705850662644111, |
| "loss": 4.0881, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.505865891723173, |
| "grad_norm": 0.7068034410476685, |
| "learning_rate": 0.0005702618252343497, |
| "loss": 4.0945, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5112474437627812, |
| "grad_norm": 0.8101431131362915, |
| "learning_rate": 0.0005699385842042882, |
| "loss": 4.1055, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5166289958023894, |
| "grad_norm": 0.6561006903648376, |
| "learning_rate": 0.0005696153431742269, |
| "loss": 4.0656, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5220105478419976, |
| "grad_norm": 0.6312971115112305, |
| "learning_rate": 0.0005692921021441655, |
| "loss": 4.0574, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5273920998816058, |
| "grad_norm": 0.6330981850624084, |
| "learning_rate": 0.0005689688611141041, |
| "loss": 4.059, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5327736519212141, |
| "grad_norm": 0.6987113356590271, |
| "learning_rate": 0.0005686456200840426, |
| "loss": 4.0672, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "grad_norm": 0.7659459710121155, |
| "learning_rate": 0.0005683223790539811, |
| "loss": 4.0414, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "eval_accuracy": 0.3217370641402987, |
| "eval_loss": 3.9916396141052246, |
| "eval_runtime": 185.6213, |
| "eval_samples_per_second": 97.031, |
| "eval_steps_per_second": 6.066, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5435367560004305, |
| "grad_norm": 0.6476443409919739, |
| "learning_rate": 0.0005679991380239198, |
| "loss": 4.0474, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.5489183080400387, |
| "grad_norm": 0.5426554679870605, |
| "learning_rate": 0.0005676758969938584, |
| "loss": 4.0554, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.5542998600796469, |
| "grad_norm": 0.5993754863739014, |
| "learning_rate": 0.000567352655963797, |
| "loss": 4.0318, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.5596814121192552, |
| "grad_norm": 0.6983941197395325, |
| "learning_rate": 0.0005670294149337355, |
| "loss": 4.023, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5650629641588634, |
| "grad_norm": 0.5401231050491333, |
| "learning_rate": 0.0005667061739036742, |
| "loss": 4.0461, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.5704445161984716, |
| "grad_norm": 0.5936373472213745, |
| "learning_rate": 0.0005663829328736127, |
| "loss": 4.0383, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5758260682380799, |
| "grad_norm": 0.6668462157249451, |
| "learning_rate": 0.0005660596918435512, |
| "loss": 4.0291, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.5812076202776881, |
| "grad_norm": 0.6027731895446777, |
| "learning_rate": 0.0005657364508134899, |
| "loss": 4.0398, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5865891723172963, |
| "grad_norm": 0.6766003370285034, |
| "learning_rate": 0.0005654132097834284, |
| "loss": 4.0254, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.5919707243569046, |
| "grad_norm": 0.6466162204742432, |
| "learning_rate": 0.0005650899687533671, |
| "loss": 4.0305, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5973522763965128, |
| "grad_norm": 0.6626183390617371, |
| "learning_rate": 0.0005647667277233056, |
| "loss": 3.996, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.602733828436121, |
| "grad_norm": 0.6999292969703674, |
| "learning_rate": 0.0005644434866932442, |
| "loss": 4.0376, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6081153804757292, |
| "grad_norm": 0.5966658592224121, |
| "learning_rate": 0.0005641202456631828, |
| "loss": 4.0054, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6134969325153374, |
| "grad_norm": 0.6622937917709351, |
| "learning_rate": 0.0005637970046331214, |
| "loss": 4.0101, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6188784845549457, |
| "grad_norm": 0.729719340801239, |
| "learning_rate": 0.00056347376360306, |
| "loss": 3.9984, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6242600365945539, |
| "grad_norm": 0.5816929936408997, |
| "learning_rate": 0.0005631505225729985, |
| "loss": 4.0058, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6296415886341621, |
| "grad_norm": 0.6277357935905457, |
| "learning_rate": 0.0005628272815429371, |
| "loss": 3.9839, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.6350231406737703, |
| "grad_norm": 0.6899020075798035, |
| "learning_rate": 0.0005625040405128757, |
| "loss": 3.9972, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6404046927133785, |
| "grad_norm": 0.5838122963905334, |
| "learning_rate": 0.0005621807994828143, |
| "loss": 3.9762, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "grad_norm": 0.6582158207893372, |
| "learning_rate": 0.0005618575584527529, |
| "loss": 4.0012, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "eval_accuracy": 0.3283593489997035, |
| "eval_loss": 3.9181182384490967, |
| "eval_runtime": 185.5106, |
| "eval_samples_per_second": 97.089, |
| "eval_steps_per_second": 6.07, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.651167796792595, |
| "grad_norm": 0.5642307996749878, |
| "learning_rate": 0.0005615343174226915, |
| "loss": 3.98, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.6565493488322032, |
| "grad_norm": 0.6277635097503662, |
| "learning_rate": 0.00056121107639263, |
| "loss": 4.0013, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.6619309008718114, |
| "grad_norm": 0.6095792651176453, |
| "learning_rate": 0.0005608878353625687, |
| "loss": 3.9694, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.6673124529114196, |
| "grad_norm": 0.6177421808242798, |
| "learning_rate": 0.0005605645943325072, |
| "loss": 3.9839, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6726940049510278, |
| "grad_norm": 0.6448411345481873, |
| "learning_rate": 0.0005602413533024458, |
| "loss": 3.9715, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.6780755569906362, |
| "grad_norm": 0.681561291217804, |
| "learning_rate": 0.0005599181122723844, |
| "loss": 3.976, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6834571090302444, |
| "grad_norm": 0.6730055212974548, |
| "learning_rate": 0.000559594871242323, |
| "loss": 3.9631, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.6888386610698526, |
| "grad_norm": 0.5638357400894165, |
| "learning_rate": 0.0005592716302122616, |
| "loss": 3.9409, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6942202131094608, |
| "grad_norm": 0.6370199918746948, |
| "learning_rate": 0.0005589483891822001, |
| "loss": 3.9694, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.699601765149069, |
| "grad_norm": 0.6043134927749634, |
| "learning_rate": 0.0005586251481521387, |
| "loss": 3.9708, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7049833171886772, |
| "grad_norm": 0.6449065804481506, |
| "learning_rate": 0.0005583019071220773, |
| "loss": 3.9571, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7103648692282855, |
| "grad_norm": 0.6545883417129517, |
| "learning_rate": 0.000557978666092016, |
| "loss": 3.9349, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7157464212678937, |
| "grad_norm": 0.5792790055274963, |
| "learning_rate": 0.0005576554250619545, |
| "loss": 3.9358, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7211279733075019, |
| "grad_norm": 0.6026337742805481, |
| "learning_rate": 0.000557332184031893, |
| "loss": 3.9339, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.7265095253471101, |
| "grad_norm": 0.5688988566398621, |
| "learning_rate": 0.0005570089430018317, |
| "loss": 3.9572, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.7318910773867183, |
| "grad_norm": 0.6680663824081421, |
| "learning_rate": 0.0005566857019717702, |
| "loss": 3.9462, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.7372726294263265, |
| "grad_norm": 0.5734186172485352, |
| "learning_rate": 0.0005563624609417089, |
| "loss": 3.9287, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.7426541814659348, |
| "grad_norm": 0.5625024437904358, |
| "learning_rate": 0.0005560392199116474, |
| "loss": 3.931, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.748035733505543, |
| "grad_norm": 0.5822864174842834, |
| "learning_rate": 0.000555715978881586, |
| "loss": 3.9193, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "grad_norm": 0.6197649240493774, |
| "learning_rate": 0.0005553927378515246, |
| "loss": 3.9278, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "eval_accuracy": 0.3329792697895752, |
| "eval_loss": 3.8630151748657227, |
| "eval_runtime": 185.8118, |
| "eval_samples_per_second": 96.931, |
| "eval_steps_per_second": 6.06, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7587988375847594, |
| "grad_norm": 0.53708416223526, |
| "learning_rate": 0.0005550694968214631, |
| "loss": 3.9353, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.7641803896243676, |
| "grad_norm": 0.602484405040741, |
| "learning_rate": 0.0005547462557914018, |
| "loss": 3.9234, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.7695619416639758, |
| "grad_norm": 0.645382285118103, |
| "learning_rate": 0.0005544294795819415, |
| "loss": 3.9319, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.7749434937035841, |
| "grad_norm": 0.5793185234069824, |
| "learning_rate": 0.0005541062385518801, |
| "loss": 3.9308, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7803250457431924, |
| "grad_norm": 0.6125531196594238, |
| "learning_rate": 0.0005537829975218188, |
| "loss": 3.9287, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.7857065977828006, |
| "grad_norm": 0.7164999842643738, |
| "learning_rate": 0.0005534597564917573, |
| "loss": 3.9188, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7910881498224088, |
| "grad_norm": 0.6857118606567383, |
| "learning_rate": 0.0005531365154616959, |
| "loss": 3.923, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.796469701862017, |
| "grad_norm": 0.6662998795509338, |
| "learning_rate": 0.0005528132744316344, |
| "loss": 3.914, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8018512539016253, |
| "grad_norm": 0.5667300820350647, |
| "learning_rate": 0.0005524900334015731, |
| "loss": 3.9011, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.8072328059412335, |
| "grad_norm": 0.6200372576713562, |
| "learning_rate": 0.0005521667923715117, |
| "loss": 3.9075, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8126143579808417, |
| "grad_norm": 0.5791467428207397, |
| "learning_rate": 0.0005518435513414502, |
| "loss": 3.8803, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.8179959100204499, |
| "grad_norm": 0.6154515743255615, |
| "learning_rate": 0.0005515203103113888, |
| "loss": 3.8899, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.8233774620600581, |
| "grad_norm": 0.5968267917633057, |
| "learning_rate": 0.0005511970692813274, |
| "loss": 3.8906, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.8287590140996663, |
| "grad_norm": 0.5160370469093323, |
| "learning_rate": 0.000550873828251266, |
| "loss": 3.8705, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.8341405661392746, |
| "grad_norm": 0.5990711450576782, |
| "learning_rate": 0.0005505505872212045, |
| "loss": 3.8951, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.8395221181788828, |
| "grad_norm": 0.5485094785690308, |
| "learning_rate": 0.0005502273461911432, |
| "loss": 3.9085, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.844903670218491, |
| "grad_norm": 0.6018003225326538, |
| "learning_rate": 0.0005499041051610817, |
| "loss": 3.89, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.8502852222580992, |
| "grad_norm": 0.639639675617218, |
| "learning_rate": 0.0005495808641310204, |
| "loss": 3.8926, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.8556667742977074, |
| "grad_norm": 0.5396811366081238, |
| "learning_rate": 0.0005492576231009589, |
| "loss": 3.8864, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "grad_norm": 0.5980393886566162, |
| "learning_rate": 0.0005489343820708974, |
| "loss": 3.8962, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "eval_accuracy": 0.33778237934830213, |
| "eval_loss": 3.817612648010254, |
| "eval_runtime": 185.5632, |
| "eval_samples_per_second": 97.061, |
| "eval_steps_per_second": 6.068, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8664298783769239, |
| "grad_norm": 0.5757644772529602, |
| "learning_rate": 0.0005486111410408361, |
| "loss": 3.8644, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.8718114304165321, |
| "grad_norm": 0.5959907174110413, |
| "learning_rate": 0.0005482879000107746, |
| "loss": 3.874, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 0.6737036108970642, |
| "learning_rate": 0.0005479646589807133, |
| "loss": 3.8756, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.8825745344957485, |
| "grad_norm": 0.5899474024772644, |
| "learning_rate": 0.0005476414179506518, |
| "loss": 3.8929, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.8879560865353568, |
| "grad_norm": 0.5099090933799744, |
| "learning_rate": 0.0005473181769205904, |
| "loss": 3.8529, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.8933376385749651, |
| "grad_norm": 0.5241236686706543, |
| "learning_rate": 0.000546994935890529, |
| "loss": 3.872, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.8987191906145733, |
| "grad_norm": 0.6906284093856812, |
| "learning_rate": 0.0005466716948604677, |
| "loss": 3.8657, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.9041007426541815, |
| "grad_norm": 0.5540809035301208, |
| "learning_rate": 0.0005463484538304062, |
| "loss": 3.8542, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.9094822946937897, |
| "grad_norm": 0.5402543544769287, |
| "learning_rate": 0.0005460252128003447, |
| "loss": 3.8643, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.9148638467333979, |
| "grad_norm": 0.5779940485954285, |
| "learning_rate": 0.0005457019717702833, |
| "loss": 3.8442, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.9202453987730062, |
| "grad_norm": 0.5777854323387146, |
| "learning_rate": 0.0005453787307402219, |
| "loss": 3.8477, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.9256269508126144, |
| "grad_norm": 0.6585916876792908, |
| "learning_rate": 0.0005450554897101605, |
| "loss": 3.8575, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.9310085028522226, |
| "grad_norm": 0.7081482410430908, |
| "learning_rate": 0.0005447322486800991, |
| "loss": 3.8511, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.9363900548918308, |
| "grad_norm": 0.5857383608818054, |
| "learning_rate": 0.0005444090076500377, |
| "loss": 3.8659, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.941771606931439, |
| "grad_norm": 0.6456316709518433, |
| "learning_rate": 0.0005440857666199763, |
| "loss": 3.8628, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.9471531589710472, |
| "grad_norm": 0.7383223176002502, |
| "learning_rate": 0.0005437625255899148, |
| "loss": 3.8441, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.9525347110106555, |
| "grad_norm": 0.56940096616745, |
| "learning_rate": 0.0005434392845598534, |
| "loss": 3.85, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.9579162630502637, |
| "grad_norm": 0.60551518201828, |
| "learning_rate": 0.000543116043529792, |
| "loss": 3.8546, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.9632978150898719, |
| "grad_norm": 0.6073434948921204, |
| "learning_rate": 0.0005427928024997306, |
| "loss": 3.8403, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "grad_norm": 0.6240805983543396, |
| "learning_rate": 0.0005424695614696692, |
| "loss": 3.8312, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "eval_accuracy": 0.34048327283359453, |
| "eval_loss": 3.7796497344970703, |
| "eval_runtime": 185.3945, |
| "eval_samples_per_second": 97.15, |
| "eval_steps_per_second": 6.074, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9740609191690883, |
| "grad_norm": 0.6017458438873291, |
| "learning_rate": 0.0005421463204396078, |
| "loss": 3.8507, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.9794424712086965, |
| "grad_norm": 0.5660796761512756, |
| "learning_rate": 0.0005418230794095463, |
| "loss": 3.8091, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.9848240232483048, |
| "grad_norm": 0.6631521582603455, |
| "learning_rate": 0.000541499838379485, |
| "loss": 3.8444, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.9902055752879131, |
| "grad_norm": 0.5666031241416931, |
| "learning_rate": 0.0005411765973494235, |
| "loss": 3.8533, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.9955871273275213, |
| "grad_norm": 0.6138345003128052, |
| "learning_rate": 0.0005408533563193621, |
| "loss": 3.8266, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.0009686793671295, |
| "grad_norm": 0.6937859058380127, |
| "learning_rate": 0.0005405301152893007, |
| "loss": 3.8116, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.0063502314067376, |
| "grad_norm": 0.5748305916786194, |
| "learning_rate": 0.0005402068742592392, |
| "loss": 3.765, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.011731783446346, |
| "grad_norm": 0.5838444828987122, |
| "learning_rate": 0.0005398836332291779, |
| "loss": 3.7823, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.017113335485954, |
| "grad_norm": 0.6028748154640198, |
| "learning_rate": 0.0005395603921991164, |
| "loss": 3.7869, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.0224948875255624, |
| "grad_norm": 0.5265587568283081, |
| "learning_rate": 0.0005392371511690551, |
| "loss": 3.7743, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0278764395651705, |
| "grad_norm": 0.5637922286987305, |
| "learning_rate": 0.0005389139101389936, |
| "loss": 3.7891, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.0332579916047788, |
| "grad_norm": 0.5316060185432434, |
| "learning_rate": 0.0005385906691089321, |
| "loss": 3.7662, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.0386395436443872, |
| "grad_norm": 0.5659850835800171, |
| "learning_rate": 0.0005382674280788708, |
| "loss": 3.7684, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.0440210956839953, |
| "grad_norm": 0.6028578877449036, |
| "learning_rate": 0.0005379441870488093, |
| "loss": 3.7662, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.0494026477236036, |
| "grad_norm": 0.5714396238327026, |
| "learning_rate": 0.0005376209460187479, |
| "loss": 3.7604, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.0547841997632117, |
| "grad_norm": 0.5520520210266113, |
| "learning_rate": 0.0005372977049886865, |
| "loss": 3.7719, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.06016575180282, |
| "grad_norm": 0.6741361021995544, |
| "learning_rate": 0.0005369809287792263, |
| "loss": 3.7773, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.0655473038424281, |
| "grad_norm": 0.6235219240188599, |
| "learning_rate": 0.000536657687749165, |
| "loss": 3.7799, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.0709288558820365, |
| "grad_norm": 0.5697728991508484, |
| "learning_rate": 0.0005363344467191035, |
| "loss": 3.7784, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "grad_norm": 0.5355727076530457, |
| "learning_rate": 0.000536011205689042, |
| "loss": 3.7868, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "eval_accuracy": 0.3444463869166277, |
| "eval_loss": 3.7480411529541016, |
| "eval_runtime": 185.7327, |
| "eval_samples_per_second": 96.973, |
| "eval_steps_per_second": 6.062, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.081691959961253, |
| "grad_norm": 0.6252759099006653, |
| "learning_rate": 0.0005356879646589807, |
| "loss": 3.7668, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.087073512000861, |
| "grad_norm": 0.6111653447151184, |
| "learning_rate": 0.0005353647236289192, |
| "loss": 3.7588, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.0924550640404693, |
| "grad_norm": 0.5253251791000366, |
| "learning_rate": 0.0005350414825988579, |
| "loss": 3.7567, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.0978366160800774, |
| "grad_norm": 0.5540984869003296, |
| "learning_rate": 0.0005347182415687964, |
| "loss": 3.7718, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.1032181681196858, |
| "grad_norm": 0.5651341080665588, |
| "learning_rate": 0.000534395000538735, |
| "loss": 3.7654, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.1085997201592939, |
| "grad_norm": 0.5826306343078613, |
| "learning_rate": 0.0005340717595086736, |
| "loss": 3.7721, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.1139812721989022, |
| "grad_norm": 0.5581619143486023, |
| "learning_rate": 0.0005337485184786122, |
| "loss": 3.75, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.1193628242385103, |
| "grad_norm": 0.6028370261192322, |
| "learning_rate": 0.0005334252774485507, |
| "loss": 3.748, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.1247443762781186, |
| "grad_norm": 0.5490254163742065, |
| "learning_rate": 0.0005331020364184894, |
| "loss": 3.7654, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.1301259283177267, |
| "grad_norm": 0.5509082674980164, |
| "learning_rate": 0.0005327787953884279, |
| "loss": 3.7476, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.135507480357335, |
| "grad_norm": 0.5552167892456055, |
| "learning_rate": 0.0005324555543583665, |
| "loss": 3.7398, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.1408890323969434, |
| "grad_norm": 0.5624234080314636, |
| "learning_rate": 0.0005321323133283051, |
| "loss": 3.7613, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.1462705844365515, |
| "grad_norm": 0.7011666297912598, |
| "learning_rate": 0.0005318090722982436, |
| "loss": 3.7521, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.1516521364761596, |
| "grad_norm": 0.635560154914856, |
| "learning_rate": 0.0005314858312681823, |
| "loss": 3.788, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.157033688515768, |
| "grad_norm": 0.6116316318511963, |
| "learning_rate": 0.0005311625902381209, |
| "loss": 3.7697, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.1624152405553763, |
| "grad_norm": 0.5557190775871277, |
| "learning_rate": 0.0005308393492080595, |
| "loss": 3.7506, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.1677967925949844, |
| "grad_norm": 0.5881202220916748, |
| "learning_rate": 0.000530516108177998, |
| "loss": 3.7431, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.1731783446345927, |
| "grad_norm": 0.5743594765663147, |
| "learning_rate": 0.0005301928671479365, |
| "loss": 3.7439, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.1785598966742008, |
| "grad_norm": 0.6218465566635132, |
| "learning_rate": 0.0005298696261178752, |
| "loss": 3.7635, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "grad_norm": 0.5709580183029175, |
| "learning_rate": 0.0005295463850878138, |
| "loss": 3.7611, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "eval_accuracy": 0.34666529619157505, |
| "eval_loss": 3.722410202026367, |
| "eval_runtime": 185.2726, |
| "eval_samples_per_second": 97.214, |
| "eval_steps_per_second": 6.078, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1893230007534172, |
| "grad_norm": 0.5581254363059998, |
| "learning_rate": 0.0005292231440577524, |
| "loss": 3.7542, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.1947045527930256, |
| "grad_norm": 0.6101961135864258, |
| "learning_rate": 0.0005288999030276909, |
| "loss": 3.7289, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.2000861048326337, |
| "grad_norm": 0.5990533232688904, |
| "learning_rate": 0.0005285766619976295, |
| "loss": 3.7564, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.205467656872242, |
| "grad_norm": 0.574143648147583, |
| "learning_rate": 0.0005282534209675681, |
| "loss": 3.7598, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.21084920891185, |
| "grad_norm": 0.584513247013092, |
| "learning_rate": 0.0005279301799375066, |
| "loss": 3.7433, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.2162307609514584, |
| "grad_norm": 0.5963412523269653, |
| "learning_rate": 0.0005276069389074453, |
| "loss": 3.7542, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.2216123129910665, |
| "grad_norm": 0.5428022742271423, |
| "learning_rate": 0.0005272836978773838, |
| "loss": 3.7485, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.2269938650306749, |
| "grad_norm": 0.6036804914474487, |
| "learning_rate": 0.0005269604568473225, |
| "loss": 3.759, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.232375417070283, |
| "grad_norm": 0.5967918038368225, |
| "learning_rate": 0.000526637215817261, |
| "loss": 3.7351, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.2377569691098913, |
| "grad_norm": 0.5945722460746765, |
| "learning_rate": 0.0005263139747871996, |
| "loss": 3.728, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.2431385211494996, |
| "grad_norm": 0.5260943174362183, |
| "learning_rate": 0.0005259907337571381, |
| "loss": 3.7511, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.2485200731891077, |
| "grad_norm": 0.6230841279029846, |
| "learning_rate": 0.0005256674927270768, |
| "loss": 3.744, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.2539016252287158, |
| "grad_norm": 0.5355697870254517, |
| "learning_rate": 0.0005253442516970154, |
| "loss": 3.7327, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.2592831772683242, |
| "grad_norm": 0.5850827097892761, |
| "learning_rate": 0.000525021010666954, |
| "loss": 3.7355, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.2646647293079325, |
| "grad_norm": 0.5686675310134888, |
| "learning_rate": 0.0005246977696368925, |
| "loss": 3.7248, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.2700462813475406, |
| "grad_norm": 0.605828046798706, |
| "learning_rate": 0.0005243745286068311, |
| "loss": 3.7337, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.275427833387149, |
| "grad_norm": 0.5307839512825012, |
| "learning_rate": 0.0005240512875767697, |
| "loss": 3.7365, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.280809385426757, |
| "grad_norm": 0.504356861114502, |
| "learning_rate": 0.0005237280465467083, |
| "loss": 3.7246, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.2861909374663654, |
| "grad_norm": 0.546125054359436, |
| "learning_rate": 0.0005234048055166469, |
| "loss": 3.7337, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "grad_norm": 0.5529025197029114, |
| "learning_rate": 0.0005230815644865854, |
| "loss": 3.7129, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "eval_accuracy": 0.3493630387431208, |
| "eval_loss": 3.7001094818115234, |
| "eval_runtime": 185.4155, |
| "eval_samples_per_second": 97.139, |
| "eval_steps_per_second": 6.073, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2969540415455818, |
| "grad_norm": 0.5733916163444519, |
| "learning_rate": 0.0005227583234565241, |
| "loss": 3.7344, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.30233559358519, |
| "grad_norm": 0.5374879837036133, |
| "learning_rate": 0.0005224350824264626, |
| "loss": 3.7114, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.3077171456247982, |
| "grad_norm": 0.5722625255584717, |
| "learning_rate": 0.0005221118413964012, |
| "loss": 3.7367, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.3130986976644063, |
| "grad_norm": 0.589179277420044, |
| "learning_rate": 0.0005217886003663398, |
| "loss": 3.7241, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.3184802497040147, |
| "grad_norm": 0.6286568641662598, |
| "learning_rate": 0.0005214718241568796, |
| "loss": 3.7201, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.3238618017436228, |
| "grad_norm": 0.5651589035987854, |
| "learning_rate": 0.0005211485831268182, |
| "loss": 3.7257, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.329243353783231, |
| "grad_norm": 0.536525547504425, |
| "learning_rate": 0.0005208253420967568, |
| "loss": 3.7238, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.3346249058228392, |
| "grad_norm": 0.5890007615089417, |
| "learning_rate": 0.0005205021010666953, |
| "loss": 3.7209, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.3400064578624475, |
| "grad_norm": 0.5613293647766113, |
| "learning_rate": 0.0005201788600366339, |
| "loss": 3.7316, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.3453880099020559, |
| "grad_norm": 0.5462381839752197, |
| "learning_rate": 0.0005198556190065725, |
| "loss": 3.7214, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.350769561941664, |
| "grad_norm": 0.5214793682098389, |
| "learning_rate": 0.0005195323779765112, |
| "loss": 3.7354, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.356151113981272, |
| "grad_norm": 0.5193334817886353, |
| "learning_rate": 0.0005192091369464497, |
| "loss": 3.7275, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.3615326660208804, |
| "grad_norm": 0.535210132598877, |
| "learning_rate": 0.0005188858959163882, |
| "loss": 3.7249, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.3669142180604887, |
| "grad_norm": 0.5688884258270264, |
| "learning_rate": 0.0005185626548863269, |
| "loss": 3.702, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.3722957701000968, |
| "grad_norm": 0.5889831185340881, |
| "learning_rate": 0.0005182394138562654, |
| "loss": 3.7041, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.3776773221397052, |
| "grad_norm": 0.587273895740509, |
| "learning_rate": 0.0005179161728262041, |
| "loss": 3.7091, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.3830588741793133, |
| "grad_norm": 0.6570191979408264, |
| "learning_rate": 0.0005175929317961426, |
| "loss": 3.749, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.3884404262189216, |
| "grad_norm": 0.5788666605949402, |
| "learning_rate": 0.0005172696907660812, |
| "loss": 3.7098, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.3938219782585297, |
| "grad_norm": 0.6514964699745178, |
| "learning_rate": 0.0005169464497360198, |
| "loss": 3.7179, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "grad_norm": 0.5161887407302856, |
| "learning_rate": 0.0005166232087059583, |
| "loss": 3.7199, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "eval_accuracy": 0.35144993475937353, |
| "eval_loss": 3.6775624752044678, |
| "eval_runtime": 185.6521, |
| "eval_samples_per_second": 97.015, |
| "eval_steps_per_second": 6.065, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4045850823377461, |
| "grad_norm": 0.586769163608551, |
| "learning_rate": 0.0005162999676758969, |
| "loss": 3.7145, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.4099666343773545, |
| "grad_norm": 0.5492085218429565, |
| "learning_rate": 0.0005159767266458355, |
| "loss": 3.7345, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.4153481864169626, |
| "grad_norm": 0.55711430311203, |
| "learning_rate": 0.0005156534856157741, |
| "loss": 3.7109, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.420729738456571, |
| "grad_norm": 0.5128664970397949, |
| "learning_rate": 0.0005153302445857127, |
| "loss": 3.7049, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.426111290496179, |
| "grad_norm": 0.618269681930542, |
| "learning_rate": 0.0005150070035556513, |
| "loss": 3.7022, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.4314928425357873, |
| "grad_norm": 0.5524218082427979, |
| "learning_rate": 0.0005146837625255898, |
| "loss": 3.6908, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.4368743945753955, |
| "grad_norm": 0.4995570182800293, |
| "learning_rate": 0.0005143605214955285, |
| "loss": 3.7093, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.4422559466150038, |
| "grad_norm": 0.5366711020469666, |
| "learning_rate": 0.0005140372804654671, |
| "loss": 3.7193, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.447637498654612, |
| "grad_norm": 0.6028699278831482, |
| "learning_rate": 0.0005137140394354056, |
| "loss": 3.7101, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.4530190506942202, |
| "grad_norm": 0.6548466682434082, |
| "learning_rate": 0.0005133907984053442, |
| "loss": 3.6972, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.4584006027338283, |
| "grad_norm": 0.5610542893409729, |
| "learning_rate": 0.0005130675573752827, |
| "loss": 3.7236, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.4637821547734367, |
| "grad_norm": 0.5922684669494629, |
| "learning_rate": 0.0005127443163452214, |
| "loss": 3.688, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.469163706813045, |
| "grad_norm": 0.5754106044769287, |
| "learning_rate": 0.00051242107531516, |
| "loss": 3.7083, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.474545258852653, |
| "grad_norm": 0.6285116672515869, |
| "learning_rate": 0.0005120978342850986, |
| "loss": 3.6849, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.4799268108922612, |
| "grad_norm": 0.5683203339576721, |
| "learning_rate": 0.0005117745932550371, |
| "loss": 3.7126, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.4853083629318695, |
| "grad_norm": 0.5266847014427185, |
| "learning_rate": 0.0005114513522249758, |
| "loss": 3.7145, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.4906899149714778, |
| "grad_norm": 0.5902765393257141, |
| "learning_rate": 0.0005111281111949143, |
| "loss": 3.691, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.496071467011086, |
| "grad_norm": 0.6617315411567688, |
| "learning_rate": 0.0005108048701648528, |
| "loss": 3.7024, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.501453019050694, |
| "grad_norm": 0.6652734279632568, |
| "learning_rate": 0.0005104816291347915, |
| "loss": 3.7146, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "grad_norm": 0.5432553291320801, |
| "learning_rate": 0.00051015838810473, |
| "loss": 3.6898, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "eval_accuracy": 0.3533115933391868, |
| "eval_loss": 3.6564745903015137, |
| "eval_runtime": 185.4642, |
| "eval_samples_per_second": 97.113, |
| "eval_steps_per_second": 6.071, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5122161231299107, |
| "grad_norm": 0.5797891616821289, |
| "learning_rate": 0.0005098351470746687, |
| "loss": 3.6824, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.5175976751695188, |
| "grad_norm": 0.5523927211761475, |
| "learning_rate": 0.0005095119060446072, |
| "loss": 3.6874, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.5229792272091272, |
| "grad_norm": 0.505785346031189, |
| "learning_rate": 0.0005091886650145458, |
| "loss": 3.703, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.5283607792487355, |
| "grad_norm": 0.5742488503456116, |
| "learning_rate": 0.0005088654239844844, |
| "loss": 3.7047, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.5337423312883436, |
| "grad_norm": 0.5759657621383667, |
| "learning_rate": 0.0005085421829544229, |
| "loss": 3.6823, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.5391238833279517, |
| "grad_norm": 0.5873520374298096, |
| "learning_rate": 0.0005082254067449629, |
| "loss": 3.6973, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.54450543536756, |
| "grad_norm": 0.5512503981590271, |
| "learning_rate": 0.0005079021657149014, |
| "loss": 3.6917, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.5498869874071683, |
| "grad_norm": 0.5719089508056641, |
| "learning_rate": 0.0005075789246848399, |
| "loss": 3.7047, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.5552685394467765, |
| "grad_norm": 0.5563808679580688, |
| "learning_rate": 0.0005072556836547785, |
| "loss": 3.6888, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.5606500914863846, |
| "grad_norm": 0.6009058952331543, |
| "learning_rate": 0.0005069324426247171, |
| "loss": 3.6743, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.566031643525993, |
| "grad_norm": 0.5218216180801392, |
| "learning_rate": 0.0005066092015946557, |
| "loss": 3.6711, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.5714131955656012, |
| "grad_norm": 0.5436237454414368, |
| "learning_rate": 0.0005062859605645943, |
| "loss": 3.6818, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.5767947476052093, |
| "grad_norm": 0.5251820683479309, |
| "learning_rate": 0.0005059627195345329, |
| "loss": 3.6783, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.5821762996448174, |
| "grad_norm": 0.5401293635368347, |
| "learning_rate": 0.0005056394785044715, |
| "loss": 3.6812, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.5875578516844258, |
| "grad_norm": 0.5772663950920105, |
| "learning_rate": 0.00050531623747441, |
| "loss": 3.6836, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.592939403724034, |
| "grad_norm": 0.5626757740974426, |
| "learning_rate": 0.0005049929964443486, |
| "loss": 3.6798, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.5983209557636422, |
| "grad_norm": 0.5350626707077026, |
| "learning_rate": 0.0005046697554142871, |
| "loss": 3.685, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.6037025078032503, |
| "grad_norm": 0.5198737382888794, |
| "learning_rate": 0.0005043465143842258, |
| "loss": 3.6869, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.6090840598428586, |
| "grad_norm": 0.5422475934028625, |
| "learning_rate": 0.0005040232733541644, |
| "loss": 3.6807, |
| "step": 14950 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "grad_norm": 0.5870875120162964, |
| "learning_rate": 0.000503700032324103, |
| "loss": 3.6861, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "eval_accuracy": 0.3554053344873719, |
| "eval_loss": 3.636244058609009, |
| "eval_runtime": 185.6793, |
| "eval_samples_per_second": 97.001, |
| "eval_steps_per_second": 6.064, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.619847163922075, |
| "grad_norm": 0.54155033826828, |
| "learning_rate": 0.0005033767912940415, |
| "loss": 3.6776, |
| "step": 15050 |
| }, |
| { |
| "epoch": 1.6252287159616834, |
| "grad_norm": 0.5892346501350403, |
| "learning_rate": 0.0005030535502639802, |
| "loss": 3.6845, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.6306102680012917, |
| "grad_norm": 0.5343015789985657, |
| "learning_rate": 0.0005027303092339187, |
| "loss": 3.6907, |
| "step": 15150 |
| }, |
| { |
| "epoch": 1.6359918200408998, |
| "grad_norm": 0.5575476288795471, |
| "learning_rate": 0.0005024070682038573, |
| "loss": 3.6723, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.641373372080508, |
| "grad_norm": 0.5770471692085266, |
| "learning_rate": 0.0005020838271737959, |
| "loss": 3.6807, |
| "step": 15250 |
| }, |
| { |
| "epoch": 1.6467549241201163, |
| "grad_norm": 0.582455039024353, |
| "learning_rate": 0.0005017605861437344, |
| "loss": 3.6737, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.6521364761597246, |
| "grad_norm": 0.5098881721496582, |
| "learning_rate": 0.0005014373451136731, |
| "loss": 3.6739, |
| "step": 15350 |
| }, |
| { |
| "epoch": 1.6575180281993327, |
| "grad_norm": 0.5664951205253601, |
| "learning_rate": 0.0005011141040836116, |
| "loss": 3.6853, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.6628995802389408, |
| "grad_norm": 0.5914148092269897, |
| "learning_rate": 0.0005007908630535503, |
| "loss": 3.6916, |
| "step": 15450 |
| }, |
| { |
| "epoch": 1.6682811322785491, |
| "grad_norm": 0.5618295669555664, |
| "learning_rate": 0.0005004676220234888, |
| "loss": 3.6552, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.6736626843181575, |
| "grad_norm": 0.5628893971443176, |
| "learning_rate": 0.0005001443809934273, |
| "loss": 3.6842, |
| "step": 15550 |
| }, |
| { |
| "epoch": 1.6790442363577656, |
| "grad_norm": 0.5461952090263367, |
| "learning_rate": 0.000499821139963366, |
| "loss": 3.6547, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.6844257883973737, |
| "grad_norm": 0.504944920539856, |
| "learning_rate": 0.0004994978989333045, |
| "loss": 3.6782, |
| "step": 15650 |
| }, |
| { |
| "epoch": 1.689807340436982, |
| "grad_norm": 0.5063119530677795, |
| "learning_rate": 0.0004991746579032431, |
| "loss": 3.682, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.6951888924765903, |
| "grad_norm": 0.5550683736801147, |
| "learning_rate": 0.0004988514168731817, |
| "loss": 3.6713, |
| "step": 15750 |
| }, |
| { |
| "epoch": 1.7005704445161984, |
| "grad_norm": 0.7855005860328674, |
| "learning_rate": 0.0004985281758431204, |
| "loss": 3.6689, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.7059519965558065, |
| "grad_norm": 0.5781165361404419, |
| "learning_rate": 0.0004982049348130589, |
| "loss": 3.6637, |
| "step": 15850 |
| }, |
| { |
| "epoch": 1.7113335485954149, |
| "grad_norm": 0.5900784134864807, |
| "learning_rate": 0.0004978816937829975, |
| "loss": 3.6821, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.7167151006350232, |
| "grad_norm": 0.5913995504379272, |
| "learning_rate": 0.000497558452752936, |
| "loss": 3.6791, |
| "step": 15950 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "grad_norm": 0.5486482977867126, |
| "learning_rate": 0.0004972352117228746, |
| "loss": 3.6454, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "eval_accuracy": 0.35726449405076544, |
| "eval_loss": 3.6199259757995605, |
| "eval_runtime": 185.6867, |
| "eval_samples_per_second": 96.997, |
| "eval_steps_per_second": 6.064, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7274782047142396, |
| "grad_norm": 0.5616929531097412, |
| "learning_rate": 0.0004969119706928133, |
| "loss": 3.6571, |
| "step": 16050 |
| }, |
| { |
| "epoch": 1.732859756753848, |
| "grad_norm": 0.5681639313697815, |
| "learning_rate": 0.0004965887296627518, |
| "loss": 3.6742, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.738241308793456, |
| "grad_norm": 0.5359152555465698, |
| "learning_rate": 0.0004962654886326904, |
| "loss": 3.6848, |
| "step": 16150 |
| }, |
| { |
| "epoch": 1.7436228608330642, |
| "grad_norm": 0.5574436187744141, |
| "learning_rate": 0.000495942247602629, |
| "loss": 3.6787, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.7490044128726725, |
| "grad_norm": 0.5218374729156494, |
| "learning_rate": 0.0004956190065725676, |
| "loss": 3.6511, |
| "step": 16250 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.5530843734741211, |
| "learning_rate": 0.0004952957655425062, |
| "loss": 3.6587, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.759767516951889, |
| "grad_norm": 0.5670496821403503, |
| "learning_rate": 0.0004949725245124448, |
| "loss": 3.674, |
| "step": 16350 |
| }, |
| { |
| "epoch": 1.765149068991497, |
| "grad_norm": 0.5473534464836121, |
| "learning_rate": 0.0004946492834823833, |
| "loss": 3.6532, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.7705306210311054, |
| "grad_norm": 0.5115002989768982, |
| "learning_rate": 0.0004943260424523219, |
| "loss": 3.6579, |
| "step": 16450 |
| }, |
| { |
| "epoch": 1.7759121730707137, |
| "grad_norm": 0.5874372124671936, |
| "learning_rate": 0.0004940028014222605, |
| "loss": 3.6689, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.7812937251103218, |
| "grad_norm": 0.5768347382545471, |
| "learning_rate": 0.0004936860252128003, |
| "loss": 3.6443, |
| "step": 16550 |
| }, |
| { |
| "epoch": 1.78667527714993, |
| "grad_norm": 0.5852294564247131, |
| "learning_rate": 0.0004933627841827388, |
| "loss": 3.6585, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.7920568291895382, |
| "grad_norm": 0.5869889259338379, |
| "learning_rate": 0.0004930395431526775, |
| "loss": 3.6664, |
| "step": 16650 |
| }, |
| { |
| "epoch": 1.7974383812291466, |
| "grad_norm": 0.5510962605476379, |
| "learning_rate": 0.0004927163021226161, |
| "loss": 3.6523, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.8028199332687547, |
| "grad_norm": 0.5650386810302734, |
| "learning_rate": 0.0004923930610925547, |
| "loss": 3.661, |
| "step": 16750 |
| }, |
| { |
| "epoch": 1.8082014853083628, |
| "grad_norm": 0.5477886199951172, |
| "learning_rate": 0.0004920698200624932, |
| "loss": 3.6555, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.813583037347971, |
| "grad_norm": 0.5766158103942871, |
| "learning_rate": 0.0004917465790324317, |
| "loss": 3.6397, |
| "step": 16850 |
| }, |
| { |
| "epoch": 1.8189645893875794, |
| "grad_norm": 0.553198516368866, |
| "learning_rate": 0.0004914233380023704, |
| "loss": 3.6534, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.8243461414271875, |
| "grad_norm": 0.5633429288864136, |
| "learning_rate": 0.0004911000969723089, |
| "loss": 3.6505, |
| "step": 16950 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "grad_norm": 0.5467578768730164, |
| "learning_rate": 0.0004907768559422476, |
| "loss": 3.6619, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "eval_accuracy": 0.3587219638879089, |
| "eval_loss": 3.6078338623046875, |
| "eval_runtime": 185.4865, |
| "eval_samples_per_second": 97.101, |
| "eval_steps_per_second": 6.071, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8351092455064042, |
| "grad_norm": 0.565685510635376, |
| "learning_rate": 0.0004904536149121861, |
| "loss": 3.6553, |
| "step": 17050 |
| }, |
| { |
| "epoch": 1.8404907975460123, |
| "grad_norm": 0.5468007922172546, |
| "learning_rate": 0.0004901303738821248, |
| "loss": 3.6446, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.8458723495856204, |
| "grad_norm": 0.5537818074226379, |
| "learning_rate": 0.0004898071328520633, |
| "loss": 3.6513, |
| "step": 17150 |
| }, |
| { |
| "epoch": 1.8512539016252287, |
| "grad_norm": 0.5450910925865173, |
| "learning_rate": 0.0004894838918220019, |
| "loss": 3.6363, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.856635453664837, |
| "grad_norm": 0.5500758290290833, |
| "learning_rate": 0.0004891606507919405, |
| "loss": 3.6319, |
| "step": 17250 |
| }, |
| { |
| "epoch": 1.8620170057044452, |
| "grad_norm": 0.5199875235557556, |
| "learning_rate": 0.000488837409761879, |
| "loss": 3.6382, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.8673985577440533, |
| "grad_norm": 0.5702790021896362, |
| "learning_rate": 0.0004885141687318177, |
| "loss": 3.647, |
| "step": 17350 |
| }, |
| { |
| "epoch": 1.8727801097836616, |
| "grad_norm": 0.5479942560195923, |
| "learning_rate": 0.00048819092770175623, |
| "loss": 3.6346, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.87816166182327, |
| "grad_norm": 0.5453149080276489, |
| "learning_rate": 0.0004878676866716948, |
| "loss": 3.6345, |
| "step": 17450 |
| }, |
| { |
| "epoch": 1.883543213862878, |
| "grad_norm": 0.5539295673370361, |
| "learning_rate": 0.00048754444564163337, |
| "loss": 3.6431, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.8889247659024861, |
| "grad_norm": 0.5381399393081665, |
| "learning_rate": 0.000487221204611572, |
| "loss": 3.6471, |
| "step": 17550 |
| }, |
| { |
| "epoch": 1.8943063179420945, |
| "grad_norm": 0.4991917312145233, |
| "learning_rate": 0.00048689796358151056, |
| "loss": 3.6445, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.8996878699817028, |
| "grad_norm": 0.556541383266449, |
| "learning_rate": 0.00048657472255144915, |
| "loss": 3.6528, |
| "step": 17650 |
| }, |
| { |
| "epoch": 1.905069422021311, |
| "grad_norm": 0.5819994807243347, |
| "learning_rate": 0.00048625148152138775, |
| "loss": 3.6255, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.910450974060919, |
| "grad_norm": 0.6031482219696045, |
| "learning_rate": 0.0004859282404913263, |
| "loss": 3.6435, |
| "step": 17750 |
| }, |
| { |
| "epoch": 1.9158325261005273, |
| "grad_norm": 0.5360264182090759, |
| "learning_rate": 0.0004856049994612649, |
| "loss": 3.641, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.9212140781401357, |
| "grad_norm": 0.6075764298439026, |
| "learning_rate": 0.00048528175843120353, |
| "loss": 3.6423, |
| "step": 17850 |
| }, |
| { |
| "epoch": 1.9265956301797438, |
| "grad_norm": 0.5733447670936584, |
| "learning_rate": 0.0004849585174011421, |
| "loss": 3.6544, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.931977182219352, |
| "grad_norm": 0.5679136514663696, |
| "learning_rate": 0.00048463527637108067, |
| "loss": 3.6382, |
| "step": 17950 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "grad_norm": 0.590400218963623, |
| "learning_rate": 0.0004843120353410192, |
| "loss": 3.645, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "eval_accuracy": 0.3596518153018252, |
| "eval_loss": 3.5906753540039062, |
| "eval_runtime": 185.6395, |
| "eval_samples_per_second": 97.021, |
| "eval_steps_per_second": 6.066, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9427402862985685, |
| "grad_norm": 0.554408073425293, |
| "learning_rate": 0.0004839887943109578, |
| "loss": 3.6355, |
| "step": 18050 |
| }, |
| { |
| "epoch": 1.9481218383381766, |
| "grad_norm": 0.6150152087211609, |
| "learning_rate": 0.00048366555328089645, |
| "loss": 3.6376, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.953503390377785, |
| "grad_norm": 0.5678287744522095, |
| "learning_rate": 0.000483342312250835, |
| "loss": 3.6329, |
| "step": 18150 |
| }, |
| { |
| "epoch": 1.9588849424173933, |
| "grad_norm": 0.6119444370269775, |
| "learning_rate": 0.0004830190712207736, |
| "loss": 3.6275, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.9642664944570014, |
| "grad_norm": 0.5304839611053467, |
| "learning_rate": 0.0004826958301907122, |
| "loss": 3.6432, |
| "step": 18250 |
| }, |
| { |
| "epoch": 1.9696480464966095, |
| "grad_norm": 0.5501518845558167, |
| "learning_rate": 0.0004823725891606507, |
| "loss": 3.6158, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.9750295985362178, |
| "grad_norm": 0.579287588596344, |
| "learning_rate": 0.0004820493481305893, |
| "loss": 3.6095, |
| "step": 18350 |
| }, |
| { |
| "epoch": 1.9804111505758262, |
| "grad_norm": 0.5825153589248657, |
| "learning_rate": 0.00048172610710052797, |
| "loss": 3.6458, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.9857927026154343, |
| "grad_norm": 0.53499835729599, |
| "learning_rate": 0.0004814028660704665, |
| "loss": 3.6129, |
| "step": 18450 |
| }, |
| { |
| "epoch": 1.9911742546550424, |
| "grad_norm": 0.536543607711792, |
| "learning_rate": 0.0004810796250404051, |
| "loss": 3.6291, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.9965558066946507, |
| "grad_norm": 0.5926487445831299, |
| "learning_rate": 0.0004807628488309449, |
| "loss": 3.6218, |
| "step": 18550 |
| }, |
| { |
| "epoch": 2.001937358734259, |
| "grad_norm": 0.5897657871246338, |
| "learning_rate": 0.0004804396078008835, |
| "loss": 3.5963, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.007318910773867, |
| "grad_norm": 0.5554097890853882, |
| "learning_rate": 0.00048011636677082204, |
| "loss": 3.5297, |
| "step": 18650 |
| }, |
| { |
| "epoch": 2.0127004628134753, |
| "grad_norm": 0.5472369194030762, |
| "learning_rate": 0.00047979312574076064, |
| "loss": 3.5556, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.018082014853084, |
| "grad_norm": 0.5412234663963318, |
| "learning_rate": 0.0004794698847106992, |
| "loss": 3.5443, |
| "step": 18750 |
| }, |
| { |
| "epoch": 2.023463566892692, |
| "grad_norm": 0.5668666958808899, |
| "learning_rate": 0.0004791466436806378, |
| "loss": 3.5528, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.0288451189323, |
| "grad_norm": 0.6467781662940979, |
| "learning_rate": 0.0004788234026505764, |
| "loss": 3.5488, |
| "step": 18850 |
| }, |
| { |
| "epoch": 2.034226670971908, |
| "grad_norm": 0.600711464881897, |
| "learning_rate": 0.00047850016162051496, |
| "loss": 3.5408, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.0396082230115167, |
| "grad_norm": 0.5562477111816406, |
| "learning_rate": 0.00047817692059045356, |
| "loss": 3.5364, |
| "step": 18950 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "grad_norm": 0.5936967134475708, |
| "learning_rate": 0.00047785367956039215, |
| "loss": 3.5353, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "eval_accuracy": 0.3616070240180468, |
| "eval_loss": 3.5791800022125244, |
| "eval_runtime": 185.4596, |
| "eval_samples_per_second": 97.115, |
| "eval_steps_per_second": 6.071, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.050371327090733, |
| "grad_norm": 0.6128162145614624, |
| "learning_rate": 0.00047753043853033075, |
| "loss": 3.5707, |
| "step": 19050 |
| }, |
| { |
| "epoch": 2.055752879130341, |
| "grad_norm": 0.521029531955719, |
| "learning_rate": 0.00047720719750026934, |
| "loss": 3.5599, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.0611344311699495, |
| "grad_norm": 0.5703673958778381, |
| "learning_rate": 0.00047688395647020793, |
| "loss": 3.5541, |
| "step": 19150 |
| }, |
| { |
| "epoch": 2.0665159832095576, |
| "grad_norm": 0.6372736692428589, |
| "learning_rate": 0.0004765607154401465, |
| "loss": 3.5518, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.0718975352491658, |
| "grad_norm": 0.6135616302490234, |
| "learning_rate": 0.00047623747441008507, |
| "loss": 3.5705, |
| "step": 19250 |
| }, |
| { |
| "epoch": 2.0772790872887743, |
| "grad_norm": 0.5645393133163452, |
| "learning_rate": 0.0004759142333800236, |
| "loss": 3.5492, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.0826606393283824, |
| "grad_norm": 0.6214520335197449, |
| "learning_rate": 0.00047559099234996226, |
| "loss": 3.568, |
| "step": 19350 |
| }, |
| { |
| "epoch": 2.0880421913679905, |
| "grad_norm": 0.6217697262763977, |
| "learning_rate": 0.00047526775131990085, |
| "loss": 3.5728, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.0934237434075986, |
| "grad_norm": 0.641295850276947, |
| "learning_rate": 0.0004749445102898394, |
| "loss": 3.5796, |
| "step": 19450 |
| }, |
| { |
| "epoch": 2.098805295447207, |
| "grad_norm": 0.6296442747116089, |
| "learning_rate": 0.000474621269259778, |
| "loss": 3.5503, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.1041868474868153, |
| "grad_norm": 0.5382377505302429, |
| "learning_rate": 0.0004742980282297166, |
| "loss": 3.5469, |
| "step": 19550 |
| }, |
| { |
| "epoch": 2.1095683995264234, |
| "grad_norm": 0.5474888682365417, |
| "learning_rate": 0.0004739747871996551, |
| "loss": 3.5498, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.1149499515660315, |
| "grad_norm": 0.5835118889808655, |
| "learning_rate": 0.00047365154616959377, |
| "loss": 3.55, |
| "step": 19650 |
| }, |
| { |
| "epoch": 2.12033150360564, |
| "grad_norm": 0.5570899248123169, |
| "learning_rate": 0.00047332830513953237, |
| "loss": 3.5411, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.125713055645248, |
| "grad_norm": 0.5999816656112671, |
| "learning_rate": 0.0004730050641094709, |
| "loss": 3.5559, |
| "step": 19750 |
| }, |
| { |
| "epoch": 2.1310946076848563, |
| "grad_norm": 0.5363185405731201, |
| "learning_rate": 0.0004726818230794095, |
| "loss": 3.5604, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.1364761597244644, |
| "grad_norm": 0.5836130976676941, |
| "learning_rate": 0.00047235858204934804, |
| "loss": 3.5618, |
| "step": 19850 |
| }, |
| { |
| "epoch": 2.141857711764073, |
| "grad_norm": 0.6064532995223999, |
| "learning_rate": 0.0004720353410192867, |
| "loss": 3.5501, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.147239263803681, |
| "grad_norm": 0.5889922976493835, |
| "learning_rate": 0.0004717120999892253, |
| "loss": 3.5502, |
| "step": 19950 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "grad_norm": 0.5489098429679871, |
| "learning_rate": 0.0004713888589591638, |
| "loss": 3.5718, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "eval_accuracy": 0.3625153621601759, |
| "eval_loss": 3.5724239349365234, |
| "eval_runtime": 185.832, |
| "eval_samples_per_second": 96.921, |
| "eval_steps_per_second": 6.059, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.1580023678828972, |
| "grad_norm": 0.5338764786720276, |
| "learning_rate": 0.0004710656179291024, |
| "loss": 3.5719, |
| "step": 20050 |
| }, |
| { |
| "epoch": 2.163383919922506, |
| "grad_norm": 0.6207489371299744, |
| "learning_rate": 0.000470742376899041, |
| "loss": 3.5627, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.168765471962114, |
| "grad_norm": 0.5809758901596069, |
| "learning_rate": 0.00047041913586897956, |
| "loss": 3.5313, |
| "step": 20150 |
| }, |
| { |
| "epoch": 2.174147024001722, |
| "grad_norm": 0.591349720954895, |
| "learning_rate": 0.0004700958948389182, |
| "loss": 3.5604, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.1795285760413305, |
| "grad_norm": 0.6110274195671082, |
| "learning_rate": 0.0004697726538088568, |
| "loss": 3.5473, |
| "step": 20250 |
| }, |
| { |
| "epoch": 2.1849101280809387, |
| "grad_norm": 0.5402040481567383, |
| "learning_rate": 0.00046944941277879534, |
| "loss": 3.5689, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.1902916801205468, |
| "grad_norm": 0.5570117235183716, |
| "learning_rate": 0.00046912617174873394, |
| "loss": 3.5731, |
| "step": 20350 |
| }, |
| { |
| "epoch": 2.195673232160155, |
| "grad_norm": 0.5699936151504517, |
| "learning_rate": 0.0004688029307186725, |
| "loss": 3.5533, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.2010547841997634, |
| "grad_norm": 0.5406856536865234, |
| "learning_rate": 0.00046847968968861107, |
| "loss": 3.5658, |
| "step": 20450 |
| }, |
| { |
| "epoch": 2.2064363362393715, |
| "grad_norm": 0.5034304857254028, |
| "learning_rate": 0.0004681564486585497, |
| "loss": 3.5457, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.2118178882789796, |
| "grad_norm": 0.5645242929458618, |
| "learning_rate": 0.00046783320762848826, |
| "loss": 3.5725, |
| "step": 20550 |
| }, |
| { |
| "epoch": 2.2171994403185877, |
| "grad_norm": 0.5770477652549744, |
| "learning_rate": 0.00046750996659842685, |
| "loss": 3.5631, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.2225809923581963, |
| "grad_norm": 0.5933657288551331, |
| "learning_rate": 0.00046719319038896666, |
| "loss": 3.5596, |
| "step": 20650 |
| }, |
| { |
| "epoch": 2.2279625443978044, |
| "grad_norm": 0.5493210554122925, |
| "learning_rate": 0.00046686994935890526, |
| "loss": 3.5763, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.2333440964374125, |
| "grad_norm": 0.5703498125076294, |
| "learning_rate": 0.0004665467083288438, |
| "loss": 3.5525, |
| "step": 20750 |
| }, |
| { |
| "epoch": 2.2387256484770206, |
| "grad_norm": 0.5591681599617004, |
| "learning_rate": 0.0004662234672987824, |
| "loss": 3.5664, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.244107200516629, |
| "grad_norm": 0.5998018980026245, |
| "learning_rate": 0.00046590022626872104, |
| "loss": 3.5644, |
| "step": 20850 |
| }, |
| { |
| "epoch": 2.2494887525562373, |
| "grad_norm": 0.5582026839256287, |
| "learning_rate": 0.0004655769852386596, |
| "loss": 3.5478, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.2548703045958454, |
| "grad_norm": 0.5756790637969971, |
| "learning_rate": 0.0004652537442085982, |
| "loss": 3.5535, |
| "step": 20950 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "grad_norm": 0.5503938794136047, |
| "learning_rate": 0.00046493050317853677, |
| "loss": 3.5333, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "eval_accuracy": 0.36392198244582213, |
| "eval_loss": 3.560683012008667, |
| "eval_runtime": 185.4032, |
| "eval_samples_per_second": 97.145, |
| "eval_steps_per_second": 6.073, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.265633408675062, |
| "grad_norm": 0.6041116714477539, |
| "learning_rate": 0.0004646072621484753, |
| "loss": 3.5622, |
| "step": 21050 |
| }, |
| { |
| "epoch": 2.27101496071467, |
| "grad_norm": 0.5796629190444946, |
| "learning_rate": 0.0004642840211184139, |
| "loss": 3.5563, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.2763965127542782, |
| "grad_norm": 0.5678168535232544, |
| "learning_rate": 0.00046396078008835255, |
| "loss": 3.5505, |
| "step": 21150 |
| }, |
| { |
| "epoch": 2.281778064793887, |
| "grad_norm": 0.5537850856781006, |
| "learning_rate": 0.0004636375390582911, |
| "loss": 3.5608, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.287159616833495, |
| "grad_norm": 0.6114761829376221, |
| "learning_rate": 0.0004633142980282297, |
| "loss": 3.561, |
| "step": 21250 |
| }, |
| { |
| "epoch": 2.292541168873103, |
| "grad_norm": 0.5960226655006409, |
| "learning_rate": 0.00046299105699816823, |
| "loss": 3.5457, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.297922720912711, |
| "grad_norm": 0.6092424988746643, |
| "learning_rate": 0.0004626678159681068, |
| "loss": 3.5531, |
| "step": 21350 |
| }, |
| { |
| "epoch": 2.303304272952319, |
| "grad_norm": 0.5451756119728088, |
| "learning_rate": 0.0004623445749380454, |
| "loss": 3.5542, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.3086858249919278, |
| "grad_norm": 0.5567100644111633, |
| "learning_rate": 0.000462021333907984, |
| "loss": 3.5407, |
| "step": 21450 |
| }, |
| { |
| "epoch": 2.314067377031536, |
| "grad_norm": 0.5734255313873291, |
| "learning_rate": 0.0004616980928779226, |
| "loss": 3.5512, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.319448929071144, |
| "grad_norm": 0.6125470399856567, |
| "learning_rate": 0.0004613748518478612, |
| "loss": 3.5699, |
| "step": 21550 |
| }, |
| { |
| "epoch": 2.3248304811107525, |
| "grad_norm": 0.5693233609199524, |
| "learning_rate": 0.00046105161081779974, |
| "loss": 3.5476, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.3302120331503606, |
| "grad_norm": 0.5511974096298218, |
| "learning_rate": 0.00046072836978773834, |
| "loss": 3.5453, |
| "step": 21650 |
| }, |
| { |
| "epoch": 2.3355935851899687, |
| "grad_norm": 0.5851377844810486, |
| "learning_rate": 0.000460405128757677, |
| "loss": 3.5708, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.340975137229577, |
| "grad_norm": 0.6215223670005798, |
| "learning_rate": 0.0004600818877276155, |
| "loss": 3.5613, |
| "step": 21750 |
| }, |
| { |
| "epoch": 2.3463566892691854, |
| "grad_norm": 0.5615935921669006, |
| "learning_rate": 0.0004597586466975541, |
| "loss": 3.5803, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.3517382413087935, |
| "grad_norm": 0.5780821442604065, |
| "learning_rate": 0.00045943540566749266, |
| "loss": 3.5638, |
| "step": 21850 |
| }, |
| { |
| "epoch": 2.3571197933484016, |
| "grad_norm": 0.552423357963562, |
| "learning_rate": 0.00045911216463743126, |
| "loss": 3.5614, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.3625013453880097, |
| "grad_norm": 0.5924360752105713, |
| "learning_rate": 0.00045878892360736985, |
| "loss": 3.5583, |
| "step": 21950 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "grad_norm": 0.5988591909408569, |
| "learning_rate": 0.00045846568257730845, |
| "loss": 3.5584, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "eval_accuracy": 0.36474970014519287, |
| "eval_loss": 3.5505900382995605, |
| "eval_runtime": 185.6951, |
| "eval_samples_per_second": 96.992, |
| "eval_steps_per_second": 6.064, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3732644494672264, |
| "grad_norm": 0.5761330723762512, |
| "learning_rate": 0.00045814244154724704, |
| "loss": 3.5604, |
| "step": 22050 |
| }, |
| { |
| "epoch": 2.3786460015068345, |
| "grad_norm": 0.5908006429672241, |
| "learning_rate": 0.00045781920051718563, |
| "loss": 3.5549, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.384027553546443, |
| "grad_norm": 0.5807985663414001, |
| "learning_rate": 0.0004574959594871242, |
| "loss": 3.5442, |
| "step": 22150 |
| }, |
| { |
| "epoch": 2.389409105586051, |
| "grad_norm": 0.582998514175415, |
| "learning_rate": 0.00045717271845706277, |
| "loss": 3.5399, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.3947906576256592, |
| "grad_norm": 0.5681512355804443, |
| "learning_rate": 0.0004568494774270013, |
| "loss": 3.5484, |
| "step": 22250 |
| }, |
| { |
| "epoch": 2.4001722096652673, |
| "grad_norm": 0.606206476688385, |
| "learning_rate": 0.00045652623639693996, |
| "loss": 3.5678, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.4055537617048754, |
| "grad_norm": 0.5849497318267822, |
| "learning_rate": 0.00045620299536687855, |
| "loss": 3.555, |
| "step": 22350 |
| }, |
| { |
| "epoch": 2.410935313744484, |
| "grad_norm": 0.539687991142273, |
| "learning_rate": 0.0004558797543368171, |
| "loss": 3.5503, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.416316865784092, |
| "grad_norm": 0.5568169355392456, |
| "learning_rate": 0.0004555565133067557, |
| "loss": 3.5482, |
| "step": 22450 |
| }, |
| { |
| "epoch": 2.4216984178237, |
| "grad_norm": 0.5671691298484802, |
| "learning_rate": 0.0004552332722766943, |
| "loss": 3.5619, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.4270799698633088, |
| "grad_norm": 0.5890142917633057, |
| "learning_rate": 0.0004549100312466328, |
| "loss": 3.53, |
| "step": 22550 |
| }, |
| { |
| "epoch": 2.432461521902917, |
| "grad_norm": 0.566472053527832, |
| "learning_rate": 0.0004545867902165715, |
| "loss": 3.5258, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.437843073942525, |
| "grad_norm": 0.5533962249755859, |
| "learning_rate": 0.0004542700140071113, |
| "loss": 3.556, |
| "step": 22650 |
| }, |
| { |
| "epoch": 2.443224625982133, |
| "grad_norm": 0.5774956941604614, |
| "learning_rate": 0.0004539467729770499, |
| "loss": 3.5584, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.4486061780217416, |
| "grad_norm": 0.579090416431427, |
| "learning_rate": 0.0004536235319469884, |
| "loss": 3.5518, |
| "step": 22750 |
| }, |
| { |
| "epoch": 2.4539877300613497, |
| "grad_norm": 0.6130251884460449, |
| "learning_rate": 0.000453300290916927, |
| "loss": 3.5559, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.459369282100958, |
| "grad_norm": 0.6626943945884705, |
| "learning_rate": 0.0004529770498868656, |
| "loss": 3.5504, |
| "step": 22850 |
| }, |
| { |
| "epoch": 2.464750834140566, |
| "grad_norm": 0.5800821781158447, |
| "learning_rate": 0.00045265380885680414, |
| "loss": 3.5502, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.4701323861801745, |
| "grad_norm": 0.5731578469276428, |
| "learning_rate": 0.0004523305678267428, |
| "loss": 3.5335, |
| "step": 22950 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "grad_norm": 0.5558658838272095, |
| "learning_rate": 0.0004520073267966814, |
| "loss": 3.5405, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "eval_accuracy": 0.3659733489677595, |
| "eval_loss": 3.54081392288208, |
| "eval_runtime": 185.8411, |
| "eval_samples_per_second": 96.916, |
| "eval_steps_per_second": 6.059, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4808954902593907, |
| "grad_norm": 0.5571113228797913, |
| "learning_rate": 0.00045168408576661993, |
| "loss": 3.5652, |
| "step": 23050 |
| }, |
| { |
| "epoch": 2.4862770422989993, |
| "grad_norm": 0.5380064845085144, |
| "learning_rate": 0.0004513608447365585, |
| "loss": 3.5644, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.4916585943386074, |
| "grad_norm": 0.5949292182922363, |
| "learning_rate": 0.00045103760370649706, |
| "loss": 3.5409, |
| "step": 23150 |
| }, |
| { |
| "epoch": 2.4970401463782155, |
| "grad_norm": 0.568659245967865, |
| "learning_rate": 0.00045071436267643566, |
| "loss": 3.5683, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.5024216984178236, |
| "grad_norm": 0.5630667805671692, |
| "learning_rate": 0.0004503911216463743, |
| "loss": 3.5461, |
| "step": 23250 |
| }, |
| { |
| "epoch": 2.5078032504574317, |
| "grad_norm": 0.688456654548645, |
| "learning_rate": 0.00045006788061631285, |
| "loss": 3.5459, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.5131848024970402, |
| "grad_norm": 0.5994473695755005, |
| "learning_rate": 0.00044974463958625144, |
| "loss": 3.5343, |
| "step": 23350 |
| }, |
| { |
| "epoch": 2.5185663545366483, |
| "grad_norm": 0.5428929328918457, |
| "learning_rate": 0.00044942139855619004, |
| "loss": 3.5474, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.5239479065762565, |
| "grad_norm": 0.5463264584541321, |
| "learning_rate": 0.0004490981575261286, |
| "loss": 3.5356, |
| "step": 23450 |
| }, |
| { |
| "epoch": 2.529329458615865, |
| "grad_norm": 0.5961200594902039, |
| "learning_rate": 0.0004487749164960672, |
| "loss": 3.5698, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.534711010655473, |
| "grad_norm": 0.5858139395713806, |
| "learning_rate": 0.0004484516754660058, |
| "loss": 3.5626, |
| "step": 23550 |
| }, |
| { |
| "epoch": 2.540092562695081, |
| "grad_norm": 0.5949637293815613, |
| "learning_rate": 0.00044812843443594436, |
| "loss": 3.5551, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.5454741147346893, |
| "grad_norm": 0.5771591663360596, |
| "learning_rate": 0.00044780519340588296, |
| "loss": 3.5651, |
| "step": 23650 |
| }, |
| { |
| "epoch": 2.550855666774298, |
| "grad_norm": 0.6235713362693787, |
| "learning_rate": 0.0004474819523758215, |
| "loss": 3.5495, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.556237218813906, |
| "grad_norm": 0.6080948114395142, |
| "learning_rate": 0.0004471587113457601, |
| "loss": 3.5228, |
| "step": 23750 |
| }, |
| { |
| "epoch": 2.561618770853514, |
| "grad_norm": 0.6454449892044067, |
| "learning_rate": 0.00044683547031569874, |
| "loss": 3.5617, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.567000322893122, |
| "grad_norm": 0.5808669924736023, |
| "learning_rate": 0.0004465122292856373, |
| "loss": 3.5527, |
| "step": 23850 |
| }, |
| { |
| "epoch": 2.5723818749327307, |
| "grad_norm": 0.5919604897499084, |
| "learning_rate": 0.0004461889882555759, |
| "loss": 3.5259, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.577763426972339, |
| "grad_norm": 0.5724890232086182, |
| "learning_rate": 0.00044586574722551447, |
| "loss": 3.5388, |
| "step": 23950 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "grad_norm": 0.6070609092712402, |
| "learning_rate": 0.000445542506195453, |
| "loss": 3.544, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "eval_accuracy": 0.36672370581100633, |
| "eval_loss": 3.531214714050293, |
| "eval_runtime": 185.324, |
| "eval_samples_per_second": 97.187, |
| "eval_steps_per_second": 6.076, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.5885265310515555, |
| "grad_norm": 0.5855563282966614, |
| "learning_rate": 0.0004452192651653916, |
| "loss": 3.562, |
| "step": 24050 |
| }, |
| { |
| "epoch": 2.5939080830911636, |
| "grad_norm": 0.6444674134254456, |
| "learning_rate": 0.00044489602413533025, |
| "loss": 3.5296, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.5992896351307717, |
| "grad_norm": 0.5749318599700928, |
| "learning_rate": 0.0004445727831052688, |
| "loss": 3.5455, |
| "step": 24150 |
| }, |
| { |
| "epoch": 2.60467118717038, |
| "grad_norm": 0.5962168574333191, |
| "learning_rate": 0.0004442495420752074, |
| "loss": 3.5727, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.610052739209988, |
| "grad_norm": 0.599294126033783, |
| "learning_rate": 0.00044392630104514593, |
| "loss": 3.552, |
| "step": 24250 |
| }, |
| { |
| "epoch": 2.6154342912495965, |
| "grad_norm": 0.5795555710792542, |
| "learning_rate": 0.0004436030600150845, |
| "loss": 3.554, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.6208158432892046, |
| "grad_norm": 0.5728325247764587, |
| "learning_rate": 0.0004432798189850231, |
| "loss": 3.5461, |
| "step": 24350 |
| }, |
| { |
| "epoch": 2.6261973953288127, |
| "grad_norm": 0.6078081727027893, |
| "learning_rate": 0.0004429565779549617, |
| "loss": 3.5405, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.5665083527565002, |
| "learning_rate": 0.0004426333369249003, |
| "loss": 3.5429, |
| "step": 24450 |
| }, |
| { |
| "epoch": 2.6369604994080293, |
| "grad_norm": 0.5623336434364319, |
| "learning_rate": 0.0004423100958948389, |
| "loss": 3.5414, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.6423420514476375, |
| "grad_norm": 0.6207952499389648, |
| "learning_rate": 0.00044198685486477744, |
| "loss": 3.5395, |
| "step": 24550 |
| }, |
| { |
| "epoch": 2.6477236034872456, |
| "grad_norm": 0.5874627828598022, |
| "learning_rate": 0.00044166361383471604, |
| "loss": 3.5593, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.653105155526854, |
| "grad_norm": 0.6552971601486206, |
| "learning_rate": 0.0004413403728046547, |
| "loss": 3.5296, |
| "step": 24650 |
| }, |
| { |
| "epoch": 2.658486707566462, |
| "grad_norm": 0.5899220108985901, |
| "learning_rate": 0.00044102359659519444, |
| "loss": 3.5484, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.6638682596060703, |
| "grad_norm": 0.6007041931152344, |
| "learning_rate": 0.00044070035556513303, |
| "loss": 3.5429, |
| "step": 24750 |
| }, |
| { |
| "epoch": 2.6692498116456784, |
| "grad_norm": 0.6164174675941467, |
| "learning_rate": 0.00044037711453507163, |
| "loss": 3.5439, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.674631363685287, |
| "grad_norm": 0.6024141311645508, |
| "learning_rate": 0.0004400538735050102, |
| "loss": 3.5384, |
| "step": 24850 |
| }, |
| { |
| "epoch": 2.680012915724895, |
| "grad_norm": 0.6285790801048279, |
| "learning_rate": 0.00043973063247494876, |
| "loss": 3.5236, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.685394467764503, |
| "grad_norm": 0.5836975574493408, |
| "learning_rate": 0.00043940739144488736, |
| "loss": 3.532, |
| "step": 24950 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "grad_norm": 0.562268078327179, |
| "learning_rate": 0.0004390841504148259, |
| "loss": 3.5247, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "eval_accuracy": 0.3671533193294248, |
| "eval_loss": 3.5225062370300293, |
| "eval_runtime": 185.6947, |
| "eval_samples_per_second": 96.993, |
| "eval_steps_per_second": 6.064, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.69615757184372, |
| "grad_norm": 0.5885962843894958, |
| "learning_rate": 0.00043876090938476455, |
| "loss": 3.5316, |
| "step": 25050 |
| }, |
| { |
| "epoch": 2.701539123883328, |
| "grad_norm": 0.595336377620697, |
| "learning_rate": 0.00043843766835470314, |
| "loss": 3.5347, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.706920675922936, |
| "grad_norm": 0.6119809746742249, |
| "learning_rate": 0.0004381144273246417, |
| "loss": 3.5388, |
| "step": 25150 |
| }, |
| { |
| "epoch": 2.712302227962544, |
| "grad_norm": 0.5899800658226013, |
| "learning_rate": 0.0004377911862945803, |
| "loss": 3.5526, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.7176837800021527, |
| "grad_norm": 0.5826917290687561, |
| "learning_rate": 0.00043746794526451887, |
| "loss": 3.517, |
| "step": 25250 |
| }, |
| { |
| "epoch": 2.723065332041761, |
| "grad_norm": 0.5683618187904358, |
| "learning_rate": 0.00043714470423445747, |
| "loss": 3.543, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.728446884081369, |
| "grad_norm": 0.5878129601478577, |
| "learning_rate": 0.00043682146320439606, |
| "loss": 3.5183, |
| "step": 25350 |
| }, |
| { |
| "epoch": 2.7338284361209775, |
| "grad_norm": 0.5678836703300476, |
| "learning_rate": 0.00043649822217433466, |
| "loss": 3.5296, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.7392099881605856, |
| "grad_norm": 0.5555063486099243, |
| "learning_rate": 0.0004361749811442732, |
| "loss": 3.5433, |
| "step": 25450 |
| }, |
| { |
| "epoch": 2.7445915402001937, |
| "grad_norm": 0.59803307056427, |
| "learning_rate": 0.0004358517401142118, |
| "loss": 3.5302, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.749973092239802, |
| "grad_norm": 0.6216201782226562, |
| "learning_rate": 0.00043552849908415033, |
| "loss": 3.543, |
| "step": 25550 |
| }, |
| { |
| "epoch": 2.7553546442794103, |
| "grad_norm": 0.6179243326187134, |
| "learning_rate": 0.000435205258054089, |
| "loss": 3.5417, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.7607361963190185, |
| "grad_norm": 0.6047491431236267, |
| "learning_rate": 0.0004348820170240276, |
| "loss": 3.5469, |
| "step": 25650 |
| }, |
| { |
| "epoch": 2.7661177483586266, |
| "grad_norm": 0.5766834020614624, |
| "learning_rate": 0.0004345587759939661, |
| "loss": 3.5255, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.7714993003982347, |
| "grad_norm": 0.6169473528862, |
| "learning_rate": 0.0004342355349639047, |
| "loss": 3.5612, |
| "step": 25750 |
| }, |
| { |
| "epoch": 2.776880852437843, |
| "grad_norm": 0.6286610960960388, |
| "learning_rate": 0.0004339122939338433, |
| "loss": 3.5331, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.7822624044774513, |
| "grad_norm": 0.5553529858589172, |
| "learning_rate": 0.00043358905290378184, |
| "loss": 3.5424, |
| "step": 25850 |
| }, |
| { |
| "epoch": 2.7876439565170594, |
| "grad_norm": 0.6053258776664734, |
| "learning_rate": 0.0004332658118737205, |
| "loss": 3.522, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.793025508556668, |
| "grad_norm": 0.6327517032623291, |
| "learning_rate": 0.0004329425708436591, |
| "loss": 3.5433, |
| "step": 25950 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "grad_norm": 0.6058651208877563, |
| "learning_rate": 0.00043261932981359763, |
| "loss": 3.5232, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "eval_accuracy": 0.3688032134308877, |
| "eval_loss": 3.511896848678589, |
| "eval_runtime": 185.7309, |
| "eval_samples_per_second": 96.974, |
| "eval_steps_per_second": 6.063, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.803788612635884, |
| "grad_norm": 0.5944856405258179, |
| "learning_rate": 0.0004322960887835362, |
| "loss": 3.5289, |
| "step": 26050 |
| }, |
| { |
| "epoch": 2.8091701646754923, |
| "grad_norm": 0.562701940536499, |
| "learning_rate": 0.00043197284775347476, |
| "loss": 3.5318, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.8145517167151004, |
| "grad_norm": 0.5836777687072754, |
| "learning_rate": 0.00043164960672341336, |
| "loss": 3.5313, |
| "step": 26150 |
| }, |
| { |
| "epoch": 2.819933268754709, |
| "grad_norm": 0.5569685101509094, |
| "learning_rate": 0.000431326365693352, |
| "loss": 3.5187, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.825314820794317, |
| "grad_norm": 0.5612331032752991, |
| "learning_rate": 0.00043100312466329055, |
| "loss": 3.5188, |
| "step": 26250 |
| }, |
| { |
| "epoch": 2.830696372833925, |
| "grad_norm": 0.6201794147491455, |
| "learning_rate": 0.00043067988363322914, |
| "loss": 3.5305, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.8360779248735337, |
| "grad_norm": 0.5919789671897888, |
| "learning_rate": 0.00043035664260316774, |
| "loss": 3.5374, |
| "step": 26350 |
| }, |
| { |
| "epoch": 2.841459476913142, |
| "grad_norm": 0.599686861038208, |
| "learning_rate": 0.0004300334015731063, |
| "loss": 3.5291, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.84684102895275, |
| "grad_norm": 0.6061415076255798, |
| "learning_rate": 0.0004297101605430449, |
| "loss": 3.5212, |
| "step": 26450 |
| }, |
| { |
| "epoch": 2.852222580992358, |
| "grad_norm": 0.6409788727760315, |
| "learning_rate": 0.0004293869195129835, |
| "loss": 3.5183, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.857604133031966, |
| "grad_norm": 0.5795250535011292, |
| "learning_rate": 0.00042906367848292206, |
| "loss": 3.517, |
| "step": 26550 |
| }, |
| { |
| "epoch": 2.8629856850715747, |
| "grad_norm": 0.5574374794960022, |
| "learning_rate": 0.00042874043745286066, |
| "loss": 3.526, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.868367237111183, |
| "grad_norm": 0.6044853329658508, |
| "learning_rate": 0.0004284171964227992, |
| "loss": 3.5232, |
| "step": 26650 |
| }, |
| { |
| "epoch": 2.873748789150791, |
| "grad_norm": 0.6033005118370056, |
| "learning_rate": 0.00042810042021333906, |
| "loss": 3.5278, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.8791303411903995, |
| "grad_norm": 0.6437329053878784, |
| "learning_rate": 0.0004277771791832776, |
| "loss": 3.5189, |
| "step": 26750 |
| }, |
| { |
| "epoch": 2.8845118932300076, |
| "grad_norm": 0.5768671631813049, |
| "learning_rate": 0.0004274539381532162, |
| "loss": 3.5177, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.8898934452696157, |
| "grad_norm": 0.5965304374694824, |
| "learning_rate": 0.00042713069712315484, |
| "loss": 3.539, |
| "step": 26850 |
| }, |
| { |
| "epoch": 2.895274997309224, |
| "grad_norm": 0.6222568154335022, |
| "learning_rate": 0.0004268074560930934, |
| "loss": 3.5065, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.9006565493488323, |
| "grad_norm": 0.609158992767334, |
| "learning_rate": 0.000426484215063032, |
| "loss": 3.5093, |
| "step": 26950 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "grad_norm": 0.6081030368804932, |
| "learning_rate": 0.0004261609740329705, |
| "loss": 3.5345, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "eval_accuracy": 0.3694739277073665, |
| "eval_loss": 3.5055761337280273, |
| "eval_runtime": 185.7948, |
| "eval_samples_per_second": 96.94, |
| "eval_steps_per_second": 6.06, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9114196534280485, |
| "grad_norm": 0.7365734577178955, |
| "learning_rate": 0.0004258377330029091, |
| "loss": 3.5275, |
| "step": 27050 |
| }, |
| { |
| "epoch": 2.9168012054676566, |
| "grad_norm": 0.6976646184921265, |
| "learning_rate": 0.00042551449197284776, |
| "loss": 3.5558, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.922182757507265, |
| "grad_norm": 0.621337890625, |
| "learning_rate": 0.0004251912509427863, |
| "loss": 3.5125, |
| "step": 27150 |
| }, |
| { |
| "epoch": 2.9275643095468733, |
| "grad_norm": 0.5724226236343384, |
| "learning_rate": 0.0004248680099127249, |
| "loss": 3.5259, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.9329458615864814, |
| "grad_norm": 0.5589929223060608, |
| "learning_rate": 0.0004245447688826635, |
| "loss": 3.525, |
| "step": 27250 |
| }, |
| { |
| "epoch": 2.93832741362609, |
| "grad_norm": 0.583922266960144, |
| "learning_rate": 0.00042422152785260203, |
| "loss": 3.5213, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.943708965665698, |
| "grad_norm": 0.5856994986534119, |
| "learning_rate": 0.0004238982868225406, |
| "loss": 3.5205, |
| "step": 27350 |
| }, |
| { |
| "epoch": 2.949090517705306, |
| "grad_norm": 0.6094067096710205, |
| "learning_rate": 0.0004235750457924793, |
| "loss": 3.5073, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.9544720697449143, |
| "grad_norm": 0.6294287443161011, |
| "learning_rate": 0.0004232518047624178, |
| "loss": 3.5346, |
| "step": 27450 |
| }, |
| { |
| "epoch": 2.9598536217845224, |
| "grad_norm": 0.6073342561721802, |
| "learning_rate": 0.0004229285637323564, |
| "loss": 3.502, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.965235173824131, |
| "grad_norm": 0.5862210988998413, |
| "learning_rate": 0.00042260532270229495, |
| "loss": 3.5288, |
| "step": 27550 |
| }, |
| { |
| "epoch": 2.970616725863739, |
| "grad_norm": 0.5769654512405396, |
| "learning_rate": 0.00042228208167223354, |
| "loss": 3.5263, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.975998277903347, |
| "grad_norm": 0.60785311460495, |
| "learning_rate": 0.00042195884064217214, |
| "loss": 3.5442, |
| "step": 27650 |
| }, |
| { |
| "epoch": 2.9813798299429557, |
| "grad_norm": 0.6289162635803223, |
| "learning_rate": 0.00042163559961211073, |
| "loss": 3.5293, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.986761381982564, |
| "grad_norm": 0.566299557685852, |
| "learning_rate": 0.00042131235858204933, |
| "loss": 3.529, |
| "step": 27750 |
| }, |
| { |
| "epoch": 2.992142934022172, |
| "grad_norm": 0.6333411931991577, |
| "learning_rate": 0.0004209891175519879, |
| "loss": 3.5108, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.9975244860617805, |
| "grad_norm": 0.5798822045326233, |
| "learning_rate": 0.00042066587652192646, |
| "loss": 3.5387, |
| "step": 27850 |
| }, |
| { |
| "epoch": 3.0029060381013886, |
| "grad_norm": 0.6096293330192566, |
| "learning_rate": 0.00042034263549186506, |
| "loss": 3.4674, |
| "step": 27900 |
| }, |
| { |
| "epoch": 3.0082875901409967, |
| "grad_norm": 0.6166154742240906, |
| "learning_rate": 0.0004200193944618036, |
| "loss": 3.4207, |
| "step": 27950 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "grad_norm": 0.6569551825523376, |
| "learning_rate": 0.00041969615343174225, |
| "loss": 3.422, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "eval_accuracy": 0.3701314949844197, |
| "eval_loss": 3.500117063522339, |
| "eval_runtime": 185.1492, |
| "eval_samples_per_second": 97.278, |
| "eval_steps_per_second": 6.082, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0190506942202133, |
| "grad_norm": 0.5694539546966553, |
| "learning_rate": 0.00041937291240168084, |
| "loss": 3.434, |
| "step": 28050 |
| }, |
| { |
| "epoch": 3.0244322462598214, |
| "grad_norm": 0.7617325782775879, |
| "learning_rate": 0.0004190496713716194, |
| "loss": 3.4439, |
| "step": 28100 |
| }, |
| { |
| "epoch": 3.0298137982994295, |
| "grad_norm": 0.6876115798950195, |
| "learning_rate": 0.000418726430341558, |
| "loss": 3.4311, |
| "step": 28150 |
| }, |
| { |
| "epoch": 3.0351953503390376, |
| "grad_norm": 0.5625863671302795, |
| "learning_rate": 0.00041840318931149657, |
| "loss": 3.4371, |
| "step": 28200 |
| }, |
| { |
| "epoch": 3.040576902378646, |
| "grad_norm": 0.596316397190094, |
| "learning_rate": 0.00041807994828143517, |
| "loss": 3.4107, |
| "step": 28250 |
| }, |
| { |
| "epoch": 3.0459584544182543, |
| "grad_norm": 0.5689908862113953, |
| "learning_rate": 0.00041775670725137376, |
| "loss": 3.4322, |
| "step": 28300 |
| }, |
| { |
| "epoch": 3.0513400064578624, |
| "grad_norm": 0.5795773863792419, |
| "learning_rate": 0.00041743346622131236, |
| "loss": 3.4443, |
| "step": 28350 |
| }, |
| { |
| "epoch": 3.0567215584974705, |
| "grad_norm": 0.6113329529762268, |
| "learning_rate": 0.0004171102251912509, |
| "loss": 3.4564, |
| "step": 28400 |
| }, |
| { |
| "epoch": 3.062103110537079, |
| "grad_norm": 0.6338844895362854, |
| "learning_rate": 0.0004167869841611895, |
| "loss": 3.4345, |
| "step": 28450 |
| }, |
| { |
| "epoch": 3.067484662576687, |
| "grad_norm": 0.626085102558136, |
| "learning_rate": 0.00041646374313112803, |
| "loss": 3.4487, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.0728662146162953, |
| "grad_norm": 0.5793798565864563, |
| "learning_rate": 0.0004161405021010667, |
| "loss": 3.4436, |
| "step": 28550 |
| }, |
| { |
| "epoch": 3.0782477666559034, |
| "grad_norm": 0.568972110748291, |
| "learning_rate": 0.0004158172610710053, |
| "loss": 3.4451, |
| "step": 28600 |
| }, |
| { |
| "epoch": 3.083629318695512, |
| "grad_norm": 0.6020107865333557, |
| "learning_rate": 0.0004154940200409438, |
| "loss": 3.446, |
| "step": 28650 |
| }, |
| { |
| "epoch": 3.08901087073512, |
| "grad_norm": 0.6391506195068359, |
| "learning_rate": 0.0004151707790108824, |
| "loss": 3.4373, |
| "step": 28700 |
| }, |
| { |
| "epoch": 3.094392422774728, |
| "grad_norm": 0.6044766306877136, |
| "learning_rate": 0.0004148540028014222, |
| "loss": 3.433, |
| "step": 28750 |
| }, |
| { |
| "epoch": 3.0997739748143363, |
| "grad_norm": 0.6538757085800171, |
| "learning_rate": 0.0004145307617713608, |
| "loss": 3.4527, |
| "step": 28800 |
| }, |
| { |
| "epoch": 3.105155526853945, |
| "grad_norm": 0.6544004082679749, |
| "learning_rate": 0.0004142139855619006, |
| "loss": 3.4564, |
| "step": 28850 |
| }, |
| { |
| "epoch": 3.110537078893553, |
| "grad_norm": 0.6053832769393921, |
| "learning_rate": 0.0004138907445318392, |
| "loss": 3.4491, |
| "step": 28900 |
| }, |
| { |
| "epoch": 3.115918630933161, |
| "grad_norm": 0.6252599358558655, |
| "learning_rate": 0.00041356750350177775, |
| "loss": 3.4348, |
| "step": 28950 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "grad_norm": 0.5669277310371399, |
| "learning_rate": 0.0004132442624717164, |
| "loss": 3.4492, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "eval_accuracy": 0.37086740099358717, |
| "eval_loss": 3.4967188835144043, |
| "eval_runtime": 185.8784, |
| "eval_samples_per_second": 96.897, |
| "eval_steps_per_second": 6.058, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.1266817350123777, |
| "grad_norm": 0.6672929525375366, |
| "learning_rate": 0.000412921021441655, |
| "loss": 3.4528, |
| "step": 29050 |
| }, |
| { |
| "epoch": 3.132063287051986, |
| "grad_norm": 0.553104817867279, |
| "learning_rate": 0.00041259778041159354, |
| "loss": 3.4456, |
| "step": 29100 |
| }, |
| { |
| "epoch": 3.137444839091594, |
| "grad_norm": 0.6385326385498047, |
| "learning_rate": 0.00041227453938153213, |
| "loss": 3.4414, |
| "step": 29150 |
| }, |
| { |
| "epoch": 3.1428263911312024, |
| "grad_norm": 0.6217435002326965, |
| "learning_rate": 0.00041195129835147067, |
| "loss": 3.4378, |
| "step": 29200 |
| }, |
| { |
| "epoch": 3.1482079431708105, |
| "grad_norm": 0.6174982190132141, |
| "learning_rate": 0.00041162805732140927, |
| "loss": 3.4712, |
| "step": 29250 |
| }, |
| { |
| "epoch": 3.1535894952104186, |
| "grad_norm": 0.6691080331802368, |
| "learning_rate": 0.0004113048162913479, |
| "loss": 3.4641, |
| "step": 29300 |
| }, |
| { |
| "epoch": 3.1589710472500268, |
| "grad_norm": 0.6154392957687378, |
| "learning_rate": 0.00041098157526128646, |
| "loss": 3.4258, |
| "step": 29350 |
| }, |
| { |
| "epoch": 3.1643525992896353, |
| "grad_norm": 0.5835902690887451, |
| "learning_rate": 0.00041065833423122505, |
| "loss": 3.4538, |
| "step": 29400 |
| }, |
| { |
| "epoch": 3.1697341513292434, |
| "grad_norm": 0.6393734812736511, |
| "learning_rate": 0.00041033509320116365, |
| "loss": 3.4687, |
| "step": 29450 |
| }, |
| { |
| "epoch": 3.1751157033688515, |
| "grad_norm": 0.5956642627716064, |
| "learning_rate": 0.0004100118521711022, |
| "loss": 3.4473, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.1804972554084596, |
| "grad_norm": 0.5974727272987366, |
| "learning_rate": 0.0004096886111410408, |
| "loss": 3.4732, |
| "step": 29550 |
| }, |
| { |
| "epoch": 3.185878807448068, |
| "grad_norm": 0.7023374438285828, |
| "learning_rate": 0.00040936537011097943, |
| "loss": 3.4686, |
| "step": 29600 |
| }, |
| { |
| "epoch": 3.1912603594876763, |
| "grad_norm": 0.6096805930137634, |
| "learning_rate": 0.00040904212908091797, |
| "loss": 3.4445, |
| "step": 29650 |
| }, |
| { |
| "epoch": 3.1966419115272844, |
| "grad_norm": 0.586425244808197, |
| "learning_rate": 0.00040871888805085656, |
| "loss": 3.4571, |
| "step": 29700 |
| }, |
| { |
| "epoch": 3.2020234635668925, |
| "grad_norm": 0.6391606330871582, |
| "learning_rate": 0.0004083956470207951, |
| "loss": 3.4527, |
| "step": 29750 |
| }, |
| { |
| "epoch": 3.207405015606501, |
| "grad_norm": 0.6039808988571167, |
| "learning_rate": 0.0004080724059907337, |
| "loss": 3.4524, |
| "step": 29800 |
| }, |
| { |
| "epoch": 3.212786567646109, |
| "grad_norm": 0.542382001876831, |
| "learning_rate": 0.00040774916496067235, |
| "loss": 3.4626, |
| "step": 29850 |
| }, |
| { |
| "epoch": 3.2181681196857173, |
| "grad_norm": 0.5787290930747986, |
| "learning_rate": 0.0004074259239306109, |
| "loss": 3.4471, |
| "step": 29900 |
| }, |
| { |
| "epoch": 3.2235496717253254, |
| "grad_norm": 0.6871864199638367, |
| "learning_rate": 0.0004071026829005495, |
| "loss": 3.4628, |
| "step": 29950 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "grad_norm": 0.5888828635215759, |
| "learning_rate": 0.0004067794418704881, |
| "loss": 3.4356, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "eval_accuracy": 0.37196903262313824, |
| "eval_loss": 3.4901552200317383, |
| "eval_runtime": 185.3297, |
| "eval_samples_per_second": 97.184, |
| "eval_steps_per_second": 6.076, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.234312775804542, |
| "grad_norm": 0.6342280507087708, |
| "learning_rate": 0.0004064562008404266, |
| "loss": 3.4487, |
| "step": 30050 |
| }, |
| { |
| "epoch": 3.23969432784415, |
| "grad_norm": 0.5991923213005066, |
| "learning_rate": 0.0004061329598103652, |
| "loss": 3.4602, |
| "step": 30100 |
| }, |
| { |
| "epoch": 3.2450758798837587, |
| "grad_norm": 0.6479193568229675, |
| "learning_rate": 0.00040580971878030386, |
| "loss": 3.4575, |
| "step": 30150 |
| }, |
| { |
| "epoch": 3.250457431923367, |
| "grad_norm": 0.5759966969490051, |
| "learning_rate": 0.0004054864777502424, |
| "loss": 3.4597, |
| "step": 30200 |
| }, |
| { |
| "epoch": 3.255838983962975, |
| "grad_norm": 0.6298264861106873, |
| "learning_rate": 0.000405163236720181, |
| "loss": 3.4651, |
| "step": 30250 |
| }, |
| { |
| "epoch": 3.261220536002583, |
| "grad_norm": 0.6173065304756165, |
| "learning_rate": 0.00040483999569011954, |
| "loss": 3.4356, |
| "step": 30300 |
| }, |
| { |
| "epoch": 3.2666020880421915, |
| "grad_norm": 0.614152729511261, |
| "learning_rate": 0.00040451675466005813, |
| "loss": 3.4493, |
| "step": 30350 |
| }, |
| { |
| "epoch": 3.2719836400817996, |
| "grad_norm": 0.6581416130065918, |
| "learning_rate": 0.0004041935136299967, |
| "loss": 3.4515, |
| "step": 30400 |
| }, |
| { |
| "epoch": 3.2773651921214078, |
| "grad_norm": 0.5537705421447754, |
| "learning_rate": 0.0004038702725999353, |
| "loss": 3.4715, |
| "step": 30450 |
| }, |
| { |
| "epoch": 3.282746744161016, |
| "grad_norm": 0.6090721487998962, |
| "learning_rate": 0.0004035470315698739, |
| "loss": 3.4579, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.2881282962006244, |
| "grad_norm": 0.5673661828041077, |
| "learning_rate": 0.0004032237905398125, |
| "loss": 3.4457, |
| "step": 30550 |
| }, |
| { |
| "epoch": 3.2935098482402325, |
| "grad_norm": 0.6037548780441284, |
| "learning_rate": 0.00040290054950975105, |
| "loss": 3.447, |
| "step": 30600 |
| }, |
| { |
| "epoch": 3.2988914002798406, |
| "grad_norm": 0.5798225998878479, |
| "learning_rate": 0.00040257730847968965, |
| "loss": 3.4458, |
| "step": 30650 |
| }, |
| { |
| "epoch": 3.304272952319449, |
| "grad_norm": 0.6514201760292053, |
| "learning_rate": 0.0004022540674496283, |
| "loss": 3.4601, |
| "step": 30700 |
| }, |
| { |
| "epoch": 3.3096545043590573, |
| "grad_norm": 0.6187174320220947, |
| "learning_rate": 0.00040193082641956684, |
| "loss": 3.4627, |
| "step": 30750 |
| }, |
| { |
| "epoch": 3.3150360563986654, |
| "grad_norm": 0.6195241212844849, |
| "learning_rate": 0.00040160758538950543, |
| "loss": 3.4607, |
| "step": 30800 |
| }, |
| { |
| "epoch": 3.3204176084382735, |
| "grad_norm": 0.6429746747016907, |
| "learning_rate": 0.00040128434435944397, |
| "loss": 3.4756, |
| "step": 30850 |
| }, |
| { |
| "epoch": 3.3257991604778816, |
| "grad_norm": 0.6028315424919128, |
| "learning_rate": 0.00040096110332938257, |
| "loss": 3.451, |
| "step": 30900 |
| }, |
| { |
| "epoch": 3.33118071251749, |
| "grad_norm": 0.6164515614509583, |
| "learning_rate": 0.00040063786229932116, |
| "loss": 3.4455, |
| "step": 30950 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "grad_norm": 0.6018369793891907, |
| "learning_rate": 0.00040031462126925975, |
| "loss": 3.4674, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "eval_accuracy": 0.3720835527668947, |
| "eval_loss": 3.482224941253662, |
| "eval_runtime": 185.97, |
| "eval_samples_per_second": 96.849, |
| "eval_steps_per_second": 6.055, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3419438165967064, |
| "grad_norm": 0.5857089161872864, |
| "learning_rate": 0.00039999138023919835, |
| "loss": 3.4712, |
| "step": 31050 |
| }, |
| { |
| "epoch": 3.347325368636315, |
| "grad_norm": 0.5926389694213867, |
| "learning_rate": 0.00039966813920913694, |
| "loss": 3.4409, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.352706920675923, |
| "grad_norm": 0.6544318199157715, |
| "learning_rate": 0.0003993448981790755, |
| "loss": 3.4366, |
| "step": 31150 |
| }, |
| { |
| "epoch": 3.358088472715531, |
| "grad_norm": 0.6314635276794434, |
| "learning_rate": 0.0003990216571490141, |
| "loss": 3.4735, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.3634700247551392, |
| "grad_norm": 0.687687873840332, |
| "learning_rate": 0.0003986984161189526, |
| "loss": 3.4621, |
| "step": 31250 |
| }, |
| { |
| "epoch": 3.368851576794748, |
| "grad_norm": 0.6045171618461609, |
| "learning_rate": 0.00039837517508889127, |
| "loss": 3.4655, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.374233128834356, |
| "grad_norm": 0.6058654189109802, |
| "learning_rate": 0.00039805193405882986, |
| "loss": 3.4728, |
| "step": 31350 |
| }, |
| { |
| "epoch": 3.379614680873964, |
| "grad_norm": 0.6477822661399841, |
| "learning_rate": 0.0003977286930287684, |
| "loss": 3.4453, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.384996232913572, |
| "grad_norm": 0.6337372660636902, |
| "learning_rate": 0.000397405451998707, |
| "loss": 3.4629, |
| "step": 31450 |
| }, |
| { |
| "epoch": 3.3903777849531807, |
| "grad_norm": 0.5923517942428589, |
| "learning_rate": 0.00039708221096864554, |
| "loss": 3.47, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.3957593369927888, |
| "grad_norm": 0.5875824093818665, |
| "learning_rate": 0.00039675896993858413, |
| "loss": 3.4527, |
| "step": 31550 |
| }, |
| { |
| "epoch": 3.401140889032397, |
| "grad_norm": 0.5956109166145325, |
| "learning_rate": 0.0003964357289085228, |
| "loss": 3.456, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.4065224410720054, |
| "grad_norm": 0.6072897911071777, |
| "learning_rate": 0.0003961124878784613, |
| "loss": 3.4426, |
| "step": 31650 |
| }, |
| { |
| "epoch": 3.4119039931116135, |
| "grad_norm": 0.5899261236190796, |
| "learning_rate": 0.0003957892468483999, |
| "loss": 3.4624, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.4172855451512216, |
| "grad_norm": 0.602802038192749, |
| "learning_rate": 0.0003954660058183385, |
| "loss": 3.4672, |
| "step": 31750 |
| }, |
| { |
| "epoch": 3.4226670971908297, |
| "grad_norm": 0.6016173362731934, |
| "learning_rate": 0.00039514276478827705, |
| "loss": 3.4611, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.428048649230438, |
| "grad_norm": 0.6393997669219971, |
| "learning_rate": 0.0003948195237582157, |
| "loss": 3.4606, |
| "step": 31850 |
| }, |
| { |
| "epoch": 3.4334302012700464, |
| "grad_norm": 0.5917162299156189, |
| "learning_rate": 0.0003944962827281543, |
| "loss": 3.4523, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.4388117533096545, |
| "grad_norm": 0.5613840222358704, |
| "learning_rate": 0.00039417304169809284, |
| "loss": 3.4522, |
| "step": 31950 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "grad_norm": 0.5975550413131714, |
| "learning_rate": 0.00039384980066803143, |
| "loss": 3.465, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "eval_accuracy": 0.3733598982400514, |
| "eval_loss": 3.4756972789764404, |
| "eval_runtime": 185.3849, |
| "eval_samples_per_second": 97.155, |
| "eval_steps_per_second": 6.074, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.449574857388871, |
| "grad_norm": 0.6139984726905823, |
| "learning_rate": 0.00039352655963796997, |
| "loss": 3.4671, |
| "step": 32050 |
| }, |
| { |
| "epoch": 3.4549564094284793, |
| "grad_norm": 0.6320118308067322, |
| "learning_rate": 0.00039320978342850983, |
| "loss": 3.4532, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.4603379614680874, |
| "grad_norm": 0.6727486252784729, |
| "learning_rate": 0.00039288654239844837, |
| "loss": 3.4728, |
| "step": 32150 |
| }, |
| { |
| "epoch": 3.4657195135076955, |
| "grad_norm": 0.6185712218284607, |
| "learning_rate": 0.00039256330136838697, |
| "loss": 3.4533, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.471101065547304, |
| "grad_norm": 0.6031298041343689, |
| "learning_rate": 0.0003922400603383256, |
| "loss": 3.4635, |
| "step": 32250 |
| }, |
| { |
| "epoch": 3.476482617586912, |
| "grad_norm": 0.6574801802635193, |
| "learning_rate": 0.00039191681930826416, |
| "loss": 3.4633, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.4818641696265202, |
| "grad_norm": 0.6520865559577942, |
| "learning_rate": 0.00039159357827820275, |
| "loss": 3.4454, |
| "step": 32350 |
| }, |
| { |
| "epoch": 3.4872457216661283, |
| "grad_norm": 0.6058455109596252, |
| "learning_rate": 0.00039127033724814135, |
| "loss": 3.4659, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.492627273705737, |
| "grad_norm": 0.6234257817268372, |
| "learning_rate": 0.0003909470962180799, |
| "loss": 3.4475, |
| "step": 32450 |
| }, |
| { |
| "epoch": 3.498008825745345, |
| "grad_norm": 0.5986215472221375, |
| "learning_rate": 0.00039062385518801854, |
| "loss": 3.4459, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.503390377784953, |
| "grad_norm": 0.5907883048057556, |
| "learning_rate": 0.00039030061415795713, |
| "loss": 3.4576, |
| "step": 32550 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 0.6612138748168945, |
| "learning_rate": 0.00038997737312789567, |
| "loss": 3.4441, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.5141534818641698, |
| "grad_norm": 0.5793318152427673, |
| "learning_rate": 0.00038965413209783426, |
| "loss": 3.4967, |
| "step": 32650 |
| }, |
| { |
| "epoch": 3.519535033903778, |
| "grad_norm": 0.6358268857002258, |
| "learning_rate": 0.0003893308910677728, |
| "loss": 3.4473, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.524916585943386, |
| "grad_norm": 0.6008777022361755, |
| "learning_rate": 0.0003890076500377114, |
| "loss": 3.4596, |
| "step": 32750 |
| }, |
| { |
| "epoch": 3.530298137982994, |
| "grad_norm": 0.6194238066673279, |
| "learning_rate": 0.0003886908738282512, |
| "loss": 3.4624, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.5356796900226026, |
| "grad_norm": 0.5898000597953796, |
| "learning_rate": 0.0003883676327981898, |
| "loss": 3.4512, |
| "step": 32850 |
| }, |
| { |
| "epoch": 3.5410612420622107, |
| "grad_norm": 0.604904294013977, |
| "learning_rate": 0.00038804439176812845, |
| "loss": 3.4572, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.546442794101819, |
| "grad_norm": 0.5905935764312744, |
| "learning_rate": 0.000387721150738067, |
| "loss": 3.4599, |
| "step": 32950 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "grad_norm": 0.6019571423530579, |
| "learning_rate": 0.0003873979097080056, |
| "loss": 3.4619, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "eval_accuracy": 0.37374518138024154, |
| "eval_loss": 3.470459222793579, |
| "eval_runtime": 185.9681, |
| "eval_samples_per_second": 96.85, |
| "eval_steps_per_second": 6.055, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5572058981810355, |
| "grad_norm": 0.6063734889030457, |
| "learning_rate": 0.0003870746686779441, |
| "loss": 3.4475, |
| "step": 33050 |
| }, |
| { |
| "epoch": 3.5625874502206436, |
| "grad_norm": 0.648643434047699, |
| "learning_rate": 0.0003867514276478827, |
| "loss": 3.4669, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.5679690022602517, |
| "grad_norm": 0.5970544815063477, |
| "learning_rate": 0.0003864281866178213, |
| "loss": 3.4543, |
| "step": 33150 |
| }, |
| { |
| "epoch": 3.57335055429986, |
| "grad_norm": 0.6207253336906433, |
| "learning_rate": 0.0003861049455877599, |
| "loss": 3.4752, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.5787321063394684, |
| "grad_norm": 0.6894705891609192, |
| "learning_rate": 0.0003857817045576985, |
| "loss": 3.4544, |
| "step": 33250 |
| }, |
| { |
| "epoch": 3.5841136583790765, |
| "grad_norm": 0.6370822191238403, |
| "learning_rate": 0.0003854584635276371, |
| "loss": 3.4602, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.5894952104186846, |
| "grad_norm": 0.607257604598999, |
| "learning_rate": 0.00038513522249757564, |
| "loss": 3.4606, |
| "step": 33350 |
| }, |
| { |
| "epoch": 3.594876762458293, |
| "grad_norm": 0.5875098705291748, |
| "learning_rate": 0.00038481198146751423, |
| "loss": 3.4622, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.6002583144979012, |
| "grad_norm": 0.605970561504364, |
| "learning_rate": 0.0003844887404374529, |
| "loss": 3.4492, |
| "step": 33450 |
| }, |
| { |
| "epoch": 3.6056398665375093, |
| "grad_norm": 0.6834209561347961, |
| "learning_rate": 0.0003841654994073914, |
| "loss": 3.4523, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.611021418577118, |
| "grad_norm": 0.6439088582992554, |
| "learning_rate": 0.00038384225837733, |
| "loss": 3.446, |
| "step": 33550 |
| }, |
| { |
| "epoch": 3.616402970616726, |
| "grad_norm": 0.6024984121322632, |
| "learning_rate": 0.00038351901734726856, |
| "loss": 3.4561, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.621784522656334, |
| "grad_norm": 0.6383893489837646, |
| "learning_rate": 0.00038319577631720715, |
| "loss": 3.467, |
| "step": 33650 |
| }, |
| { |
| "epoch": 3.627166074695942, |
| "grad_norm": 0.6166204214096069, |
| "learning_rate": 0.00038287253528714575, |
| "loss": 3.4563, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.6325476267355503, |
| "grad_norm": 0.6604927182197571, |
| "learning_rate": 0.00038254929425708434, |
| "loss": 3.4591, |
| "step": 33750 |
| }, |
| { |
| "epoch": 3.637929178775159, |
| "grad_norm": 0.5971705317497253, |
| "learning_rate": 0.00038222605322702294, |
| "loss": 3.4505, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.643310730814767, |
| "grad_norm": 0.5814175605773926, |
| "learning_rate": 0.00038190281219696153, |
| "loss": 3.4408, |
| "step": 33850 |
| }, |
| { |
| "epoch": 3.648692282854375, |
| "grad_norm": 0.6446985602378845, |
| "learning_rate": 0.00038157957116690007, |
| "loss": 3.4502, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.6540738348939836, |
| "grad_norm": 0.6055765748023987, |
| "learning_rate": 0.00038125633013683867, |
| "loss": 3.4574, |
| "step": 33950 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "grad_norm": 0.6023905873298645, |
| "learning_rate": 0.0003809330891067772, |
| "loss": 3.4499, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "eval_accuracy": 0.3743394040236989, |
| "eval_loss": 3.463463306427002, |
| "eval_runtime": 185.3498, |
| "eval_samples_per_second": 97.173, |
| "eval_steps_per_second": 6.075, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6648369389732, |
| "grad_norm": 0.6245377063751221, |
| "learning_rate": 0.00038060984807671586, |
| "loss": 3.4636, |
| "step": 34050 |
| }, |
| { |
| "epoch": 3.670218491012808, |
| "grad_norm": 0.5832623839378357, |
| "learning_rate": 0.00038028660704665445, |
| "loss": 3.4716, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.675600043052416, |
| "grad_norm": 0.6104280948638916, |
| "learning_rate": 0.000379963366016593, |
| "loss": 3.467, |
| "step": 34150 |
| }, |
| { |
| "epoch": 3.6809815950920246, |
| "grad_norm": 0.6102921366691589, |
| "learning_rate": 0.0003796401249865316, |
| "loss": 3.4588, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.6863631471316327, |
| "grad_norm": 0.680322527885437, |
| "learning_rate": 0.0003793168839564701, |
| "loss": 3.4555, |
| "step": 34250 |
| }, |
| { |
| "epoch": 3.691744699171241, |
| "grad_norm": 0.5855444669723511, |
| "learning_rate": 0.0003789936429264088, |
| "loss": 3.4561, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.6971262512108494, |
| "grad_norm": 0.5749221444129944, |
| "learning_rate": 0.00037867040189634737, |
| "loss": 3.4398, |
| "step": 34350 |
| }, |
| { |
| "epoch": 3.7025078032504575, |
| "grad_norm": 0.6411744356155396, |
| "learning_rate": 0.0003783471608662859, |
| "loss": 3.4339, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.7078893552900656, |
| "grad_norm": 0.62653648853302, |
| "learning_rate": 0.0003780239198362245, |
| "loss": 3.4553, |
| "step": 34450 |
| }, |
| { |
| "epoch": 3.713270907329674, |
| "grad_norm": 0.6701282262802124, |
| "learning_rate": 0.0003777006788061631, |
| "loss": 3.4685, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.7186524593692822, |
| "grad_norm": 0.6589163541793823, |
| "learning_rate": 0.00037737743777610164, |
| "loss": 3.4411, |
| "step": 34550 |
| }, |
| { |
| "epoch": 3.7240340114088903, |
| "grad_norm": 0.6170485019683838, |
| "learning_rate": 0.0003770541967460403, |
| "loss": 3.4477, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.7294155634484984, |
| "grad_norm": 0.6524089574813843, |
| "learning_rate": 0.0003767309557159789, |
| "loss": 3.4341, |
| "step": 34650 |
| }, |
| { |
| "epoch": 3.7347971154881066, |
| "grad_norm": 0.6533591747283936, |
| "learning_rate": 0.0003764077146859174, |
| "loss": 3.4535, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.740178667527715, |
| "grad_norm": 0.6661497950553894, |
| "learning_rate": 0.000376084473655856, |
| "loss": 3.4341, |
| "step": 34750 |
| }, |
| { |
| "epoch": 3.745560219567323, |
| "grad_norm": 0.6009532809257507, |
| "learning_rate": 0.00037576123262579456, |
| "loss": 3.4592, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.7509417716069313, |
| "grad_norm": 0.6715372800827026, |
| "learning_rate": 0.00037543799159573315, |
| "loss": 3.4464, |
| "step": 34850 |
| }, |
| { |
| "epoch": 3.75632332364654, |
| "grad_norm": 0.6724957227706909, |
| "learning_rate": 0.0003751147505656718, |
| "loss": 3.4591, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.761704875686148, |
| "grad_norm": 0.5966370105743408, |
| "learning_rate": 0.00037479150953561034, |
| "loss": 3.4587, |
| "step": 34950 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "grad_norm": 0.6564235687255859, |
| "learning_rate": 0.00037446826850554894, |
| "loss": 3.4533, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "eval_accuracy": 0.3750346738528238, |
| "eval_loss": 3.4556665420532227, |
| "eval_runtime": 186.006, |
| "eval_samples_per_second": 96.83, |
| "eval_steps_per_second": 6.054, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.772467979765364, |
| "grad_norm": 0.6435751914978027, |
| "learning_rate": 0.00037414502747548753, |
| "loss": 3.4489, |
| "step": 35050 |
| }, |
| { |
| "epoch": 3.7778495318049723, |
| "grad_norm": 0.6148671507835388, |
| "learning_rate": 0.00037382178644542607, |
| "loss": 3.4621, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.783231083844581, |
| "grad_norm": 0.6056584715843201, |
| "learning_rate": 0.00037349854541536467, |
| "loss": 3.4435, |
| "step": 35150 |
| }, |
| { |
| "epoch": 3.788612635884189, |
| "grad_norm": 0.5860791206359863, |
| "learning_rate": 0.0003731753043853033, |
| "loss": 3.4396, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.793994187923797, |
| "grad_norm": 0.5746303200721741, |
| "learning_rate": 0.00037285206335524186, |
| "loss": 3.4524, |
| "step": 35250 |
| }, |
| { |
| "epoch": 3.7993757399634056, |
| "grad_norm": 0.6134446859359741, |
| "learning_rate": 0.00037252882232518045, |
| "loss": 3.4416, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.8047572920030137, |
| "grad_norm": 0.6056711673736572, |
| "learning_rate": 0.000372205581295119, |
| "loss": 3.4589, |
| "step": 35350 |
| }, |
| { |
| "epoch": 3.810138844042622, |
| "grad_norm": 0.6229293942451477, |
| "learning_rate": 0.0003718823402650576, |
| "loss": 3.4504, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.8155203960822304, |
| "grad_norm": 0.6463286280632019, |
| "learning_rate": 0.00037155909923499624, |
| "loss": 3.4534, |
| "step": 35450 |
| }, |
| { |
| "epoch": 3.8209019481218385, |
| "grad_norm": 0.6630687713623047, |
| "learning_rate": 0.0003712358582049348, |
| "loss": 3.459, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.8262835001614466, |
| "grad_norm": 0.6307542324066162, |
| "learning_rate": 0.00037091261717487337, |
| "loss": 3.4528, |
| "step": 35550 |
| }, |
| { |
| "epoch": 3.8316650522010547, |
| "grad_norm": 0.6218998432159424, |
| "learning_rate": 0.00037058937614481197, |
| "loss": 3.4378, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.837046604240663, |
| "grad_norm": 0.5981392860412598, |
| "learning_rate": 0.0003702661351147505, |
| "loss": 3.4587, |
| "step": 35650 |
| }, |
| { |
| "epoch": 3.8424281562802713, |
| "grad_norm": 0.604943573474884, |
| "learning_rate": 0.0003699428940846891, |
| "loss": 3.4417, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.8478097083198795, |
| "grad_norm": 0.6054553389549255, |
| "learning_rate": 0.00036961965305462775, |
| "loss": 3.4481, |
| "step": 35750 |
| }, |
| { |
| "epoch": 3.8531912603594876, |
| "grad_norm": 0.6459552049636841, |
| "learning_rate": 0.0003692964120245663, |
| "loss": 3.4729, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.858572812399096, |
| "grad_norm": 0.6239967942237854, |
| "learning_rate": 0.0003689731709945049, |
| "loss": 3.4456, |
| "step": 35850 |
| }, |
| { |
| "epoch": 3.863954364438704, |
| "grad_norm": 0.6265727281570435, |
| "learning_rate": 0.0003686499299644434, |
| "loss": 3.4373, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.8693359164783123, |
| "grad_norm": 0.6187468767166138, |
| "learning_rate": 0.000368326688934382, |
| "loss": 3.4287, |
| "step": 35950 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "grad_norm": 0.5702441334724426, |
| "learning_rate": 0.0003680034479043206, |
| "loss": 3.4533, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "eval_accuracy": 0.3755187224680373, |
| "eval_loss": 3.4498579502105713, |
| "eval_runtime": 185.4565, |
| "eval_samples_per_second": 97.117, |
| "eval_steps_per_second": 6.072, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8800990205575285, |
| "grad_norm": 0.6100196838378906, |
| "learning_rate": 0.0003676802068742592, |
| "loss": 3.4235, |
| "step": 36050 |
| }, |
| { |
| "epoch": 3.885480572597137, |
| "grad_norm": 0.6305633783340454, |
| "learning_rate": 0.0003673569658441978, |
| "loss": 3.4455, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.890862124636745, |
| "grad_norm": 0.6668870449066162, |
| "learning_rate": 0.0003670337248141364, |
| "loss": 3.4423, |
| "step": 36150 |
| }, |
| { |
| "epoch": 3.8962436766763533, |
| "grad_norm": 0.6084736585617065, |
| "learning_rate": 0.00036671048378407494, |
| "loss": 3.4535, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.901625228715962, |
| "grad_norm": 0.6447023153305054, |
| "learning_rate": 0.00036638724275401353, |
| "loss": 3.4569, |
| "step": 36250 |
| }, |
| { |
| "epoch": 3.90700678075557, |
| "grad_norm": 0.6358567476272583, |
| "learning_rate": 0.0003660640017239522, |
| "loss": 3.4369, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.912388332795178, |
| "grad_norm": 0.597412645816803, |
| "learning_rate": 0.0003657407606938907, |
| "loss": 3.4437, |
| "step": 36350 |
| }, |
| { |
| "epoch": 3.9177698848347866, |
| "grad_norm": 0.6119159460067749, |
| "learning_rate": 0.0003654175196638293, |
| "loss": 3.4496, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.9231514368743947, |
| "grad_norm": 0.6461895108222961, |
| "learning_rate": 0.00036509427863376786, |
| "loss": 3.4356, |
| "step": 36450 |
| }, |
| { |
| "epoch": 3.928532988914003, |
| "grad_norm": 0.6408407092094421, |
| "learning_rate": 0.00036477103760370645, |
| "loss": 3.4544, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.933914540953611, |
| "grad_norm": 0.6049111485481262, |
| "learning_rate": 0.00036444779657364505, |
| "loss": 3.4484, |
| "step": 36550 |
| }, |
| { |
| "epoch": 3.939296092993219, |
| "grad_norm": 0.6260215044021606, |
| "learning_rate": 0.00036412455554358364, |
| "loss": 3.4454, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.9446776450328276, |
| "grad_norm": 0.6023298501968384, |
| "learning_rate": 0.00036380131451352224, |
| "loss": 3.4414, |
| "step": 36650 |
| }, |
| { |
| "epoch": 3.9500591970724357, |
| "grad_norm": 0.6622657775878906, |
| "learning_rate": 0.00036347807348346083, |
| "loss": 3.4632, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.955440749112044, |
| "grad_norm": 0.6552717089653015, |
| "learning_rate": 0.00036315483245339937, |
| "loss": 3.4478, |
| "step": 36750 |
| }, |
| { |
| "epoch": 3.9608223011516523, |
| "grad_norm": 0.6252656579017639, |
| "learning_rate": 0.00036283159142333797, |
| "loss": 3.4527, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.9662038531912605, |
| "grad_norm": 0.5924856662750244, |
| "learning_rate": 0.0003625083503932765, |
| "loss": 3.448, |
| "step": 36850 |
| }, |
| { |
| "epoch": 3.9715854052308686, |
| "grad_norm": 0.6145959496498108, |
| "learning_rate": 0.00036218510936321516, |
| "loss": 3.4675, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.9769669572704767, |
| "grad_norm": 0.663978636264801, |
| "learning_rate": 0.00036186186833315375, |
| "loss": 3.4394, |
| "step": 36950 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "grad_norm": 0.642401933670044, |
| "learning_rate": 0.0003615386273030923, |
| "loss": 3.4527, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "eval_accuracy": 0.37618628581076946, |
| "eval_loss": 3.4478323459625244, |
| "eval_runtime": 185.7273, |
| "eval_samples_per_second": 96.976, |
| "eval_steps_per_second": 6.063, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9877300613496933, |
| "grad_norm": 0.7342509627342224, |
| "learning_rate": 0.0003612153862730309, |
| "loss": 3.4525, |
| "step": 37050 |
| }, |
| { |
| "epoch": 3.9931116133893014, |
| "grad_norm": 0.6472655534744263, |
| "learning_rate": 0.0003608921452429695, |
| "loss": 3.4575, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.9984931654289095, |
| "grad_norm": 0.6445570588111877, |
| "learning_rate": 0.000360568904212908, |
| "loss": 3.4452, |
| "step": 37150 |
| }, |
| { |
| "epoch": 4.003874717468518, |
| "grad_norm": 0.6552912592887878, |
| "learning_rate": 0.00036024566318284667, |
| "loss": 3.367, |
| "step": 37200 |
| }, |
| { |
| "epoch": 4.009256269508126, |
| "grad_norm": 0.6877759695053101, |
| "learning_rate": 0.00035992242215278526, |
| "loss": 3.3401, |
| "step": 37250 |
| }, |
| { |
| "epoch": 4.014637821547734, |
| "grad_norm": 0.6554164290428162, |
| "learning_rate": 0.0003595991811227238, |
| "loss": 3.3635, |
| "step": 37300 |
| }, |
| { |
| "epoch": 4.020019373587343, |
| "grad_norm": 0.6302908658981323, |
| "learning_rate": 0.0003592759400926624, |
| "loss": 3.3559, |
| "step": 37350 |
| }, |
| { |
| "epoch": 4.0254009256269505, |
| "grad_norm": 0.688528299331665, |
| "learning_rate": 0.00035895269906260094, |
| "loss": 3.3326, |
| "step": 37400 |
| }, |
| { |
| "epoch": 4.030782477666559, |
| "grad_norm": 0.624566376209259, |
| "learning_rate": 0.0003586294580325396, |
| "loss": 3.371, |
| "step": 37450 |
| }, |
| { |
| "epoch": 4.036164029706168, |
| "grad_norm": 0.6329576373100281, |
| "learning_rate": 0.0003583062170024782, |
| "loss": 3.3754, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.041545581745775, |
| "grad_norm": 0.6370219588279724, |
| "learning_rate": 0.0003579829759724167, |
| "loss": 3.3571, |
| "step": 37550 |
| }, |
| { |
| "epoch": 4.046927133785384, |
| "grad_norm": 0.6691529750823975, |
| "learning_rate": 0.0003576597349423553, |
| "loss": 3.3653, |
| "step": 37600 |
| }, |
| { |
| "epoch": 4.0523086858249915, |
| "grad_norm": 0.6015875935554504, |
| "learning_rate": 0.0003573364939122939, |
| "loss": 3.3504, |
| "step": 37650 |
| }, |
| { |
| "epoch": 4.0576902378646, |
| "grad_norm": 0.630630373954773, |
| "learning_rate": 0.00035701325288223245, |
| "loss": 3.3662, |
| "step": 37700 |
| }, |
| { |
| "epoch": 4.063071789904209, |
| "grad_norm": 0.6788750886917114, |
| "learning_rate": 0.0003566900118521711, |
| "loss": 3.3776, |
| "step": 37750 |
| }, |
| { |
| "epoch": 4.068453341943816, |
| "grad_norm": 0.6798415184020996, |
| "learning_rate": 0.0003563667708221097, |
| "loss": 3.3712, |
| "step": 37800 |
| }, |
| { |
| "epoch": 4.073834893983425, |
| "grad_norm": 0.6421825885772705, |
| "learning_rate": 0.00035604352979204824, |
| "loss": 3.3668, |
| "step": 37850 |
| }, |
| { |
| "epoch": 4.079216446023033, |
| "grad_norm": 0.6127395629882812, |
| "learning_rate": 0.00035572028876198683, |
| "loss": 3.3658, |
| "step": 37900 |
| }, |
| { |
| "epoch": 4.084597998062641, |
| "grad_norm": 0.6443917751312256, |
| "learning_rate": 0.00035539704773192537, |
| "loss": 3.3753, |
| "step": 37950 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "grad_norm": 0.6737672090530396, |
| "learning_rate": 0.00035507380670186397, |
| "loss": 3.3723, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "eval_accuracy": 0.3766218752380177, |
| "eval_loss": 3.4456825256347656, |
| "eval_runtime": 185.5813, |
| "eval_samples_per_second": 97.052, |
| "eval_steps_per_second": 6.067, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.095361102141858, |
| "grad_norm": 0.6715104579925537, |
| "learning_rate": 0.0003547505656718026, |
| "loss": 3.35, |
| "step": 38050 |
| }, |
| { |
| "epoch": 4.100742654181466, |
| "grad_norm": 0.6445884704589844, |
| "learning_rate": 0.00035442732464174116, |
| "loss": 3.3821, |
| "step": 38100 |
| }, |
| { |
| "epoch": 4.106124206221074, |
| "grad_norm": 0.6589037179946899, |
| "learning_rate": 0.00035410408361167975, |
| "loss": 3.3716, |
| "step": 38150 |
| }, |
| { |
| "epoch": 4.111505758260682, |
| "grad_norm": 0.6113186478614807, |
| "learning_rate": 0.0003537808425816183, |
| "loss": 3.3783, |
| "step": 38200 |
| }, |
| { |
| "epoch": 4.1168873103002905, |
| "grad_norm": 0.6394211053848267, |
| "learning_rate": 0.00035346406637215815, |
| "loss": 3.3518, |
| "step": 38250 |
| }, |
| { |
| "epoch": 4.122268862339899, |
| "grad_norm": 0.6456443071365356, |
| "learning_rate": 0.0003531408253420967, |
| "loss": 3.3758, |
| "step": 38300 |
| }, |
| { |
| "epoch": 4.127650414379507, |
| "grad_norm": 0.6623437404632568, |
| "learning_rate": 0.0003528175843120353, |
| "loss": 3.3705, |
| "step": 38350 |
| }, |
| { |
| "epoch": 4.133031966419115, |
| "grad_norm": 0.690635621547699, |
| "learning_rate": 0.00035249434328197394, |
| "loss": 3.3639, |
| "step": 38400 |
| }, |
| { |
| "epoch": 4.138413518458724, |
| "grad_norm": 0.6645708084106445, |
| "learning_rate": 0.0003521711022519125, |
| "loss": 3.3804, |
| "step": 38450 |
| }, |
| { |
| "epoch": 4.1437950704983315, |
| "grad_norm": 0.6038868427276611, |
| "learning_rate": 0.00035184786122185107, |
| "loss": 3.3703, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.14917662253794, |
| "grad_norm": 0.6192077994346619, |
| "learning_rate": 0.00035152462019178967, |
| "loss": 3.3592, |
| "step": 38550 |
| }, |
| { |
| "epoch": 4.154558174577549, |
| "grad_norm": 0.6280696392059326, |
| "learning_rate": 0.0003512013791617282, |
| "loss": 3.3679, |
| "step": 38600 |
| }, |
| { |
| "epoch": 4.159939726617156, |
| "grad_norm": 0.6606400012969971, |
| "learning_rate": 0.0003508781381316668, |
| "loss": 3.3629, |
| "step": 38650 |
| }, |
| { |
| "epoch": 4.165321278656765, |
| "grad_norm": 0.6507160067558289, |
| "learning_rate": 0.00035055489710160545, |
| "loss": 3.3746, |
| "step": 38700 |
| }, |
| { |
| "epoch": 4.1707028306963725, |
| "grad_norm": 0.6669695377349854, |
| "learning_rate": 0.000350231656071544, |
| "loss": 3.3742, |
| "step": 38750 |
| }, |
| { |
| "epoch": 4.176084382735981, |
| "grad_norm": 0.6331527829170227, |
| "learning_rate": 0.0003499084150414826, |
| "loss": 3.3731, |
| "step": 38800 |
| }, |
| { |
| "epoch": 4.18146593477559, |
| "grad_norm": 0.6521340012550354, |
| "learning_rate": 0.0003495851740114211, |
| "loss": 3.3777, |
| "step": 38850 |
| }, |
| { |
| "epoch": 4.186847486815197, |
| "grad_norm": 0.5923029184341431, |
| "learning_rate": 0.0003492619329813597, |
| "loss": 3.3559, |
| "step": 38900 |
| }, |
| { |
| "epoch": 4.192229038854806, |
| "grad_norm": 0.6522383689880371, |
| "learning_rate": 0.0003489386919512983, |
| "loss": 3.3809, |
| "step": 38950 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "grad_norm": 0.6232262253761292, |
| "learning_rate": 0.0003486154509212369, |
| "loss": 3.3858, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "eval_accuracy": 0.37713145728186764, |
| "eval_loss": 3.4412105083465576, |
| "eval_runtime": 185.311, |
| "eval_samples_per_second": 97.193, |
| "eval_steps_per_second": 6.076, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.202992142934022, |
| "grad_norm": 0.6506069898605347, |
| "learning_rate": 0.0003482922098911755, |
| "loss": 3.3824, |
| "step": 39050 |
| }, |
| { |
| "epoch": 4.208373694973631, |
| "grad_norm": 0.660612165927887, |
| "learning_rate": 0.0003479689688611141, |
| "loss": 3.3863, |
| "step": 39100 |
| }, |
| { |
| "epoch": 4.213755247013238, |
| "grad_norm": 0.611361026763916, |
| "learning_rate": 0.00034764572783105264, |
| "loss": 3.3891, |
| "step": 39150 |
| }, |
| { |
| "epoch": 4.219136799052847, |
| "grad_norm": 0.631380558013916, |
| "learning_rate": 0.00034732248680099123, |
| "loss": 3.3824, |
| "step": 39200 |
| }, |
| { |
| "epoch": 4.224518351092455, |
| "grad_norm": 0.6021390557289124, |
| "learning_rate": 0.0003469992457709299, |
| "loss": 3.3832, |
| "step": 39250 |
| }, |
| { |
| "epoch": 4.229899903132063, |
| "grad_norm": 0.6790795922279358, |
| "learning_rate": 0.0003466760047408684, |
| "loss": 3.3917, |
| "step": 39300 |
| }, |
| { |
| "epoch": 4.2352814551716715, |
| "grad_norm": 0.6313596963882446, |
| "learning_rate": 0.000346352763710807, |
| "loss": 3.3881, |
| "step": 39350 |
| }, |
| { |
| "epoch": 4.24066300721128, |
| "grad_norm": 0.6013673543930054, |
| "learning_rate": 0.00034602952268074556, |
| "loss": 3.3728, |
| "step": 39400 |
| }, |
| { |
| "epoch": 4.246044559250888, |
| "grad_norm": 0.6409134268760681, |
| "learning_rate": 0.00034570628165068415, |
| "loss": 3.3769, |
| "step": 39450 |
| }, |
| { |
| "epoch": 4.251426111290496, |
| "grad_norm": 0.6430742740631104, |
| "learning_rate": 0.0003453830406206227, |
| "loss": 3.3783, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.256807663330104, |
| "grad_norm": 0.6880787014961243, |
| "learning_rate": 0.00034506626441116255, |
| "loss": 3.3918, |
| "step": 39550 |
| }, |
| { |
| "epoch": 4.2621892153697125, |
| "grad_norm": 0.663536548614502, |
| "learning_rate": 0.0003447430233811011, |
| "loss": 3.3739, |
| "step": 39600 |
| }, |
| { |
| "epoch": 4.267570767409321, |
| "grad_norm": 0.6652539372444153, |
| "learning_rate": 0.00034441978235103974, |
| "loss": 3.3811, |
| "step": 39650 |
| }, |
| { |
| "epoch": 4.272952319448929, |
| "grad_norm": 0.6741958856582642, |
| "learning_rate": 0.00034409654132097834, |
| "loss": 3.3752, |
| "step": 39700 |
| }, |
| { |
| "epoch": 4.278333871488537, |
| "grad_norm": 0.6360245943069458, |
| "learning_rate": 0.0003437733002909169, |
| "loss": 3.3803, |
| "step": 39750 |
| }, |
| { |
| "epoch": 4.283715423528146, |
| "grad_norm": 0.6430355310440063, |
| "learning_rate": 0.00034345005926085547, |
| "loss": 3.3764, |
| "step": 39800 |
| }, |
| { |
| "epoch": 4.2890969755677535, |
| "grad_norm": 0.6121541261672974, |
| "learning_rate": 0.00034312681823079407, |
| "loss": 3.3881, |
| "step": 39850 |
| }, |
| { |
| "epoch": 4.294478527607362, |
| "grad_norm": 0.6350565552711487, |
| "learning_rate": 0.00034280357720073266, |
| "loss": 3.3844, |
| "step": 39900 |
| }, |
| { |
| "epoch": 4.299860079646971, |
| "grad_norm": 0.6175565123558044, |
| "learning_rate": 0.00034248033617067126, |
| "loss": 3.3748, |
| "step": 39950 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "grad_norm": 0.6013319492340088, |
| "learning_rate": 0.00034215709514060985, |
| "loss": 3.3796, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "eval_accuracy": 0.37709842680397204, |
| "eval_loss": 3.441098928451538, |
| "eval_runtime": 185.8449, |
| "eval_samples_per_second": 96.914, |
| "eval_steps_per_second": 6.059, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.310623183726187, |
| "grad_norm": 0.6016003489494324, |
| "learning_rate": 0.0003418338541105484, |
| "loss": 3.3772, |
| "step": 40050 |
| }, |
| { |
| "epoch": 4.3160047357657945, |
| "grad_norm": 0.6799412369728088, |
| "learning_rate": 0.000341510613080487, |
| "loss": 3.3938, |
| "step": 40100 |
| }, |
| { |
| "epoch": 4.321386287805403, |
| "grad_norm": 0.7045021057128906, |
| "learning_rate": 0.0003411873720504255, |
| "loss": 3.3969, |
| "step": 40150 |
| }, |
| { |
| "epoch": 4.326767839845012, |
| "grad_norm": 0.684752345085144, |
| "learning_rate": 0.0003408641310203642, |
| "loss": 3.3735, |
| "step": 40200 |
| }, |
| { |
| "epoch": 4.332149391884619, |
| "grad_norm": 0.637198805809021, |
| "learning_rate": 0.00034054088999030277, |
| "loss": 3.3918, |
| "step": 40250 |
| }, |
| { |
| "epoch": 4.337530943924228, |
| "grad_norm": 0.6360993385314941, |
| "learning_rate": 0.0003402176489602413, |
| "loss": 3.3864, |
| "step": 40300 |
| }, |
| { |
| "epoch": 4.342912495963836, |
| "grad_norm": 0.6425589323043823, |
| "learning_rate": 0.0003398944079301799, |
| "loss": 3.4061, |
| "step": 40350 |
| }, |
| { |
| "epoch": 4.348294048003444, |
| "grad_norm": 0.6526085734367371, |
| "learning_rate": 0.0003395711669001185, |
| "loss": 3.3708, |
| "step": 40400 |
| }, |
| { |
| "epoch": 4.3536756000430525, |
| "grad_norm": 0.6961384415626526, |
| "learning_rate": 0.00033924792587005704, |
| "loss": 3.3778, |
| "step": 40450 |
| }, |
| { |
| "epoch": 4.359057152082661, |
| "grad_norm": 0.6156586408615112, |
| "learning_rate": 0.0003389246848399957, |
| "loss": 3.4006, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.364438704122269, |
| "grad_norm": 0.6043623685836792, |
| "learning_rate": 0.0003386014438099343, |
| "loss": 3.3768, |
| "step": 40550 |
| }, |
| { |
| "epoch": 4.369820256161877, |
| "grad_norm": 0.5980582237243652, |
| "learning_rate": 0.0003382782027798728, |
| "loss": 3.4036, |
| "step": 40600 |
| }, |
| { |
| "epoch": 4.375201808201485, |
| "grad_norm": 0.6055423021316528, |
| "learning_rate": 0.0003379549617498114, |
| "loss": 3.37, |
| "step": 40650 |
| }, |
| { |
| "epoch": 4.3805833602410935, |
| "grad_norm": 0.6060426831245422, |
| "learning_rate": 0.00033763172071974996, |
| "loss": 3.3714, |
| "step": 40700 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 0.6431573033332825, |
| "learning_rate": 0.00033730847968968855, |
| "loss": 3.3858, |
| "step": 40750 |
| }, |
| { |
| "epoch": 4.39134646432031, |
| "grad_norm": 0.6799246668815613, |
| "learning_rate": 0.0003369852386596272, |
| "loss": 3.3819, |
| "step": 40800 |
| }, |
| { |
| "epoch": 4.396728016359918, |
| "grad_norm": 0.6246432065963745, |
| "learning_rate": 0.00033666199762956574, |
| "loss": 3.396, |
| "step": 40850 |
| }, |
| { |
| "epoch": 4.402109568399527, |
| "grad_norm": 0.6059042811393738, |
| "learning_rate": 0.00033633875659950434, |
| "loss": 3.3765, |
| "step": 40900 |
| }, |
| { |
| "epoch": 4.4074911204391345, |
| "grad_norm": 0.6426601409912109, |
| "learning_rate": 0.0003360155155694429, |
| "loss": 3.3655, |
| "step": 40950 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "grad_norm": 0.6187671422958374, |
| "learning_rate": 0.0003356922745393815, |
| "loss": 3.3666, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "eval_accuracy": 0.3776884119848047, |
| "eval_loss": 3.4329841136932373, |
| "eval_runtime": 185.6939, |
| "eval_samples_per_second": 96.993, |
| "eval_steps_per_second": 6.064, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.418254224518351, |
| "grad_norm": 0.6929706335067749, |
| "learning_rate": 0.0003353690335093201, |
| "loss": 3.3905, |
| "step": 41050 |
| }, |
| { |
| "epoch": 4.423635776557959, |
| "grad_norm": 0.608570396900177, |
| "learning_rate": 0.0003350457924792587, |
| "loss": 3.3883, |
| "step": 41100 |
| }, |
| { |
| "epoch": 4.429017328597568, |
| "grad_norm": 0.6500557661056519, |
| "learning_rate": 0.00033472255144919726, |
| "loss": 3.3772, |
| "step": 41150 |
| }, |
| { |
| "epoch": 4.4343988806371755, |
| "grad_norm": 0.6308916211128235, |
| "learning_rate": 0.00033439931041913585, |
| "loss": 3.3973, |
| "step": 41200 |
| }, |
| { |
| "epoch": 4.439780432676784, |
| "grad_norm": 0.6509377956390381, |
| "learning_rate": 0.0003340760693890744, |
| "loss": 3.3805, |
| "step": 41250 |
| }, |
| { |
| "epoch": 4.445161984716393, |
| "grad_norm": 0.6561933755874634, |
| "learning_rate": 0.000333752828359013, |
| "loss": 3.3886, |
| "step": 41300 |
| }, |
| { |
| "epoch": 4.450543536756, |
| "grad_norm": 0.6214322447776794, |
| "learning_rate": 0.00033342958732895164, |
| "loss": 3.3794, |
| "step": 41350 |
| }, |
| { |
| "epoch": 4.455925088795609, |
| "grad_norm": 0.627403974533081, |
| "learning_rate": 0.0003331063462988902, |
| "loss": 3.3921, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.461306640835216, |
| "grad_norm": 0.6275430917739868, |
| "learning_rate": 0.00033278310526882877, |
| "loss": 3.3876, |
| "step": 41450 |
| }, |
| { |
| "epoch": 4.466688192874825, |
| "grad_norm": 0.7008453607559204, |
| "learning_rate": 0.0003324598642387673, |
| "loss": 3.3855, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.4720697449144335, |
| "grad_norm": 0.6666591167449951, |
| "learning_rate": 0.0003321366232087059, |
| "loss": 3.396, |
| "step": 41550 |
| }, |
| { |
| "epoch": 4.477451296954041, |
| "grad_norm": 0.6528044939041138, |
| "learning_rate": 0.0003318133821786445, |
| "loss": 3.3807, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.48283284899365, |
| "grad_norm": 0.6648622751235962, |
| "learning_rate": 0.0003314901411485831, |
| "loss": 3.3692, |
| "step": 41650 |
| }, |
| { |
| "epoch": 4.488214401033258, |
| "grad_norm": 0.6213133931159973, |
| "learning_rate": 0.0003311669001185217, |
| "loss": 3.3885, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.493595953072866, |
| "grad_norm": 0.6371303200721741, |
| "learning_rate": 0.0003308436590884603, |
| "loss": 3.3915, |
| "step": 41750 |
| }, |
| { |
| "epoch": 4.4989775051124745, |
| "grad_norm": 0.640751838684082, |
| "learning_rate": 0.0003305204180583988, |
| "loss": 3.3858, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.504359057152083, |
| "grad_norm": 0.652368426322937, |
| "learning_rate": 0.0003301971770283374, |
| "loss": 3.3916, |
| "step": 41850 |
| }, |
| { |
| "epoch": 4.509740609191691, |
| "grad_norm": 0.6258159875869751, |
| "learning_rate": 0.00032987393599827607, |
| "loss": 3.3906, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.515122161231299, |
| "grad_norm": 0.662338376045227, |
| "learning_rate": 0.0003295506949682146, |
| "loss": 3.3902, |
| "step": 41950 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "grad_norm": 0.7796279191970825, |
| "learning_rate": 0.0003292274539381532, |
| "loss": 3.3933, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "eval_accuracy": 0.37852623440274213, |
| "eval_loss": 3.4288346767425537, |
| "eval_runtime": 185.4738, |
| "eval_samples_per_second": 97.108, |
| "eval_steps_per_second": 6.071, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.5258852653105155, |
| "grad_norm": 0.6728432178497314, |
| "learning_rate": 0.00032890421290809174, |
| "loss": 3.3829, |
| "step": 42050 |
| }, |
| { |
| "epoch": 4.531266817350124, |
| "grad_norm": 0.6380643248558044, |
| "learning_rate": 0.00032858097187803034, |
| "loss": 3.4061, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.536648369389732, |
| "grad_norm": 0.6570500135421753, |
| "learning_rate": 0.00032825773084796893, |
| "loss": 3.3774, |
| "step": 42150 |
| }, |
| { |
| "epoch": 4.54202992142934, |
| "grad_norm": 0.6483356952667236, |
| "learning_rate": 0.00032793448981790753, |
| "loss": 3.3952, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.547411473468949, |
| "grad_norm": 0.6839144825935364, |
| "learning_rate": 0.0003276112487878461, |
| "loss": 3.3896, |
| "step": 42250 |
| }, |
| { |
| "epoch": 4.5527930255085565, |
| "grad_norm": 0.7169089913368225, |
| "learning_rate": 0.0003272880077577847, |
| "loss": 3.3932, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.558174577548165, |
| "grad_norm": 0.7072483897209167, |
| "learning_rate": 0.00032696476672772326, |
| "loss": 3.3805, |
| "step": 42350 |
| }, |
| { |
| "epoch": 4.563556129587774, |
| "grad_norm": 0.6454222798347473, |
| "learning_rate": 0.00032664152569766185, |
| "loss": 3.3798, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.568937681627381, |
| "grad_norm": 0.6448003053665161, |
| "learning_rate": 0.0003263182846676004, |
| "loss": 3.3848, |
| "step": 42450 |
| }, |
| { |
| "epoch": 4.57431923366699, |
| "grad_norm": 0.638205885887146, |
| "learning_rate": 0.00032599504363753904, |
| "loss": 3.3848, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.579700785706597, |
| "grad_norm": 0.6642127633094788, |
| "learning_rate": 0.00032567180260747764, |
| "loss": 3.4048, |
| "step": 42550 |
| }, |
| { |
| "epoch": 4.585082337746206, |
| "grad_norm": 0.8099965453147888, |
| "learning_rate": 0.0003253485615774162, |
| "loss": 3.3962, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.5904638897858145, |
| "grad_norm": 0.6563544869422913, |
| "learning_rate": 0.00032502532054735477, |
| "loss": 3.4022, |
| "step": 42650 |
| }, |
| { |
| "epoch": 4.595845441825422, |
| "grad_norm": 0.665276825428009, |
| "learning_rate": 0.00032470207951729337, |
| "loss": 3.3891, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.601226993865031, |
| "grad_norm": 0.677116334438324, |
| "learning_rate": 0.0003243788384872319, |
| "loss": 3.3994, |
| "step": 42750 |
| }, |
| { |
| "epoch": 4.606608545904638, |
| "grad_norm": 0.6451838612556458, |
| "learning_rate": 0.00032405559745717056, |
| "loss": 3.4089, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.611990097944247, |
| "grad_norm": 0.6646443009376526, |
| "learning_rate": 0.00032373235642710915, |
| "loss": 3.384, |
| "step": 42850 |
| }, |
| { |
| "epoch": 4.6173716499838555, |
| "grad_norm": 0.6215267181396484, |
| "learning_rate": 0.0003234091153970477, |
| "loss": 3.3835, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.622753202023463, |
| "grad_norm": 0.6914271712303162, |
| "learning_rate": 0.0003230858743669863, |
| "loss": 3.3865, |
| "step": 42950 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "grad_norm": 0.6758708357810974, |
| "learning_rate": 0.0003227626333369248, |
| "loss": 3.388, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "eval_accuracy": 0.3789760573582941, |
| "eval_loss": 3.4235808849334717, |
| "eval_runtime": 188.2775, |
| "eval_samples_per_second": 95.662, |
| "eval_steps_per_second": 5.981, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.63351630610268, |
| "grad_norm": 0.644496738910675, |
| "learning_rate": 0.0003224393923068635, |
| "loss": 3.3989, |
| "step": 43050 |
| }, |
| { |
| "epoch": 4.638897858142288, |
| "grad_norm": 0.6717650294303894, |
| "learning_rate": 0.00032211615127680207, |
| "loss": 3.3677, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.6442794101818965, |
| "grad_norm": 0.6316590309143066, |
| "learning_rate": 0.0003217929102467406, |
| "loss": 3.3836, |
| "step": 43150 |
| }, |
| { |
| "epoch": 4.649660962221505, |
| "grad_norm": 0.6026739478111267, |
| "learning_rate": 0.0003214696692166792, |
| "loss": 3.3819, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.655042514261113, |
| "grad_norm": 0.7291637063026428, |
| "learning_rate": 0.0003211464281866178, |
| "loss": 3.3878, |
| "step": 43250 |
| }, |
| { |
| "epoch": 4.660424066300721, |
| "grad_norm": 0.6246160864830017, |
| "learning_rate": 0.00032082318715655634, |
| "loss": 3.3831, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.665805618340329, |
| "grad_norm": 0.6072555184364319, |
| "learning_rate": 0.000320499946126495, |
| "loss": 3.3819, |
| "step": 43350 |
| }, |
| { |
| "epoch": 4.6711871703799375, |
| "grad_norm": 0.6010484099388123, |
| "learning_rate": 0.0003201767050964336, |
| "loss": 3.3925, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.676568722419546, |
| "grad_norm": 0.689566433429718, |
| "learning_rate": 0.0003198534640663721, |
| "loss": 3.3882, |
| "step": 43450 |
| }, |
| { |
| "epoch": 4.681950274459154, |
| "grad_norm": 0.6601406335830688, |
| "learning_rate": 0.0003195302230363107, |
| "loss": 3.3921, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.687331826498762, |
| "grad_norm": 0.66224205493927, |
| "learning_rate": 0.00031920698200624926, |
| "loss": 3.3878, |
| "step": 43550 |
| }, |
| { |
| "epoch": 4.692713378538371, |
| "grad_norm": 0.6448682546615601, |
| "learning_rate": 0.00031888374097618785, |
| "loss": 3.3908, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.6980949305779784, |
| "grad_norm": 0.6359822154045105, |
| "learning_rate": 0.0003185604999461265, |
| "loss": 3.408, |
| "step": 43650 |
| }, |
| { |
| "epoch": 4.703476482617587, |
| "grad_norm": 0.6792516708374023, |
| "learning_rate": 0.00031823725891606504, |
| "loss": 3.3934, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.7088580346571955, |
| "grad_norm": 0.7235320210456848, |
| "learning_rate": 0.00031791401788600364, |
| "loss": 3.3974, |
| "step": 43750 |
| }, |
| { |
| "epoch": 4.714239586696803, |
| "grad_norm": 0.6749063730239868, |
| "learning_rate": 0.00031759077685594223, |
| "loss": 3.391, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.719621138736412, |
| "grad_norm": 0.6731683611869812, |
| "learning_rate": 0.00031726753582588077, |
| "loss": 3.3736, |
| "step": 43850 |
| }, |
| { |
| "epoch": 4.725002690776019, |
| "grad_norm": 0.64322829246521, |
| "learning_rate": 0.0003169442947958194, |
| "loss": 3.3817, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.730384242815628, |
| "grad_norm": 0.6686130166053772, |
| "learning_rate": 0.000316621053765758, |
| "loss": 3.3752, |
| "step": 43950 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "grad_norm": 0.6506117582321167, |
| "learning_rate": 0.00031629781273569656, |
| "loss": 3.3868, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "eval_accuracy": 0.37949128935231036, |
| "eval_loss": 3.415797472000122, |
| "eval_runtime": 195.946, |
| "eval_samples_per_second": 91.918, |
| "eval_steps_per_second": 5.746, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.741147346894844, |
| "grad_norm": 0.6574264764785767, |
| "learning_rate": 0.00031597457170563515, |
| "loss": 3.3731, |
| "step": 44050 |
| }, |
| { |
| "epoch": 4.746528898934453, |
| "grad_norm": 0.7197338938713074, |
| "learning_rate": 0.0003156513306755737, |
| "loss": 3.3724, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.751910450974061, |
| "grad_norm": 0.6819115877151489, |
| "learning_rate": 0.0003153280896455123, |
| "loss": 3.4, |
| "step": 44150 |
| }, |
| { |
| "epoch": 4.757292003013669, |
| "grad_norm": 0.6807391047477722, |
| "learning_rate": 0.00031500484861545094, |
| "loss": 3.3764, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.7626735550532775, |
| "grad_norm": 0.6691242456436157, |
| "learning_rate": 0.0003146816075853895, |
| "loss": 3.4043, |
| "step": 44250 |
| }, |
| { |
| "epoch": 4.768055107092886, |
| "grad_norm": 0.6639679670333862, |
| "learning_rate": 0.00031435836655532807, |
| "loss": 3.3884, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.773436659132494, |
| "grad_norm": 0.6559722423553467, |
| "learning_rate": 0.00031403512552526667, |
| "loss": 3.3983, |
| "step": 44350 |
| }, |
| { |
| "epoch": 4.778818211172102, |
| "grad_norm": 0.6629297733306885, |
| "learning_rate": 0.0003137118844952052, |
| "loss": 3.3854, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.78419976321171, |
| "grad_norm": 0.6778562664985657, |
| "learning_rate": 0.0003133886434651438, |
| "loss": 3.3915, |
| "step": 44450 |
| }, |
| { |
| "epoch": 4.7895813152513185, |
| "grad_norm": 0.6818774938583374, |
| "learning_rate": 0.00031306540243508245, |
| "loss": 3.3948, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.794962867290927, |
| "grad_norm": 0.7193388938903809, |
| "learning_rate": 0.000312742161405021, |
| "loss": 3.4046, |
| "step": 44550 |
| }, |
| { |
| "epoch": 4.800344419330535, |
| "grad_norm": 0.6422929167747498, |
| "learning_rate": 0.0003124189203749596, |
| "loss": 3.3852, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.805725971370143, |
| "grad_norm": 0.6536322832107544, |
| "learning_rate": 0.0003120956793448981, |
| "loss": 3.3816, |
| "step": 44650 |
| }, |
| { |
| "epoch": 4.811107523409751, |
| "grad_norm": 0.6786619424819946, |
| "learning_rate": 0.0003117724383148367, |
| "loss": 3.3685, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.8164890754493594, |
| "grad_norm": 0.6211053729057312, |
| "learning_rate": 0.00031144919728477526, |
| "loss": 3.3991, |
| "step": 44750 |
| }, |
| { |
| "epoch": 4.821870627488968, |
| "grad_norm": 0.647323727607727, |
| "learning_rate": 0.0003111259562547139, |
| "loss": 3.3846, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.827252179528576, |
| "grad_norm": 0.6519112586975098, |
| "learning_rate": 0.0003108027152246525, |
| "loss": 3.3724, |
| "step": 44850 |
| }, |
| { |
| "epoch": 4.832633731568184, |
| "grad_norm": 0.7176257967948914, |
| "learning_rate": 0.0003104794741945911, |
| "loss": 3.3855, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.838015283607793, |
| "grad_norm": 0.6432632803916931, |
| "learning_rate": 0.00031015623316452964, |
| "loss": 3.397, |
| "step": 44950 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "grad_norm": 0.6224542260169983, |
| "learning_rate": 0.00030983299213446823, |
| "loss": 3.3877, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "eval_accuracy": 0.38022828189035596, |
| "eval_loss": 3.41146183013916, |
| "eval_runtime": 206.0262, |
| "eval_samples_per_second": 87.421, |
| "eval_steps_per_second": 5.465, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.848778387687009, |
| "grad_norm": 0.6494463086128235, |
| "learning_rate": 0.0003095097511044069, |
| "loss": 3.389, |
| "step": 45050 |
| }, |
| { |
| "epoch": 4.8541599397266175, |
| "grad_norm": 0.6661505103111267, |
| "learning_rate": 0.0003091865100743454, |
| "loss": 3.3898, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.859541491766225, |
| "grad_norm": 0.6509565114974976, |
| "learning_rate": 0.000308863269044284, |
| "loss": 3.4159, |
| "step": 45150 |
| }, |
| { |
| "epoch": 4.864923043805834, |
| "grad_norm": 0.6561294794082642, |
| "learning_rate": 0.00030854002801422256, |
| "loss": 3.4058, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.870304595845441, |
| "grad_norm": 0.6605173349380493, |
| "learning_rate": 0.00030821678698416115, |
| "loss": 3.3868, |
| "step": 45250 |
| }, |
| { |
| "epoch": 4.87568614788505, |
| "grad_norm": 0.6553974747657776, |
| "learning_rate": 0.0003078935459540997, |
| "loss": 3.37, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.8810676999246585, |
| "grad_norm": 0.6796717047691345, |
| "learning_rate": 0.00030757676974463955, |
| "loss": 3.3938, |
| "step": 45350 |
| }, |
| { |
| "epoch": 4.886449251964266, |
| "grad_norm": 0.7089065313339233, |
| "learning_rate": 0.0003072535287145781, |
| "loss": 3.4027, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.891830804003875, |
| "grad_norm": 0.6419367790222168, |
| "learning_rate": 0.00030693028768451674, |
| "loss": 3.3955, |
| "step": 45450 |
| }, |
| { |
| "epoch": 4.897212356043483, |
| "grad_norm": 0.7183190584182739, |
| "learning_rate": 0.00030660704665445534, |
| "loss": 3.405, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.902593908083091, |
| "grad_norm": 0.6650996208190918, |
| "learning_rate": 0.0003062838056243939, |
| "loss": 3.3796, |
| "step": 45550 |
| }, |
| { |
| "epoch": 4.9079754601226995, |
| "grad_norm": 0.6407854557037354, |
| "learning_rate": 0.00030596056459433247, |
| "loss": 3.3895, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.913357012162308, |
| "grad_norm": 0.677783191204071, |
| "learning_rate": 0.00030563732356427107, |
| "loss": 3.3871, |
| "step": 45650 |
| }, |
| { |
| "epoch": 4.918738564201916, |
| "grad_norm": 0.6902631521224976, |
| "learning_rate": 0.00030531408253420966, |
| "loss": 3.3757, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.924120116241524, |
| "grad_norm": 0.6598906517028809, |
| "learning_rate": 0.00030499084150414826, |
| "loss": 3.3974, |
| "step": 45750 |
| }, |
| { |
| "epoch": 4.929501668281132, |
| "grad_norm": 0.6694639921188354, |
| "learning_rate": 0.00030466760047408685, |
| "loss": 3.4009, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.9348832203207404, |
| "grad_norm": 0.6821097135543823, |
| "learning_rate": 0.0003043443594440254, |
| "loss": 3.384, |
| "step": 45850 |
| }, |
| { |
| "epoch": 4.940264772360349, |
| "grad_norm": 0.6829236745834351, |
| "learning_rate": 0.000304021118413964, |
| "loss": 3.3811, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.945646324399957, |
| "grad_norm": 0.6571699380874634, |
| "learning_rate": 0.0003036978773839025, |
| "loss": 3.388, |
| "step": 45950 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "grad_norm": 0.6993294358253479, |
| "learning_rate": 0.0003033746363538412, |
| "loss": 3.3995, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "eval_accuracy": 0.3807766530151557, |
| "eval_loss": 3.4071731567382812, |
| "eval_runtime": 196.2525, |
| "eval_samples_per_second": 91.775, |
| "eval_steps_per_second": 5.738, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.956409428479174, |
| "grad_norm": 0.6646604537963867, |
| "learning_rate": 0.00030305139532377977, |
| "loss": 3.389, |
| "step": 46050 |
| }, |
| { |
| "epoch": 4.961790980518781, |
| "grad_norm": 0.6480951309204102, |
| "learning_rate": 0.0003027281542937183, |
| "loss": 3.389, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.96717253255839, |
| "grad_norm": 0.6770681738853455, |
| "learning_rate": 0.0003024049132636569, |
| "loss": 3.3843, |
| "step": 46150 |
| }, |
| { |
| "epoch": 4.9725540845979985, |
| "grad_norm": 0.6758816838264465, |
| "learning_rate": 0.0003020816722335955, |
| "loss": 3.3836, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.977935636637606, |
| "grad_norm": 0.7239846587181091, |
| "learning_rate": 0.00030175843120353404, |
| "loss": 3.3927, |
| "step": 46250 |
| }, |
| { |
| "epoch": 4.983317188677215, |
| "grad_norm": 0.6626628041267395, |
| "learning_rate": 0.0003014351901734727, |
| "loss": 3.3825, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.988698740716822, |
| "grad_norm": 0.653659999370575, |
| "learning_rate": 0.0003011119491434113, |
| "loss": 3.3807, |
| "step": 46350 |
| }, |
| { |
| "epoch": 4.994080292756431, |
| "grad_norm": 0.653965950012207, |
| "learning_rate": 0.0003007887081133498, |
| "loss": 3.3818, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.9994618447960395, |
| "grad_norm": 0.6614517569541931, |
| "learning_rate": 0.0003004654670832884, |
| "loss": 3.3878, |
| "step": 46450 |
| }, |
| { |
| "epoch": 5.004843396835647, |
| "grad_norm": 0.6403462886810303, |
| "learning_rate": 0.00030014222605322696, |
| "loss": 3.3043, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.010224948875256, |
| "grad_norm": 0.6614930033683777, |
| "learning_rate": 0.00029981898502316555, |
| "loss": 3.2837, |
| "step": 46550 |
| }, |
| { |
| "epoch": 5.015606500914864, |
| "grad_norm": 0.6274611353874207, |
| "learning_rate": 0.00029949574399310415, |
| "loss": 3.3024, |
| "step": 46600 |
| }, |
| { |
| "epoch": 5.020988052954472, |
| "grad_norm": 0.6498275399208069, |
| "learning_rate": 0.00029917250296304274, |
| "loss": 3.2855, |
| "step": 46650 |
| }, |
| { |
| "epoch": 5.0263696049940805, |
| "grad_norm": 0.6935259103775024, |
| "learning_rate": 0.00029884926193298134, |
| "loss": 3.294, |
| "step": 46700 |
| }, |
| { |
| "epoch": 5.031751157033688, |
| "grad_norm": 0.6822585463523865, |
| "learning_rate": 0.0002985260209029199, |
| "loss": 3.2945, |
| "step": 46750 |
| }, |
| { |
| "epoch": 5.037132709073297, |
| "grad_norm": 0.6383519768714905, |
| "learning_rate": 0.00029820277987285853, |
| "loss": 3.314, |
| "step": 46800 |
| }, |
| { |
| "epoch": 5.042514261112905, |
| "grad_norm": 0.6817286014556885, |
| "learning_rate": 0.00029787953884279707, |
| "loss": 3.3041, |
| "step": 46850 |
| }, |
| { |
| "epoch": 5.047895813152513, |
| "grad_norm": 0.6522748470306396, |
| "learning_rate": 0.00029755629781273566, |
| "loss": 3.2853, |
| "step": 46900 |
| }, |
| { |
| "epoch": 5.0532773651921215, |
| "grad_norm": 0.6280950903892517, |
| "learning_rate": 0.00029723305678267426, |
| "loss": 3.3078, |
| "step": 46950 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "grad_norm": 0.6908829212188721, |
| "learning_rate": 0.00029690981575261285, |
| "loss": 3.3128, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "eval_accuracy": 0.3805925950231979, |
| "eval_loss": 3.4133989810943604, |
| "eval_runtime": 202.0265, |
| "eval_samples_per_second": 89.152, |
| "eval_steps_per_second": 5.574, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.064040469271338, |
| "grad_norm": 0.6536048650741577, |
| "learning_rate": 0.0002965865747225514, |
| "loss": 3.3042, |
| "step": 47050 |
| }, |
| { |
| "epoch": 5.069422021310946, |
| "grad_norm": 0.6916722059249878, |
| "learning_rate": 0.00029626333369249, |
| "loss": 3.3182, |
| "step": 47100 |
| }, |
| { |
| "epoch": 5.074803573350554, |
| "grad_norm": 0.7070348858833313, |
| "learning_rate": 0.0002959400926624286, |
| "loss": 3.3146, |
| "step": 47150 |
| }, |
| { |
| "epoch": 5.080185125390162, |
| "grad_norm": 0.6372457146644592, |
| "learning_rate": 0.0002956168516323672, |
| "loss": 3.2856, |
| "step": 47200 |
| }, |
| { |
| "epoch": 5.085566677429771, |
| "grad_norm": 0.7483593821525574, |
| "learning_rate": 0.00029529361060230577, |
| "loss": 3.3038, |
| "step": 47250 |
| }, |
| { |
| "epoch": 5.090948229469379, |
| "grad_norm": 0.6939383149147034, |
| "learning_rate": 0.0002949703695722443, |
| "loss": 3.3055, |
| "step": 47300 |
| }, |
| { |
| "epoch": 5.096329781508987, |
| "grad_norm": 0.680477499961853, |
| "learning_rate": 0.00029464712854218296, |
| "loss": 3.3089, |
| "step": 47350 |
| }, |
| { |
| "epoch": 5.101711333548596, |
| "grad_norm": 0.7036064863204956, |
| "learning_rate": 0.0002943238875121215, |
| "loss": 3.3127, |
| "step": 47400 |
| }, |
| { |
| "epoch": 5.107092885588203, |
| "grad_norm": 0.674315869808197, |
| "learning_rate": 0.0002940006464820601, |
| "loss": 3.296, |
| "step": 47450 |
| }, |
| { |
| "epoch": 5.112474437627812, |
| "grad_norm": 0.6757225394248962, |
| "learning_rate": 0.0002936774054519987, |
| "loss": 3.3277, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.1178559896674205, |
| "grad_norm": 0.6784997582435608, |
| "learning_rate": 0.0002933541644219373, |
| "loss": 3.3212, |
| "step": 47550 |
| }, |
| { |
| "epoch": 5.123237541707028, |
| "grad_norm": 0.7136285305023193, |
| "learning_rate": 0.0002930309233918758, |
| "loss": 3.3031, |
| "step": 47600 |
| }, |
| { |
| "epoch": 5.128619093746637, |
| "grad_norm": 0.7315130829811096, |
| "learning_rate": 0.0002927076823618144, |
| "loss": 3.3216, |
| "step": 47650 |
| }, |
| { |
| "epoch": 5.134000645786244, |
| "grad_norm": 0.6805402636528015, |
| "learning_rate": 0.000292384441331753, |
| "loss": 3.3204, |
| "step": 47700 |
| }, |
| { |
| "epoch": 5.139382197825853, |
| "grad_norm": 0.6625795364379883, |
| "learning_rate": 0.0002920612003016916, |
| "loss": 3.3254, |
| "step": 47750 |
| }, |
| { |
| "epoch": 5.1447637498654615, |
| "grad_norm": 0.6752861738204956, |
| "learning_rate": 0.0002917379592716302, |
| "loss": 3.3083, |
| "step": 47800 |
| }, |
| { |
| "epoch": 5.150145301905069, |
| "grad_norm": 0.6586028337478638, |
| "learning_rate": 0.00029141471824156874, |
| "loss": 3.3197, |
| "step": 47850 |
| }, |
| { |
| "epoch": 5.155526853944678, |
| "grad_norm": 0.6836965680122375, |
| "learning_rate": 0.00029109147721150734, |
| "loss": 3.3259, |
| "step": 47900 |
| }, |
| { |
| "epoch": 5.160908405984286, |
| "grad_norm": 0.6927655935287476, |
| "learning_rate": 0.00029076823618144593, |
| "loss": 3.3055, |
| "step": 47950 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "grad_norm": 0.673652172088623, |
| "learning_rate": 0.00029044499515138453, |
| "loss": 3.3421, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "eval_accuracy": 0.3808159853605445, |
| "eval_loss": 3.41086483001709, |
| "eval_runtime": 189.1518, |
| "eval_samples_per_second": 95.22, |
| "eval_steps_per_second": 5.953, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.1716715100635025, |
| "grad_norm": 0.70196533203125, |
| "learning_rate": 0.0002901217541213231, |
| "loss": 3.2895, |
| "step": 48050 |
| }, |
| { |
| "epoch": 5.17705306210311, |
| "grad_norm": 0.6821349263191223, |
| "learning_rate": 0.0002897985130912617, |
| "loss": 3.3135, |
| "step": 48100 |
| }, |
| { |
| "epoch": 5.182434614142719, |
| "grad_norm": 0.6695942878723145, |
| "learning_rate": 0.00028947527206120026, |
| "loss": 3.2997, |
| "step": 48150 |
| }, |
| { |
| "epoch": 5.187816166182327, |
| "grad_norm": 0.6828153729438782, |
| "learning_rate": 0.00028915203103113885, |
| "loss": 3.3362, |
| "step": 48200 |
| }, |
| { |
| "epoch": 5.193197718221935, |
| "grad_norm": 0.6441693902015686, |
| "learning_rate": 0.00028882879000107745, |
| "loss": 3.3123, |
| "step": 48250 |
| }, |
| { |
| "epoch": 5.198579270261543, |
| "grad_norm": 0.7036584615707397, |
| "learning_rate": 0.00028850554897101604, |
| "loss": 3.3284, |
| "step": 48300 |
| }, |
| { |
| "epoch": 5.203960822301152, |
| "grad_norm": 0.6629340052604675, |
| "learning_rate": 0.00028818230794095464, |
| "loss": 3.3241, |
| "step": 48350 |
| }, |
| { |
| "epoch": 5.20934237434076, |
| "grad_norm": 0.6551874279975891, |
| "learning_rate": 0.0002878590669108932, |
| "loss": 3.3282, |
| "step": 48400 |
| }, |
| { |
| "epoch": 5.214723926380368, |
| "grad_norm": 0.6657053828239441, |
| "learning_rate": 0.00028753582588083177, |
| "loss": 3.3131, |
| "step": 48450 |
| }, |
| { |
| "epoch": 5.220105478419977, |
| "grad_norm": 0.7288995385169983, |
| "learning_rate": 0.00028721258485077037, |
| "loss": 3.3064, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.225487030459584, |
| "grad_norm": 0.706412672996521, |
| "learning_rate": 0.00028688934382070896, |
| "loss": 3.3179, |
| "step": 48550 |
| }, |
| { |
| "epoch": 5.230868582499193, |
| "grad_norm": 0.6880627870559692, |
| "learning_rate": 0.0002865661027906475, |
| "loss": 3.3231, |
| "step": 48600 |
| }, |
| { |
| "epoch": 5.236250134538801, |
| "grad_norm": 0.7488923072814941, |
| "learning_rate": 0.00028624286176058615, |
| "loss": 3.3192, |
| "step": 48650 |
| }, |
| { |
| "epoch": 5.241631686578409, |
| "grad_norm": 0.6686444878578186, |
| "learning_rate": 0.0002859196207305247, |
| "loss": 3.3058, |
| "step": 48700 |
| }, |
| { |
| "epoch": 5.247013238618018, |
| "grad_norm": 0.6534485220909119, |
| "learning_rate": 0.0002855963797004633, |
| "loss": 3.3363, |
| "step": 48750 |
| }, |
| { |
| "epoch": 5.252394790657625, |
| "grad_norm": 0.6567819714546204, |
| "learning_rate": 0.0002852731386704019, |
| "loss": 3.3279, |
| "step": 48800 |
| }, |
| { |
| "epoch": 5.257776342697234, |
| "grad_norm": 0.6784723997116089, |
| "learning_rate": 0.0002849498976403405, |
| "loss": 3.3252, |
| "step": 48850 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 0.6590015292167664, |
| "learning_rate": 0.000284626656610279, |
| "loss": 3.3222, |
| "step": 48900 |
| }, |
| { |
| "epoch": 5.26853944677645, |
| "grad_norm": 0.6960996985435486, |
| "learning_rate": 0.0002843034155802176, |
| "loss": 3.3329, |
| "step": 48950 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "grad_norm": 0.7198411822319031, |
| "learning_rate": 0.0002839801745501562, |
| "loss": 3.3313, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "eval_accuracy": 0.3814156406483926, |
| "eval_loss": 3.4050655364990234, |
| "eval_runtime": 193.7931, |
| "eval_samples_per_second": 92.939, |
| "eval_steps_per_second": 5.81, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.279302550855666, |
| "grad_norm": 0.6831629872322083, |
| "learning_rate": 0.0002836569335200948, |
| "loss": 3.3077, |
| "step": 49050 |
| }, |
| { |
| "epoch": 5.284684102895275, |
| "grad_norm": 0.6490628123283386, |
| "learning_rate": 0.0002833336924900334, |
| "loss": 3.3434, |
| "step": 49100 |
| }, |
| { |
| "epoch": 5.2900656549348835, |
| "grad_norm": 0.6766687035560608, |
| "learning_rate": 0.00028301045145997193, |
| "loss": 3.3244, |
| "step": 49150 |
| }, |
| { |
| "epoch": 5.295447206974491, |
| "grad_norm": 0.673887312412262, |
| "learning_rate": 0.0002826872104299106, |
| "loss": 3.3171, |
| "step": 49200 |
| }, |
| { |
| "epoch": 5.3008287590141, |
| "grad_norm": 0.7235074639320374, |
| "learning_rate": 0.0002823639693998491, |
| "loss": 3.3033, |
| "step": 49250 |
| }, |
| { |
| "epoch": 5.306210311053708, |
| "grad_norm": 0.6715757250785828, |
| "learning_rate": 0.0002820407283697877, |
| "loss": 3.3341, |
| "step": 49300 |
| }, |
| { |
| "epoch": 5.311591863093316, |
| "grad_norm": 0.7311149835586548, |
| "learning_rate": 0.0002817239521603275, |
| "loss": 3.3085, |
| "step": 49350 |
| }, |
| { |
| "epoch": 5.316973415132924, |
| "grad_norm": 0.6206410527229309, |
| "learning_rate": 0.0002814007111302661, |
| "loss": 3.335, |
| "step": 49400 |
| }, |
| { |
| "epoch": 5.322354967172533, |
| "grad_norm": 0.6753268241882324, |
| "learning_rate": 0.0002810774701002047, |
| "loss": 3.3259, |
| "step": 49450 |
| }, |
| { |
| "epoch": 5.327736519212141, |
| "grad_norm": 0.7057104110717773, |
| "learning_rate": 0.00028075422907014325, |
| "loss": 3.3277, |
| "step": 49500 |
| }, |
| { |
| "epoch": 5.333118071251749, |
| "grad_norm": 0.7210546731948853, |
| "learning_rate": 0.00028043098804008185, |
| "loss": 3.3208, |
| "step": 49550 |
| }, |
| { |
| "epoch": 5.338499623291357, |
| "grad_norm": 0.6837037205696106, |
| "learning_rate": 0.00028010774701002044, |
| "loss": 3.3161, |
| "step": 49600 |
| }, |
| { |
| "epoch": 5.343881175330965, |
| "grad_norm": 0.6846098303794861, |
| "learning_rate": 0.00027978450597995904, |
| "loss": 3.3289, |
| "step": 49650 |
| }, |
| { |
| "epoch": 5.349262727370574, |
| "grad_norm": 0.6509220600128174, |
| "learning_rate": 0.0002794612649498976, |
| "loss": 3.3072, |
| "step": 49700 |
| }, |
| { |
| "epoch": 5.354644279410182, |
| "grad_norm": 0.6819303631782532, |
| "learning_rate": 0.00027914448874043744, |
| "loss": 3.3168, |
| "step": 49750 |
| }, |
| { |
| "epoch": 5.36002583144979, |
| "grad_norm": 0.6943836212158203, |
| "learning_rate": 0.00027882124771037603, |
| "loss": 3.3241, |
| "step": 49800 |
| }, |
| { |
| "epoch": 5.365407383489399, |
| "grad_norm": 0.7299051284790039, |
| "learning_rate": 0.0002784980066803146, |
| "loss": 3.3447, |
| "step": 49850 |
| }, |
| { |
| "epoch": 5.370788935529006, |
| "grad_norm": 0.7343830466270447, |
| "learning_rate": 0.00027817476565025317, |
| "loss": 3.3466, |
| "step": 49900 |
| }, |
| { |
| "epoch": 5.376170487568615, |
| "grad_norm": 0.7176545262336731, |
| "learning_rate": 0.00027785152462019176, |
| "loss": 3.3147, |
| "step": 49950 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "grad_norm": 0.6610731482505798, |
| "learning_rate": 0.00027752828359013036, |
| "loss": 3.334, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "eval_accuracy": 0.38206538491752323, |
| "eval_loss": 3.400869369506836, |
| "eval_runtime": 229.8589, |
| "eval_samples_per_second": 78.357, |
| "eval_steps_per_second": 4.899, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.386933591647831, |
| "grad_norm": 0.7090919613838196, |
| "learning_rate": 0.0002772050425600689, |
| "loss": 3.3351, |
| "step": 50050 |
| }, |
| { |
| "epoch": 5.39231514368744, |
| "grad_norm": 0.7518377304077148, |
| "learning_rate": 0.00027688180153000755, |
| "loss": 3.3191, |
| "step": 50100 |
| }, |
| { |
| "epoch": 5.397696695727047, |
| "grad_norm": 0.7093546986579895, |
| "learning_rate": 0.0002765585604999461, |
| "loss": 3.3343, |
| "step": 50150 |
| }, |
| { |
| "epoch": 5.403078247766656, |
| "grad_norm": 0.6840599775314331, |
| "learning_rate": 0.0002762353194698847, |
| "loss": 3.32, |
| "step": 50200 |
| }, |
| { |
| "epoch": 5.4084597998062645, |
| "grad_norm": 0.6484826803207397, |
| "learning_rate": 0.0002759120784398233, |
| "loss": 3.3218, |
| "step": 50250 |
| }, |
| { |
| "epoch": 5.413841351845872, |
| "grad_norm": 0.6536992788314819, |
| "learning_rate": 0.00027558883740976187, |
| "loss": 3.3521, |
| "step": 50300 |
| }, |
| { |
| "epoch": 5.419222903885481, |
| "grad_norm": 0.6963332295417786, |
| "learning_rate": 0.0002752655963797004, |
| "loss": 3.3312, |
| "step": 50350 |
| }, |
| { |
| "epoch": 5.424604455925088, |
| "grad_norm": 0.6415053009986877, |
| "learning_rate": 0.000274942355349639, |
| "loss": 3.3309, |
| "step": 50400 |
| }, |
| { |
| "epoch": 5.429986007964697, |
| "grad_norm": 0.6596943736076355, |
| "learning_rate": 0.0002746191143195776, |
| "loss": 3.3303, |
| "step": 50450 |
| }, |
| { |
| "epoch": 5.435367560004305, |
| "grad_norm": 0.6950709819793701, |
| "learning_rate": 0.0002742958732895162, |
| "loss": 3.3397, |
| "step": 50500 |
| }, |
| { |
| "epoch": 5.440749112043913, |
| "grad_norm": 0.7178179025650024, |
| "learning_rate": 0.0002739726322594548, |
| "loss": 3.3321, |
| "step": 50550 |
| }, |
| { |
| "epoch": 5.446130664083522, |
| "grad_norm": 0.702781617641449, |
| "learning_rate": 0.00027364939122939333, |
| "loss": 3.3373, |
| "step": 50600 |
| }, |
| { |
| "epoch": 5.45151221612313, |
| "grad_norm": 0.7205773591995239, |
| "learning_rate": 0.0002733261501993319, |
| "loss": 3.3475, |
| "step": 50650 |
| }, |
| { |
| "epoch": 5.456893768162738, |
| "grad_norm": 0.787045955657959, |
| "learning_rate": 0.0002730029091692705, |
| "loss": 3.3291, |
| "step": 50700 |
| }, |
| { |
| "epoch": 5.462275320202346, |
| "grad_norm": 0.6911286115646362, |
| "learning_rate": 0.0002726796681392091, |
| "loss": 3.3394, |
| "step": 50750 |
| }, |
| { |
| "epoch": 5.467656872241955, |
| "grad_norm": 0.6939038634300232, |
| "learning_rate": 0.0002723564271091477, |
| "loss": 3.3394, |
| "step": 50800 |
| }, |
| { |
| "epoch": 5.473038424281563, |
| "grad_norm": 0.6775001287460327, |
| "learning_rate": 0.0002720331860790863, |
| "loss": 3.3299, |
| "step": 50850 |
| }, |
| { |
| "epoch": 5.478419976321171, |
| "grad_norm": 0.6703805923461914, |
| "learning_rate": 0.00027170994504902485, |
| "loss": 3.3365, |
| "step": 50900 |
| }, |
| { |
| "epoch": 5.483801528360779, |
| "grad_norm": 0.6991882920265198, |
| "learning_rate": 0.00027138670401896344, |
| "loss": 3.3393, |
| "step": 50950 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "grad_norm": 0.7240285277366638, |
| "learning_rate": 0.00027106346298890204, |
| "loss": 3.3365, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "eval_accuracy": 0.3827228435416887, |
| "eval_loss": 3.3959908485412598, |
| "eval_runtime": 191.5013, |
| "eval_samples_per_second": 94.052, |
| "eval_steps_per_second": 5.88, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.494564632439996, |
| "grad_norm": 0.7037093043327332, |
| "learning_rate": 0.00027074022195884063, |
| "loss": 3.3455, |
| "step": 51050 |
| }, |
| { |
| "epoch": 5.499946184479604, |
| "grad_norm": 0.7127552628517151, |
| "learning_rate": 0.0002704169809287792, |
| "loss": 3.3367, |
| "step": 51100 |
| }, |
| { |
| "epoch": 5.505327736519212, |
| "grad_norm": 0.6924493908882141, |
| "learning_rate": 0.00027009373989871776, |
| "loss": 3.3169, |
| "step": 51150 |
| }, |
| { |
| "epoch": 5.510709288558821, |
| "grad_norm": 0.6796975135803223, |
| "learning_rate": 0.00026977049886865636, |
| "loss": 3.3356, |
| "step": 51200 |
| }, |
| { |
| "epoch": 5.516090840598428, |
| "grad_norm": 0.7216967344284058, |
| "learning_rate": 0.00026944725783859495, |
| "loss": 3.3283, |
| "step": 51250 |
| }, |
| { |
| "epoch": 5.521472392638037, |
| "grad_norm": 0.6969133019447327, |
| "learning_rate": 0.00026912401680853355, |
| "loss": 3.3162, |
| "step": 51300 |
| }, |
| { |
| "epoch": 5.5268539446776455, |
| "grad_norm": 0.6896169185638428, |
| "learning_rate": 0.0002688007757784721, |
| "loss": 3.318, |
| "step": 51350 |
| }, |
| { |
| "epoch": 5.532235496717253, |
| "grad_norm": 0.6580575704574585, |
| "learning_rate": 0.00026847753474841074, |
| "loss": 3.3272, |
| "step": 51400 |
| }, |
| { |
| "epoch": 5.537617048756862, |
| "grad_norm": 0.7668164372444153, |
| "learning_rate": 0.0002681542937183493, |
| "loss": 3.3273, |
| "step": 51450 |
| }, |
| { |
| "epoch": 5.542998600796469, |
| "grad_norm": 0.6946510076522827, |
| "learning_rate": 0.0002678310526882879, |
| "loss": 3.351, |
| "step": 51500 |
| }, |
| { |
| "epoch": 5.548380152836078, |
| "grad_norm": 0.7242096662521362, |
| "learning_rate": 0.00026750781165822647, |
| "loss": 3.3435, |
| "step": 51550 |
| }, |
| { |
| "epoch": 5.553761704875686, |
| "grad_norm": 0.7032466530799866, |
| "learning_rate": 0.00026718457062816506, |
| "loss": 3.335, |
| "step": 51600 |
| }, |
| { |
| "epoch": 5.559143256915294, |
| "grad_norm": 0.7151451706886292, |
| "learning_rate": 0.00026686132959810366, |
| "loss": 3.3247, |
| "step": 51650 |
| }, |
| { |
| "epoch": 5.564524808954903, |
| "grad_norm": 0.7089542150497437, |
| "learning_rate": 0.0002665380885680422, |
| "loss": 3.3489, |
| "step": 51700 |
| }, |
| { |
| "epoch": 5.569906360994511, |
| "grad_norm": 0.7258995175361633, |
| "learning_rate": 0.0002662148475379808, |
| "loss": 3.3258, |
| "step": 51750 |
| }, |
| { |
| "epoch": 5.575287913034119, |
| "grad_norm": 0.6972107291221619, |
| "learning_rate": 0.0002658916065079194, |
| "loss": 3.3383, |
| "step": 51800 |
| }, |
| { |
| "epoch": 5.580669465073727, |
| "grad_norm": 0.6443345546722412, |
| "learning_rate": 0.000265568365477858, |
| "loss": 3.3137, |
| "step": 51850 |
| }, |
| { |
| "epoch": 5.586051017113336, |
| "grad_norm": 0.6882283091545105, |
| "learning_rate": 0.0002652451244477965, |
| "loss": 3.3367, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.591432569152944, |
| "grad_norm": 0.6775994300842285, |
| "learning_rate": 0.00026492188341773517, |
| "loss": 3.3426, |
| "step": 51950 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "grad_norm": 0.7083613276481628, |
| "learning_rate": 0.0002645986423876737, |
| "loss": 3.3222, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "eval_accuracy": 0.3826475470904332, |
| "eval_loss": 3.3929553031921387, |
| "eval_runtime": 191.8708, |
| "eval_samples_per_second": 93.87, |
| "eval_steps_per_second": 5.869, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.60219567323216, |
| "grad_norm": 0.6776838302612305, |
| "learning_rate": 0.0002642754013576123, |
| "loss": 3.357, |
| "step": 52050 |
| }, |
| { |
| "epoch": 5.607577225271768, |
| "grad_norm": 0.712910532951355, |
| "learning_rate": 0.0002639521603275509, |
| "loss": 3.3203, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.612958777311377, |
| "grad_norm": 0.6820148825645447, |
| "learning_rate": 0.0002636289192974895, |
| "loss": 3.3309, |
| "step": 52150 |
| }, |
| { |
| "epoch": 5.618340329350985, |
| "grad_norm": 0.7003359198570251, |
| "learning_rate": 0.00026330567826742804, |
| "loss": 3.3264, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.623721881390593, |
| "grad_norm": 0.7073283791542053, |
| "learning_rate": 0.00026298243723736663, |
| "loss": 3.3356, |
| "step": 52250 |
| }, |
| { |
| "epoch": 5.629103433430201, |
| "grad_norm": 0.7507100701332092, |
| "learning_rate": 0.0002626591962073052, |
| "loss": 3.3313, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.634484985469809, |
| "grad_norm": 0.6759254932403564, |
| "learning_rate": 0.0002623359551772438, |
| "loss": 3.3271, |
| "step": 52350 |
| }, |
| { |
| "epoch": 5.639866537509418, |
| "grad_norm": 0.7124132513999939, |
| "learning_rate": 0.0002620127141471824, |
| "loss": 3.3357, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.645248089549026, |
| "grad_norm": 0.732514500617981, |
| "learning_rate": 0.00026168947311712095, |
| "loss": 3.3328, |
| "step": 52450 |
| }, |
| { |
| "epoch": 5.650629641588634, |
| "grad_norm": 0.7036166191101074, |
| "learning_rate": 0.00026136623208705955, |
| "loss": 3.3299, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.656011193628243, |
| "grad_norm": 0.7753247618675232, |
| "learning_rate": 0.00026104299105699814, |
| "loss": 3.3339, |
| "step": 52550 |
| }, |
| { |
| "epoch": 5.66139274566785, |
| "grad_norm": 0.6865853667259216, |
| "learning_rate": 0.00026071975002693674, |
| "loss": 3.3415, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.666774297707459, |
| "grad_norm": 0.7263120412826538, |
| "learning_rate": 0.00026039650899687533, |
| "loss": 3.3287, |
| "step": 52650 |
| }, |
| { |
| "epoch": 5.672155849747067, |
| "grad_norm": 0.7010359168052673, |
| "learning_rate": 0.00026007326796681393, |
| "loss": 3.3402, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.677537401786675, |
| "grad_norm": 0.6875418424606323, |
| "learning_rate": 0.00025975002693675247, |
| "loss": 3.3312, |
| "step": 52750 |
| }, |
| { |
| "epoch": 5.682918953826284, |
| "grad_norm": 0.6930513978004456, |
| "learning_rate": 0.00025942678590669106, |
| "loss": 3.3351, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.688300505865891, |
| "grad_norm": 0.7597100138664246, |
| "learning_rate": 0.00025910354487662966, |
| "loss": 3.3443, |
| "step": 52850 |
| }, |
| { |
| "epoch": 5.6936820579055, |
| "grad_norm": 0.6796735525131226, |
| "learning_rate": 0.00025878030384656825, |
| "loss": 3.3383, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.699063609945108, |
| "grad_norm": 0.7502220869064331, |
| "learning_rate": 0.00025845706281650685, |
| "loss": 3.3484, |
| "step": 52950 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "grad_norm": 0.7178375720977783, |
| "learning_rate": 0.0002581338217864454, |
| "loss": 3.3533, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "eval_accuracy": 0.3829048371287779, |
| "eval_loss": 3.388993501663208, |
| "eval_runtime": 207.0381, |
| "eval_samples_per_second": 86.994, |
| "eval_steps_per_second": 5.439, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.709826714024325, |
| "grad_norm": 0.6738512516021729, |
| "learning_rate": 0.000257810580756384, |
| "loss": 3.3326, |
| "step": 53050 |
| }, |
| { |
| "epoch": 5.715208266063933, |
| "grad_norm": 0.694001317024231, |
| "learning_rate": 0.0002574873397263226, |
| "loss": 3.3229, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.720589818103541, |
| "grad_norm": 0.7215181589126587, |
| "learning_rate": 0.00025716409869626117, |
| "loss": 3.328, |
| "step": 53150 |
| }, |
| { |
| "epoch": 5.725971370143149, |
| "grad_norm": 0.716097891330719, |
| "learning_rate": 0.0002568408576661997, |
| "loss": 3.3222, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.731352922182758, |
| "grad_norm": 0.6920291185379028, |
| "learning_rate": 0.00025651761663613836, |
| "loss": 3.3171, |
| "step": 53250 |
| }, |
| { |
| "epoch": 5.736734474222366, |
| "grad_norm": 0.679693877696991, |
| "learning_rate": 0.0002561943756060769, |
| "loss": 3.3251, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.742116026261974, |
| "grad_norm": 0.6690770387649536, |
| "learning_rate": 0.0002558711345760155, |
| "loss": 3.3364, |
| "step": 53350 |
| }, |
| { |
| "epoch": 5.747497578301582, |
| "grad_norm": 0.6807551980018616, |
| "learning_rate": 0.0002555478935459541, |
| "loss": 3.332, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.75287913034119, |
| "grad_norm": 0.7079342603683472, |
| "learning_rate": 0.00025522465251589263, |
| "loss": 3.348, |
| "step": 53450 |
| }, |
| { |
| "epoch": 5.758260682380799, |
| "grad_norm": 0.6894396543502808, |
| "learning_rate": 0.0002549014114858312, |
| "loss": 3.3481, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.763642234420407, |
| "grad_norm": 0.751152515411377, |
| "learning_rate": 0.0002545781704557698, |
| "loss": 3.3212, |
| "step": 53550 |
| }, |
| { |
| "epoch": 5.769023786460015, |
| "grad_norm": 0.6932485103607178, |
| "learning_rate": 0.0002542549294257084, |
| "loss": 3.3442, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.774405338499624, |
| "grad_norm": 0.6937035918235779, |
| "learning_rate": 0.000253931688395647, |
| "loss": 3.3385, |
| "step": 53650 |
| }, |
| { |
| "epoch": 5.779786890539231, |
| "grad_norm": 0.7012552618980408, |
| "learning_rate": 0.0002536084473655856, |
| "loss": 3.3362, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.78516844257884, |
| "grad_norm": 0.7385281324386597, |
| "learning_rate": 0.00025328520633552415, |
| "loss": 3.335, |
| "step": 53750 |
| }, |
| { |
| "epoch": 5.790549994618448, |
| "grad_norm": 0.6543053388595581, |
| "learning_rate": 0.000252968430126064, |
| "loss": 3.3261, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.795931546658056, |
| "grad_norm": 0.6646652817726135, |
| "learning_rate": 0.00025264518909600255, |
| "loss": 3.3349, |
| "step": 53850 |
| }, |
| { |
| "epoch": 5.801313098697665, |
| "grad_norm": 0.6860446929931641, |
| "learning_rate": 0.00025232194806594114, |
| "loss": 3.3057, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.806694650737272, |
| "grad_norm": 0.7016067504882812, |
| "learning_rate": 0.00025199870703587974, |
| "loss": 3.3369, |
| "step": 53950 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "grad_norm": 0.7163733839988708, |
| "learning_rate": 0.00025167546600581833, |
| "loss": 3.3362, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "eval_accuracy": 0.3834984078549084, |
| "eval_loss": 3.3828744888305664, |
| "eval_runtime": 199.0745, |
| "eval_samples_per_second": 90.474, |
| "eval_steps_per_second": 5.656, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.817457754816489, |
| "grad_norm": 0.6933131814002991, |
| "learning_rate": 0.0002513522249757569, |
| "loss": 3.3542, |
| "step": 54050 |
| }, |
| { |
| "epoch": 5.822839306856097, |
| "grad_norm": 0.703668475151062, |
| "learning_rate": 0.00025102898394569547, |
| "loss": 3.3429, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.828220858895706, |
| "grad_norm": 0.7263962626457214, |
| "learning_rate": 0.00025070574291563406, |
| "loss": 3.3462, |
| "step": 54150 |
| }, |
| { |
| "epoch": 5.833602410935313, |
| "grad_norm": 0.7322703003883362, |
| "learning_rate": 0.00025038250188557265, |
| "loss": 3.3344, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.838983962974922, |
| "grad_norm": 0.6877895593643188, |
| "learning_rate": 0.00025006572567611246, |
| "loss": 3.3358, |
| "step": 54250 |
| }, |
| { |
| "epoch": 5.84436551501453, |
| "grad_norm": 0.6911699771881104, |
| "learning_rate": 0.00024974248464605106, |
| "loss": 3.3221, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.849747067054138, |
| "grad_norm": 0.7125555872917175, |
| "learning_rate": 0.00024941924361598965, |
| "loss": 3.3289, |
| "step": 54350 |
| }, |
| { |
| "epoch": 5.855128619093747, |
| "grad_norm": 0.72613924741745, |
| "learning_rate": 0.00024909600258592825, |
| "loss": 3.3338, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.860510171133355, |
| "grad_norm": 0.7772033214569092, |
| "learning_rate": 0.0002487727615558668, |
| "loss": 3.3315, |
| "step": 54450 |
| }, |
| { |
| "epoch": 5.865891723172963, |
| "grad_norm": 0.6914510130882263, |
| "learning_rate": 0.0002484495205258054, |
| "loss": 3.3368, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.871273275212571, |
| "grad_norm": 0.7719976305961609, |
| "learning_rate": 0.000248126279495744, |
| "loss": 3.3202, |
| "step": 54550 |
| }, |
| { |
| "epoch": 5.87665482725218, |
| "grad_norm": 0.7112765908241272, |
| "learning_rate": 0.00024780303846568257, |
| "loss": 3.3396, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.882036379291788, |
| "grad_norm": 0.7199580073356628, |
| "learning_rate": 0.0002474797974356211, |
| "loss": 3.3516, |
| "step": 54650 |
| }, |
| { |
| "epoch": 5.887417931331396, |
| "grad_norm": 0.7419936060905457, |
| "learning_rate": 0.00024715655640555976, |
| "loss": 3.3373, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.892799483371004, |
| "grad_norm": 0.7119138240814209, |
| "learning_rate": 0.0002468333153754983, |
| "loss": 3.3376, |
| "step": 54750 |
| }, |
| { |
| "epoch": 5.898181035410612, |
| "grad_norm": 0.7191426157951355, |
| "learning_rate": 0.0002465100743454369, |
| "loss": 3.3242, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.903562587450221, |
| "grad_norm": 0.780044436454773, |
| "learning_rate": 0.0002461868333153755, |
| "loss": 3.3281, |
| "step": 54850 |
| }, |
| { |
| "epoch": 5.9089441394898286, |
| "grad_norm": 0.6683967709541321, |
| "learning_rate": 0.0002458635922853141, |
| "loss": 3.3293, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.914325691529437, |
| "grad_norm": 0.7071850895881653, |
| "learning_rate": 0.0002455403512552526, |
| "loss": 3.3447, |
| "step": 54950 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "grad_norm": 0.7385892271995544, |
| "learning_rate": 0.0002452171102251912, |
| "loss": 3.3091, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "eval_accuracy": 0.38418694120498875, |
| "eval_loss": 3.377957820892334, |
| "eval_runtime": 194.0037, |
| "eval_samples_per_second": 92.838, |
| "eval_steps_per_second": 5.804, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.925088795608653, |
| "grad_norm": 0.7041518688201904, |
| "learning_rate": 0.0002448938691951298, |
| "loss": 3.3406, |
| "step": 55050 |
| }, |
| { |
| "epoch": 5.930470347648262, |
| "grad_norm": 0.7140253782272339, |
| "learning_rate": 0.0002445706281650684, |
| "loss": 3.3085, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.93585189968787, |
| "grad_norm": 0.6460371017456055, |
| "learning_rate": 0.000244247387135007, |
| "loss": 3.3373, |
| "step": 55150 |
| }, |
| { |
| "epoch": 5.941233451727478, |
| "grad_norm": 0.7184333801269531, |
| "learning_rate": 0.00024392414610494557, |
| "loss": 3.3167, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.946615003767087, |
| "grad_norm": 0.6763706803321838, |
| "learning_rate": 0.00024360090507488414, |
| "loss": 3.3296, |
| "step": 55250 |
| }, |
| { |
| "epoch": 5.951996555806694, |
| "grad_norm": 0.6709035634994507, |
| "learning_rate": 0.00024327766404482273, |
| "loss": 3.3416, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.957378107846303, |
| "grad_norm": 0.7073157429695129, |
| "learning_rate": 0.0002429544230147613, |
| "loss": 3.346, |
| "step": 55350 |
| }, |
| { |
| "epoch": 5.962759659885911, |
| "grad_norm": 0.7447193264961243, |
| "learning_rate": 0.00024263118198469992, |
| "loss": 3.3274, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.968141211925519, |
| "grad_norm": 0.7422886490821838, |
| "learning_rate": 0.0002423079409546385, |
| "loss": 3.3313, |
| "step": 55450 |
| }, |
| { |
| "epoch": 5.973522763965128, |
| "grad_norm": 0.6966865658760071, |
| "learning_rate": 0.00024198469992457706, |
| "loss": 3.3305, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.978904316004736, |
| "grad_norm": 0.6928249001502991, |
| "learning_rate": 0.00024166145889451568, |
| "loss": 3.3185, |
| "step": 55550 |
| }, |
| { |
| "epoch": 5.984285868044344, |
| "grad_norm": 0.6931163668632507, |
| "learning_rate": 0.00024133821786445425, |
| "loss": 3.3381, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.989667420083952, |
| "grad_norm": 0.6950192451477051, |
| "learning_rate": 0.0002410149768343928, |
| "loss": 3.3136, |
| "step": 55650 |
| }, |
| { |
| "epoch": 5.995048972123561, |
| "grad_norm": 0.7014208436012268, |
| "learning_rate": 0.0002406917358043314, |
| "loss": 3.3394, |
| "step": 55700 |
| }, |
| { |
| "epoch": 6.000430524163169, |
| "grad_norm": 0.787409245967865, |
| "learning_rate": 0.00024036849477427, |
| "loss": 3.3271, |
| "step": 55750 |
| }, |
| { |
| "epoch": 6.005812076202777, |
| "grad_norm": 0.6705772876739502, |
| "learning_rate": 0.00024004525374420857, |
| "loss": 3.2379, |
| "step": 55800 |
| }, |
| { |
| "epoch": 6.011193628242385, |
| "grad_norm": 0.7323542237281799, |
| "learning_rate": 0.00023972201271414716, |
| "loss": 3.2582, |
| "step": 55850 |
| }, |
| { |
| "epoch": 6.016575180281993, |
| "grad_norm": 0.7192012667655945, |
| "learning_rate": 0.00023939877168408573, |
| "loss": 3.2438, |
| "step": 55900 |
| }, |
| { |
| "epoch": 6.021956732321602, |
| "grad_norm": 0.7066397070884705, |
| "learning_rate": 0.00023907553065402433, |
| "loss": 3.2409, |
| "step": 55950 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "grad_norm": 0.737398624420166, |
| "learning_rate": 0.00023875228962396292, |
| "loss": 3.2331, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "eval_accuracy": 0.38383773082355305, |
| "eval_loss": 3.3844356536865234, |
| "eval_runtime": 199.0566, |
| "eval_samples_per_second": 90.482, |
| "eval_steps_per_second": 5.657, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.032719836400818, |
| "grad_norm": 0.7181048393249512, |
| "learning_rate": 0.0002384290485939015, |
| "loss": 3.2481, |
| "step": 56050 |
| }, |
| { |
| "epoch": 6.038101388440427, |
| "grad_norm": 0.750227153301239, |
| "learning_rate": 0.00023810580756384006, |
| "loss": 3.2575, |
| "step": 56100 |
| }, |
| { |
| "epoch": 6.043482940480034, |
| "grad_norm": 0.680108368396759, |
| "learning_rate": 0.00023778256653377868, |
| "loss": 3.2455, |
| "step": 56150 |
| }, |
| { |
| "epoch": 6.048864492519643, |
| "grad_norm": 0.7663739919662476, |
| "learning_rate": 0.00023745932550371725, |
| "loss": 3.2557, |
| "step": 56200 |
| }, |
| { |
| "epoch": 6.0542460445592505, |
| "grad_norm": 0.7118892669677734, |
| "learning_rate": 0.00023713608447365584, |
| "loss": 3.2405, |
| "step": 56250 |
| }, |
| { |
| "epoch": 6.059627596598859, |
| "grad_norm": 0.6887235641479492, |
| "learning_rate": 0.00023681284344359444, |
| "loss": 3.2547, |
| "step": 56300 |
| }, |
| { |
| "epoch": 6.065009148638468, |
| "grad_norm": 0.7353869676589966, |
| "learning_rate": 0.000236489602413533, |
| "loss": 3.2625, |
| "step": 56350 |
| }, |
| { |
| "epoch": 6.070390700678075, |
| "grad_norm": 0.7517368793487549, |
| "learning_rate": 0.0002361663613834716, |
| "loss": 3.2478, |
| "step": 56400 |
| }, |
| { |
| "epoch": 6.075772252717684, |
| "grad_norm": 0.7083318829536438, |
| "learning_rate": 0.00023584312035341017, |
| "loss": 3.2488, |
| "step": 56450 |
| }, |
| { |
| "epoch": 6.081153804757292, |
| "grad_norm": 0.7242905497550964, |
| "learning_rate": 0.00023551987932334876, |
| "loss": 3.2575, |
| "step": 56500 |
| }, |
| { |
| "epoch": 6.0865353567969, |
| "grad_norm": 0.8002662658691406, |
| "learning_rate": 0.00023519663829328735, |
| "loss": 3.2727, |
| "step": 56550 |
| }, |
| { |
| "epoch": 6.091916908836509, |
| "grad_norm": 0.6858273148536682, |
| "learning_rate": 0.00023487339726322592, |
| "loss": 3.271, |
| "step": 56600 |
| }, |
| { |
| "epoch": 6.097298460876116, |
| "grad_norm": 0.7602444887161255, |
| "learning_rate": 0.0002345501562331645, |
| "loss": 3.2524, |
| "step": 56650 |
| }, |
| { |
| "epoch": 6.102680012915725, |
| "grad_norm": 0.684025764465332, |
| "learning_rate": 0.0002342269152031031, |
| "loss": 3.2576, |
| "step": 56700 |
| }, |
| { |
| "epoch": 6.108061564955333, |
| "grad_norm": 0.725549042224884, |
| "learning_rate": 0.00023390367417304168, |
| "loss": 3.2571, |
| "step": 56750 |
| }, |
| { |
| "epoch": 6.113443116994941, |
| "grad_norm": 0.7334359884262085, |
| "learning_rate": 0.00023358043314298025, |
| "loss": 3.252, |
| "step": 56800 |
| }, |
| { |
| "epoch": 6.11882466903455, |
| "grad_norm": 0.7559038996696472, |
| "learning_rate": 0.00023326365693352008, |
| "loss": 3.2543, |
| "step": 56850 |
| }, |
| { |
| "epoch": 6.124206221074158, |
| "grad_norm": 0.7692900896072388, |
| "learning_rate": 0.00023294041590345865, |
| "loss": 3.2498, |
| "step": 56900 |
| }, |
| { |
| "epoch": 6.129587773113766, |
| "grad_norm": 0.7604845762252808, |
| "learning_rate": 0.00023261717487339724, |
| "loss": 3.2553, |
| "step": 56950 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "grad_norm": 0.7401818633079529, |
| "learning_rate": 0.0002322939338433358, |
| "loss": 3.2561, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "eval_accuracy": 0.38435328877623276, |
| "eval_loss": 3.3827803134918213, |
| "eval_runtime": 189.876, |
| "eval_samples_per_second": 94.857, |
| "eval_steps_per_second": 5.93, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 0.7508281469345093, |
| "learning_rate": 0.0002319706928132744, |
| "loss": 3.2641, |
| "step": 57050 |
| }, |
| { |
| "epoch": 6.1457324292325906, |
| "grad_norm": 0.7007184028625488, |
| "learning_rate": 0.000231647451783213, |
| "loss": 3.25, |
| "step": 57100 |
| }, |
| { |
| "epoch": 6.151113981272199, |
| "grad_norm": 0.7512602806091309, |
| "learning_rate": 0.00023132421075315157, |
| "loss": 3.2556, |
| "step": 57150 |
| }, |
| { |
| "epoch": 6.156495533311807, |
| "grad_norm": 0.7906447649002075, |
| "learning_rate": 0.0002310009697230902, |
| "loss": 3.2777, |
| "step": 57200 |
| }, |
| { |
| "epoch": 6.161877085351415, |
| "grad_norm": 0.7162222862243652, |
| "learning_rate": 0.00023067772869302876, |
| "loss": 3.2594, |
| "step": 57250 |
| }, |
| { |
| "epoch": 6.167258637391024, |
| "grad_norm": 0.724038302898407, |
| "learning_rate": 0.00023035448766296732, |
| "loss": 3.2519, |
| "step": 57300 |
| }, |
| { |
| "epoch": 6.1726401894306315, |
| "grad_norm": 0.7300527691841125, |
| "learning_rate": 0.00023003124663290592, |
| "loss": 3.2576, |
| "step": 57350 |
| }, |
| { |
| "epoch": 6.17802174147024, |
| "grad_norm": 0.7151215076446533, |
| "learning_rate": 0.0002297080056028445, |
| "loss": 3.2758, |
| "step": 57400 |
| }, |
| { |
| "epoch": 6.183403293509849, |
| "grad_norm": 0.7065231204032898, |
| "learning_rate": 0.00022938476457278308, |
| "loss": 3.2777, |
| "step": 57450 |
| }, |
| { |
| "epoch": 6.188784845549456, |
| "grad_norm": 0.7321211695671082, |
| "learning_rate": 0.00022906152354272168, |
| "loss": 3.2625, |
| "step": 57500 |
| }, |
| { |
| "epoch": 6.194166397589065, |
| "grad_norm": 0.7722061276435852, |
| "learning_rate": 0.00022873828251266024, |
| "loss": 3.2614, |
| "step": 57550 |
| }, |
| { |
| "epoch": 6.1995479496286725, |
| "grad_norm": 0.7216559648513794, |
| "learning_rate": 0.00022841504148259884, |
| "loss": 3.2711, |
| "step": 57600 |
| }, |
| { |
| "epoch": 6.204929501668281, |
| "grad_norm": 0.7288267016410828, |
| "learning_rate": 0.00022809180045253743, |
| "loss": 3.2796, |
| "step": 57650 |
| }, |
| { |
| "epoch": 6.21031105370789, |
| "grad_norm": 0.7449484467506409, |
| "learning_rate": 0.000227768559422476, |
| "loss": 3.2711, |
| "step": 57700 |
| }, |
| { |
| "epoch": 6.215692605747497, |
| "grad_norm": 0.7095771431922913, |
| "learning_rate": 0.00022744531839241457, |
| "loss": 3.263, |
| "step": 57750 |
| }, |
| { |
| "epoch": 6.221074157787106, |
| "grad_norm": 0.6975909471511841, |
| "learning_rate": 0.0002271220773623532, |
| "loss": 3.263, |
| "step": 57800 |
| }, |
| { |
| "epoch": 6.226455709826714, |
| "grad_norm": 0.765916109085083, |
| "learning_rate": 0.00022679883633229176, |
| "loss": 3.2627, |
| "step": 57850 |
| }, |
| { |
| "epoch": 6.231837261866322, |
| "grad_norm": 0.7326292395591736, |
| "learning_rate": 0.00022647559530223032, |
| "loss": 3.2769, |
| "step": 57900 |
| }, |
| { |
| "epoch": 6.237218813905931, |
| "grad_norm": 0.7090906500816345, |
| "learning_rate": 0.00022615235427216895, |
| "loss": 3.2689, |
| "step": 57950 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "grad_norm": 0.7131697535514832, |
| "learning_rate": 0.0002258291132421075, |
| "loss": 3.2774, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "eval_accuracy": 0.3847193403552797, |
| "eval_loss": 3.3791263103485107, |
| "eval_runtime": 209.7354, |
| "eval_samples_per_second": 85.875, |
| "eval_steps_per_second": 5.369, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.247981917985147, |
| "grad_norm": 0.7514231204986572, |
| "learning_rate": 0.0002255058722120461, |
| "loss": 3.2866, |
| "step": 58050 |
| }, |
| { |
| "epoch": 6.253363470024755, |
| "grad_norm": 0.7404820322990417, |
| "learning_rate": 0.00022518263118198468, |
| "loss": 3.2731, |
| "step": 58100 |
| }, |
| { |
| "epoch": 6.258745022064363, |
| "grad_norm": 0.7129669785499573, |
| "learning_rate": 0.00022485939015192327, |
| "loss": 3.264, |
| "step": 58150 |
| }, |
| { |
| "epoch": 6.264126574103972, |
| "grad_norm": 0.7018173336982727, |
| "learning_rate": 0.00022453614912186186, |
| "loss": 3.2629, |
| "step": 58200 |
| }, |
| { |
| "epoch": 6.26950812614358, |
| "grad_norm": 0.7070589065551758, |
| "learning_rate": 0.00022421290809180043, |
| "loss": 3.2837, |
| "step": 58250 |
| }, |
| { |
| "epoch": 6.274889678183188, |
| "grad_norm": 0.7775071859359741, |
| "learning_rate": 0.000223889667061739, |
| "loss": 3.2507, |
| "step": 58300 |
| }, |
| { |
| "epoch": 6.280271230222796, |
| "grad_norm": 0.7200853824615479, |
| "learning_rate": 0.00022356642603167762, |
| "loss": 3.2699, |
| "step": 58350 |
| }, |
| { |
| "epoch": 6.285652782262405, |
| "grad_norm": 0.7718141078948975, |
| "learning_rate": 0.0002232431850016162, |
| "loss": 3.2714, |
| "step": 58400 |
| }, |
| { |
| "epoch": 6.2910343343020125, |
| "grad_norm": 0.7274107933044434, |
| "learning_rate": 0.00022291994397155476, |
| "loss": 3.2824, |
| "step": 58450 |
| }, |
| { |
| "epoch": 6.296415886341621, |
| "grad_norm": 0.6835294365882874, |
| "learning_rate": 0.00022259670294149338, |
| "loss": 3.2738, |
| "step": 58500 |
| }, |
| { |
| "epoch": 6.301797438381229, |
| "grad_norm": 0.7693158388137817, |
| "learning_rate": 0.00022227346191143195, |
| "loss": 3.2735, |
| "step": 58550 |
| }, |
| { |
| "epoch": 6.307178990420837, |
| "grad_norm": 0.724594235420227, |
| "learning_rate": 0.00022195022088137051, |
| "loss": 3.2632, |
| "step": 58600 |
| }, |
| { |
| "epoch": 6.312560542460446, |
| "grad_norm": 0.7053855061531067, |
| "learning_rate": 0.0002216269798513091, |
| "loss": 3.2815, |
| "step": 58650 |
| }, |
| { |
| "epoch": 6.3179420945000535, |
| "grad_norm": 0.7325628995895386, |
| "learning_rate": 0.00022130373882124768, |
| "loss": 3.2774, |
| "step": 58700 |
| }, |
| { |
| "epoch": 6.323323646539662, |
| "grad_norm": 0.7015441060066223, |
| "learning_rate": 0.00022098049779118627, |
| "loss": 3.2722, |
| "step": 58750 |
| }, |
| { |
| "epoch": 6.328705198579271, |
| "grad_norm": 0.6750187277793884, |
| "learning_rate": 0.00022065725676112487, |
| "loss": 3.2743, |
| "step": 58800 |
| }, |
| { |
| "epoch": 6.334086750618878, |
| "grad_norm": 0.7636710405349731, |
| "learning_rate": 0.00022033401573106343, |
| "loss": 3.2824, |
| "step": 58850 |
| }, |
| { |
| "epoch": 6.339468302658487, |
| "grad_norm": 0.7353969216346741, |
| "learning_rate": 0.000220010774701002, |
| "loss": 3.2635, |
| "step": 58900 |
| }, |
| { |
| "epoch": 6.344849854698095, |
| "grad_norm": 0.7475679516792297, |
| "learning_rate": 0.00021968753367094062, |
| "loss": 3.2819, |
| "step": 58950 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "grad_norm": 0.7494605183601379, |
| "learning_rate": 0.0002193642926408792, |
| "loss": 3.2758, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "eval_accuracy": 0.38515145289011793, |
| "eval_loss": 3.3769986629486084, |
| "eval_runtime": 199.2062, |
| "eval_samples_per_second": 90.414, |
| "eval_steps_per_second": 5.652, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.355612958777312, |
| "grad_norm": 0.6721672415733337, |
| "learning_rate": 0.00021904105161081778, |
| "loss": 3.289, |
| "step": 59050 |
| }, |
| { |
| "epoch": 6.360994510816919, |
| "grad_norm": 0.7832658886909485, |
| "learning_rate": 0.00021871781058075638, |
| "loss": 3.2721, |
| "step": 59100 |
| }, |
| { |
| "epoch": 6.366376062856528, |
| "grad_norm": 0.7826193571090698, |
| "learning_rate": 0.00021839456955069495, |
| "loss": 3.2809, |
| "step": 59150 |
| }, |
| { |
| "epoch": 6.371757614896136, |
| "grad_norm": 0.7247890830039978, |
| "learning_rate": 0.00021807132852063354, |
| "loss": 3.2753, |
| "step": 59200 |
| }, |
| { |
| "epoch": 6.377139166935744, |
| "grad_norm": 0.6926654577255249, |
| "learning_rate": 0.0002177480874905721, |
| "loss": 3.2722, |
| "step": 59250 |
| }, |
| { |
| "epoch": 6.382520718975353, |
| "grad_norm": 0.7651258111000061, |
| "learning_rate": 0.0002174248464605107, |
| "loss": 3.2964, |
| "step": 59300 |
| }, |
| { |
| "epoch": 6.387902271014961, |
| "grad_norm": 0.7821109890937805, |
| "learning_rate": 0.0002171016054304493, |
| "loss": 3.2794, |
| "step": 59350 |
| }, |
| { |
| "epoch": 6.393283823054569, |
| "grad_norm": 0.7029480934143066, |
| "learning_rate": 0.00021677836440038787, |
| "loss": 3.2882, |
| "step": 59400 |
| }, |
| { |
| "epoch": 6.398665375094177, |
| "grad_norm": 0.7246655225753784, |
| "learning_rate": 0.00021645512337032643, |
| "loss": 3.2761, |
| "step": 59450 |
| }, |
| { |
| "epoch": 6.404046927133785, |
| "grad_norm": 0.7272088527679443, |
| "learning_rate": 0.00021613188234026506, |
| "loss": 3.2803, |
| "step": 59500 |
| }, |
| { |
| "epoch": 6.4094284791733935, |
| "grad_norm": 0.7681124806404114, |
| "learning_rate": 0.00021580864131020362, |
| "loss": 3.268, |
| "step": 59550 |
| }, |
| { |
| "epoch": 6.414810031213002, |
| "grad_norm": 0.7238587737083435, |
| "learning_rate": 0.0002154854002801422, |
| "loss": 3.2894, |
| "step": 59600 |
| }, |
| { |
| "epoch": 6.42019158325261, |
| "grad_norm": 0.7431035041809082, |
| "learning_rate": 0.0002151621592500808, |
| "loss": 3.2836, |
| "step": 59650 |
| }, |
| { |
| "epoch": 6.425573135292218, |
| "grad_norm": 0.7501331567764282, |
| "learning_rate": 0.00021483891822001938, |
| "loss": 3.2989, |
| "step": 59700 |
| }, |
| { |
| "epoch": 6.430954687331827, |
| "grad_norm": 0.8023155927658081, |
| "learning_rate": 0.00021451567718995795, |
| "loss": 3.2746, |
| "step": 59750 |
| }, |
| { |
| "epoch": 6.4363362393714345, |
| "grad_norm": 0.7714266777038574, |
| "learning_rate": 0.00021419243615989654, |
| "loss": 3.2699, |
| "step": 59800 |
| }, |
| { |
| "epoch": 6.441717791411043, |
| "grad_norm": 0.7600786089897156, |
| "learning_rate": 0.00021386919512983514, |
| "loss": 3.2804, |
| "step": 59850 |
| }, |
| { |
| "epoch": 6.447099343450651, |
| "grad_norm": 0.7343586683273315, |
| "learning_rate": 0.0002135459540997737, |
| "loss": 3.2754, |
| "step": 59900 |
| }, |
| { |
| "epoch": 6.452480895490259, |
| "grad_norm": 0.7292217016220093, |
| "learning_rate": 0.0002132227130697123, |
| "loss": 3.2895, |
| "step": 59950 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "grad_norm": 0.7181167006492615, |
| "learning_rate": 0.00021289947203965087, |
| "loss": 3.2704, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "eval_accuracy": 0.3855310860801417, |
| "eval_loss": 3.370968818664551, |
| "eval_runtime": 189.017, |
| "eval_samples_per_second": 95.288, |
| "eval_steps_per_second": 5.957, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.4632439995694755, |
| "grad_norm": 0.7233908772468567, |
| "learning_rate": 0.0002125762310095895, |
| "loss": 3.2821, |
| "step": 60050 |
| }, |
| { |
| "epoch": 6.468625551609084, |
| "grad_norm": 0.7436235547065735, |
| "learning_rate": 0.00021225298997952806, |
| "loss": 3.2707, |
| "step": 60100 |
| }, |
| { |
| "epoch": 6.474007103648693, |
| "grad_norm": 0.7943134307861328, |
| "learning_rate": 0.00021192974894946662, |
| "loss": 3.278, |
| "step": 60150 |
| }, |
| { |
| "epoch": 6.4793886556883, |
| "grad_norm": 0.7859511971473694, |
| "learning_rate": 0.00021160650791940524, |
| "loss": 3.2665, |
| "step": 60200 |
| }, |
| { |
| "epoch": 6.484770207727909, |
| "grad_norm": 0.7386723160743713, |
| "learning_rate": 0.0002112832668893438, |
| "loss": 3.2707, |
| "step": 60250 |
| }, |
| { |
| "epoch": 6.490151759767517, |
| "grad_norm": 0.7917008399963379, |
| "learning_rate": 0.00021096002585928238, |
| "loss": 3.2784, |
| "step": 60300 |
| }, |
| { |
| "epoch": 6.495533311807125, |
| "grad_norm": 0.7543750405311584, |
| "learning_rate": 0.00021063678482922097, |
| "loss": 3.2963, |
| "step": 60350 |
| }, |
| { |
| "epoch": 6.500914863846734, |
| "grad_norm": 0.6963995695114136, |
| "learning_rate": 0.00021031354379915957, |
| "loss": 3.2862, |
| "step": 60400 |
| }, |
| { |
| "epoch": 6.506296415886341, |
| "grad_norm": 0.733762264251709, |
| "learning_rate": 0.00020999030276909814, |
| "loss": 3.2699, |
| "step": 60450 |
| }, |
| { |
| "epoch": 6.51167796792595, |
| "grad_norm": 0.7302635908126831, |
| "learning_rate": 0.00020966706173903673, |
| "loss": 3.2793, |
| "step": 60500 |
| }, |
| { |
| "epoch": 6.517059519965558, |
| "grad_norm": 0.7205909490585327, |
| "learning_rate": 0.0002093438207089753, |
| "loss": 3.2815, |
| "step": 60550 |
| }, |
| { |
| "epoch": 6.522441072005166, |
| "grad_norm": 0.7015230059623718, |
| "learning_rate": 0.00020902057967891387, |
| "loss": 3.2864, |
| "step": 60600 |
| }, |
| { |
| "epoch": 6.5278226240447745, |
| "grad_norm": 0.7541133165359497, |
| "learning_rate": 0.0002086973386488525, |
| "loss": 3.2772, |
| "step": 60650 |
| }, |
| { |
| "epoch": 6.533204176084383, |
| "grad_norm": 0.7334636449813843, |
| "learning_rate": 0.00020837409761879106, |
| "loss": 3.276, |
| "step": 60700 |
| }, |
| { |
| "epoch": 6.538585728123991, |
| "grad_norm": 0.7742342352867126, |
| "learning_rate": 0.00020805085658872962, |
| "loss": 3.2825, |
| "step": 60750 |
| }, |
| { |
| "epoch": 6.543967280163599, |
| "grad_norm": 0.7646075487136841, |
| "learning_rate": 0.00020772761555866825, |
| "loss": 3.28, |
| "step": 60800 |
| }, |
| { |
| "epoch": 6.549348832203208, |
| "grad_norm": 0.7698816657066345, |
| "learning_rate": 0.0002074043745286068, |
| "loss": 3.2784, |
| "step": 60850 |
| }, |
| { |
| "epoch": 6.5547303842428155, |
| "grad_norm": 0.7581663131713867, |
| "learning_rate": 0.00020708759831914662, |
| "loss": 3.2789, |
| "step": 60900 |
| }, |
| { |
| "epoch": 6.560111936282424, |
| "grad_norm": 0.7218497395515442, |
| "learning_rate": 0.00020676435728908521, |
| "loss": 3.2652, |
| "step": 60950 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "grad_norm": 0.7453391551971436, |
| "learning_rate": 0.0002064411162590238, |
| "loss": 3.2909, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "eval_accuracy": 0.3859504862271056, |
| "eval_loss": 3.368067502975464, |
| "eval_runtime": 188.6949, |
| "eval_samples_per_second": 95.45, |
| "eval_steps_per_second": 5.967, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.57087504036164, |
| "grad_norm": 0.760221004486084, |
| "learning_rate": 0.00020611787522896238, |
| "loss": 3.2769, |
| "step": 61050 |
| }, |
| { |
| "epoch": 6.576256592401249, |
| "grad_norm": 0.7161414623260498, |
| "learning_rate": 0.00020579463419890094, |
| "loss": 3.2877, |
| "step": 61100 |
| }, |
| { |
| "epoch": 6.5816381444408565, |
| "grad_norm": 0.7219290137290955, |
| "learning_rate": 0.00020547139316883957, |
| "loss": 3.2697, |
| "step": 61150 |
| }, |
| { |
| "epoch": 6.587019696480465, |
| "grad_norm": 0.7357455492019653, |
| "learning_rate": 0.00020514815213877813, |
| "loss": 3.2605, |
| "step": 61200 |
| }, |
| { |
| "epoch": 6.592401248520073, |
| "grad_norm": 0.7844370007514954, |
| "learning_rate": 0.0002048249111087167, |
| "loss": 3.2879, |
| "step": 61250 |
| }, |
| { |
| "epoch": 6.597782800559681, |
| "grad_norm": 0.7833907604217529, |
| "learning_rate": 0.00020450167007865532, |
| "loss": 3.2809, |
| "step": 61300 |
| }, |
| { |
| "epoch": 6.60316435259929, |
| "grad_norm": 0.7428874373435974, |
| "learning_rate": 0.0002041784290485939, |
| "loss": 3.2752, |
| "step": 61350 |
| }, |
| { |
| "epoch": 6.608545904638898, |
| "grad_norm": 0.7579718232154846, |
| "learning_rate": 0.00020385518801853246, |
| "loss": 3.292, |
| "step": 61400 |
| }, |
| { |
| "epoch": 6.613927456678506, |
| "grad_norm": 0.8111922740936279, |
| "learning_rate": 0.00020353194698847105, |
| "loss": 3.2782, |
| "step": 61450 |
| }, |
| { |
| "epoch": 6.619309008718115, |
| "grad_norm": 0.7089512348175049, |
| "learning_rate": 0.00020320870595840965, |
| "loss": 3.2841, |
| "step": 61500 |
| }, |
| { |
| "epoch": 6.624690560757722, |
| "grad_norm": 0.8609421849250793, |
| "learning_rate": 0.00020289192974894945, |
| "loss": 3.2795, |
| "step": 61550 |
| }, |
| { |
| "epoch": 6.630072112797331, |
| "grad_norm": 0.7742621302604675, |
| "learning_rate": 0.00020256868871888802, |
| "loss": 3.2916, |
| "step": 61600 |
| }, |
| { |
| "epoch": 6.635453664836939, |
| "grad_norm": 0.7429343461990356, |
| "learning_rate": 0.00020224544768882664, |
| "loss": 3.282, |
| "step": 61650 |
| }, |
| { |
| "epoch": 6.640835216876547, |
| "grad_norm": 0.7164751887321472, |
| "learning_rate": 0.0002019222066587652, |
| "loss": 3.2739, |
| "step": 61700 |
| }, |
| { |
| "epoch": 6.6462167689161555, |
| "grad_norm": 0.749731719493866, |
| "learning_rate": 0.00020159896562870378, |
| "loss": 3.2904, |
| "step": 61750 |
| }, |
| { |
| "epoch": 6.651598320955763, |
| "grad_norm": 0.800025463104248, |
| "learning_rate": 0.00020127572459864237, |
| "loss": 3.2888, |
| "step": 61800 |
| }, |
| { |
| "epoch": 6.656979872995372, |
| "grad_norm": 0.7650232911109924, |
| "learning_rate": 0.00020095248356858097, |
| "loss": 3.2782, |
| "step": 61850 |
| }, |
| { |
| "epoch": 6.66236142503498, |
| "grad_norm": 0.8025387525558472, |
| "learning_rate": 0.00020062924253851953, |
| "loss": 3.2902, |
| "step": 61900 |
| }, |
| { |
| "epoch": 6.667742977074588, |
| "grad_norm": 0.7375674247741699, |
| "learning_rate": 0.00020030600150845813, |
| "loss": 3.28, |
| "step": 61950 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "grad_norm": 0.7235491275787354, |
| "learning_rate": 0.0001999827604783967, |
| "loss": 3.2517, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "eval_accuracy": 0.38661424671876427, |
| "eval_loss": 3.3633713722229004, |
| "eval_runtime": 215.2915, |
| "eval_samples_per_second": 83.659, |
| "eval_steps_per_second": 5.23, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.678506081153805, |
| "grad_norm": 0.7825114727020264, |
| "learning_rate": 0.0001996595194483353, |
| "loss": 3.2896, |
| "step": 62050 |
| }, |
| { |
| "epoch": 6.683887633193413, |
| "grad_norm": 0.7919412851333618, |
| "learning_rate": 0.00019933627841827389, |
| "loss": 3.2721, |
| "step": 62100 |
| }, |
| { |
| "epoch": 6.689269185233021, |
| "grad_norm": 0.7695334553718567, |
| "learning_rate": 0.00019901303738821245, |
| "loss": 3.2796, |
| "step": 62150 |
| }, |
| { |
| "epoch": 6.69465073727263, |
| "grad_norm": 0.7713647484779358, |
| "learning_rate": 0.00019868979635815102, |
| "loss": 3.2701, |
| "step": 62200 |
| }, |
| { |
| "epoch": 6.7000322893122375, |
| "grad_norm": 0.7982776761054993, |
| "learning_rate": 0.00019836655532808964, |
| "loss": 3.2907, |
| "step": 62250 |
| }, |
| { |
| "epoch": 6.705413841351846, |
| "grad_norm": 0.7364196181297302, |
| "learning_rate": 0.0001980433142980282, |
| "loss": 3.3025, |
| "step": 62300 |
| }, |
| { |
| "epoch": 6.710795393391454, |
| "grad_norm": 0.7790904641151428, |
| "learning_rate": 0.00019772007326796678, |
| "loss": 3.2641, |
| "step": 62350 |
| }, |
| { |
| "epoch": 6.716176945431062, |
| "grad_norm": 0.7802610397338867, |
| "learning_rate": 0.0001973968322379054, |
| "loss": 3.3106, |
| "step": 62400 |
| }, |
| { |
| "epoch": 6.721558497470671, |
| "grad_norm": 0.7998891472816467, |
| "learning_rate": 0.00019707359120784397, |
| "loss": 3.3037, |
| "step": 62450 |
| }, |
| { |
| "epoch": 6.7269400495102785, |
| "grad_norm": 0.7798315286636353, |
| "learning_rate": 0.00019675035017778253, |
| "loss": 3.2986, |
| "step": 62500 |
| }, |
| { |
| "epoch": 6.732321601549887, |
| "grad_norm": 0.7587239146232605, |
| "learning_rate": 0.00019642710914772113, |
| "loss": 3.2855, |
| "step": 62550 |
| }, |
| { |
| "epoch": 6.737703153589496, |
| "grad_norm": 0.7729638814926147, |
| "learning_rate": 0.00019610386811765972, |
| "loss": 3.273, |
| "step": 62600 |
| }, |
| { |
| "epoch": 6.743084705629103, |
| "grad_norm": 0.7621460556983948, |
| "learning_rate": 0.00019578062708759832, |
| "loss": 3.2794, |
| "step": 62650 |
| }, |
| { |
| "epoch": 6.748466257668712, |
| "grad_norm": 0.7755793333053589, |
| "learning_rate": 0.00019545738605753689, |
| "loss": 3.2871, |
| "step": 62700 |
| }, |
| { |
| "epoch": 6.75384780970832, |
| "grad_norm": 0.7545557618141174, |
| "learning_rate": 0.00019513414502747545, |
| "loss": 3.2949, |
| "step": 62750 |
| }, |
| { |
| "epoch": 6.759229361747928, |
| "grad_norm": 0.8555020093917847, |
| "learning_rate": 0.00019481090399741408, |
| "loss": 3.294, |
| "step": 62800 |
| }, |
| { |
| "epoch": 6.7646109137875365, |
| "grad_norm": 0.8784904479980469, |
| "learning_rate": 0.00019448766296735264, |
| "loss": 3.289, |
| "step": 62850 |
| }, |
| { |
| "epoch": 6.769992465827144, |
| "grad_norm": 0.7961408495903015, |
| "learning_rate": 0.0001941644219372912, |
| "loss": 3.2854, |
| "step": 62900 |
| }, |
| { |
| "epoch": 6.775374017866753, |
| "grad_norm": 0.7516131401062012, |
| "learning_rate": 0.00019384118090722983, |
| "loss": 3.2826, |
| "step": 62950 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "grad_norm": 0.7225354313850403, |
| "learning_rate": 0.0001935179398771684, |
| "loss": 3.2907, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "eval_accuracy": 0.38661718034673526, |
| "eval_loss": 3.3603904247283936, |
| "eval_runtime": 186.1062, |
| "eval_samples_per_second": 96.778, |
| "eval_steps_per_second": 6.05, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.786137121945969, |
| "grad_norm": 0.7398602366447449, |
| "learning_rate": 0.00019319469884710697, |
| "loss": 3.2655, |
| "step": 63050 |
| }, |
| { |
| "epoch": 6.7915186739855775, |
| "grad_norm": 0.7899209260940552, |
| "learning_rate": 0.00019287145781704556, |
| "loss": 3.2906, |
| "step": 63100 |
| }, |
| { |
| "epoch": 6.796900226025185, |
| "grad_norm": 0.8090075850486755, |
| "learning_rate": 0.00019254821678698416, |
| "loss": 3.2966, |
| "step": 63150 |
| }, |
| { |
| "epoch": 6.802281778064794, |
| "grad_norm": 0.7782484889030457, |
| "learning_rate": 0.00019222497575692272, |
| "loss": 3.2666, |
| "step": 63200 |
| }, |
| { |
| "epoch": 6.807663330104402, |
| "grad_norm": 0.7428813576698303, |
| "learning_rate": 0.00019190173472686132, |
| "loss": 3.2765, |
| "step": 63250 |
| }, |
| { |
| "epoch": 6.813044882144011, |
| "grad_norm": 0.7877742052078247, |
| "learning_rate": 0.0001915784936967999, |
| "loss": 3.2876, |
| "step": 63300 |
| }, |
| { |
| "epoch": 6.8184264341836185, |
| "grad_norm": 0.7624988555908203, |
| "learning_rate": 0.00019125525266673845, |
| "loss": 3.2792, |
| "step": 63350 |
| }, |
| { |
| "epoch": 6.823807986223227, |
| "grad_norm": 0.7845025658607483, |
| "learning_rate": 0.00019093201163667708, |
| "loss": 3.2922, |
| "step": 63400 |
| }, |
| { |
| "epoch": 6.829189538262835, |
| "grad_norm": 0.7649354934692383, |
| "learning_rate": 0.00019060877060661564, |
| "loss": 3.278, |
| "step": 63450 |
| }, |
| { |
| "epoch": 6.834571090302443, |
| "grad_norm": 0.7455893754959106, |
| "learning_rate": 0.0001902855295765542, |
| "loss": 3.2766, |
| "step": 63500 |
| }, |
| { |
| "epoch": 6.839952642342052, |
| "grad_norm": 0.7284469604492188, |
| "learning_rate": 0.00018996228854649283, |
| "loss": 3.2942, |
| "step": 63550 |
| }, |
| { |
| "epoch": 6.8453341943816595, |
| "grad_norm": 0.7722787261009216, |
| "learning_rate": 0.0001896390475164314, |
| "loss": 3.2727, |
| "step": 63600 |
| }, |
| { |
| "epoch": 6.850715746421268, |
| "grad_norm": 0.7689815163612366, |
| "learning_rate": 0.00018931580648637, |
| "loss": 3.2934, |
| "step": 63650 |
| }, |
| { |
| "epoch": 6.856097298460876, |
| "grad_norm": 0.7457485795021057, |
| "learning_rate": 0.00018899256545630856, |
| "loss": 3.2886, |
| "step": 63700 |
| }, |
| { |
| "epoch": 6.861478850500484, |
| "grad_norm": 0.7541655898094177, |
| "learning_rate": 0.00018866932442624716, |
| "loss": 3.2864, |
| "step": 63750 |
| }, |
| { |
| "epoch": 6.866860402540093, |
| "grad_norm": 0.8029109835624695, |
| "learning_rate": 0.00018835254821678696, |
| "loss": 3.2789, |
| "step": 63800 |
| }, |
| { |
| "epoch": 6.8722419545797, |
| "grad_norm": 0.7254763841629028, |
| "learning_rate": 0.00018802930718672553, |
| "loss": 3.2982, |
| "step": 63850 |
| }, |
| { |
| "epoch": 6.877623506619309, |
| "grad_norm": 0.7659353017807007, |
| "learning_rate": 0.00018770606615666415, |
| "loss": 3.2749, |
| "step": 63900 |
| }, |
| { |
| "epoch": 6.8830050586589175, |
| "grad_norm": 0.7792415022850037, |
| "learning_rate": 0.00018738282512660272, |
| "loss": 3.2786, |
| "step": 63950 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "grad_norm": 0.7490355372428894, |
| "learning_rate": 0.0001870595840965413, |
| "loss": 3.2777, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "eval_accuracy": 0.38715718519917325, |
| "eval_loss": 3.355168342590332, |
| "eval_runtime": 185.7745, |
| "eval_samples_per_second": 96.951, |
| "eval_steps_per_second": 6.061, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.893768162738134, |
| "grad_norm": 0.7168199419975281, |
| "learning_rate": 0.0001867363430664799, |
| "loss": 3.2862, |
| "step": 64050 |
| }, |
| { |
| "epoch": 6.899149714777742, |
| "grad_norm": 0.7434327602386475, |
| "learning_rate": 0.00018641310203641848, |
| "loss": 3.2984, |
| "step": 64100 |
| }, |
| { |
| "epoch": 6.90453126681735, |
| "grad_norm": 0.7520498037338257, |
| "learning_rate": 0.00018608986100635705, |
| "loss": 3.2909, |
| "step": 64150 |
| }, |
| { |
| "epoch": 6.9099128188569585, |
| "grad_norm": 0.7482304573059082, |
| "learning_rate": 0.00018576661997629564, |
| "loss": 3.2872, |
| "step": 64200 |
| }, |
| { |
| "epoch": 6.915294370896566, |
| "grad_norm": 0.7380110621452332, |
| "learning_rate": 0.00018544337894623423, |
| "loss": 3.2777, |
| "step": 64250 |
| }, |
| { |
| "epoch": 6.920675922936175, |
| "grad_norm": 0.7658602595329285, |
| "learning_rate": 0.0001851201379161728, |
| "loss": 3.2829, |
| "step": 64300 |
| }, |
| { |
| "epoch": 6.926057474975783, |
| "grad_norm": 0.750186562538147, |
| "learning_rate": 0.0001847968968861114, |
| "loss": 3.2828, |
| "step": 64350 |
| }, |
| { |
| "epoch": 6.931439027015391, |
| "grad_norm": 0.7942940592765808, |
| "learning_rate": 0.00018447365585604996, |
| "loss": 3.2822, |
| "step": 64400 |
| }, |
| { |
| "epoch": 6.9368205790549995, |
| "grad_norm": 0.7206134796142578, |
| "learning_rate": 0.00018415041482598859, |
| "loss": 3.2783, |
| "step": 64450 |
| }, |
| { |
| "epoch": 6.942202131094608, |
| "grad_norm": 0.819940447807312, |
| "learning_rate": 0.00018382717379592715, |
| "loss": 3.2769, |
| "step": 64500 |
| }, |
| { |
| "epoch": 6.947583683134216, |
| "grad_norm": 0.7407217621803284, |
| "learning_rate": 0.00018350393276586572, |
| "loss": 3.2826, |
| "step": 64550 |
| }, |
| { |
| "epoch": 6.952965235173824, |
| "grad_norm": 0.7760574817657471, |
| "learning_rate": 0.00018318069173580434, |
| "loss": 3.2724, |
| "step": 64600 |
| }, |
| { |
| "epoch": 6.958346787213433, |
| "grad_norm": 0.773434042930603, |
| "learning_rate": 0.0001828574507057429, |
| "loss": 3.2757, |
| "step": 64650 |
| }, |
| { |
| "epoch": 6.9637283392530405, |
| "grad_norm": 0.761227548122406, |
| "learning_rate": 0.00018253420967568148, |
| "loss": 3.2878, |
| "step": 64700 |
| }, |
| { |
| "epoch": 6.969109891292649, |
| "grad_norm": 0.80083829164505, |
| "learning_rate": 0.00018221096864562007, |
| "loss": 3.2829, |
| "step": 64750 |
| }, |
| { |
| "epoch": 6.974491443332257, |
| "grad_norm": 0.7639033794403076, |
| "learning_rate": 0.00018188772761555867, |
| "loss": 3.2665, |
| "step": 64800 |
| }, |
| { |
| "epoch": 6.979872995371865, |
| "grad_norm": 0.769054651260376, |
| "learning_rate": 0.00018156448658549723, |
| "loss": 3.3075, |
| "step": 64850 |
| }, |
| { |
| "epoch": 6.985254547411474, |
| "grad_norm": 0.769734263420105, |
| "learning_rate": 0.00018124124555543583, |
| "loss": 3.2727, |
| "step": 64900 |
| }, |
| { |
| "epoch": 6.990636099451081, |
| "grad_norm": 0.7468952536582947, |
| "learning_rate": 0.0001809180045253744, |
| "loss": 3.2627, |
| "step": 64950 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "grad_norm": 0.7479400634765625, |
| "learning_rate": 0.00018059476349531296, |
| "loss": 3.2879, |
| "step": 65000 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "eval_accuracy": 0.38733972205070155, |
| "eval_loss": 3.3511366844177246, |
| "eval_runtime": 185.9727, |
| "eval_samples_per_second": 96.848, |
| "eval_steps_per_second": 6.055, |
| "step": 65000 |
| }, |
| { |
| "epoch": 7.0013992035302985, |
| "grad_norm": 0.7707948088645935, |
| "learning_rate": 0.00018027152246525159, |
| "loss": 3.2579, |
| "step": 65050 |
| }, |
| { |
| "epoch": 7.006780755569906, |
| "grad_norm": 0.8526560068130493, |
| "learning_rate": 0.00017994828143519015, |
| "loss": 3.2101, |
| "step": 65100 |
| }, |
| { |
| "epoch": 7.012162307609515, |
| "grad_norm": 0.7695967555046082, |
| "learning_rate": 0.00017962504040512872, |
| "loss": 3.1905, |
| "step": 65150 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.769116222858429, |
| "learning_rate": 0.00017930179937506734, |
| "loss": 3.2002, |
| "step": 65200 |
| }, |
| { |
| "epoch": 7.022925411688731, |
| "grad_norm": 0.7766295671463013, |
| "learning_rate": 0.0001789785583450059, |
| "loss": 3.2006, |
| "step": 65250 |
| }, |
| { |
| "epoch": 7.0283069637283395, |
| "grad_norm": 0.7473772168159485, |
| "learning_rate": 0.00017865531731494448, |
| "loss": 3.1989, |
| "step": 65300 |
| }, |
| { |
| "epoch": 7.033688515767947, |
| "grad_norm": 0.8230901956558228, |
| "learning_rate": 0.00017833207628488307, |
| "loss": 3.2103, |
| "step": 65350 |
| }, |
| { |
| "epoch": 7.039070067807556, |
| "grad_norm": 0.7858790159225464, |
| "learning_rate": 0.00017800883525482167, |
| "loss": 3.201, |
| "step": 65400 |
| }, |
| { |
| "epoch": 7.044451619847164, |
| "grad_norm": 0.7520588040351868, |
| "learning_rate": 0.00017768559422476026, |
| "loss": 3.2173, |
| "step": 65450 |
| }, |
| { |
| "epoch": 7.049833171886772, |
| "grad_norm": 0.7921962738037109, |
| "learning_rate": 0.00017736235319469883, |
| "loss": 3.2317, |
| "step": 65500 |
| }, |
| { |
| "epoch": 7.0552147239263805, |
| "grad_norm": 0.8519015312194824, |
| "learning_rate": 0.0001770391121646374, |
| "loss": 3.2154, |
| "step": 65550 |
| }, |
| { |
| "epoch": 7.060596275965988, |
| "grad_norm": 0.8122455477714539, |
| "learning_rate": 0.00017671587113457602, |
| "loss": 3.2125, |
| "step": 65600 |
| }, |
| { |
| "epoch": 7.065977828005597, |
| "grad_norm": 0.7628065943717957, |
| "learning_rate": 0.0001763926301045146, |
| "loss": 3.204, |
| "step": 65650 |
| }, |
| { |
| "epoch": 7.071359380045205, |
| "grad_norm": 0.7457402944564819, |
| "learning_rate": 0.00017606938907445315, |
| "loss": 3.1995, |
| "step": 65700 |
| }, |
| { |
| "epoch": 7.076740932084813, |
| "grad_norm": 0.7427577972412109, |
| "learning_rate": 0.00017574614804439178, |
| "loss": 3.2073, |
| "step": 65750 |
| }, |
| { |
| "epoch": 7.0821224841244215, |
| "grad_norm": 0.7844288349151611, |
| "learning_rate": 0.00017542290701433034, |
| "loss": 3.2033, |
| "step": 65800 |
| }, |
| { |
| "epoch": 7.08750403616403, |
| "grad_norm": 0.7779414653778076, |
| "learning_rate": 0.0001750996659842689, |
| "loss": 3.205, |
| "step": 65850 |
| }, |
| { |
| "epoch": 7.092885588203638, |
| "grad_norm": 0.7393507361412048, |
| "learning_rate": 0.0001747764249542075, |
| "loss": 3.2114, |
| "step": 65900 |
| }, |
| { |
| "epoch": 7.098267140243246, |
| "grad_norm": 0.7399935722351074, |
| "learning_rate": 0.0001744531839241461, |
| "loss": 3.1996, |
| "step": 65950 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "grad_norm": 0.7557267546653748, |
| "learning_rate": 0.00017412994289408467, |
| "loss": 3.2072, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "eval_accuracy": 0.3874779285240016, |
| "eval_loss": 3.3563175201416016, |
| "eval_runtime": 185.9082, |
| "eval_samples_per_second": 96.881, |
| "eval_steps_per_second": 6.057, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.109030244322462, |
| "grad_norm": 0.8268268704414368, |
| "learning_rate": 0.00017380670186402326, |
| "loss": 3.2159, |
| "step": 66050 |
| }, |
| { |
| "epoch": 7.114411796362071, |
| "grad_norm": 0.7251294851303101, |
| "learning_rate": 0.00017348346083396183, |
| "loss": 3.2203, |
| "step": 66100 |
| }, |
| { |
| "epoch": 7.119793348401679, |
| "grad_norm": 0.8212281465530396, |
| "learning_rate": 0.00017316021980390042, |
| "loss": 3.2034, |
| "step": 66150 |
| }, |
| { |
| "epoch": 7.125174900441287, |
| "grad_norm": 0.7860719561576843, |
| "learning_rate": 0.00017283697877383902, |
| "loss": 3.212, |
| "step": 66200 |
| }, |
| { |
| "epoch": 7.130556452480896, |
| "grad_norm": 0.7960042357444763, |
| "learning_rate": 0.0001725137377437776, |
| "loss": 3.2086, |
| "step": 66250 |
| }, |
| { |
| "epoch": 7.135938004520503, |
| "grad_norm": 0.8287146687507629, |
| "learning_rate": 0.00017219049671371615, |
| "loss": 3.2168, |
| "step": 66300 |
| }, |
| { |
| "epoch": 7.141319556560112, |
| "grad_norm": 0.7764856219291687, |
| "learning_rate": 0.00017186725568365478, |
| "loss": 3.2132, |
| "step": 66350 |
| }, |
| { |
| "epoch": 7.1467011085997205, |
| "grad_norm": 0.7876928448677063, |
| "learning_rate": 0.00017154401465359334, |
| "loss": 3.2174, |
| "step": 66400 |
| }, |
| { |
| "epoch": 7.152082660639328, |
| "grad_norm": 0.7779263854026794, |
| "learning_rate": 0.00017122077362353194, |
| "loss": 3.2185, |
| "step": 66450 |
| }, |
| { |
| "epoch": 7.157464212678937, |
| "grad_norm": 0.7854360342025757, |
| "learning_rate": 0.00017089753259347053, |
| "loss": 3.2339, |
| "step": 66500 |
| }, |
| { |
| "epoch": 7.162845764718545, |
| "grad_norm": 0.7960987687110901, |
| "learning_rate": 0.0001705742915634091, |
| "loss": 3.2212, |
| "step": 66550 |
| }, |
| { |
| "epoch": 7.168227316758153, |
| "grad_norm": 0.7815688252449036, |
| "learning_rate": 0.0001702510505333477, |
| "loss": 3.2474, |
| "step": 66600 |
| }, |
| { |
| "epoch": 7.1736088687977615, |
| "grad_norm": 0.7685019969940186, |
| "learning_rate": 0.00016992780950328626, |
| "loss": 3.2106, |
| "step": 66650 |
| }, |
| { |
| "epoch": 7.178990420837369, |
| "grad_norm": 0.8606351613998413, |
| "learning_rate": 0.00016960456847322486, |
| "loss": 3.2008, |
| "step": 66700 |
| }, |
| { |
| "epoch": 7.184371972876978, |
| "grad_norm": 0.8394185304641724, |
| "learning_rate": 0.00016928132744316345, |
| "loss": 3.2173, |
| "step": 66750 |
| }, |
| { |
| "epoch": 7.189753524916586, |
| "grad_norm": 0.7554501891136169, |
| "learning_rate": 0.00016895808641310202, |
| "loss": 3.2126, |
| "step": 66800 |
| }, |
| { |
| "epoch": 7.195135076956194, |
| "grad_norm": 0.7635372877120972, |
| "learning_rate": 0.0001686348453830406, |
| "loss": 3.2201, |
| "step": 66850 |
| }, |
| { |
| "epoch": 7.2005166289958025, |
| "grad_norm": 0.7927452921867371, |
| "learning_rate": 0.0001683116043529792, |
| "loss": 3.2159, |
| "step": 66900 |
| }, |
| { |
| "epoch": 7.205898181035411, |
| "grad_norm": 0.8126789331436157, |
| "learning_rate": 0.00016798836332291778, |
| "loss": 3.2263, |
| "step": 66950 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "grad_norm": 0.787567675113678, |
| "learning_rate": 0.00016766512229285634, |
| "loss": 3.2178, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "eval_accuracy": 0.3875839737425085, |
| "eval_loss": 3.3573434352874756, |
| "eval_runtime": 185.8381, |
| "eval_samples_per_second": 96.918, |
| "eval_steps_per_second": 6.059, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.216661285114627, |
| "grad_norm": 0.7294377088546753, |
| "learning_rate": 0.00016734188126279494, |
| "loss": 3.226, |
| "step": 67050 |
| }, |
| { |
| "epoch": 7.222042837154235, |
| "grad_norm": 0.7648470401763916, |
| "learning_rate": 0.00016701864023273353, |
| "loss": 3.2196, |
| "step": 67100 |
| }, |
| { |
| "epoch": 7.2274243891938434, |
| "grad_norm": 0.8017888069152832, |
| "learning_rate": 0.0001666953992026721, |
| "loss": 3.2381, |
| "step": 67150 |
| }, |
| { |
| "epoch": 7.232805941233452, |
| "grad_norm": 0.7950610518455505, |
| "learning_rate": 0.0001663721581726107, |
| "loss": 3.2203, |
| "step": 67200 |
| }, |
| { |
| "epoch": 7.23818749327306, |
| "grad_norm": 0.7861783504486084, |
| "learning_rate": 0.00016604891714254926, |
| "loss": 3.2071, |
| "step": 67250 |
| }, |
| { |
| "epoch": 7.243569045312668, |
| "grad_norm": 0.8108430504798889, |
| "learning_rate": 0.00016572567611248786, |
| "loss": 3.215, |
| "step": 67300 |
| }, |
| { |
| "epoch": 7.248950597352277, |
| "grad_norm": 0.7876373529434204, |
| "learning_rate": 0.00016540243508242645, |
| "loss": 3.2135, |
| "step": 67350 |
| }, |
| { |
| "epoch": 7.254332149391884, |
| "grad_norm": 0.7766739130020142, |
| "learning_rate": 0.00016507919405236502, |
| "loss": 3.226, |
| "step": 67400 |
| }, |
| { |
| "epoch": 7.259713701431493, |
| "grad_norm": 0.7911031246185303, |
| "learning_rate": 0.00016475595302230364, |
| "loss": 3.2016, |
| "step": 67450 |
| }, |
| { |
| "epoch": 7.265095253471101, |
| "grad_norm": 0.9022691249847412, |
| "learning_rate": 0.0001644327119922422, |
| "loss": 3.238, |
| "step": 67500 |
| }, |
| { |
| "epoch": 7.270476805510709, |
| "grad_norm": 0.7787434458732605, |
| "learning_rate": 0.00016410947096218078, |
| "loss": 3.2115, |
| "step": 67550 |
| }, |
| { |
| "epoch": 7.275858357550318, |
| "grad_norm": 0.778066873550415, |
| "learning_rate": 0.00016378622993211937, |
| "loss": 3.2363, |
| "step": 67600 |
| }, |
| { |
| "epoch": 7.281239909589925, |
| "grad_norm": 0.7604992389678955, |
| "learning_rate": 0.00016346298890205797, |
| "loss": 3.2236, |
| "step": 67650 |
| }, |
| { |
| "epoch": 7.286621461629534, |
| "grad_norm": 0.8357788920402527, |
| "learning_rate": 0.00016313974787199653, |
| "loss": 3.2204, |
| "step": 67700 |
| }, |
| { |
| "epoch": 7.2920030136691425, |
| "grad_norm": 0.834628701210022, |
| "learning_rate": 0.00016281650684193513, |
| "loss": 3.2103, |
| "step": 67750 |
| }, |
| { |
| "epoch": 7.29738456570875, |
| "grad_norm": 0.8433746099472046, |
| "learning_rate": 0.00016249973063247494, |
| "loss": 3.2284, |
| "step": 67800 |
| }, |
| { |
| "epoch": 7.302766117748359, |
| "grad_norm": 0.8454276323318481, |
| "learning_rate": 0.00016217648960241353, |
| "loss": 3.2091, |
| "step": 67850 |
| }, |
| { |
| "epoch": 7.308147669787967, |
| "grad_norm": 0.8023149371147156, |
| "learning_rate": 0.0001618532485723521, |
| "loss": 3.2364, |
| "step": 67900 |
| }, |
| { |
| "epoch": 7.313529221827575, |
| "grad_norm": 0.7875514626502991, |
| "learning_rate": 0.00016153000754229067, |
| "loss": 3.2244, |
| "step": 67950 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "grad_norm": 0.77104252576828, |
| "learning_rate": 0.0001612067665122293, |
| "loss": 3.2177, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "eval_accuracy": 0.3879769712377335, |
| "eval_loss": 3.3529610633850098, |
| "eval_runtime": 185.8729, |
| "eval_samples_per_second": 96.9, |
| "eval_steps_per_second": 6.058, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.324292325906791, |
| "grad_norm": 0.7934383153915405, |
| "learning_rate": 0.00016088352548216785, |
| "loss": 3.239, |
| "step": 68050 |
| }, |
| { |
| "epoch": 7.3296738779464, |
| "grad_norm": 0.8406181335449219, |
| "learning_rate": 0.00016056028445210642, |
| "loss": 3.241, |
| "step": 68100 |
| }, |
| { |
| "epoch": 7.335055429986008, |
| "grad_norm": 0.7851194143295288, |
| "learning_rate": 0.00016023704342204504, |
| "loss": 3.2231, |
| "step": 68150 |
| }, |
| { |
| "epoch": 7.340436982025616, |
| "grad_norm": 0.8027125597000122, |
| "learning_rate": 0.0001599138023919836, |
| "loss": 3.2247, |
| "step": 68200 |
| }, |
| { |
| "epoch": 7.3458185340652244, |
| "grad_norm": 0.7888418436050415, |
| "learning_rate": 0.0001595905613619222, |
| "loss": 3.2297, |
| "step": 68250 |
| }, |
| { |
| "epoch": 7.351200086104833, |
| "grad_norm": 0.7823399901390076, |
| "learning_rate": 0.00015926732033186077, |
| "loss": 3.2208, |
| "step": 68300 |
| }, |
| { |
| "epoch": 7.356581638144441, |
| "grad_norm": 0.8531885147094727, |
| "learning_rate": 0.00015894407930179934, |
| "loss": 3.227, |
| "step": 68350 |
| }, |
| { |
| "epoch": 7.361963190184049, |
| "grad_norm": 0.7732324600219727, |
| "learning_rate": 0.00015862083827173796, |
| "loss": 3.2108, |
| "step": 68400 |
| }, |
| { |
| "epoch": 7.367344742223658, |
| "grad_norm": 0.8048567771911621, |
| "learning_rate": 0.00015829759724167653, |
| "loss": 3.2174, |
| "step": 68450 |
| }, |
| { |
| "epoch": 7.372726294263265, |
| "grad_norm": 0.8031445741653442, |
| "learning_rate": 0.0001579743562116151, |
| "loss": 3.2214, |
| "step": 68500 |
| }, |
| { |
| "epoch": 7.378107846302874, |
| "grad_norm": 0.8209540247917175, |
| "learning_rate": 0.00015765111518155372, |
| "loss": 3.2361, |
| "step": 68550 |
| }, |
| { |
| "epoch": 7.383489398342482, |
| "grad_norm": 0.8266004323959351, |
| "learning_rate": 0.0001573278741514923, |
| "loss": 3.2196, |
| "step": 68600 |
| }, |
| { |
| "epoch": 7.38887095038209, |
| "grad_norm": 0.7896647453308105, |
| "learning_rate": 0.00015700463312143085, |
| "loss": 3.2209, |
| "step": 68650 |
| }, |
| { |
| "epoch": 7.394252502421699, |
| "grad_norm": 0.7958690524101257, |
| "learning_rate": 0.00015668139209136945, |
| "loss": 3.2401, |
| "step": 68700 |
| }, |
| { |
| "epoch": 7.399634054461306, |
| "grad_norm": 0.792015790939331, |
| "learning_rate": 0.00015635815106130804, |
| "loss": 3.2112, |
| "step": 68750 |
| }, |
| { |
| "epoch": 7.405015606500915, |
| "grad_norm": 0.8137146830558777, |
| "learning_rate": 0.0001560349100312466, |
| "loss": 3.2205, |
| "step": 68800 |
| }, |
| { |
| "epoch": 7.4103971585405235, |
| "grad_norm": 0.8189206123352051, |
| "learning_rate": 0.0001557116690011852, |
| "loss": 3.237, |
| "step": 68850 |
| }, |
| { |
| "epoch": 7.415778710580131, |
| "grad_norm": 0.7857847809791565, |
| "learning_rate": 0.00015538842797112377, |
| "loss": 3.2141, |
| "step": 68900 |
| }, |
| { |
| "epoch": 7.42116026261974, |
| "grad_norm": 0.7994054555892944, |
| "learning_rate": 0.00015506518694106237, |
| "loss": 3.2391, |
| "step": 68950 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "grad_norm": 0.7537856101989746, |
| "learning_rate": 0.00015474194591100096, |
| "loss": 3.2299, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "eval_accuracy": 0.38847720913323136, |
| "eval_loss": 3.347830057144165, |
| "eval_runtime": 186.1507, |
| "eval_samples_per_second": 96.755, |
| "eval_steps_per_second": 6.049, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.431923366698956, |
| "grad_norm": 0.7814086675643921, |
| "learning_rate": 0.00015441870488093953, |
| "loss": 3.2528, |
| "step": 69050 |
| }, |
| { |
| "epoch": 7.4373049187385645, |
| "grad_norm": 0.7771836519241333, |
| "learning_rate": 0.0001540954638508781, |
| "loss": 3.2299, |
| "step": 69100 |
| }, |
| { |
| "epoch": 7.442686470778172, |
| "grad_norm": 0.814217209815979, |
| "learning_rate": 0.00015377222282081672, |
| "loss": 3.2437, |
| "step": 69150 |
| }, |
| { |
| "epoch": 7.448068022817781, |
| "grad_norm": 0.7949915528297424, |
| "learning_rate": 0.0001534489817907553, |
| "loss": 3.2338, |
| "step": 69200 |
| }, |
| { |
| "epoch": 7.453449574857389, |
| "grad_norm": 0.8176887035369873, |
| "learning_rate": 0.00015312574076069388, |
| "loss": 3.2269, |
| "step": 69250 |
| }, |
| { |
| "epoch": 7.458831126896997, |
| "grad_norm": 0.8128419518470764, |
| "learning_rate": 0.00015280249973063248, |
| "loss": 3.2314, |
| "step": 69300 |
| }, |
| { |
| "epoch": 7.4642126789366054, |
| "grad_norm": 0.7866790294647217, |
| "learning_rate": 0.00015247925870057104, |
| "loss": 3.2153, |
| "step": 69350 |
| }, |
| { |
| "epoch": 7.469594230976213, |
| "grad_norm": 0.7787197828292847, |
| "learning_rate": 0.00015215601767050964, |
| "loss": 3.2333, |
| "step": 69400 |
| }, |
| { |
| "epoch": 7.474975783015822, |
| "grad_norm": 0.7818750143051147, |
| "learning_rate": 0.0001518327766404482, |
| "loss": 3.2315, |
| "step": 69450 |
| }, |
| { |
| "epoch": 7.48035733505543, |
| "grad_norm": 0.7789158225059509, |
| "learning_rate": 0.0001515095356103868, |
| "loss": 3.2233, |
| "step": 69500 |
| }, |
| { |
| "epoch": 7.485738887095038, |
| "grad_norm": 0.7852670550346375, |
| "learning_rate": 0.0001511862945803254, |
| "loss": 3.2384, |
| "step": 69550 |
| }, |
| { |
| "epoch": 7.491120439134646, |
| "grad_norm": 0.81417316198349, |
| "learning_rate": 0.00015086305355026396, |
| "loss": 3.2101, |
| "step": 69600 |
| }, |
| { |
| "epoch": 7.496501991174255, |
| "grad_norm": 0.7992369532585144, |
| "learning_rate": 0.00015053981252020253, |
| "loss": 3.2412, |
| "step": 69650 |
| }, |
| { |
| "epoch": 7.501883543213863, |
| "grad_norm": 0.8337027430534363, |
| "learning_rate": 0.00015021657149014115, |
| "loss": 3.2409, |
| "step": 69700 |
| }, |
| { |
| "epoch": 7.507265095253471, |
| "grad_norm": 0.8439802527427673, |
| "learning_rate": 0.00014989333046007972, |
| "loss": 3.2365, |
| "step": 69750 |
| }, |
| { |
| "epoch": 7.51264664729308, |
| "grad_norm": 0.8039165139198303, |
| "learning_rate": 0.00014957655425061953, |
| "loss": 3.2262, |
| "step": 69800 |
| }, |
| { |
| "epoch": 7.518028199332687, |
| "grad_norm": 0.8090606927871704, |
| "learning_rate": 0.00014925331322055812, |
| "loss": 3.2364, |
| "step": 69850 |
| }, |
| { |
| "epoch": 7.523409751372296, |
| "grad_norm": 0.8324903845787048, |
| "learning_rate": 0.0001489300721904967, |
| "loss": 3.2249, |
| "step": 69900 |
| }, |
| { |
| "epoch": 7.528791303411904, |
| "grad_norm": 0.8116533756256104, |
| "learning_rate": 0.00014860683116043528, |
| "loss": 3.2277, |
| "step": 69950 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "grad_norm": 0.8552218079566956, |
| "learning_rate": 0.00014828359013037385, |
| "loss": 3.2027, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "eval_accuracy": 0.38886879414091474, |
| "eval_loss": 3.3467564582824707, |
| "eval_runtime": 185.6527, |
| "eval_samples_per_second": 97.014, |
| "eval_steps_per_second": 6.065, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.539554407491121, |
| "grad_norm": 0.7482637763023376, |
| "learning_rate": 0.00014796034910031245, |
| "loss": 3.2255, |
| "step": 70050 |
| }, |
| { |
| "epoch": 7.544935959530728, |
| "grad_norm": 0.7789427638053894, |
| "learning_rate": 0.00014763710807025104, |
| "loss": 3.2254, |
| "step": 70100 |
| }, |
| { |
| "epoch": 7.550317511570337, |
| "grad_norm": 0.8304659128189087, |
| "learning_rate": 0.0001473138670401896, |
| "loss": 3.248, |
| "step": 70150 |
| }, |
| { |
| "epoch": 7.5556990636099455, |
| "grad_norm": 0.8365060091018677, |
| "learning_rate": 0.0001469906260101282, |
| "loss": 3.2318, |
| "step": 70200 |
| }, |
| { |
| "epoch": 7.561080615649553, |
| "grad_norm": 0.8208428025245667, |
| "learning_rate": 0.0001466673849800668, |
| "loss": 3.2282, |
| "step": 70250 |
| }, |
| { |
| "epoch": 7.566462167689162, |
| "grad_norm": 0.8882009387016296, |
| "learning_rate": 0.0001463441439500054, |
| "loss": 3.2248, |
| "step": 70300 |
| }, |
| { |
| "epoch": 7.57184371972877, |
| "grad_norm": 0.7620612978935242, |
| "learning_rate": 0.00014602090291994396, |
| "loss": 3.2401, |
| "step": 70350 |
| }, |
| { |
| "epoch": 7.577225271768378, |
| "grad_norm": 0.8021471500396729, |
| "learning_rate": 0.00014569766188988255, |
| "loss": 3.2176, |
| "step": 70400 |
| }, |
| { |
| "epoch": 7.5826068238079865, |
| "grad_norm": 0.7981626987457275, |
| "learning_rate": 0.00014537442085982112, |
| "loss": 3.2207, |
| "step": 70450 |
| }, |
| { |
| "epoch": 7.587988375847594, |
| "grad_norm": 0.819978654384613, |
| "learning_rate": 0.00014505117982975972, |
| "loss": 3.2319, |
| "step": 70500 |
| }, |
| { |
| "epoch": 7.593369927887203, |
| "grad_norm": 0.7760841250419617, |
| "learning_rate": 0.00014472793879969828, |
| "loss": 3.2426, |
| "step": 70550 |
| }, |
| { |
| "epoch": 7.598751479926811, |
| "grad_norm": 0.8547475337982178, |
| "learning_rate": 0.00014440469776963688, |
| "loss": 3.2469, |
| "step": 70600 |
| }, |
| { |
| "epoch": 7.604133031966419, |
| "grad_norm": 0.7779892683029175, |
| "learning_rate": 0.00014408145673957545, |
| "loss": 3.225, |
| "step": 70650 |
| }, |
| { |
| "epoch": 7.609514584006027, |
| "grad_norm": 0.8118804097175598, |
| "learning_rate": 0.00014375821570951404, |
| "loss": 3.2477, |
| "step": 70700 |
| }, |
| { |
| "epoch": 7.614896136045635, |
| "grad_norm": 0.8520888686180115, |
| "learning_rate": 0.00014343497467945264, |
| "loss": 3.2387, |
| "step": 70750 |
| }, |
| { |
| "epoch": 7.620277688085244, |
| "grad_norm": 0.807848334312439, |
| "learning_rate": 0.00014311173364939123, |
| "loss": 3.2286, |
| "step": 70800 |
| }, |
| { |
| "epoch": 7.625659240124852, |
| "grad_norm": 0.791697084903717, |
| "learning_rate": 0.0001427884926193298, |
| "loss": 3.2344, |
| "step": 70850 |
| }, |
| { |
| "epoch": 7.63104079216446, |
| "grad_norm": 0.8143545985221863, |
| "learning_rate": 0.0001424652515892684, |
| "loss": 3.2503, |
| "step": 70900 |
| }, |
| { |
| "epoch": 7.636422344204068, |
| "grad_norm": 0.7878372669219971, |
| "learning_rate": 0.000142142010559207, |
| "loss": 3.2284, |
| "step": 70950 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "grad_norm": 0.8296144604682922, |
| "learning_rate": 0.00014181876952914555, |
| "loss": 3.2336, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "eval_accuracy": 0.38922202467919964, |
| "eval_loss": 3.3422555923461914, |
| "eval_runtime": 185.7263, |
| "eval_samples_per_second": 96.976, |
| "eval_steps_per_second": 6.063, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.647185448283285, |
| "grad_norm": 0.7863085269927979, |
| "learning_rate": 0.00014149552849908415, |
| "loss": 3.2313, |
| "step": 71050 |
| }, |
| { |
| "epoch": 7.652567000322893, |
| "grad_norm": 0.849492609500885, |
| "learning_rate": 0.00014117228746902272, |
| "loss": 3.2367, |
| "step": 71100 |
| }, |
| { |
| "epoch": 7.657948552362502, |
| "grad_norm": 0.7902466058731079, |
| "learning_rate": 0.0001408490464389613, |
| "loss": 3.2184, |
| "step": 71150 |
| }, |
| { |
| "epoch": 7.663330104402109, |
| "grad_norm": 0.7909029722213745, |
| "learning_rate": 0.00014052580540889988, |
| "loss": 3.2466, |
| "step": 71200 |
| }, |
| { |
| "epoch": 7.668711656441718, |
| "grad_norm": 0.7495254278182983, |
| "learning_rate": 0.00014020256437883847, |
| "loss": 3.2374, |
| "step": 71250 |
| }, |
| { |
| "epoch": 7.674093208481326, |
| "grad_norm": 0.8227121829986572, |
| "learning_rate": 0.00013987932334877707, |
| "loss": 3.2442, |
| "step": 71300 |
| }, |
| { |
| "epoch": 7.679474760520934, |
| "grad_norm": 0.8032054305076599, |
| "learning_rate": 0.00013955608231871564, |
| "loss": 3.2455, |
| "step": 71350 |
| }, |
| { |
| "epoch": 7.684856312560543, |
| "grad_norm": 0.7971029877662659, |
| "learning_rate": 0.00013923284128865423, |
| "loss": 3.2469, |
| "step": 71400 |
| }, |
| { |
| "epoch": 7.69023786460015, |
| "grad_norm": 0.7779977321624756, |
| "learning_rate": 0.00013890960025859283, |
| "loss": 3.2491, |
| "step": 71450 |
| }, |
| { |
| "epoch": 7.695619416639759, |
| "grad_norm": 0.8167675733566284, |
| "learning_rate": 0.00013859282404913263, |
| "loss": 3.2375, |
| "step": 71500 |
| }, |
| { |
| "epoch": 7.7010009686793675, |
| "grad_norm": 0.7902052998542786, |
| "learning_rate": 0.0001382695830190712, |
| "loss": 3.2397, |
| "step": 71550 |
| }, |
| { |
| "epoch": 7.706382520718975, |
| "grad_norm": 0.8054006695747375, |
| "learning_rate": 0.0001379463419890098, |
| "loss": 3.2359, |
| "step": 71600 |
| }, |
| { |
| "epoch": 7.711764072758584, |
| "grad_norm": 0.8425346612930298, |
| "learning_rate": 0.00013762310095894836, |
| "loss": 3.2315, |
| "step": 71650 |
| }, |
| { |
| "epoch": 7.717145624798192, |
| "grad_norm": 0.8496072888374329, |
| "learning_rate": 0.00013729985992888696, |
| "loss": 3.2229, |
| "step": 71700 |
| }, |
| { |
| "epoch": 7.7225271768378, |
| "grad_norm": 0.7817051410675049, |
| "learning_rate": 0.00013697661889882555, |
| "loss": 3.2259, |
| "step": 71750 |
| }, |
| { |
| "epoch": 7.727908728877408, |
| "grad_norm": 0.9599478840827942, |
| "learning_rate": 0.00013665337786876412, |
| "loss": 3.2099, |
| "step": 71800 |
| }, |
| { |
| "epoch": 7.733290280917016, |
| "grad_norm": 0.821431577205658, |
| "learning_rate": 0.0001363301368387027, |
| "loss": 3.2438, |
| "step": 71850 |
| }, |
| { |
| "epoch": 7.738671832956625, |
| "grad_norm": 0.8411420583724976, |
| "learning_rate": 0.0001360068958086413, |
| "loss": 3.2373, |
| "step": 71900 |
| }, |
| { |
| "epoch": 7.744053384996233, |
| "grad_norm": 0.8518005013465881, |
| "learning_rate": 0.00013568365477857988, |
| "loss": 3.2403, |
| "step": 71950 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "grad_norm": 0.7894284129142761, |
| "learning_rate": 0.00013536041374851847, |
| "loss": 3.2134, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "eval_accuracy": 0.3896006799932331, |
| "eval_loss": 3.3371217250823975, |
| "eval_runtime": 186.2179, |
| "eval_samples_per_second": 96.72, |
| "eval_steps_per_second": 6.047, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.754816489075449, |
| "grad_norm": 0.8548874855041504, |
| "learning_rate": 0.00013503717271845706, |
| "loss": 3.2289, |
| "step": 72050 |
| }, |
| { |
| "epoch": 7.760198041115058, |
| "grad_norm": 0.806556224822998, |
| "learning_rate": 0.00013471393168839563, |
| "loss": 3.2182, |
| "step": 72100 |
| }, |
| { |
| "epoch": 7.765579593154666, |
| "grad_norm": 0.7773876190185547, |
| "learning_rate": 0.00013439069065833423, |
| "loss": 3.2241, |
| "step": 72150 |
| }, |
| { |
| "epoch": 7.770961145194274, |
| "grad_norm": 0.7830033898353577, |
| "learning_rate": 0.0001340674496282728, |
| "loss": 3.2288, |
| "step": 72200 |
| }, |
| { |
| "epoch": 7.776342697233883, |
| "grad_norm": 0.8270848393440247, |
| "learning_rate": 0.0001337442085982114, |
| "loss": 3.2351, |
| "step": 72250 |
| }, |
| { |
| "epoch": 7.78172424927349, |
| "grad_norm": 0.7873373627662659, |
| "learning_rate": 0.00013342096756814996, |
| "loss": 3.2349, |
| "step": 72300 |
| }, |
| { |
| "epoch": 7.787105801313099, |
| "grad_norm": 0.802685022354126, |
| "learning_rate": 0.00013309772653808855, |
| "loss": 3.2417, |
| "step": 72350 |
| }, |
| { |
| "epoch": 7.792487353352707, |
| "grad_norm": 0.882526159286499, |
| "learning_rate": 0.00013277448550802715, |
| "loss": 3.2415, |
| "step": 72400 |
| }, |
| { |
| "epoch": 7.797868905392315, |
| "grad_norm": 0.8275822997093201, |
| "learning_rate": 0.0001324512444779657, |
| "loss": 3.2452, |
| "step": 72450 |
| }, |
| { |
| "epoch": 7.803250457431924, |
| "grad_norm": 0.8188118934631348, |
| "learning_rate": 0.0001321280034479043, |
| "loss": 3.2419, |
| "step": 72500 |
| }, |
| { |
| "epoch": 7.808632009471531, |
| "grad_norm": 0.8010212779045105, |
| "learning_rate": 0.0001318047624178429, |
| "loss": 3.2306, |
| "step": 72550 |
| }, |
| { |
| "epoch": 7.81401356151114, |
| "grad_norm": 0.8207672238349915, |
| "learning_rate": 0.0001314815213877815, |
| "loss": 3.234, |
| "step": 72600 |
| }, |
| { |
| "epoch": 7.819395113550748, |
| "grad_norm": 0.8084502816200256, |
| "learning_rate": 0.00013115828035772007, |
| "loss": 3.2568, |
| "step": 72650 |
| }, |
| { |
| "epoch": 7.824776665590356, |
| "grad_norm": 0.7942692041397095, |
| "learning_rate": 0.00013083503932765866, |
| "loss": 3.256, |
| "step": 72700 |
| }, |
| { |
| "epoch": 7.830158217629965, |
| "grad_norm": 0.8268451690673828, |
| "learning_rate": 0.00013051179829759723, |
| "loss": 3.224, |
| "step": 72750 |
| }, |
| { |
| "epoch": 7.835539769669572, |
| "grad_norm": 0.8491590619087219, |
| "learning_rate": 0.00013018855726753582, |
| "loss": 3.2455, |
| "step": 72800 |
| }, |
| { |
| "epoch": 7.840921321709181, |
| "grad_norm": 0.8598081469535828, |
| "learning_rate": 0.0001298653162374744, |
| "loss": 3.2229, |
| "step": 72850 |
| }, |
| { |
| "epoch": 7.846302873748789, |
| "grad_norm": 0.7725291848182678, |
| "learning_rate": 0.00012954207520741298, |
| "loss": 3.2246, |
| "step": 72900 |
| }, |
| { |
| "epoch": 7.851684425788397, |
| "grad_norm": 0.8496323823928833, |
| "learning_rate": 0.00012921883417735155, |
| "loss": 3.2413, |
| "step": 72950 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "grad_norm": 0.7936690449714661, |
| "learning_rate": 0.00012889559314729015, |
| "loss": 3.23, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "eval_accuracy": 0.38996260276254313, |
| "eval_loss": 3.3337481021881104, |
| "eval_runtime": 185.6466, |
| "eval_samples_per_second": 97.018, |
| "eval_steps_per_second": 6.065, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.862447529867614, |
| "grad_norm": 0.8138206601142883, |
| "learning_rate": 0.00012857235211722874, |
| "loss": 3.2289, |
| "step": 73050 |
| }, |
| { |
| "epoch": 7.867829081907222, |
| "grad_norm": 0.7915546894073486, |
| "learning_rate": 0.00012824911108716734, |
| "loss": 3.2306, |
| "step": 73100 |
| }, |
| { |
| "epoch": 7.87321063394683, |
| "grad_norm": 0.8671497702598572, |
| "learning_rate": 0.0001279258700571059, |
| "loss": 3.2313, |
| "step": 73150 |
| }, |
| { |
| "epoch": 7.878592185986438, |
| "grad_norm": 0.8411927223205566, |
| "learning_rate": 0.0001276026290270445, |
| "loss": 3.2546, |
| "step": 73200 |
| }, |
| { |
| "epoch": 7.883973738026047, |
| "grad_norm": 0.835565984249115, |
| "learning_rate": 0.00012727938799698307, |
| "loss": 3.2212, |
| "step": 73250 |
| }, |
| { |
| "epoch": 7.889355290065655, |
| "grad_norm": 0.9108295440673828, |
| "learning_rate": 0.00012695614696692166, |
| "loss": 3.2152, |
| "step": 73300 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 0.8257512450218201, |
| "learning_rate": 0.00012663290593686023, |
| "loss": 3.2287, |
| "step": 73350 |
| }, |
| { |
| "epoch": 7.900118394144871, |
| "grad_norm": 0.8217402696609497, |
| "learning_rate": 0.00012630966490679882, |
| "loss": 3.2314, |
| "step": 73400 |
| }, |
| { |
| "epoch": 7.90549994618448, |
| "grad_norm": 0.8362021446228027, |
| "learning_rate": 0.0001259864238767374, |
| "loss": 3.2338, |
| "step": 73450 |
| }, |
| { |
| "epoch": 7.910881498224088, |
| "grad_norm": 0.8642450571060181, |
| "learning_rate": 0.00012566318284667598, |
| "loss": 3.2452, |
| "step": 73500 |
| }, |
| { |
| "epoch": 7.916263050263696, |
| "grad_norm": 0.8016989231109619, |
| "learning_rate": 0.00012533994181661458, |
| "loss": 3.2237, |
| "step": 73550 |
| }, |
| { |
| "epoch": 7.921644602303305, |
| "grad_norm": 0.8322981595993042, |
| "learning_rate": 0.00012501670078655317, |
| "loss": 3.2353, |
| "step": 73600 |
| }, |
| { |
| "epoch": 7.927026154342912, |
| "grad_norm": 0.8199933171272278, |
| "learning_rate": 0.00012469345975649174, |
| "loss": 3.2255, |
| "step": 73650 |
| }, |
| { |
| "epoch": 7.932407706382521, |
| "grad_norm": 0.8421011567115784, |
| "learning_rate": 0.00012437021872643034, |
| "loss": 3.2271, |
| "step": 73700 |
| }, |
| { |
| "epoch": 7.937789258422129, |
| "grad_norm": 0.8250933289527893, |
| "learning_rate": 0.00012404697769636893, |
| "loss": 3.2086, |
| "step": 73750 |
| }, |
| { |
| "epoch": 7.943170810461737, |
| "grad_norm": 0.810928463935852, |
| "learning_rate": 0.0001237237366663075, |
| "loss": 3.2514, |
| "step": 73800 |
| }, |
| { |
| "epoch": 7.948552362501346, |
| "grad_norm": 0.8243696689605713, |
| "learning_rate": 0.0001234004956362461, |
| "loss": 3.2394, |
| "step": 73850 |
| }, |
| { |
| "epoch": 7.953933914540953, |
| "grad_norm": 0.8388850688934326, |
| "learning_rate": 0.00012307725460618466, |
| "loss": 3.2373, |
| "step": 73900 |
| }, |
| { |
| "epoch": 7.959315466580562, |
| "grad_norm": 0.8117256164550781, |
| "learning_rate": 0.00012275401357612326, |
| "loss": 3.2246, |
| "step": 73950 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "grad_norm": 0.7929599285125732, |
| "learning_rate": 0.00012243077254606182, |
| "loss": 3.245, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "eval_accuracy": 0.39027650095543914, |
| "eval_loss": 3.330167055130005, |
| "eval_runtime": 185.7364, |
| "eval_samples_per_second": 96.971, |
| "eval_steps_per_second": 6.062, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.970078570659778, |
| "grad_norm": 0.8007392287254333, |
| "learning_rate": 0.00012210753151600042, |
| "loss": 3.2335, |
| "step": 74050 |
| }, |
| { |
| "epoch": 7.975460122699387, |
| "grad_norm": 0.7954262495040894, |
| "learning_rate": 0.00012178429048593901, |
| "loss": 3.2309, |
| "step": 74100 |
| }, |
| { |
| "epoch": 7.980841674738995, |
| "grad_norm": 0.8516155481338501, |
| "learning_rate": 0.00012146104945587758, |
| "loss": 3.2089, |
| "step": 74150 |
| }, |
| { |
| "epoch": 7.986223226778603, |
| "grad_norm": 0.8081137537956238, |
| "learning_rate": 0.00012113780842581617, |
| "loss": 3.2366, |
| "step": 74200 |
| }, |
| { |
| "epoch": 7.991604778818211, |
| "grad_norm": 0.778059720993042, |
| "learning_rate": 0.00012081456739575477, |
| "loss": 3.2437, |
| "step": 74250 |
| }, |
| { |
| "epoch": 7.996986330857819, |
| "grad_norm": 0.7741621732711792, |
| "learning_rate": 0.00012049132636569334, |
| "loss": 3.2247, |
| "step": 74300 |
| }, |
| { |
| "epoch": 8.002367882897428, |
| "grad_norm": 0.8703892230987549, |
| "learning_rate": 0.00012016808533563193, |
| "loss": 3.2053, |
| "step": 74350 |
| }, |
| { |
| "epoch": 8.007749434937036, |
| "grad_norm": 0.8257153034210205, |
| "learning_rate": 0.00011984484430557051, |
| "loss": 3.1457, |
| "step": 74400 |
| }, |
| { |
| "epoch": 8.013130986976645, |
| "grad_norm": 0.8422123789787292, |
| "learning_rate": 0.0001195216032755091, |
| "loss": 3.1526, |
| "step": 74450 |
| }, |
| { |
| "epoch": 8.018512539016251, |
| "grad_norm": 0.8423177599906921, |
| "learning_rate": 0.00011919836224544767, |
| "loss": 3.1494, |
| "step": 74500 |
| }, |
| { |
| "epoch": 8.02389409105586, |
| "grad_norm": 0.8235529661178589, |
| "learning_rate": 0.00011887512121538627, |
| "loss": 3.1522, |
| "step": 74550 |
| }, |
| { |
| "epoch": 8.029275643095469, |
| "grad_norm": 0.844817042350769, |
| "learning_rate": 0.00011855188018532485, |
| "loss": 3.1629, |
| "step": 74600 |
| }, |
| { |
| "epoch": 8.034657195135077, |
| "grad_norm": 0.8099653124809265, |
| "learning_rate": 0.00011822863915526343, |
| "loss": 3.1663, |
| "step": 74650 |
| }, |
| { |
| "epoch": 8.040038747174686, |
| "grad_norm": 0.8309894800186157, |
| "learning_rate": 0.00011790539812520201, |
| "loss": 3.1511, |
| "step": 74700 |
| }, |
| { |
| "epoch": 8.045420299214294, |
| "grad_norm": 0.8725168108940125, |
| "learning_rate": 0.00011758215709514061, |
| "loss": 3.1611, |
| "step": 74750 |
| }, |
| { |
| "epoch": 8.050801851253901, |
| "grad_norm": 0.8431879281997681, |
| "learning_rate": 0.00011725891606507917, |
| "loss": 3.1564, |
| "step": 74800 |
| }, |
| { |
| "epoch": 8.05618340329351, |
| "grad_norm": 0.8405344486236572, |
| "learning_rate": 0.00011693567503501777, |
| "loss": 3.1532, |
| "step": 74850 |
| }, |
| { |
| "epoch": 8.061564955333118, |
| "grad_norm": 0.9406002163887024, |
| "learning_rate": 0.00011661243400495635, |
| "loss": 3.1556, |
| "step": 74900 |
| }, |
| { |
| "epoch": 8.066946507372727, |
| "grad_norm": 0.8277288675308228, |
| "learning_rate": 0.00011628919297489493, |
| "loss": 3.1641, |
| "step": 74950 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "grad_norm": 0.8266585469245911, |
| "learning_rate": 0.00011596595194483351, |
| "loss": 3.1517, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "eval_accuracy": 0.39001682055356257, |
| "eval_loss": 3.338434934616089, |
| "eval_runtime": 186.1434, |
| "eval_samples_per_second": 96.759, |
| "eval_steps_per_second": 6.049, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.077709611451942, |
| "grad_norm": 0.7777183651924133, |
| "learning_rate": 0.00011564271091477211, |
| "loss": 3.1594, |
| "step": 75050 |
| }, |
| { |
| "epoch": 8.08309116349155, |
| "grad_norm": 0.7924986481666565, |
| "learning_rate": 0.0001153194698847107, |
| "loss": 3.1616, |
| "step": 75100 |
| }, |
| { |
| "epoch": 8.088472715531159, |
| "grad_norm": 0.8581122159957886, |
| "learning_rate": 0.00011499622885464927, |
| "loss": 3.1624, |
| "step": 75150 |
| }, |
| { |
| "epoch": 8.093854267570768, |
| "grad_norm": 0.8415508270263672, |
| "learning_rate": 0.00011467298782458786, |
| "loss": 3.1725, |
| "step": 75200 |
| }, |
| { |
| "epoch": 8.099235819610376, |
| "grad_norm": 0.8101992607116699, |
| "learning_rate": 0.00011434974679452645, |
| "loss": 3.175, |
| "step": 75250 |
| }, |
| { |
| "epoch": 8.104617371649983, |
| "grad_norm": 0.8518224358558655, |
| "learning_rate": 0.00011402650576446503, |
| "loss": 3.1844, |
| "step": 75300 |
| }, |
| { |
| "epoch": 8.109998923689592, |
| "grad_norm": 0.8583261370658875, |
| "learning_rate": 0.00011370326473440361, |
| "loss": 3.1692, |
| "step": 75350 |
| }, |
| { |
| "epoch": 8.1153804757292, |
| "grad_norm": 0.895036518573761, |
| "learning_rate": 0.0001133800237043422, |
| "loss": 3.1721, |
| "step": 75400 |
| }, |
| { |
| "epoch": 8.120762027768809, |
| "grad_norm": 0.8301292657852173, |
| "learning_rate": 0.00011305678267428077, |
| "loss": 3.1696, |
| "step": 75450 |
| }, |
| { |
| "epoch": 8.126143579808417, |
| "grad_norm": 0.8367131352424622, |
| "learning_rate": 0.00011274000646482059, |
| "loss": 3.1967, |
| "step": 75500 |
| }, |
| { |
| "epoch": 8.131525131848026, |
| "grad_norm": 0.8633970618247986, |
| "learning_rate": 0.00011241676543475918, |
| "loss": 3.1675, |
| "step": 75550 |
| }, |
| { |
| "epoch": 8.136906683887632, |
| "grad_norm": 0.8824740648269653, |
| "learning_rate": 0.00011209352440469775, |
| "loss": 3.1774, |
| "step": 75600 |
| }, |
| { |
| "epoch": 8.142288235927241, |
| "grad_norm": 0.7940886616706848, |
| "learning_rate": 0.00011177028337463635, |
| "loss": 3.1691, |
| "step": 75650 |
| }, |
| { |
| "epoch": 8.14766978796685, |
| "grad_norm": 0.8604269623756409, |
| "learning_rate": 0.00011144704234457493, |
| "loss": 3.1641, |
| "step": 75700 |
| }, |
| { |
| "epoch": 8.153051340006458, |
| "grad_norm": 0.847507119178772, |
| "learning_rate": 0.00011112380131451351, |
| "loss": 3.1801, |
| "step": 75750 |
| }, |
| { |
| "epoch": 8.158432892046067, |
| "grad_norm": 0.781295120716095, |
| "learning_rate": 0.00011080056028445209, |
| "loss": 3.162, |
| "step": 75800 |
| }, |
| { |
| "epoch": 8.163814444085673, |
| "grad_norm": 0.8339664936065674, |
| "learning_rate": 0.00011047731925439068, |
| "loss": 3.1682, |
| "step": 75850 |
| }, |
| { |
| "epoch": 8.169195996125282, |
| "grad_norm": 0.8284735083580017, |
| "learning_rate": 0.00011015407822432928, |
| "loss": 3.1613, |
| "step": 75900 |
| }, |
| { |
| "epoch": 8.17457754816489, |
| "grad_norm": 0.8000165224075317, |
| "learning_rate": 0.00010983083719426785, |
| "loss": 3.1589, |
| "step": 75950 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "grad_norm": 0.8093355894088745, |
| "learning_rate": 0.00010950759616420644, |
| "loss": 3.1701, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "eval_accuracy": 0.3903008392023096, |
| "eval_loss": 3.336153030395508, |
| "eval_runtime": 185.6996, |
| "eval_samples_per_second": 96.99, |
| "eval_steps_per_second": 6.064, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.185340652244108, |
| "grad_norm": 0.8471051454544067, |
| "learning_rate": 0.00010918435513414502, |
| "loss": 3.1699, |
| "step": 76050 |
| }, |
| { |
| "epoch": 8.190722204283716, |
| "grad_norm": 0.8457047939300537, |
| "learning_rate": 0.00010886111410408359, |
| "loss": 3.1689, |
| "step": 76100 |
| }, |
| { |
| "epoch": 8.196103756323323, |
| "grad_norm": 0.8495464324951172, |
| "learning_rate": 0.00010853787307402218, |
| "loss": 3.179, |
| "step": 76150 |
| }, |
| { |
| "epoch": 8.201485308362932, |
| "grad_norm": 0.8045263886451721, |
| "learning_rate": 0.00010821463204396078, |
| "loss": 3.1642, |
| "step": 76200 |
| }, |
| { |
| "epoch": 8.20686686040254, |
| "grad_norm": 0.8181670904159546, |
| "learning_rate": 0.00010789139101389935, |
| "loss": 3.1862, |
| "step": 76250 |
| }, |
| { |
| "epoch": 8.212248412442149, |
| "grad_norm": 0.8407565951347351, |
| "learning_rate": 0.00010756814998383794, |
| "loss": 3.1579, |
| "step": 76300 |
| }, |
| { |
| "epoch": 8.217629964481757, |
| "grad_norm": 0.8352681994438171, |
| "learning_rate": 0.00010724490895377652, |
| "loss": 3.1699, |
| "step": 76350 |
| }, |
| { |
| "epoch": 8.223011516521364, |
| "grad_norm": 0.8680731654167175, |
| "learning_rate": 0.00010692166792371512, |
| "loss": 3.1789, |
| "step": 76400 |
| }, |
| { |
| "epoch": 8.228393068560973, |
| "grad_norm": 0.8959930539131165, |
| "learning_rate": 0.00010659842689365368, |
| "loss": 3.172, |
| "step": 76450 |
| }, |
| { |
| "epoch": 8.233774620600581, |
| "grad_norm": 0.8152377605438232, |
| "learning_rate": 0.00010627518586359228, |
| "loss": 3.1748, |
| "step": 76500 |
| }, |
| { |
| "epoch": 8.23915617264019, |
| "grad_norm": 0.7709901928901672, |
| "learning_rate": 0.00010595194483353086, |
| "loss": 3.1382, |
| "step": 76550 |
| }, |
| { |
| "epoch": 8.244537724679798, |
| "grad_norm": 0.8352434635162354, |
| "learning_rate": 0.00010562870380346944, |
| "loss": 3.184, |
| "step": 76600 |
| }, |
| { |
| "epoch": 8.249919276719407, |
| "grad_norm": 0.8120669722557068, |
| "learning_rate": 0.00010530546277340802, |
| "loss": 3.1914, |
| "step": 76650 |
| }, |
| { |
| "epoch": 8.255300828759013, |
| "grad_norm": 0.8267314434051514, |
| "learning_rate": 0.00010498222174334662, |
| "loss": 3.1672, |
| "step": 76700 |
| }, |
| { |
| "epoch": 8.260682380798622, |
| "grad_norm": 0.841647744178772, |
| "learning_rate": 0.00010465898071328519, |
| "loss": 3.1636, |
| "step": 76750 |
| }, |
| { |
| "epoch": 8.26606393283823, |
| "grad_norm": 0.8355528712272644, |
| "learning_rate": 0.00010433573968322378, |
| "loss": 3.1645, |
| "step": 76800 |
| }, |
| { |
| "epoch": 8.27144548487784, |
| "grad_norm": 0.8447277545928955, |
| "learning_rate": 0.00010401249865316237, |
| "loss": 3.1683, |
| "step": 76850 |
| }, |
| { |
| "epoch": 8.276827036917448, |
| "grad_norm": 0.8199051022529602, |
| "learning_rate": 0.00010368925762310096, |
| "loss": 3.1838, |
| "step": 76900 |
| }, |
| { |
| "epoch": 8.282208588957054, |
| "grad_norm": 0.821168065071106, |
| "learning_rate": 0.00010336601659303954, |
| "loss": 3.1707, |
| "step": 76950 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "grad_norm": 0.8731397390365601, |
| "learning_rate": 0.00010304277556297812, |
| "loss": 3.1486, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "eval_accuracy": 0.39075457366182287, |
| "eval_loss": 3.33301043510437, |
| "eval_runtime": 185.9715, |
| "eval_samples_per_second": 96.848, |
| "eval_steps_per_second": 6.055, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.292971693036272, |
| "grad_norm": 0.807433545589447, |
| "learning_rate": 0.00010271953453291671, |
| "loss": 3.1849, |
| "step": 77050 |
| }, |
| { |
| "epoch": 8.29835324507588, |
| "grad_norm": 0.8317826986312866, |
| "learning_rate": 0.00010239629350285528, |
| "loss": 3.1815, |
| "step": 77100 |
| }, |
| { |
| "epoch": 8.303734797115489, |
| "grad_norm": 0.8155573010444641, |
| "learning_rate": 0.00010207305247279387, |
| "loss": 3.1795, |
| "step": 77150 |
| }, |
| { |
| "epoch": 8.309116349155097, |
| "grad_norm": 0.8386462330818176, |
| "learning_rate": 0.00010174981144273246, |
| "loss": 3.1716, |
| "step": 77200 |
| }, |
| { |
| "epoch": 8.314497901194704, |
| "grad_norm": 0.8625231385231018, |
| "learning_rate": 0.00010142657041267104, |
| "loss": 3.1796, |
| "step": 77250 |
| }, |
| { |
| "epoch": 8.319879453234313, |
| "grad_norm": 0.8375402688980103, |
| "learning_rate": 0.00010110332938260962, |
| "loss": 3.1799, |
| "step": 77300 |
| }, |
| { |
| "epoch": 8.325261005273921, |
| "grad_norm": 0.8287211656570435, |
| "learning_rate": 0.00010078008835254821, |
| "loss": 3.1765, |
| "step": 77350 |
| }, |
| { |
| "epoch": 8.33064255731353, |
| "grad_norm": 0.8305663466453552, |
| "learning_rate": 0.0001004568473224868, |
| "loss": 3.1687, |
| "step": 77400 |
| }, |
| { |
| "epoch": 8.336024109353138, |
| "grad_norm": 0.8254111409187317, |
| "learning_rate": 0.00010013360629242537, |
| "loss": 3.1747, |
| "step": 77450 |
| }, |
| { |
| "epoch": 8.341405661392745, |
| "grad_norm": 0.8871572613716125, |
| "learning_rate": 9.98168300829652e-05, |
| "loss": 3.1621, |
| "step": 77500 |
| }, |
| { |
| "epoch": 8.346787213432354, |
| "grad_norm": 0.8878978490829468, |
| "learning_rate": 9.949358905290376e-05, |
| "loss": 3.1887, |
| "step": 77550 |
| }, |
| { |
| "epoch": 8.352168765471962, |
| "grad_norm": 0.8454150557518005, |
| "learning_rate": 9.917034802284236e-05, |
| "loss": 3.1821, |
| "step": 77600 |
| }, |
| { |
| "epoch": 8.35755031751157, |
| "grad_norm": 0.8997962474822998, |
| "learning_rate": 9.884710699278094e-05, |
| "loss": 3.2012, |
| "step": 77650 |
| }, |
| { |
| "epoch": 8.36293186955118, |
| "grad_norm": 0.8641318082809448, |
| "learning_rate": 9.852386596271953e-05, |
| "loss": 3.1719, |
| "step": 77700 |
| }, |
| { |
| "epoch": 8.368313421590786, |
| "grad_norm": 0.8282453417778015, |
| "learning_rate": 9.82006249326581e-05, |
| "loss": 3.1844, |
| "step": 77750 |
| }, |
| { |
| "epoch": 8.373694973630395, |
| "grad_norm": 0.8555848002433777, |
| "learning_rate": 9.78773839025967e-05, |
| "loss": 3.1653, |
| "step": 77800 |
| }, |
| { |
| "epoch": 8.379076525670003, |
| "grad_norm": 0.8081738948822021, |
| "learning_rate": 9.755414287253529e-05, |
| "loss": 3.1816, |
| "step": 77850 |
| }, |
| { |
| "epoch": 8.384458077709612, |
| "grad_norm": 0.8319929838180542, |
| "learning_rate": 9.723090184247386e-05, |
| "loss": 3.1643, |
| "step": 77900 |
| }, |
| { |
| "epoch": 8.38983962974922, |
| "grad_norm": 0.8342401385307312, |
| "learning_rate": 9.690766081241245e-05, |
| "loss": 3.1614, |
| "step": 77950 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "grad_norm": 0.8482891917228699, |
| "learning_rate": 9.658441978235103e-05, |
| "loss": 3.1946, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "eval_accuracy": 0.3911348587691736, |
| "eval_loss": 3.3295955657958984, |
| "eval_runtime": 186.3735, |
| "eval_samples_per_second": 96.639, |
| "eval_steps_per_second": 6.042, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.400602733828435, |
| "grad_norm": 0.9109727740287781, |
| "learning_rate": 9.626117875228961e-05, |
| "loss": 3.1706, |
| "step": 78050 |
| }, |
| { |
| "epoch": 8.405984285868044, |
| "grad_norm": 0.8868828415870667, |
| "learning_rate": 9.59379377222282e-05, |
| "loss": 3.1801, |
| "step": 78100 |
| }, |
| { |
| "epoch": 8.411365837907653, |
| "grad_norm": 0.7975767850875854, |
| "learning_rate": 9.561469669216679e-05, |
| "loss": 3.1766, |
| "step": 78150 |
| }, |
| { |
| "epoch": 8.416747389947261, |
| "grad_norm": 0.8645923137664795, |
| "learning_rate": 9.529145566210537e-05, |
| "loss": 3.1801, |
| "step": 78200 |
| }, |
| { |
| "epoch": 8.42212894198687, |
| "grad_norm": 0.8716831803321838, |
| "learning_rate": 9.496821463204395e-05, |
| "loss": 3.1758, |
| "step": 78250 |
| }, |
| { |
| "epoch": 8.427510494026476, |
| "grad_norm": 0.8328187465667725, |
| "learning_rate": 9.464497360198253e-05, |
| "loss": 3.1723, |
| "step": 78300 |
| }, |
| { |
| "epoch": 8.432892046066085, |
| "grad_norm": 0.8371261954307556, |
| "learning_rate": 9.432173257192113e-05, |
| "loss": 3.1773, |
| "step": 78350 |
| }, |
| { |
| "epoch": 8.438273598105694, |
| "grad_norm": 0.834830105304718, |
| "learning_rate": 9.39984915418597e-05, |
| "loss": 3.1754, |
| "step": 78400 |
| }, |
| { |
| "epoch": 8.443655150145302, |
| "grad_norm": 0.9049352407455444, |
| "learning_rate": 9.367525051179829e-05, |
| "loss": 3.1824, |
| "step": 78450 |
| }, |
| { |
| "epoch": 8.44903670218491, |
| "grad_norm": 0.8162443041801453, |
| "learning_rate": 9.335200948173688e-05, |
| "loss": 3.1844, |
| "step": 78500 |
| }, |
| { |
| "epoch": 8.45441825422452, |
| "grad_norm": 0.9018626809120178, |
| "learning_rate": 9.302876845167545e-05, |
| "loss": 3.1696, |
| "step": 78550 |
| }, |
| { |
| "epoch": 8.459799806264126, |
| "grad_norm": 0.8505458831787109, |
| "learning_rate": 9.271199224221527e-05, |
| "loss": 3.1821, |
| "step": 78600 |
| }, |
| { |
| "epoch": 8.465181358303735, |
| "grad_norm": 0.8124887943267822, |
| "learning_rate": 9.238875121215387e-05, |
| "loss": 3.1801, |
| "step": 78650 |
| }, |
| { |
| "epoch": 8.470562910343343, |
| "grad_norm": 0.8661438822746277, |
| "learning_rate": 9.206551018209243e-05, |
| "loss": 3.1655, |
| "step": 78700 |
| }, |
| { |
| "epoch": 8.475944462382952, |
| "grad_norm": 0.8392370939254761, |
| "learning_rate": 9.174226915203103e-05, |
| "loss": 3.1835, |
| "step": 78750 |
| }, |
| { |
| "epoch": 8.48132601442256, |
| "grad_norm": 0.8550435900688171, |
| "learning_rate": 9.141902812196961e-05, |
| "loss": 3.1858, |
| "step": 78800 |
| }, |
| { |
| "epoch": 8.486707566462167, |
| "grad_norm": 0.8661965727806091, |
| "learning_rate": 9.109578709190818e-05, |
| "loss": 3.1893, |
| "step": 78850 |
| }, |
| { |
| "epoch": 8.492089118501776, |
| "grad_norm": 0.837314248085022, |
| "learning_rate": 9.077254606184677e-05, |
| "loss": 3.1782, |
| "step": 78900 |
| }, |
| { |
| "epoch": 8.497470670541384, |
| "grad_norm": 0.8997771739959717, |
| "learning_rate": 9.044930503178537e-05, |
| "loss": 3.1867, |
| "step": 78950 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "grad_norm": 0.8326417803764343, |
| "learning_rate": 9.012606400172395e-05, |
| "loss": 3.1793, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "eval_accuracy": 0.3914402820368201, |
| "eval_loss": 3.3243117332458496, |
| "eval_runtime": 185.6677, |
| "eval_samples_per_second": 97.007, |
| "eval_steps_per_second": 6.065, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.508233774620601, |
| "grad_norm": 0.8277132511138916, |
| "learning_rate": 8.980282297166253e-05, |
| "loss": 3.1903, |
| "step": 79050 |
| }, |
| { |
| "epoch": 8.513615326660208, |
| "grad_norm": 0.8385311365127563, |
| "learning_rate": 8.947958194160111e-05, |
| "loss": 3.1856, |
| "step": 79100 |
| }, |
| { |
| "epoch": 8.518996878699816, |
| "grad_norm": 0.8503491282463074, |
| "learning_rate": 8.91563409115397e-05, |
| "loss": 3.175, |
| "step": 79150 |
| }, |
| { |
| "epoch": 8.524378430739425, |
| "grad_norm": 0.8473013043403625, |
| "learning_rate": 8.883309988147827e-05, |
| "loss": 3.2004, |
| "step": 79200 |
| }, |
| { |
| "epoch": 8.529759982779034, |
| "grad_norm": 0.8553803563117981, |
| "learning_rate": 8.850985885141687e-05, |
| "loss": 3.1872, |
| "step": 79250 |
| }, |
| { |
| "epoch": 8.535141534818642, |
| "grad_norm": 0.8805926442146301, |
| "learning_rate": 8.818661782135545e-05, |
| "loss": 3.1751, |
| "step": 79300 |
| }, |
| { |
| "epoch": 8.54052308685825, |
| "grad_norm": 0.8522555232048035, |
| "learning_rate": 8.786337679129403e-05, |
| "loss": 3.1679, |
| "step": 79350 |
| }, |
| { |
| "epoch": 8.545904638897857, |
| "grad_norm": 0.8566330671310425, |
| "learning_rate": 8.754013576123261e-05, |
| "loss": 3.175, |
| "step": 79400 |
| }, |
| { |
| "epoch": 8.551286190937466, |
| "grad_norm": 0.8199010491371155, |
| "learning_rate": 8.72168947311712e-05, |
| "loss": 3.1743, |
| "step": 79450 |
| }, |
| { |
| "epoch": 8.556667742977075, |
| "grad_norm": 0.885802149772644, |
| "learning_rate": 8.68936537011098e-05, |
| "loss": 3.1698, |
| "step": 79500 |
| }, |
| { |
| "epoch": 8.562049295016683, |
| "grad_norm": 0.8471908569335938, |
| "learning_rate": 8.657041267104837e-05, |
| "loss": 3.1837, |
| "step": 79550 |
| }, |
| { |
| "epoch": 8.567430847056292, |
| "grad_norm": 0.8292319774627686, |
| "learning_rate": 8.624717164098696e-05, |
| "loss": 3.1857, |
| "step": 79600 |
| }, |
| { |
| "epoch": 8.572812399095898, |
| "grad_norm": 0.8388420939445496, |
| "learning_rate": 8.592393061092554e-05, |
| "loss": 3.1936, |
| "step": 79650 |
| }, |
| { |
| "epoch": 8.578193951135507, |
| "grad_norm": 0.8944175839424133, |
| "learning_rate": 8.560068958086412e-05, |
| "loss": 3.1745, |
| "step": 79700 |
| }, |
| { |
| "epoch": 8.583575503175116, |
| "grad_norm": 0.8031978607177734, |
| "learning_rate": 8.52774485508027e-05, |
| "loss": 3.1718, |
| "step": 79750 |
| }, |
| { |
| "epoch": 8.588957055214724, |
| "grad_norm": 0.8663561344146729, |
| "learning_rate": 8.49542075207413e-05, |
| "loss": 3.2032, |
| "step": 79800 |
| }, |
| { |
| "epoch": 8.594338607254333, |
| "grad_norm": 0.8733053803443909, |
| "learning_rate": 8.463096649067987e-05, |
| "loss": 3.1841, |
| "step": 79850 |
| }, |
| { |
| "epoch": 8.599720159293941, |
| "grad_norm": 0.9112555980682373, |
| "learning_rate": 8.430772546061846e-05, |
| "loss": 3.2002, |
| "step": 79900 |
| }, |
| { |
| "epoch": 8.605101711333548, |
| "grad_norm": 0.8888176679611206, |
| "learning_rate": 8.398448443055704e-05, |
| "loss": 3.1842, |
| "step": 79950 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "grad_norm": 0.8602695465087891, |
| "learning_rate": 8.366124340049564e-05, |
| "loss": 3.1722, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "eval_accuracy": 0.3919845243518828, |
| "eval_loss": 3.3207170963287354, |
| "eval_runtime": 186.1198, |
| "eval_samples_per_second": 96.771, |
| "eval_steps_per_second": 6.05, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.615864815412765, |
| "grad_norm": 0.8392812609672546, |
| "learning_rate": 8.33380023704342e-05, |
| "loss": 3.1921, |
| "step": 80050 |
| }, |
| { |
| "epoch": 8.621246367452374, |
| "grad_norm": 0.8568989038467407, |
| "learning_rate": 8.30147613403728e-05, |
| "loss": 3.1665, |
| "step": 80100 |
| }, |
| { |
| "epoch": 8.626627919491982, |
| "grad_norm": 0.8770465850830078, |
| "learning_rate": 8.269152031031138e-05, |
| "loss": 3.1882, |
| "step": 80150 |
| }, |
| { |
| "epoch": 8.632009471531589, |
| "grad_norm": 0.9001599550247192, |
| "learning_rate": 8.236827928024996e-05, |
| "loss": 3.1943, |
| "step": 80200 |
| }, |
| { |
| "epoch": 8.637391023571197, |
| "grad_norm": 0.8278911113739014, |
| "learning_rate": 8.204503825018854e-05, |
| "loss": 3.1922, |
| "step": 80250 |
| }, |
| { |
| "epoch": 8.642772575610806, |
| "grad_norm": 0.8092060089111328, |
| "learning_rate": 8.172179722012714e-05, |
| "loss": 3.1764, |
| "step": 80300 |
| }, |
| { |
| "epoch": 8.648154127650415, |
| "grad_norm": 0.806548535823822, |
| "learning_rate": 8.13985561900657e-05, |
| "loss": 3.1823, |
| "step": 80350 |
| }, |
| { |
| "epoch": 8.653535679690023, |
| "grad_norm": 0.8273184895515442, |
| "learning_rate": 8.10753151600043e-05, |
| "loss": 3.1737, |
| "step": 80400 |
| }, |
| { |
| "epoch": 8.658917231729632, |
| "grad_norm": 0.8822018504142761, |
| "learning_rate": 8.07520741299429e-05, |
| "loss": 3.1948, |
| "step": 80450 |
| }, |
| { |
| "epoch": 8.664298783769238, |
| "grad_norm": 0.851418137550354, |
| "learning_rate": 8.042883309988148e-05, |
| "loss": 3.1999, |
| "step": 80500 |
| }, |
| { |
| "epoch": 8.669680335808847, |
| "grad_norm": 0.9021140336990356, |
| "learning_rate": 8.010559206982006e-05, |
| "loss": 3.182, |
| "step": 80550 |
| }, |
| { |
| "epoch": 8.675061887848456, |
| "grad_norm": 0.8763213157653809, |
| "learning_rate": 7.978235103975864e-05, |
| "loss": 3.1891, |
| "step": 80600 |
| }, |
| { |
| "epoch": 8.680443439888064, |
| "grad_norm": 0.8623907566070557, |
| "learning_rate": 7.945911000969723e-05, |
| "loss": 3.2095, |
| "step": 80650 |
| }, |
| { |
| "epoch": 8.685824991927673, |
| "grad_norm": 0.8749169111251831, |
| "learning_rate": 7.91358689796358e-05, |
| "loss": 3.1801, |
| "step": 80700 |
| }, |
| { |
| "epoch": 8.69120654396728, |
| "grad_norm": 0.8788124918937683, |
| "learning_rate": 7.88126279495744e-05, |
| "loss": 3.1642, |
| "step": 80750 |
| }, |
| { |
| "epoch": 8.696588096006888, |
| "grad_norm": 0.9627377390861511, |
| "learning_rate": 7.848938691951298e-05, |
| "loss": 3.1865, |
| "step": 80800 |
| }, |
| { |
| "epoch": 8.701969648046497, |
| "grad_norm": 0.8454668521881104, |
| "learning_rate": 7.816614588945156e-05, |
| "loss": 3.1678, |
| "step": 80850 |
| }, |
| { |
| "epoch": 8.707351200086105, |
| "grad_norm": 0.8614948391914368, |
| "learning_rate": 7.784290485939014e-05, |
| "loss": 3.1932, |
| "step": 80900 |
| }, |
| { |
| "epoch": 8.712732752125714, |
| "grad_norm": 0.8596400022506714, |
| "learning_rate": 7.751966382932873e-05, |
| "loss": 3.1824, |
| "step": 80950 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "grad_norm": 0.8495746850967407, |
| "learning_rate": 7.719642279926731e-05, |
| "loss": 3.177, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "eval_accuracy": 0.39226636994287356, |
| "eval_loss": 3.318680763244629, |
| "eval_runtime": 186.2188, |
| "eval_samples_per_second": 96.72, |
| "eval_steps_per_second": 6.047, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.723495856204929, |
| "grad_norm": 0.9119624495506287, |
| "learning_rate": 7.68731817692059e-05, |
| "loss": 3.1911, |
| "step": 81050 |
| }, |
| { |
| "epoch": 8.728877408244538, |
| "grad_norm": 0.8198921084403992, |
| "learning_rate": 7.654994073914448e-05, |
| "loss": 3.1873, |
| "step": 81100 |
| }, |
| { |
| "epoch": 8.734258960284146, |
| "grad_norm": 0.8648779988288879, |
| "learning_rate": 7.622669970908307e-05, |
| "loss": 3.1686, |
| "step": 81150 |
| }, |
| { |
| "epoch": 8.739640512323755, |
| "grad_norm": 0.8918686509132385, |
| "learning_rate": 7.590345867902164e-05, |
| "loss": 3.2035, |
| "step": 81200 |
| }, |
| { |
| "epoch": 8.745022064363363, |
| "grad_norm": 0.9003249406814575, |
| "learning_rate": 7.558021764896023e-05, |
| "loss": 3.1672, |
| "step": 81250 |
| }, |
| { |
| "epoch": 8.75040361640297, |
| "grad_norm": 0.862156867980957, |
| "learning_rate": 7.525697661889883e-05, |
| "loss": 3.1904, |
| "step": 81300 |
| }, |
| { |
| "epoch": 8.755785168442578, |
| "grad_norm": 0.8787533640861511, |
| "learning_rate": 7.493373558883741e-05, |
| "loss": 3.1835, |
| "step": 81350 |
| }, |
| { |
| "epoch": 8.761166720482187, |
| "grad_norm": 0.814907968044281, |
| "learning_rate": 7.461049455877599e-05, |
| "loss": 3.1656, |
| "step": 81400 |
| }, |
| { |
| "epoch": 8.766548272521796, |
| "grad_norm": 0.8895700573921204, |
| "learning_rate": 7.428725352871457e-05, |
| "loss": 3.1981, |
| "step": 81450 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.8487426042556763, |
| "learning_rate": 7.396401249865315e-05, |
| "loss": 3.1724, |
| "step": 81500 |
| }, |
| { |
| "epoch": 8.777311376601011, |
| "grad_norm": 0.8261008262634277, |
| "learning_rate": 7.364077146859173e-05, |
| "loss": 3.1975, |
| "step": 81550 |
| }, |
| { |
| "epoch": 8.78269292864062, |
| "grad_norm": 0.8884170651435852, |
| "learning_rate": 7.331753043853033e-05, |
| "loss": 3.2006, |
| "step": 81600 |
| }, |
| { |
| "epoch": 8.788074480680228, |
| "grad_norm": 0.8335497379302979, |
| "learning_rate": 7.299428940846891e-05, |
| "loss": 3.1789, |
| "step": 81650 |
| }, |
| { |
| "epoch": 8.793456032719837, |
| "grad_norm": 0.830715000629425, |
| "learning_rate": 7.267104837840749e-05, |
| "loss": 3.1875, |
| "step": 81700 |
| }, |
| { |
| "epoch": 8.798837584759445, |
| "grad_norm": 0.8754333853721619, |
| "learning_rate": 7.234780734834607e-05, |
| "loss": 3.1759, |
| "step": 81750 |
| }, |
| { |
| "epoch": 8.804219136799054, |
| "grad_norm": 0.8991921544075012, |
| "learning_rate": 7.202456631828465e-05, |
| "loss": 3.1968, |
| "step": 81800 |
| }, |
| { |
| "epoch": 8.80960068883866, |
| "grad_norm": 0.8619662523269653, |
| "learning_rate": 7.170132528822325e-05, |
| "loss": 3.2101, |
| "step": 81850 |
| }, |
| { |
| "epoch": 8.814982240878269, |
| "grad_norm": 0.8826781511306763, |
| "learning_rate": 7.137808425816183e-05, |
| "loss": 3.18, |
| "step": 81900 |
| }, |
| { |
| "epoch": 8.820363792917878, |
| "grad_norm": 0.8799011707305908, |
| "learning_rate": 7.105484322810041e-05, |
| "loss": 3.1976, |
| "step": 81950 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "grad_norm": 0.8722858428955078, |
| "learning_rate": 7.073160219803899e-05, |
| "loss": 3.1673, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "eval_accuracy": 0.3925598413928605, |
| "eval_loss": 3.314213514328003, |
| "eval_runtime": 185.8502, |
| "eval_samples_per_second": 96.911, |
| "eval_steps_per_second": 6.059, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.831126896997095, |
| "grad_norm": 0.9026461839675903, |
| "learning_rate": 7.040836116797757e-05, |
| "loss": 3.1666, |
| "step": 82050 |
| }, |
| { |
| "epoch": 8.836508449036701, |
| "grad_norm": 0.8400061726570129, |
| "learning_rate": 7.008512013791617e-05, |
| "loss": 3.2031, |
| "step": 82100 |
| }, |
| { |
| "epoch": 8.84189000107631, |
| "grad_norm": 0.851011335849762, |
| "learning_rate": 6.976187910785475e-05, |
| "loss": 3.1718, |
| "step": 82150 |
| }, |
| { |
| "epoch": 8.847271553115919, |
| "grad_norm": 0.9252976775169373, |
| "learning_rate": 6.943863807779334e-05, |
| "loss": 3.172, |
| "step": 82200 |
| }, |
| { |
| "epoch": 8.852653105155527, |
| "grad_norm": 0.8581783175468445, |
| "learning_rate": 6.911539704773192e-05, |
| "loss": 3.1783, |
| "step": 82250 |
| }, |
| { |
| "epoch": 8.858034657195136, |
| "grad_norm": 0.8723549842834473, |
| "learning_rate": 6.87921560176705e-05, |
| "loss": 3.1758, |
| "step": 82300 |
| }, |
| { |
| "epoch": 8.863416209234742, |
| "grad_norm": 0.8365177512168884, |
| "learning_rate": 6.846891498760909e-05, |
| "loss": 3.1784, |
| "step": 82350 |
| }, |
| { |
| "epoch": 8.868797761274351, |
| "grad_norm": 0.8695857524871826, |
| "learning_rate": 6.814567395754767e-05, |
| "loss": 3.1792, |
| "step": 82400 |
| }, |
| { |
| "epoch": 8.87417931331396, |
| "grad_norm": 0.91620272397995, |
| "learning_rate": 6.782243292748626e-05, |
| "loss": 3.1696, |
| "step": 82450 |
| }, |
| { |
| "epoch": 8.879560865353568, |
| "grad_norm": 0.8370681405067444, |
| "learning_rate": 6.749919189742484e-05, |
| "loss": 3.2113, |
| "step": 82500 |
| }, |
| { |
| "epoch": 8.884942417393177, |
| "grad_norm": 0.8928407430648804, |
| "learning_rate": 6.718241568796465e-05, |
| "loss": 3.1842, |
| "step": 82550 |
| }, |
| { |
| "epoch": 8.890323969432785, |
| "grad_norm": 0.8692743182182312, |
| "learning_rate": 6.685917465790323e-05, |
| "loss": 3.1856, |
| "step": 82600 |
| }, |
| { |
| "epoch": 8.895705521472392, |
| "grad_norm": 0.8894014358520508, |
| "learning_rate": 6.653593362784182e-05, |
| "loss": 3.1813, |
| "step": 82650 |
| }, |
| { |
| "epoch": 8.901087073512, |
| "grad_norm": 0.8594149351119995, |
| "learning_rate": 6.62126925977804e-05, |
| "loss": 3.1993, |
| "step": 82700 |
| }, |
| { |
| "epoch": 8.906468625551609, |
| "grad_norm": 0.8474794030189514, |
| "learning_rate": 6.588945156771899e-05, |
| "loss": 3.1777, |
| "step": 82750 |
| }, |
| { |
| "epoch": 8.911850177591218, |
| "grad_norm": 0.9214169979095459, |
| "learning_rate": 6.556621053765757e-05, |
| "loss": 3.1681, |
| "step": 82800 |
| }, |
| { |
| "epoch": 8.917231729630826, |
| "grad_norm": 0.8759368658065796, |
| "learning_rate": 6.524296950759615e-05, |
| "loss": 3.164, |
| "step": 82850 |
| }, |
| { |
| "epoch": 8.922613281670433, |
| "grad_norm": 0.8734903335571289, |
| "learning_rate": 6.491972847753474e-05, |
| "loss": 3.1812, |
| "step": 82900 |
| }, |
| { |
| "epoch": 8.927994833710041, |
| "grad_norm": 0.8701024055480957, |
| "learning_rate": 6.459648744747333e-05, |
| "loss": 3.1713, |
| "step": 82950 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "grad_norm": 0.8649294972419739, |
| "learning_rate": 6.427324641741192e-05, |
| "loss": 3.1787, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "eval_accuracy": 0.3928902548247043, |
| "eval_loss": 3.310454845428467, |
| "eval_runtime": 186.1177, |
| "eval_samples_per_second": 96.772, |
| "eval_steps_per_second": 6.05, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.938757937789259, |
| "grad_norm": 0.8301261067390442, |
| "learning_rate": 6.39500053873505e-05, |
| "loss": 3.1759, |
| "step": 83050 |
| }, |
| { |
| "epoch": 8.944139489828867, |
| "grad_norm": 0.8773009181022644, |
| "learning_rate": 6.362676435728908e-05, |
| "loss": 3.1792, |
| "step": 83100 |
| }, |
| { |
| "epoch": 8.949521041868476, |
| "grad_norm": 0.8822053074836731, |
| "learning_rate": 6.330352332722766e-05, |
| "loss": 3.1861, |
| "step": 83150 |
| }, |
| { |
| "epoch": 8.954902593908082, |
| "grad_norm": 0.8993712663650513, |
| "learning_rate": 6.298028229716624e-05, |
| "loss": 3.1945, |
| "step": 83200 |
| }, |
| { |
| "epoch": 8.960284145947691, |
| "grad_norm": 0.848084032535553, |
| "learning_rate": 6.265704126710484e-05, |
| "loss": 3.1828, |
| "step": 83250 |
| }, |
| { |
| "epoch": 8.9656656979873, |
| "grad_norm": 0.8800722360610962, |
| "learning_rate": 6.233380023704342e-05, |
| "loss": 3.1887, |
| "step": 83300 |
| }, |
| { |
| "epoch": 8.971047250026908, |
| "grad_norm": 0.8553746342658997, |
| "learning_rate": 6.2010559206982e-05, |
| "loss": 3.1643, |
| "step": 83350 |
| }, |
| { |
| "epoch": 8.976428802066517, |
| "grad_norm": 0.8592532873153687, |
| "learning_rate": 6.168731817692058e-05, |
| "loss": 3.1819, |
| "step": 83400 |
| }, |
| { |
| "epoch": 8.981810354106123, |
| "grad_norm": 0.8659167885780334, |
| "learning_rate": 6.136407714685916e-05, |
| "loss": 3.1781, |
| "step": 83450 |
| }, |
| { |
| "epoch": 8.987191906145732, |
| "grad_norm": 0.887120246887207, |
| "learning_rate": 6.104083611679776e-05, |
| "loss": 3.1797, |
| "step": 83500 |
| }, |
| { |
| "epoch": 8.99257345818534, |
| "grad_norm": 0.829015851020813, |
| "learning_rate": 6.071759508673634e-05, |
| "loss": 3.1903, |
| "step": 83550 |
| }, |
| { |
| "epoch": 8.997955010224949, |
| "grad_norm": 0.8530725836753845, |
| "learning_rate": 6.039435405667492e-05, |
| "loss": 3.1816, |
| "step": 83600 |
| }, |
| { |
| "epoch": 9.003336562264558, |
| "grad_norm": 0.8461193442344666, |
| "learning_rate": 6.007111302661351e-05, |
| "loss": 3.1462, |
| "step": 83650 |
| }, |
| { |
| "epoch": 9.008718114304166, |
| "grad_norm": 0.857636034488678, |
| "learning_rate": 5.974787199655209e-05, |
| "loss": 3.1079, |
| "step": 83700 |
| }, |
| { |
| "epoch": 9.014099666343773, |
| "grad_norm": 0.9047430753707886, |
| "learning_rate": 5.942463096649068e-05, |
| "loss": 3.1454, |
| "step": 83750 |
| }, |
| { |
| "epoch": 9.019481218383381, |
| "grad_norm": 0.8626530766487122, |
| "learning_rate": 5.910138993642926e-05, |
| "loss": 3.1256, |
| "step": 83800 |
| }, |
| { |
| "epoch": 9.02486277042299, |
| "grad_norm": 0.8821343779563904, |
| "learning_rate": 5.877814890636784e-05, |
| "loss": 3.116, |
| "step": 83850 |
| }, |
| { |
| "epoch": 9.030244322462599, |
| "grad_norm": 0.8559414744377136, |
| "learning_rate": 5.845490787630643e-05, |
| "loss": 3.1238, |
| "step": 83900 |
| }, |
| { |
| "epoch": 9.035625874502207, |
| "grad_norm": 0.8904216885566711, |
| "learning_rate": 5.813166684624501e-05, |
| "loss": 3.1017, |
| "step": 83950 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "grad_norm": 0.8306416273117065, |
| "learning_rate": 5.7808425816183596e-05, |
| "loss": 3.1438, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "eval_accuracy": 0.3928628742969751, |
| "eval_loss": 3.3142924308776855, |
| "eval_runtime": 185.9073, |
| "eval_samples_per_second": 96.882, |
| "eval_steps_per_second": 6.057, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.046388978581422, |
| "grad_norm": 0.8466785550117493, |
| "learning_rate": 5.748518478612218e-05, |
| "loss": 3.1146, |
| "step": 84050 |
| }, |
| { |
| "epoch": 9.051770530621031, |
| "grad_norm": 0.8146861791610718, |
| "learning_rate": 5.716194375606076e-05, |
| "loss": 3.123, |
| "step": 84100 |
| }, |
| { |
| "epoch": 9.05715208266064, |
| "grad_norm": 0.8775056004524231, |
| "learning_rate": 5.6838702725999346e-05, |
| "loss": 3.1288, |
| "step": 84150 |
| }, |
| { |
| "epoch": 9.062533634700248, |
| "grad_norm": 0.8910180926322937, |
| "learning_rate": 5.651546169593793e-05, |
| "loss": 3.1265, |
| "step": 84200 |
| }, |
| { |
| "epoch": 9.067915186739857, |
| "grad_norm": 0.8813162446022034, |
| "learning_rate": 5.619222066587652e-05, |
| "loss": 3.1091, |
| "step": 84250 |
| }, |
| { |
| "epoch": 9.073296738779463, |
| "grad_norm": 0.8770124912261963, |
| "learning_rate": 5.58689796358151e-05, |
| "loss": 3.1376, |
| "step": 84300 |
| }, |
| { |
| "epoch": 9.078678290819072, |
| "grad_norm": 0.8976263999938965, |
| "learning_rate": 5.554573860575369e-05, |
| "loss": 3.1274, |
| "step": 84350 |
| }, |
| { |
| "epoch": 9.08405984285868, |
| "grad_norm": 0.8833991885185242, |
| "learning_rate": 5.522249757569227e-05, |
| "loss": 3.0987, |
| "step": 84400 |
| }, |
| { |
| "epoch": 9.089441394898289, |
| "grad_norm": 0.8468970060348511, |
| "learning_rate": 5.489925654563085e-05, |
| "loss": 3.1198, |
| "step": 84450 |
| }, |
| { |
| "epoch": 9.094822946937898, |
| "grad_norm": 0.9312103986740112, |
| "learning_rate": 5.457601551556944e-05, |
| "loss": 3.1262, |
| "step": 84500 |
| }, |
| { |
| "epoch": 9.100204498977504, |
| "grad_norm": 0.919991672039032, |
| "learning_rate": 5.425277448550802e-05, |
| "loss": 3.1314, |
| "step": 84550 |
| }, |
| { |
| "epoch": 9.105586051017113, |
| "grad_norm": 0.9003528952598572, |
| "learning_rate": 5.392953345544661e-05, |
| "loss": 3.1279, |
| "step": 84600 |
| }, |
| { |
| "epoch": 9.110967603056721, |
| "grad_norm": 0.8960425853729248, |
| "learning_rate": 5.360629242538519e-05, |
| "loss": 3.1115, |
| "step": 84650 |
| }, |
| { |
| "epoch": 9.11634915509633, |
| "grad_norm": 0.898668646812439, |
| "learning_rate": 5.328305139532377e-05, |
| "loss": 3.1274, |
| "step": 84700 |
| }, |
| { |
| "epoch": 9.121730707135939, |
| "grad_norm": 0.8424252271652222, |
| "learning_rate": 5.295981036526236e-05, |
| "loss": 3.1187, |
| "step": 84750 |
| }, |
| { |
| "epoch": 9.127112259175545, |
| "grad_norm": 0.8712086081504822, |
| "learning_rate": 5.263656933520094e-05, |
| "loss": 3.1323, |
| "step": 84800 |
| }, |
| { |
| "epoch": 9.132493811215154, |
| "grad_norm": 0.8652164936065674, |
| "learning_rate": 5.231332830513953e-05, |
| "loss": 3.1193, |
| "step": 84850 |
| }, |
| { |
| "epoch": 9.137875363254762, |
| "grad_norm": 0.8246886134147644, |
| "learning_rate": 5.199008727507811e-05, |
| "loss": 3.1129, |
| "step": 84900 |
| }, |
| { |
| "epoch": 9.143256915294371, |
| "grad_norm": 0.8619574904441833, |
| "learning_rate": 5.166684624501669e-05, |
| "loss": 3.1119, |
| "step": 84950 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "grad_norm": 0.8570842742919922, |
| "learning_rate": 5.1343605214955286e-05, |
| "loss": 3.1517, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "eval_accuracy": 0.3930500832226794, |
| "eval_loss": 3.312678813934326, |
| "eval_runtime": 186.0671, |
| "eval_samples_per_second": 96.798, |
| "eval_steps_per_second": 6.052, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.154020019373588, |
| "grad_norm": 0.8676272034645081, |
| "learning_rate": 5.102036418489387e-05, |
| "loss": 3.1165, |
| "step": 85050 |
| }, |
| { |
| "epoch": 9.159401571413195, |
| "grad_norm": 0.8815665245056152, |
| "learning_rate": 5.0697123154832455e-05, |
| "loss": 3.1423, |
| "step": 85100 |
| }, |
| { |
| "epoch": 9.164783123452803, |
| "grad_norm": 0.9012113809585571, |
| "learning_rate": 5.0373882124771036e-05, |
| "loss": 3.1273, |
| "step": 85150 |
| }, |
| { |
| "epoch": 9.170164675492412, |
| "grad_norm": 0.8648649454116821, |
| "learning_rate": 5.005064109470962e-05, |
| "loss": 3.1311, |
| "step": 85200 |
| }, |
| { |
| "epoch": 9.17554622753202, |
| "grad_norm": 0.8902011513710022, |
| "learning_rate": 4.9727400064648205e-05, |
| "loss": 3.1323, |
| "step": 85250 |
| }, |
| { |
| "epoch": 9.180927779571629, |
| "grad_norm": 0.8550026416778564, |
| "learning_rate": 4.9404159034586786e-05, |
| "loss": 3.1492, |
| "step": 85300 |
| }, |
| { |
| "epoch": 9.186309331611236, |
| "grad_norm": 0.880626916885376, |
| "learning_rate": 4.9080918004525374e-05, |
| "loss": 3.139, |
| "step": 85350 |
| }, |
| { |
| "epoch": 9.191690883650844, |
| "grad_norm": 0.8435592651367188, |
| "learning_rate": 4.8757676974463955e-05, |
| "loss": 3.1416, |
| "step": 85400 |
| }, |
| { |
| "epoch": 9.197072435690453, |
| "grad_norm": 0.9266993403434753, |
| "learning_rate": 4.8434435944402536e-05, |
| "loss": 3.1275, |
| "step": 85450 |
| }, |
| { |
| "epoch": 9.202453987730062, |
| "grad_norm": 0.8847249746322632, |
| "learning_rate": 4.8111194914341124e-05, |
| "loss": 3.1317, |
| "step": 85500 |
| }, |
| { |
| "epoch": 9.20783553976967, |
| "grad_norm": 0.8901268243789673, |
| "learning_rate": 4.7787953884279705e-05, |
| "loss": 3.1337, |
| "step": 85550 |
| }, |
| { |
| "epoch": 9.213217091809279, |
| "grad_norm": 0.8637431263923645, |
| "learning_rate": 4.746471285421829e-05, |
| "loss": 3.1347, |
| "step": 85600 |
| }, |
| { |
| "epoch": 9.218598643848885, |
| "grad_norm": 0.898429274559021, |
| "learning_rate": 4.7141471824156874e-05, |
| "loss": 3.1265, |
| "step": 85650 |
| }, |
| { |
| "epoch": 9.223980195888494, |
| "grad_norm": 0.8722035884857178, |
| "learning_rate": 4.6818230794095455e-05, |
| "loss": 3.136, |
| "step": 85700 |
| }, |
| { |
| "epoch": 9.229361747928102, |
| "grad_norm": 0.8962967991828918, |
| "learning_rate": 4.649498976403404e-05, |
| "loss": 3.124, |
| "step": 85750 |
| }, |
| { |
| "epoch": 9.234743299967711, |
| "grad_norm": 0.8888386487960815, |
| "learning_rate": 4.6171748733972624e-05, |
| "loss": 3.1325, |
| "step": 85800 |
| }, |
| { |
| "epoch": 9.24012485200732, |
| "grad_norm": 0.8526327610015869, |
| "learning_rate": 4.584850770391122e-05, |
| "loss": 3.129, |
| "step": 85850 |
| }, |
| { |
| "epoch": 9.245506404046926, |
| "grad_norm": 0.8576886653900146, |
| "learning_rate": 4.55252666738498e-05, |
| "loss": 3.1395, |
| "step": 85900 |
| }, |
| { |
| "epoch": 9.250887956086535, |
| "grad_norm": 0.8933677673339844, |
| "learning_rate": 4.520202564378838e-05, |
| "loss": 3.1428, |
| "step": 85950 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "grad_norm": 0.8917112350463867, |
| "learning_rate": 4.487878461372697e-05, |
| "loss": 3.1461, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "eval_accuracy": 0.39323099028089054, |
| "eval_loss": 3.3114166259765625, |
| "eval_runtime": 185.9806, |
| "eval_samples_per_second": 96.843, |
| "eval_steps_per_second": 6.054, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.261651060165752, |
| "grad_norm": 0.885558545589447, |
| "learning_rate": 4.455554358366555e-05, |
| "loss": 3.1165, |
| "step": 86050 |
| }, |
| { |
| "epoch": 9.26703261220536, |
| "grad_norm": 0.9123818874359131, |
| "learning_rate": 4.423230255360414e-05, |
| "loss": 3.1254, |
| "step": 86100 |
| }, |
| { |
| "epoch": 9.272414164244967, |
| "grad_norm": 0.8465166091918945, |
| "learning_rate": 4.390906152354272e-05, |
| "loss": 3.1268, |
| "step": 86150 |
| }, |
| { |
| "epoch": 9.277795716284576, |
| "grad_norm": 0.850324809551239, |
| "learning_rate": 4.35858204934813e-05, |
| "loss": 3.132, |
| "step": 86200 |
| }, |
| { |
| "epoch": 9.283177268324184, |
| "grad_norm": 0.8691790103912354, |
| "learning_rate": 4.326257946341989e-05, |
| "loss": 3.1241, |
| "step": 86250 |
| }, |
| { |
| "epoch": 9.288558820363793, |
| "grad_norm": 0.885235071182251, |
| "learning_rate": 4.293933843335847e-05, |
| "loss": 3.1222, |
| "step": 86300 |
| }, |
| { |
| "epoch": 9.293940372403402, |
| "grad_norm": 0.8591457009315491, |
| "learning_rate": 4.261609740329706e-05, |
| "loss": 3.1465, |
| "step": 86350 |
| }, |
| { |
| "epoch": 9.29932192444301, |
| "grad_norm": 0.8532686829566956, |
| "learning_rate": 4.229285637323564e-05, |
| "loss": 3.1219, |
| "step": 86400 |
| }, |
| { |
| "epoch": 9.304703476482617, |
| "grad_norm": 0.8285564184188843, |
| "learning_rate": 4.196961534317422e-05, |
| "loss": 3.1346, |
| "step": 86450 |
| }, |
| { |
| "epoch": 9.310085028522225, |
| "grad_norm": 0.8536093831062317, |
| "learning_rate": 4.164637431311281e-05, |
| "loss": 3.1264, |
| "step": 86500 |
| }, |
| { |
| "epoch": 9.315466580561834, |
| "grad_norm": 0.9113935828208923, |
| "learning_rate": 4.132313328305139e-05, |
| "loss": 3.1086, |
| "step": 86550 |
| }, |
| { |
| "epoch": 9.320848132601443, |
| "grad_norm": 0.8574223518371582, |
| "learning_rate": 4.10063570735912e-05, |
| "loss": 3.1431, |
| "step": 86600 |
| }, |
| { |
| "epoch": 9.326229684641051, |
| "grad_norm": 0.8522573113441467, |
| "learning_rate": 4.068311604352979e-05, |
| "loss": 3.1585, |
| "step": 86650 |
| }, |
| { |
| "epoch": 9.331611236680658, |
| "grad_norm": 0.927192747592926, |
| "learning_rate": 4.035987501346837e-05, |
| "loss": 3.1198, |
| "step": 86700 |
| }, |
| { |
| "epoch": 9.336992788720266, |
| "grad_norm": 0.8413094878196716, |
| "learning_rate": 4.003663398340695e-05, |
| "loss": 3.1246, |
| "step": 86750 |
| }, |
| { |
| "epoch": 9.342374340759875, |
| "grad_norm": 0.8902986645698547, |
| "learning_rate": 3.9713392953345546e-05, |
| "loss": 3.1197, |
| "step": 86800 |
| }, |
| { |
| "epoch": 9.347755892799483, |
| "grad_norm": 0.8831755518913269, |
| "learning_rate": 3.939015192328413e-05, |
| "loss": 3.1259, |
| "step": 86850 |
| }, |
| { |
| "epoch": 9.353137444839092, |
| "grad_norm": 0.8932204246520996, |
| "learning_rate": 3.9066910893222715e-05, |
| "loss": 3.1298, |
| "step": 86900 |
| }, |
| { |
| "epoch": 9.3585189968787, |
| "grad_norm": 0.90253746509552, |
| "learning_rate": 3.8743669863161296e-05, |
| "loss": 3.1282, |
| "step": 86950 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "grad_norm": 0.8865585923194885, |
| "learning_rate": 3.842042883309988e-05, |
| "loss": 3.1319, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "eval_accuracy": 0.39351620411140353, |
| "eval_loss": 3.309441328048706, |
| "eval_runtime": 186.037, |
| "eval_samples_per_second": 96.814, |
| "eval_steps_per_second": 6.053, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.369282100957916, |
| "grad_norm": 0.852715790271759, |
| "learning_rate": 3.8097187803038465e-05, |
| "loss": 3.1281, |
| "step": 87050 |
| }, |
| { |
| "epoch": 9.374663652997524, |
| "grad_norm": 0.9123461842536926, |
| "learning_rate": 3.7773946772977047e-05, |
| "loss": 3.1327, |
| "step": 87100 |
| }, |
| { |
| "epoch": 9.380045205037133, |
| "grad_norm": 0.9890989661216736, |
| "learning_rate": 3.745070574291563e-05, |
| "loss": 3.1357, |
| "step": 87150 |
| }, |
| { |
| "epoch": 9.385426757076742, |
| "grad_norm": 0.8498582243919373, |
| "learning_rate": 3.7127464712854216e-05, |
| "loss": 3.1352, |
| "step": 87200 |
| }, |
| { |
| "epoch": 9.390808309116348, |
| "grad_norm": 0.8595276474952698, |
| "learning_rate": 3.6804223682792803e-05, |
| "loss": 3.146, |
| "step": 87250 |
| }, |
| { |
| "epoch": 9.396189861155957, |
| "grad_norm": 0.868246853351593, |
| "learning_rate": 3.6480982652731385e-05, |
| "loss": 3.1357, |
| "step": 87300 |
| }, |
| { |
| "epoch": 9.401571413195565, |
| "grad_norm": 0.9030839800834656, |
| "learning_rate": 3.6157741622669966e-05, |
| "loss": 3.1199, |
| "step": 87350 |
| }, |
| { |
| "epoch": 9.406952965235174, |
| "grad_norm": 0.9103978872299194, |
| "learning_rate": 3.5834500592608554e-05, |
| "loss": 3.1231, |
| "step": 87400 |
| }, |
| { |
| "epoch": 9.412334517274783, |
| "grad_norm": 0.8952512741088867, |
| "learning_rate": 3.5511259562547135e-05, |
| "loss": 3.122, |
| "step": 87450 |
| }, |
| { |
| "epoch": 9.417716069314391, |
| "grad_norm": 0.8824374079704285, |
| "learning_rate": 3.518801853248572e-05, |
| "loss": 3.1133, |
| "step": 87500 |
| }, |
| { |
| "epoch": 9.423097621353998, |
| "grad_norm": 0.8795512318611145, |
| "learning_rate": 3.4864777502424304e-05, |
| "loss": 3.1181, |
| "step": 87550 |
| }, |
| { |
| "epoch": 9.428479173393606, |
| "grad_norm": 0.8722243309020996, |
| "learning_rate": 3.4541536472362885e-05, |
| "loss": 3.1459, |
| "step": 87600 |
| }, |
| { |
| "epoch": 9.433860725433215, |
| "grad_norm": 0.923505425453186, |
| "learning_rate": 3.421829544230147e-05, |
| "loss": 3.1229, |
| "step": 87650 |
| }, |
| { |
| "epoch": 9.439242277472824, |
| "grad_norm": 0.8457745313644409, |
| "learning_rate": 3.389505441224006e-05, |
| "loss": 3.1395, |
| "step": 87700 |
| }, |
| { |
| "epoch": 9.444623829512432, |
| "grad_norm": 0.8823767304420471, |
| "learning_rate": 3.357181338217864e-05, |
| "loss": 3.1251, |
| "step": 87750 |
| }, |
| { |
| "epoch": 9.450005381552039, |
| "grad_norm": 0.8751040697097778, |
| "learning_rate": 3.324857235211722e-05, |
| "loss": 3.1209, |
| "step": 87800 |
| }, |
| { |
| "epoch": 9.455386933591647, |
| "grad_norm": 0.9281714558601379, |
| "learning_rate": 3.292533132205581e-05, |
| "loss": 3.1344, |
| "step": 87850 |
| }, |
| { |
| "epoch": 9.460768485631256, |
| "grad_norm": 0.9198567867279053, |
| "learning_rate": 3.2608555112595624e-05, |
| "loss": 3.1408, |
| "step": 87900 |
| }, |
| { |
| "epoch": 9.466150037670864, |
| "grad_norm": 0.9540517926216125, |
| "learning_rate": 3.2285314082534205e-05, |
| "loss": 3.1292, |
| "step": 87950 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "grad_norm": 0.8667344450950623, |
| "learning_rate": 3.196207305247279e-05, |
| "loss": 3.1143, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "eval_accuracy": 0.3938747586411913, |
| "eval_loss": 3.3071906566619873, |
| "eval_runtime": 185.9575, |
| "eval_samples_per_second": 96.855, |
| "eval_steps_per_second": 6.055, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.476913141750082, |
| "grad_norm": 0.8748806715011597, |
| "learning_rate": 3.1638832022411374e-05, |
| "loss": 3.1209, |
| "step": 88050 |
| }, |
| { |
| "epoch": 9.482294693789688, |
| "grad_norm": 0.8563083410263062, |
| "learning_rate": 3.1315590992349955e-05, |
| "loss": 3.1301, |
| "step": 88100 |
| }, |
| { |
| "epoch": 9.487676245829297, |
| "grad_norm": 0.8622967600822449, |
| "learning_rate": 3.099234996228854e-05, |
| "loss": 3.1433, |
| "step": 88150 |
| }, |
| { |
| "epoch": 9.493057797868905, |
| "grad_norm": 0.8674555420875549, |
| "learning_rate": 3.066910893222713e-05, |
| "loss": 3.131, |
| "step": 88200 |
| }, |
| { |
| "epoch": 9.498439349908514, |
| "grad_norm": 0.9057973623275757, |
| "learning_rate": 3.0345867902165712e-05, |
| "loss": 3.1329, |
| "step": 88250 |
| }, |
| { |
| "epoch": 9.503820901948123, |
| "grad_norm": 0.8870735764503479, |
| "learning_rate": 3.00226268721043e-05, |
| "loss": 3.1298, |
| "step": 88300 |
| }, |
| { |
| "epoch": 9.50920245398773, |
| "grad_norm": 0.8787422180175781, |
| "learning_rate": 2.9699385842042878e-05, |
| "loss": 3.1122, |
| "step": 88350 |
| }, |
| { |
| "epoch": 9.514584006027338, |
| "grad_norm": 0.8897624611854553, |
| "learning_rate": 2.9376144811981465e-05, |
| "loss": 3.1107, |
| "step": 88400 |
| }, |
| { |
| "epoch": 9.519965558066946, |
| "grad_norm": 0.8597399592399597, |
| "learning_rate": 2.905290378192005e-05, |
| "loss": 3.1504, |
| "step": 88450 |
| }, |
| { |
| "epoch": 9.525347110106555, |
| "grad_norm": 0.9135892391204834, |
| "learning_rate": 2.8729662751858634e-05, |
| "loss": 3.1326, |
| "step": 88500 |
| }, |
| { |
| "epoch": 9.530728662146164, |
| "grad_norm": 0.8865475058555603, |
| "learning_rate": 2.840642172179722e-05, |
| "loss": 3.1247, |
| "step": 88550 |
| }, |
| { |
| "epoch": 9.536110214185772, |
| "grad_norm": 0.8653778433799744, |
| "learning_rate": 2.80831806917358e-05, |
| "loss": 3.1294, |
| "step": 88600 |
| }, |
| { |
| "epoch": 9.541491766225379, |
| "grad_norm": 0.8692387938499451, |
| "learning_rate": 2.7759939661674384e-05, |
| "loss": 3.1322, |
| "step": 88650 |
| }, |
| { |
| "epoch": 9.546873318264987, |
| "grad_norm": 0.9058132767677307, |
| "learning_rate": 2.743669863161297e-05, |
| "loss": 3.1255, |
| "step": 88700 |
| }, |
| { |
| "epoch": 9.552254870304596, |
| "grad_norm": 0.9091250896453857, |
| "learning_rate": 2.7113457601551557e-05, |
| "loss": 3.1458, |
| "step": 88750 |
| }, |
| { |
| "epoch": 9.557636422344205, |
| "grad_norm": 0.8677462935447693, |
| "learning_rate": 2.679021657149014e-05, |
| "loss": 3.1379, |
| "step": 88800 |
| }, |
| { |
| "epoch": 9.563017974383813, |
| "grad_norm": 0.8408787846565247, |
| "learning_rate": 2.6466975541428722e-05, |
| "loss": 3.1336, |
| "step": 88850 |
| }, |
| { |
| "epoch": 9.56839952642342, |
| "grad_norm": 0.8599417805671692, |
| "learning_rate": 2.6143734511367307e-05, |
| "loss": 3.142, |
| "step": 88900 |
| }, |
| { |
| "epoch": 9.573781078463028, |
| "grad_norm": 0.8730495572090149, |
| "learning_rate": 2.582049348130589e-05, |
| "loss": 3.1248, |
| "step": 88950 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "grad_norm": 0.923576831817627, |
| "learning_rate": 2.5497252451244476e-05, |
| "loss": 3.1376, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "eval_accuracy": 0.3940191583290968, |
| "eval_loss": 3.304161548614502, |
| "eval_runtime": 186.0114, |
| "eval_samples_per_second": 96.827, |
| "eval_steps_per_second": 6.053, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.584544182542245, |
| "grad_norm": 0.9327528476715088, |
| "learning_rate": 2.517401142118306e-05, |
| "loss": 3.1491, |
| "step": 89050 |
| }, |
| { |
| "epoch": 9.589925734581854, |
| "grad_norm": 0.8553284406661987, |
| "learning_rate": 2.485077039112164e-05, |
| "loss": 3.1268, |
| "step": 89100 |
| }, |
| { |
| "epoch": 9.59530728662146, |
| "grad_norm": 0.886030375957489, |
| "learning_rate": 2.4527529361060226e-05, |
| "loss": 3.123, |
| "step": 89150 |
| }, |
| { |
| "epoch": 9.60068883866107, |
| "grad_norm": 0.9254641532897949, |
| "learning_rate": 2.4204288330998814e-05, |
| "loss": 3.1243, |
| "step": 89200 |
| }, |
| { |
| "epoch": 9.606070390700678, |
| "grad_norm": 0.8575250506401062, |
| "learning_rate": 2.38810473009374e-05, |
| "loss": 3.1321, |
| "step": 89250 |
| }, |
| { |
| "epoch": 9.611451942740286, |
| "grad_norm": 0.8573437333106995, |
| "learning_rate": 2.3557806270875983e-05, |
| "loss": 3.1294, |
| "step": 89300 |
| }, |
| { |
| "epoch": 9.616833494779895, |
| "grad_norm": 0.8967459201812744, |
| "learning_rate": 2.3234565240814564e-05, |
| "loss": 3.1273, |
| "step": 89350 |
| }, |
| { |
| "epoch": 9.622215046819504, |
| "grad_norm": 0.8906954526901245, |
| "learning_rate": 2.291132421075315e-05, |
| "loss": 3.1165, |
| "step": 89400 |
| }, |
| { |
| "epoch": 9.62759659885911, |
| "grad_norm": 0.8561041951179504, |
| "learning_rate": 2.2588083180691733e-05, |
| "loss": 3.1289, |
| "step": 89450 |
| }, |
| { |
| "epoch": 9.632978150898719, |
| "grad_norm": 0.9139636754989624, |
| "learning_rate": 2.2264842150630318e-05, |
| "loss": 3.1181, |
| "step": 89500 |
| }, |
| { |
| "epoch": 9.638359702938327, |
| "grad_norm": 0.8472151756286621, |
| "learning_rate": 2.1941601120568905e-05, |
| "loss": 3.1104, |
| "step": 89550 |
| }, |
| { |
| "epoch": 9.643741254977936, |
| "grad_norm": 0.8977211117744446, |
| "learning_rate": 2.1618360090507483e-05, |
| "loss": 3.1108, |
| "step": 89600 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.8994197249412537, |
| "learning_rate": 2.129511906044607e-05, |
| "loss": 3.1327, |
| "step": 89650 |
| }, |
| { |
| "epoch": 9.654504359057151, |
| "grad_norm": 0.9048464894294739, |
| "learning_rate": 2.0971878030384655e-05, |
| "loss": 3.1281, |
| "step": 89700 |
| }, |
| { |
| "epoch": 9.65988591109676, |
| "grad_norm": 0.9332903027534485, |
| "learning_rate": 2.064863700032324e-05, |
| "loss": 3.1466, |
| "step": 89750 |
| }, |
| { |
| "epoch": 9.665267463136368, |
| "grad_norm": 0.8656063675880432, |
| "learning_rate": 2.0325395970261824e-05, |
| "loss": 3.123, |
| "step": 89800 |
| }, |
| { |
| "epoch": 9.670649015175977, |
| "grad_norm": 0.891152560710907, |
| "learning_rate": 2.0002154940200406e-05, |
| "loss": 3.1296, |
| "step": 89850 |
| }, |
| { |
| "epoch": 9.676030567215586, |
| "grad_norm": 0.8883286714553833, |
| "learning_rate": 1.968537873074022e-05, |
| "loss": 3.1299, |
| "step": 89900 |
| }, |
| { |
| "epoch": 9.681412119255192, |
| "grad_norm": 0.9736307263374329, |
| "learning_rate": 1.9362137700678803e-05, |
| "loss": 3.127, |
| "step": 89950 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "grad_norm": 0.8929668068885803, |
| "learning_rate": 1.9038896670617388e-05, |
| "loss": 3.1218, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "eval_accuracy": 0.39444659878975896, |
| "eval_loss": 3.3022682666778564, |
| "eval_runtime": 185.8935, |
| "eval_samples_per_second": 96.889, |
| "eval_steps_per_second": 6.057, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.69217522333441, |
| "grad_norm": 0.8568660616874695, |
| "learning_rate": 1.8715655640555972e-05, |
| "loss": 3.1413, |
| "step": 90050 |
| }, |
| { |
| "epoch": 9.697556775374018, |
| "grad_norm": 0.8788077235221863, |
| "learning_rate": 1.8392414610494557e-05, |
| "loss": 3.1215, |
| "step": 90100 |
| }, |
| { |
| "epoch": 9.702938327413626, |
| "grad_norm": 0.8740991353988647, |
| "learning_rate": 1.806917358043314e-05, |
| "loss": 3.1236, |
| "step": 90150 |
| }, |
| { |
| "epoch": 9.708319879453235, |
| "grad_norm": 0.8861961960792542, |
| "learning_rate": 1.7745932550371726e-05, |
| "loss": 3.1314, |
| "step": 90200 |
| }, |
| { |
| "epoch": 9.713701431492842, |
| "grad_norm": 0.8763524293899536, |
| "learning_rate": 1.742269152031031e-05, |
| "loss": 3.1248, |
| "step": 90250 |
| }, |
| { |
| "epoch": 9.71908298353245, |
| "grad_norm": 0.8509135246276855, |
| "learning_rate": 1.7099450490248895e-05, |
| "loss": 3.1177, |
| "step": 90300 |
| }, |
| { |
| "epoch": 9.724464535572059, |
| "grad_norm": 0.8852295875549316, |
| "learning_rate": 1.677620946018748e-05, |
| "loss": 3.1215, |
| "step": 90350 |
| }, |
| { |
| "epoch": 9.729846087611667, |
| "grad_norm": 0.9037942886352539, |
| "learning_rate": 1.6452968430126064e-05, |
| "loss": 3.1429, |
| "step": 90400 |
| }, |
| { |
| "epoch": 9.735227639651276, |
| "grad_norm": 0.8635242581367493, |
| "learning_rate": 1.6129727400064645e-05, |
| "loss": 3.1151, |
| "step": 90450 |
| }, |
| { |
| "epoch": 9.740609191690883, |
| "grad_norm": 0.8843039870262146, |
| "learning_rate": 1.580648637000323e-05, |
| "loss": 3.1263, |
| "step": 90500 |
| }, |
| { |
| "epoch": 9.745990743730491, |
| "grad_norm": 0.8762742280960083, |
| "learning_rate": 1.5483245339941817e-05, |
| "loss": 3.1338, |
| "step": 90550 |
| }, |
| { |
| "epoch": 9.7513722957701, |
| "grad_norm": 0.8784039616584778, |
| "learning_rate": 1.5160004309880398e-05, |
| "loss": 3.1324, |
| "step": 90600 |
| }, |
| { |
| "epoch": 9.756753847809708, |
| "grad_norm": 0.8572136759757996, |
| "learning_rate": 1.4836763279818985e-05, |
| "loss": 3.1241, |
| "step": 90650 |
| }, |
| { |
| "epoch": 9.762135399849317, |
| "grad_norm": 0.8844181895256042, |
| "learning_rate": 1.4513522249757567e-05, |
| "loss": 3.1463, |
| "step": 90700 |
| }, |
| { |
| "epoch": 9.767516951888926, |
| "grad_norm": 0.8686134815216064, |
| "learning_rate": 1.4190281219696152e-05, |
| "loss": 3.1185, |
| "step": 90750 |
| }, |
| { |
| "epoch": 9.772898503928532, |
| "grad_norm": 0.8900553584098816, |
| "learning_rate": 1.3867040189634736e-05, |
| "loss": 3.1164, |
| "step": 90800 |
| }, |
| { |
| "epoch": 9.77828005596814, |
| "grad_norm": 0.8739273548126221, |
| "learning_rate": 1.3543799159573321e-05, |
| "loss": 3.1549, |
| "step": 90850 |
| }, |
| { |
| "epoch": 9.78366160800775, |
| "grad_norm": 0.848564863204956, |
| "learning_rate": 1.3220558129511905e-05, |
| "loss": 3.1221, |
| "step": 90900 |
| }, |
| { |
| "epoch": 9.789043160047358, |
| "grad_norm": 0.8485873937606812, |
| "learning_rate": 1.2897317099450488e-05, |
| "loss": 3.1266, |
| "step": 90950 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "grad_norm": 0.9386956691741943, |
| "learning_rate": 1.2574076069389073e-05, |
| "loss": 3.1415, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "eval_accuracy": 0.39463771921942464, |
| "eval_loss": 3.2997336387634277, |
| "eval_runtime": 185.898, |
| "eval_samples_per_second": 96.886, |
| "eval_steps_per_second": 6.057, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.799806264126573, |
| "grad_norm": 0.939706563949585, |
| "learning_rate": 1.2250835039327659e-05, |
| "loss": 3.1332, |
| "step": 91050 |
| }, |
| { |
| "epoch": 9.805187816166182, |
| "grad_norm": 0.869918942451477, |
| "learning_rate": 1.1927594009266242e-05, |
| "loss": 3.1249, |
| "step": 91100 |
| }, |
| { |
| "epoch": 9.81056936820579, |
| "grad_norm": 0.8955085873603821, |
| "learning_rate": 1.1604352979204826e-05, |
| "loss": 3.1243, |
| "step": 91150 |
| }, |
| { |
| "epoch": 9.815950920245399, |
| "grad_norm": 0.9001854658126831, |
| "learning_rate": 1.1281111949143409e-05, |
| "loss": 3.123, |
| "step": 91200 |
| }, |
| { |
| "epoch": 9.821332472285007, |
| "grad_norm": 0.8752421140670776, |
| "learning_rate": 1.0957870919081995e-05, |
| "loss": 3.1207, |
| "step": 91250 |
| }, |
| { |
| "epoch": 9.826714024324616, |
| "grad_norm": 0.8942108154296875, |
| "learning_rate": 1.063462988902058e-05, |
| "loss": 3.1239, |
| "step": 91300 |
| }, |
| { |
| "epoch": 9.832095576364223, |
| "grad_norm": 0.9161922931671143, |
| "learning_rate": 1.0311388858959162e-05, |
| "loss": 3.1254, |
| "step": 91350 |
| }, |
| { |
| "epoch": 9.837477128403831, |
| "grad_norm": 0.8643301725387573, |
| "learning_rate": 9.988147828897747e-06, |
| "loss": 3.1307, |
| "step": 91400 |
| }, |
| { |
| "epoch": 9.84285868044344, |
| "grad_norm": 0.8793330788612366, |
| "learning_rate": 9.66490679883633e-06, |
| "loss": 3.1239, |
| "step": 91450 |
| }, |
| { |
| "epoch": 9.848240232483048, |
| "grad_norm": 0.9255681037902832, |
| "learning_rate": 9.341665768774916e-06, |
| "loss": 3.1246, |
| "step": 91500 |
| }, |
| { |
| "epoch": 9.853621784522657, |
| "grad_norm": 0.8642120361328125, |
| "learning_rate": 9.018424738713499e-06, |
| "loss": 3.1171, |
| "step": 91550 |
| }, |
| { |
| "epoch": 9.859003336562264, |
| "grad_norm": 0.9059169292449951, |
| "learning_rate": 8.695183708652085e-06, |
| "loss": 3.1352, |
| "step": 91600 |
| }, |
| { |
| "epoch": 9.864384888601872, |
| "grad_norm": 0.8789724111557007, |
| "learning_rate": 8.37194267859067e-06, |
| "loss": 3.1299, |
| "step": 91650 |
| }, |
| { |
| "epoch": 9.869766440641481, |
| "grad_norm": 0.8953512907028198, |
| "learning_rate": 8.048701648529252e-06, |
| "loss": 3.1268, |
| "step": 91700 |
| }, |
| { |
| "epoch": 9.87514799268109, |
| "grad_norm": 0.8603357076644897, |
| "learning_rate": 7.725460618467837e-06, |
| "loss": 3.122, |
| "step": 91750 |
| }, |
| { |
| "epoch": 9.880529544720698, |
| "grad_norm": 0.9423066973686218, |
| "learning_rate": 7.40221958840642e-06, |
| "loss": 3.1405, |
| "step": 91800 |
| }, |
| { |
| "epoch": 9.885911096760307, |
| "grad_norm": 0.8285483121871948, |
| "learning_rate": 7.078978558345006e-06, |
| "loss": 3.1308, |
| "step": 91850 |
| }, |
| { |
| "epoch": 9.891292648799913, |
| "grad_norm": 0.906309187412262, |
| "learning_rate": 6.75573752828359e-06, |
| "loss": 3.1369, |
| "step": 91900 |
| }, |
| { |
| "epoch": 9.896674200839522, |
| "grad_norm": 0.8861557245254517, |
| "learning_rate": 6.432496498222174e-06, |
| "loss": 3.1106, |
| "step": 91950 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "grad_norm": 0.8721465468406677, |
| "learning_rate": 6.1092554681607575e-06, |
| "loss": 3.1329, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "eval_accuracy": 0.3947629959990747, |
| "eval_loss": 3.298814296722412, |
| "eval_runtime": 186.1463, |
| "eval_samples_per_second": 96.757, |
| "eval_steps_per_second": 6.049, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.907437304918739, |
| "grad_norm": 0.8620793223381042, |
| "learning_rate": 5.786014438099342e-06, |
| "loss": 3.1232, |
| "step": 92050 |
| }, |
| { |
| "epoch": 9.912818856958348, |
| "grad_norm": 0.8785669207572937, |
| "learning_rate": 5.4627734080379264e-06, |
| "loss": 3.1137, |
| "step": 92100 |
| }, |
| { |
| "epoch": 9.918200408997954, |
| "grad_norm": 0.8959348797798157, |
| "learning_rate": 5.139532377976511e-06, |
| "loss": 3.1243, |
| "step": 92150 |
| }, |
| { |
| "epoch": 9.923581961037563, |
| "grad_norm": 0.9237845540046692, |
| "learning_rate": 4.816291347915095e-06, |
| "loss": 3.1295, |
| "step": 92200 |
| }, |
| { |
| "epoch": 9.928963513077171, |
| "grad_norm": 0.9364736080169678, |
| "learning_rate": 4.493050317853679e-06, |
| "loss": 3.114, |
| "step": 92250 |
| }, |
| { |
| "epoch": 9.93434506511678, |
| "grad_norm": 0.8466960191726685, |
| "learning_rate": 4.169809287792264e-06, |
| "loss": 3.1173, |
| "step": 92300 |
| }, |
| { |
| "epoch": 9.939726617156388, |
| "grad_norm": 0.8846635222434998, |
| "learning_rate": 3.846568257730847e-06, |
| "loss": 3.1419, |
| "step": 92350 |
| }, |
| { |
| "epoch": 9.945108169195997, |
| "grad_norm": 0.8732933402061462, |
| "learning_rate": 3.523327227669432e-06, |
| "loss": 3.1209, |
| "step": 92400 |
| }, |
| { |
| "epoch": 9.950489721235604, |
| "grad_norm": 0.8332459926605225, |
| "learning_rate": 3.2000861976080162e-06, |
| "loss": 3.1354, |
| "step": 92450 |
| }, |
| { |
| "epoch": 9.955871273275212, |
| "grad_norm": 0.8753027319908142, |
| "learning_rate": 2.8768451675466007e-06, |
| "loss": 3.1346, |
| "step": 92500 |
| }, |
| { |
| "epoch": 9.961252825314821, |
| "grad_norm": 0.868958592414856, |
| "learning_rate": 2.5536041374851848e-06, |
| "loss": 3.136, |
| "step": 92550 |
| }, |
| { |
| "epoch": 9.96663437735443, |
| "grad_norm": 0.8361853957176208, |
| "learning_rate": 2.230363107423769e-06, |
| "loss": 3.1265, |
| "step": 92600 |
| }, |
| { |
| "epoch": 9.972015929394038, |
| "grad_norm": 0.8855199217796326, |
| "learning_rate": 1.9071220773623531e-06, |
| "loss": 3.1248, |
| "step": 92650 |
| }, |
| { |
| "epoch": 9.977397481433645, |
| "grad_norm": 0.9217408895492554, |
| "learning_rate": 1.5838810473009372e-06, |
| "loss": 3.1303, |
| "step": 92700 |
| }, |
| { |
| "epoch": 9.982779033473253, |
| "grad_norm": 0.8892512321472168, |
| "learning_rate": 1.2606400172395215e-06, |
| "loss": 3.1246, |
| "step": 92750 |
| }, |
| { |
| "epoch": 9.988160585512862, |
| "grad_norm": 0.8720786571502686, |
| "learning_rate": 9.373989871781058e-07, |
| "loss": 3.1269, |
| "step": 92800 |
| }, |
| { |
| "epoch": 9.99354213755247, |
| "grad_norm": 0.909217894077301, |
| "learning_rate": 6.1415795711669e-07, |
| "loss": 3.1374, |
| "step": 92850 |
| }, |
| { |
| "epoch": 9.998923689592079, |
| "grad_norm": 0.8751625418663025, |
| "learning_rate": 2.909169270552742e-07, |
| "loss": 3.1104, |
| "step": 92900 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 92910, |
| "total_flos": 7.7681598529536e+17, |
| "train_loss": 3.4542547713105125, |
| "train_runtime": 80895.6915, |
| "train_samples_per_second": 36.751, |
| "train_steps_per_second": 1.149 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 92910, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.7681598529536e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|