| { |
| "best_metric": 3.3076210021972656, |
| "best_model_checkpoint": "/scratch/cl5625/exceptions/models/100M_8397/checkpoint-90000", |
| "epoch": 10.0, |
| "eval_steps": 1000, |
| "global_step": 92910, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.005381552039608223, |
| "grad_norm": 1.1363136768341064, |
| "learning_rate": 0.0003, |
| "loss": 8.4632, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.010763104079216447, |
| "grad_norm": 3.25538969039917, |
| "learning_rate": 0.0006, |
| "loss": 6.8415, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.01614465611882467, |
| "grad_norm": 3.1040453910827637, |
| "learning_rate": 0.0005996767589699385, |
| "loss": 6.4387, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.021526208158432893, |
| "grad_norm": 3.3955957889556885, |
| "learning_rate": 0.0005993535179398771, |
| "loss": 6.1922, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.026907760198041114, |
| "grad_norm": 2.7429745197296143, |
| "learning_rate": 0.0005990302769098158, |
| "loss": 6.0811, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.03228931223764934, |
| "grad_norm": 1.5651514530181885, |
| "learning_rate": 0.0005987070358797543, |
| "loss": 5.9862, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.03767086427725756, |
| "grad_norm": 1.5071296691894531, |
| "learning_rate": 0.0005983837948496929, |
| "loss": 5.8565, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.04305241631686579, |
| "grad_norm": 1.3275337219238281, |
| "learning_rate": 0.0005980605538196314, |
| "loss": 5.8096, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.048433968356474004, |
| "grad_norm": 1.523363709449768, |
| "learning_rate": 0.0005977373127895701, |
| "loss": 5.7397, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.05381552039608223, |
| "grad_norm": 1.786197304725647, |
| "learning_rate": 0.0005974140717595086, |
| "loss": 5.6499, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.05919707243569045, |
| "grad_norm": 1.4374873638153076, |
| "learning_rate": 0.0005970908307294472, |
| "loss": 5.5871, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.06457862447529868, |
| "grad_norm": 1.401210069656372, |
| "learning_rate": 0.0005967675896993858, |
| "loss": 5.5076, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.0699601765149069, |
| "grad_norm": 1.2165864706039429, |
| "learning_rate": 0.0005964443486693243, |
| "loss": 5.4396, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.07534172855451512, |
| "grad_norm": 1.5090405941009521, |
| "learning_rate": 0.000596121107639263, |
| "loss": 5.3853, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.08072328059412334, |
| "grad_norm": 1.6507272720336914, |
| "learning_rate": 0.0005957978666092015, |
| "loss": 5.3282, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.08610483263373157, |
| "grad_norm": 1.5006401538848877, |
| "learning_rate": 0.0005954746255791401, |
| "loss": 5.2854, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.09148638467333979, |
| "grad_norm": 1.1712360382080078, |
| "learning_rate": 0.0005951513845490787, |
| "loss": 5.265, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.09686793671294801, |
| "grad_norm": 1.2654850482940674, |
| "learning_rate": 0.0005948281435190174, |
| "loss": 5.1858, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.10224948875255624, |
| "grad_norm": 1.3258107900619507, |
| "learning_rate": 0.0005945049024889559, |
| "loss": 5.1631, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "grad_norm": 1.1906688213348389, |
| "learning_rate": 0.0005941816614588944, |
| "loss": 5.1052, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.10763104079216446, |
| "eval_accuracy": 0.22547332185886404, |
| "eval_loss": 5.034008502960205, |
| "eval_runtime": 183.4102, |
| "eval_samples_per_second": 98.201, |
| "eval_steps_per_second": 6.139, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.11301259283177269, |
| "grad_norm": 0.88250333070755, |
| "learning_rate": 0.000593858420428833, |
| "loss": 5.0516, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.1183941448713809, |
| "grad_norm": 0.965934693813324, |
| "learning_rate": 0.0005935351793987716, |
| "loss": 5.0322, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.12377569691098914, |
| "grad_norm": 1.1457676887512207, |
| "learning_rate": 0.0005932119383687103, |
| "loss": 5.0043, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.12915724895059735, |
| "grad_norm": 1.3504283428192139, |
| "learning_rate": 0.0005928886973386488, |
| "loss": 4.9543, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.13453880099020557, |
| "grad_norm": 1.02091383934021, |
| "learning_rate": 0.0005925654563085874, |
| "loss": 4.9343, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.1399203530298138, |
| "grad_norm": 0.9588813781738281, |
| "learning_rate": 0.000592242215278526, |
| "loss": 4.8984, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.14530190506942203, |
| "grad_norm": 0.8982344269752502, |
| "learning_rate": 0.0005919189742484645, |
| "loss": 4.8657, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.15068345710903025, |
| "grad_norm": 1.053356647491455, |
| "learning_rate": 0.0005915957332184032, |
| "loss": 4.8735, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.15606500914863847, |
| "grad_norm": 1.0207793712615967, |
| "learning_rate": 0.0005912724921883417, |
| "loss": 4.8455, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.16144656118824668, |
| "grad_norm": 0.9741514325141907, |
| "learning_rate": 0.0005909492511582803, |
| "loss": 4.8106, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.1668281132278549, |
| "grad_norm": 1.1351131200790405, |
| "learning_rate": 0.0005906260101282189, |
| "loss": 4.7854, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.17220966526746315, |
| "grad_norm": 0.8748419880867004, |
| "learning_rate": 0.0005903027690981575, |
| "loss": 4.7504, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.17759121730707136, |
| "grad_norm": 0.7746927738189697, |
| "learning_rate": 0.000589979528068096, |
| "loss": 4.7232, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.18297276934667958, |
| "grad_norm": 1.0927244424819946, |
| "learning_rate": 0.0005896562870380347, |
| "loss": 4.7024, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.1883543213862878, |
| "grad_norm": 1.1120930910110474, |
| "learning_rate": 0.0005893330460079732, |
| "loss": 4.7158, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.19373587342589602, |
| "grad_norm": 0.7173694372177124, |
| "learning_rate": 0.0005890098049779118, |
| "loss": 4.684, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.19911742546550426, |
| "grad_norm": 0.964622974395752, |
| "learning_rate": 0.0005886865639478504, |
| "loss": 4.668, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.20449897750511248, |
| "grad_norm": 0.8760665059089661, |
| "learning_rate": 0.0005883633229177889, |
| "loss": 4.64, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.2098805295447207, |
| "grad_norm": 1.0075942277908325, |
| "learning_rate": 0.0005880400818877276, |
| "loss": 4.6187, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "grad_norm": 1.170619249343872, |
| "learning_rate": 0.0005877168408576662, |
| "loss": 4.6124, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.2152620815843289, |
| "eval_accuracy": 0.2689746785531477, |
| "eval_loss": 4.52186918258667, |
| "eval_runtime": 182.9507, |
| "eval_samples_per_second": 98.447, |
| "eval_steps_per_second": 6.155, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.22064363362393713, |
| "grad_norm": 1.1905301809310913, |
| "learning_rate": 0.0005873935998276048, |
| "loss": 4.5808, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.22602518566354537, |
| "grad_norm": 0.7720122933387756, |
| "learning_rate": 0.0005870703587975433, |
| "loss": 4.5765, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.2314067377031536, |
| "grad_norm": 0.9038923978805542, |
| "learning_rate": 0.0005867471177674818, |
| "loss": 4.5495, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.2367882897427618, |
| "grad_norm": 0.954218327999115, |
| "learning_rate": 0.0005864238767374205, |
| "loss": 4.5188, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.24216984178237003, |
| "grad_norm": 0.890593409538269, |
| "learning_rate": 0.0005861006357073591, |
| "loss": 4.5113, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.24755139382197827, |
| "grad_norm": 1.0826386213302612, |
| "learning_rate": 0.0005857773946772977, |
| "loss": 4.5139, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.2529329458615865, |
| "grad_norm": 1.0157872438430786, |
| "learning_rate": 0.0005854541536472362, |
| "loss": 4.4663, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.2583144979011947, |
| "grad_norm": 1.0314841270446777, |
| "learning_rate": 0.0005851309126171749, |
| "loss": 4.4576, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.2636960499408029, |
| "grad_norm": 1.0260419845581055, |
| "learning_rate": 0.0005848076715871134, |
| "loss": 4.4447, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.26907760198041114, |
| "grad_norm": 0.8484777212142944, |
| "learning_rate": 0.000584484430557052, |
| "loss": 4.4135, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.27445915402001936, |
| "grad_norm": 0.9077085256576538, |
| "learning_rate": 0.0005841611895269906, |
| "loss": 4.4272, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.2798407060596276, |
| "grad_norm": 0.7669521570205688, |
| "learning_rate": 0.0005838379484969291, |
| "loss": 4.402, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.2852222580992358, |
| "grad_norm": 0.7865301966667175, |
| "learning_rate": 0.0005835147074668678, |
| "loss": 4.406, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.29060381013884407, |
| "grad_norm": 0.9062381386756897, |
| "learning_rate": 0.0005831914664368063, |
| "loss": 4.3869, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.2959853621784523, |
| "grad_norm": 0.8867931962013245, |
| "learning_rate": 0.0005828682254067449, |
| "loss": 4.367, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.3013669142180605, |
| "grad_norm": 0.7799649834632874, |
| "learning_rate": 0.0005825449843766835, |
| "loss": 4.3764, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.3067484662576687, |
| "grad_norm": 0.998032808303833, |
| "learning_rate": 0.0005822217433466221, |
| "loss": 4.3453, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.31213001829727693, |
| "grad_norm": 0.7040951251983643, |
| "learning_rate": 0.0005818985023165607, |
| "loss": 4.3335, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.31751157033688515, |
| "grad_norm": 0.8604708313941956, |
| "learning_rate": 0.0005815752612864992, |
| "loss": 4.3286, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "grad_norm": 0.6292029619216919, |
| "learning_rate": 0.0005812520202564378, |
| "loss": 4.3286, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.32289312237649337, |
| "eval_accuracy": 0.2978722179020627, |
| "eval_loss": 4.24153995513916, |
| "eval_runtime": 183.0117, |
| "eval_samples_per_second": 98.414, |
| "eval_steps_per_second": 6.153, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.3282746744161016, |
| "grad_norm": 0.9364116787910461, |
| "learning_rate": 0.0005809287792263764, |
| "loss": 4.3113, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.3336562264557098, |
| "grad_norm": 0.6825273633003235, |
| "learning_rate": 0.0005806055381963151, |
| "loss": 4.3019, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.3390377784953181, |
| "grad_norm": 0.6769887208938599, |
| "learning_rate": 0.0005802822971662536, |
| "loss": 4.287, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.3444193305349263, |
| "grad_norm": 0.7461885809898376, |
| "learning_rate": 0.0005799590561361922, |
| "loss": 4.2488, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.3498008825745345, |
| "grad_norm": 0.8458229899406433, |
| "learning_rate": 0.0005796358151061307, |
| "loss": 4.2704, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.35518243461414273, |
| "grad_norm": 0.7842735052108765, |
| "learning_rate": 0.0005793125740760694, |
| "loss": 4.2467, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.36056398665375095, |
| "grad_norm": 0.6246998310089111, |
| "learning_rate": 0.0005789893330460079, |
| "loss": 4.2382, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.36594553869335916, |
| "grad_norm": 0.7485172748565674, |
| "learning_rate": 0.0005786660920159465, |
| "loss": 4.2417, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.3713270907329674, |
| "grad_norm": 0.8092179894447327, |
| "learning_rate": 0.0005783428509858851, |
| "loss": 4.2616, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.3767086427725756, |
| "grad_norm": 1.0175601243972778, |
| "learning_rate": 0.0005780196099558237, |
| "loss": 4.2304, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.3820901948121838, |
| "grad_norm": 0.6826550960540771, |
| "learning_rate": 0.0005776963689257623, |
| "loss": 4.2206, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.38747174685179203, |
| "grad_norm": 0.7783133387565613, |
| "learning_rate": 0.0005773731278957008, |
| "loss": 4.2282, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.3928532988914003, |
| "grad_norm": 0.6485751867294312, |
| "learning_rate": 0.0005770498868656394, |
| "loss": 4.2179, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.3982348509310085, |
| "grad_norm": 0.778927206993103, |
| "learning_rate": 0.000576726645835578, |
| "loss": 4.1895, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.40361640297061674, |
| "grad_norm": 0.8899291753768921, |
| "learning_rate": 0.0005764034048055167, |
| "loss": 4.2006, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.40899795501022496, |
| "grad_norm": 0.6624945998191833, |
| "learning_rate": 0.0005760801637754552, |
| "loss": 4.2022, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.4143795070498332, |
| "grad_norm": 0.7508805394172668, |
| "learning_rate": 0.0005757569227453937, |
| "loss": 4.1884, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.4197610590894414, |
| "grad_norm": 0.6273350119590759, |
| "learning_rate": 0.0005754336817153324, |
| "loss": 4.1653, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.4251426111290496, |
| "grad_norm": 0.7216036319732666, |
| "learning_rate": 0.0005751104406852709, |
| "loss": 4.168, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "grad_norm": 0.6389856934547424, |
| "learning_rate": 0.0005747871996552096, |
| "loss": 4.1493, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.4305241631686578, |
| "eval_accuracy": 0.31214225357606534, |
| "eval_loss": 4.097915172576904, |
| "eval_runtime": 182.793, |
| "eval_samples_per_second": 98.532, |
| "eval_steps_per_second": 6.16, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.43590571520826604, |
| "grad_norm": 0.8230230212211609, |
| "learning_rate": 0.0005744639586251481, |
| "loss": 4.1597, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.44128726724787426, |
| "grad_norm": 0.7011985182762146, |
| "learning_rate": 0.0005741407175950867, |
| "loss": 4.1713, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.44666881928748253, |
| "grad_norm": 0.6618315577507019, |
| "learning_rate": 0.0005738174765650253, |
| "loss": 4.156, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.45205037132709075, |
| "grad_norm": 0.652692973613739, |
| "learning_rate": 0.0005734942355349638, |
| "loss": 4.1499, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.45743192336669897, |
| "grad_norm": 0.6924708485603333, |
| "learning_rate": 0.0005731709945049025, |
| "loss": 4.1527, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.4628134754063072, |
| "grad_norm": 0.7289921641349792, |
| "learning_rate": 0.000572847753474841, |
| "loss": 4.1386, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.4681950274459154, |
| "grad_norm": 0.6634368896484375, |
| "learning_rate": 0.0005725245124447796, |
| "loss": 4.1338, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.4735765794855236, |
| "grad_norm": 0.6518203616142273, |
| "learning_rate": 0.0005722012714147182, |
| "loss": 4.1474, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.47895813152513184, |
| "grad_norm": 0.5596990585327148, |
| "learning_rate": 0.0005718780303846568, |
| "loss": 4.115, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.48433968356474005, |
| "grad_norm": 0.5837390422821045, |
| "learning_rate": 0.0005715547893545953, |
| "loss": 4.0993, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.48972123560434827, |
| "grad_norm": 0.5811730027198792, |
| "learning_rate": 0.000571231548324534, |
| "loss": 4.099, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.49510278764395654, |
| "grad_norm": 0.5709436535835266, |
| "learning_rate": 0.0005709083072944725, |
| "loss": 4.1202, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.5004843396835648, |
| "grad_norm": 0.6790245175361633, |
| "learning_rate": 0.0005705850662644111, |
| "loss": 4.0862, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.505865891723173, |
| "grad_norm": 0.629574179649353, |
| "learning_rate": 0.0005702618252343497, |
| "loss": 4.1059, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.5112474437627812, |
| "grad_norm": 0.5848677754402161, |
| "learning_rate": 0.0005699385842042882, |
| "loss": 4.0903, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.5166289958023894, |
| "grad_norm": 0.6429191827774048, |
| "learning_rate": 0.0005696153431742269, |
| "loss": 4.0856, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.5220105478419976, |
| "grad_norm": 0.6784705519676208, |
| "learning_rate": 0.0005692921021441655, |
| "loss": 4.0779, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.5273920998816058, |
| "grad_norm": 0.6737756133079529, |
| "learning_rate": 0.0005689688611141041, |
| "loss": 4.0737, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.5327736519212141, |
| "grad_norm": 0.614524781703949, |
| "learning_rate": 0.0005686456200840426, |
| "loss": 4.055, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "grad_norm": 0.5657824277877808, |
| "learning_rate": 0.0005683223790539811, |
| "loss": 4.0522, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5381552039608223, |
| "eval_accuracy": 0.3199428790038182, |
| "eval_loss": 4.0021162033081055, |
| "eval_runtime": 183.0646, |
| "eval_samples_per_second": 98.386, |
| "eval_steps_per_second": 6.151, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.5435367560004305, |
| "grad_norm": 0.7021715044975281, |
| "learning_rate": 0.0005679991380239198, |
| "loss": 4.0655, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.5489183080400387, |
| "grad_norm": 0.796134352684021, |
| "learning_rate": 0.0005676758969938584, |
| "loss": 4.0578, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.5542998600796469, |
| "grad_norm": 0.6416186094284058, |
| "learning_rate": 0.000567352655963797, |
| "loss": 4.0523, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.5596814121192552, |
| "grad_norm": 0.6666920185089111, |
| "learning_rate": 0.0005670294149337355, |
| "loss": 4.0536, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.5650629641588634, |
| "grad_norm": 0.6438349485397339, |
| "learning_rate": 0.0005667061739036742, |
| "loss": 4.0504, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.5704445161984716, |
| "grad_norm": 0.5683118104934692, |
| "learning_rate": 0.0005663829328736127, |
| "loss": 4.0581, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.5758260682380799, |
| "grad_norm": 0.6448424458503723, |
| "learning_rate": 0.0005660596918435512, |
| "loss": 4.0476, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.5812076202776881, |
| "grad_norm": 0.6892325282096863, |
| "learning_rate": 0.0005657364508134899, |
| "loss": 4.0357, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.5865891723172963, |
| "grad_norm": 0.6151754260063171, |
| "learning_rate": 0.0005654132097834284, |
| "loss": 4.0408, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.5919707243569046, |
| "grad_norm": 0.5521050095558167, |
| "learning_rate": 0.0005650899687533671, |
| "loss": 4.0438, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.5973522763965128, |
| "grad_norm": 0.5622237324714661, |
| "learning_rate": 0.0005647667277233056, |
| "loss": 4.0191, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.602733828436121, |
| "grad_norm": 0.6457247734069824, |
| "learning_rate": 0.0005644434866932442, |
| "loss": 4.0255, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.6081153804757292, |
| "grad_norm": 0.6344576478004456, |
| "learning_rate": 0.0005641202456631828, |
| "loss": 4.0171, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.6134969325153374, |
| "grad_norm": 0.8287795186042786, |
| "learning_rate": 0.0005637970046331214, |
| "loss": 3.9951, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.6188784845549457, |
| "grad_norm": 0.6326580047607422, |
| "learning_rate": 0.00056347376360306, |
| "loss": 4.0027, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.6242600365945539, |
| "grad_norm": 0.7404974102973938, |
| "learning_rate": 0.0005631505225729985, |
| "loss": 4.0087, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.6296415886341621, |
| "grad_norm": 0.6510661244392395, |
| "learning_rate": 0.0005628272815429371, |
| "loss": 3.9942, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.6350231406737703, |
| "grad_norm": 0.6300317049026489, |
| "learning_rate": 0.0005625040405128757, |
| "loss": 4.0002, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.6404046927133785, |
| "grad_norm": 0.5819694399833679, |
| "learning_rate": 0.0005621807994828143, |
| "loss": 3.9987, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "grad_norm": 0.7095755338668823, |
| "learning_rate": 0.0005618575584527529, |
| "loss": 3.9857, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.6457862447529867, |
| "eval_accuracy": 0.32717677096873066, |
| "eval_loss": 3.925938606262207, |
| "eval_runtime": 182.691, |
| "eval_samples_per_second": 98.587, |
| "eval_steps_per_second": 6.163, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.651167796792595, |
| "grad_norm": 0.6090443134307861, |
| "learning_rate": 0.0005615343174226915, |
| "loss": 3.9941, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.6565493488322032, |
| "grad_norm": 0.5509337186813354, |
| "learning_rate": 0.00056121107639263, |
| "loss": 3.988, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.6619309008718114, |
| "grad_norm": 0.681161642074585, |
| "learning_rate": 0.0005608943001831699, |
| "loss": 3.989, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.6673124529114196, |
| "grad_norm": 0.7362217903137207, |
| "learning_rate": 0.0005605710591531085, |
| "loss": 3.9711, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.6726940049510278, |
| "grad_norm": 0.718980610370636, |
| "learning_rate": 0.000560247818123047, |
| "loss": 3.9902, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.6780755569906362, |
| "grad_norm": 0.6289951205253601, |
| "learning_rate": 0.0005599245770929855, |
| "loss": 3.9774, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.6834571090302444, |
| "grad_norm": 0.6267488598823547, |
| "learning_rate": 0.0005596013360629242, |
| "loss": 3.964, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.6888386610698526, |
| "grad_norm": 0.681925892829895, |
| "learning_rate": 0.0005592780950328628, |
| "loss": 3.9841, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.6942202131094608, |
| "grad_norm": 0.6765555143356323, |
| "learning_rate": 0.0005589548540028014, |
| "loss": 3.9582, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.699601765149069, |
| "grad_norm": 0.6029344201087952, |
| "learning_rate": 0.0005586316129727399, |
| "loss": 3.9729, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.7049833171886772, |
| "grad_norm": 0.5697076320648193, |
| "learning_rate": 0.0005583083719426786, |
| "loss": 3.9529, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.7103648692282855, |
| "grad_norm": 0.6241387724876404, |
| "learning_rate": 0.0005579851309126171, |
| "loss": 3.941, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.7157464212678937, |
| "grad_norm": 0.5610880851745605, |
| "learning_rate": 0.0005576618898825558, |
| "loss": 3.948, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.7211279733075019, |
| "grad_norm": 0.5819376111030579, |
| "learning_rate": 0.0005573386488524943, |
| "loss": 3.9528, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.7265095253471101, |
| "grad_norm": 0.5621766448020935, |
| "learning_rate": 0.0005570154078224328, |
| "loss": 3.9367, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.7318910773867183, |
| "grad_norm": 0.6376521587371826, |
| "learning_rate": 0.0005566921667923715, |
| "loss": 3.9523, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.7372726294263265, |
| "grad_norm": 0.5738341212272644, |
| "learning_rate": 0.00055636892576231, |
| "loss": 3.9213, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.7426541814659348, |
| "grad_norm": 0.5444178581237793, |
| "learning_rate": 0.0005560456847322487, |
| "loss": 3.9256, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.748035733505543, |
| "grad_norm": 0.6856753826141357, |
| "learning_rate": 0.0005557224437021872, |
| "loss": 3.9396, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "grad_norm": 0.6148762106895447, |
| "learning_rate": 0.0005553992026721258, |
| "loss": 3.932, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7534172855451512, |
| "eval_accuracy": 0.33273132389958254, |
| "eval_loss": 3.8695037364959717, |
| "eval_runtime": 183.1477, |
| "eval_samples_per_second": 98.341, |
| "eval_steps_per_second": 6.148, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.7587988375847594, |
| "grad_norm": 0.6166629791259766, |
| "learning_rate": 0.0005550759616420644, |
| "loss": 3.9215, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.7641803896243676, |
| "grad_norm": 0.5288543701171875, |
| "learning_rate": 0.000554752720612003, |
| "loss": 3.9322, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.7695619416639758, |
| "grad_norm": 0.650581955909729, |
| "learning_rate": 0.0005544294795819415, |
| "loss": 3.9267, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.7749434937035841, |
| "grad_norm": 0.5915279984474182, |
| "learning_rate": 0.0005541062385518801, |
| "loss": 3.9138, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.7803250457431924, |
| "grad_norm": 0.645673930644989, |
| "learning_rate": 0.0005537829975218188, |
| "loss": 3.9376, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.7857065977828006, |
| "grad_norm": 0.710521936416626, |
| "learning_rate": 0.0005534597564917573, |
| "loss": 3.9161, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.7910881498224088, |
| "grad_norm": 0.6567710041999817, |
| "learning_rate": 0.0005531365154616959, |
| "loss": 3.9225, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.796469701862017, |
| "grad_norm": 0.6403442621231079, |
| "learning_rate": 0.0005528132744316344, |
| "loss": 3.9261, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.8018512539016253, |
| "grad_norm": 0.5535980463027954, |
| "learning_rate": 0.0005524900334015731, |
| "loss": 3.9177, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.8072328059412335, |
| "grad_norm": 0.5908585786819458, |
| "learning_rate": 0.0005521667923715117, |
| "loss": 3.9195, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.8126143579808417, |
| "grad_norm": 0.5708103775978088, |
| "learning_rate": 0.0005518435513414502, |
| "loss": 3.9055, |
| "step": 7550 |
| }, |
| { |
| "epoch": 0.8179959100204499, |
| "grad_norm": 0.604663074016571, |
| "learning_rate": 0.0005515203103113888, |
| "loss": 3.9147, |
| "step": 7600 |
| }, |
| { |
| "epoch": 0.8233774620600581, |
| "grad_norm": 0.5916433930397034, |
| "learning_rate": 0.0005511970692813274, |
| "loss": 3.9041, |
| "step": 7650 |
| }, |
| { |
| "epoch": 0.8287590140996663, |
| "grad_norm": 0.74225914478302, |
| "learning_rate": 0.000550873828251266, |
| "loss": 3.9067, |
| "step": 7700 |
| }, |
| { |
| "epoch": 0.8341405661392746, |
| "grad_norm": 0.5728737115859985, |
| "learning_rate": 0.0005505505872212045, |
| "loss": 3.9031, |
| "step": 7750 |
| }, |
| { |
| "epoch": 0.8395221181788828, |
| "grad_norm": 0.6314652562141418, |
| "learning_rate": 0.0005502273461911432, |
| "loss": 3.9059, |
| "step": 7800 |
| }, |
| { |
| "epoch": 0.844903670218491, |
| "grad_norm": 0.6741981506347656, |
| "learning_rate": 0.0005499041051610817, |
| "loss": 3.8996, |
| "step": 7850 |
| }, |
| { |
| "epoch": 0.8502852222580992, |
| "grad_norm": 0.5826407074928284, |
| "learning_rate": 0.0005495808641310204, |
| "loss": 3.8754, |
| "step": 7900 |
| }, |
| { |
| "epoch": 0.8556667742977074, |
| "grad_norm": 0.5577662587165833, |
| "learning_rate": 0.0005492576231009589, |
| "loss": 3.8799, |
| "step": 7950 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "grad_norm": 0.6278002858161926, |
| "learning_rate": 0.0005489343820708974, |
| "loss": 3.8959, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8610483263373157, |
| "eval_accuracy": 0.33684915969486356, |
| "eval_loss": 3.826174020767212, |
| "eval_runtime": 183.0754, |
| "eval_samples_per_second": 98.38, |
| "eval_steps_per_second": 6.15, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.8664298783769239, |
| "grad_norm": 0.6210545301437378, |
| "learning_rate": 0.0005486111410408361, |
| "loss": 3.8848, |
| "step": 8050 |
| }, |
| { |
| "epoch": 0.8718114304165321, |
| "grad_norm": 0.622451901435852, |
| "learning_rate": 0.0005482879000107746, |
| "loss": 3.8772, |
| "step": 8100 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 0.604086697101593, |
| "learning_rate": 0.0005479646589807133, |
| "loss": 3.8865, |
| "step": 8150 |
| }, |
| { |
| "epoch": 0.8825745344957485, |
| "grad_norm": 0.5613585710525513, |
| "learning_rate": 0.0005476414179506518, |
| "loss": 3.8691, |
| "step": 8200 |
| }, |
| { |
| "epoch": 0.8879560865353568, |
| "grad_norm": 0.6614847779273987, |
| "learning_rate": 0.0005473181769205904, |
| "loss": 3.861, |
| "step": 8250 |
| }, |
| { |
| "epoch": 0.8933376385749651, |
| "grad_norm": 0.551531970500946, |
| "learning_rate": 0.000546994935890529, |
| "loss": 3.8908, |
| "step": 8300 |
| }, |
| { |
| "epoch": 0.8987191906145733, |
| "grad_norm": 0.7161784768104553, |
| "learning_rate": 0.0005466716948604677, |
| "loss": 3.8724, |
| "step": 8350 |
| }, |
| { |
| "epoch": 0.9041007426541815, |
| "grad_norm": 0.6812533140182495, |
| "learning_rate": 0.0005463484538304062, |
| "loss": 3.8675, |
| "step": 8400 |
| }, |
| { |
| "epoch": 0.9094822946937897, |
| "grad_norm": 0.5462357401847839, |
| "learning_rate": 0.000546031677620946, |
| "loss": 3.8776, |
| "step": 8450 |
| }, |
| { |
| "epoch": 0.9148638467333979, |
| "grad_norm": 0.5445178747177124, |
| "learning_rate": 0.0005457084365908845, |
| "loss": 3.8794, |
| "step": 8500 |
| }, |
| { |
| "epoch": 0.9202453987730062, |
| "grad_norm": 0.5628532767295837, |
| "learning_rate": 0.0005453851955608232, |
| "loss": 3.865, |
| "step": 8550 |
| }, |
| { |
| "epoch": 0.9256269508126144, |
| "grad_norm": 0.590034544467926, |
| "learning_rate": 0.0005450619545307617, |
| "loss": 3.8665, |
| "step": 8600 |
| }, |
| { |
| "epoch": 0.9310085028522226, |
| "grad_norm": 0.5842359662055969, |
| "learning_rate": 0.0005447387135007003, |
| "loss": 3.8516, |
| "step": 8650 |
| }, |
| { |
| "epoch": 0.9363900548918308, |
| "grad_norm": 0.5638464093208313, |
| "learning_rate": 0.0005444154724706389, |
| "loss": 3.8689, |
| "step": 8700 |
| }, |
| { |
| "epoch": 0.941771606931439, |
| "grad_norm": 0.5643623471260071, |
| "learning_rate": 0.0005440922314405775, |
| "loss": 3.8532, |
| "step": 8750 |
| }, |
| { |
| "epoch": 0.9471531589710472, |
| "grad_norm": 0.5182409882545471, |
| "learning_rate": 0.0005437689904105161, |
| "loss": 3.8537, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.9525347110106555, |
| "grad_norm": 0.6119173765182495, |
| "learning_rate": 0.0005434457493804546, |
| "loss": 3.8483, |
| "step": 8850 |
| }, |
| { |
| "epoch": 0.9579162630502637, |
| "grad_norm": 0.609887957572937, |
| "learning_rate": 0.0005431225083503932, |
| "loss": 3.8477, |
| "step": 8900 |
| }, |
| { |
| "epoch": 0.9632978150898719, |
| "grad_norm": 0.6084715127944946, |
| "learning_rate": 0.0005427992673203318, |
| "loss": 3.8383, |
| "step": 8950 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "grad_norm": 0.6143397092819214, |
| "learning_rate": 0.0005424760262902704, |
| "loss": 3.8478, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9686793671294801, |
| "eval_accuracy": 0.3404499163970355, |
| "eval_loss": 3.785356044769287, |
| "eval_runtime": 182.7059, |
| "eval_samples_per_second": 98.579, |
| "eval_steps_per_second": 6.163, |
| "step": 9000 |
| }, |
| { |
| "epoch": 0.9740609191690883, |
| "grad_norm": 0.5996887683868408, |
| "learning_rate": 0.000542152785260209, |
| "loss": 3.8303, |
| "step": 9050 |
| }, |
| { |
| "epoch": 0.9794424712086965, |
| "grad_norm": 0.5821929574012756, |
| "learning_rate": 0.0005418295442301476, |
| "loss": 3.8485, |
| "step": 9100 |
| }, |
| { |
| "epoch": 0.9848240232483048, |
| "grad_norm": 0.5654467940330505, |
| "learning_rate": 0.0005415063032000861, |
| "loss": 3.8499, |
| "step": 9150 |
| }, |
| { |
| "epoch": 0.9902055752879131, |
| "grad_norm": 0.6263448596000671, |
| "learning_rate": 0.0005411830621700248, |
| "loss": 3.8372, |
| "step": 9200 |
| }, |
| { |
| "epoch": 0.9955871273275213, |
| "grad_norm": 0.5956529974937439, |
| "learning_rate": 0.0005408598211399633, |
| "loss": 3.8455, |
| "step": 9250 |
| }, |
| { |
| "epoch": 1.0009686793671295, |
| "grad_norm": 0.5786877274513245, |
| "learning_rate": 0.0005405365801099019, |
| "loss": 3.8393, |
| "step": 9300 |
| }, |
| { |
| "epoch": 1.0063502314067376, |
| "grad_norm": 0.6130419373512268, |
| "learning_rate": 0.0005402133390798405, |
| "loss": 3.7682, |
| "step": 9350 |
| }, |
| { |
| "epoch": 1.011731783446346, |
| "grad_norm": 0.6110179424285889, |
| "learning_rate": 0.000539890098049779, |
| "loss": 3.7743, |
| "step": 9400 |
| }, |
| { |
| "epoch": 1.017113335485954, |
| "grad_norm": 0.5443481206893921, |
| "learning_rate": 0.0005395668570197177, |
| "loss": 3.7773, |
| "step": 9450 |
| }, |
| { |
| "epoch": 1.0224948875255624, |
| "grad_norm": 0.5167891383171082, |
| "learning_rate": 0.0005392436159896562, |
| "loss": 3.7604, |
| "step": 9500 |
| }, |
| { |
| "epoch": 1.0278764395651705, |
| "grad_norm": 0.647746741771698, |
| "learning_rate": 0.0005389203749595948, |
| "loss": 3.7733, |
| "step": 9550 |
| }, |
| { |
| "epoch": 1.0332579916047788, |
| "grad_norm": 0.5472586750984192, |
| "learning_rate": 0.0005385971339295334, |
| "loss": 3.7681, |
| "step": 9600 |
| }, |
| { |
| "epoch": 1.0386395436443872, |
| "grad_norm": 0.5735807418823242, |
| "learning_rate": 0.000538273892899472, |
| "loss": 3.7828, |
| "step": 9650 |
| }, |
| { |
| "epoch": 1.0440210956839953, |
| "grad_norm": 0.6225168704986572, |
| "learning_rate": 0.0005379506518694106, |
| "loss": 3.7848, |
| "step": 9700 |
| }, |
| { |
| "epoch": 1.0494026477236036, |
| "grad_norm": 0.5470787286758423, |
| "learning_rate": 0.0005376274108393491, |
| "loss": 3.7819, |
| "step": 9750 |
| }, |
| { |
| "epoch": 1.0547841997632117, |
| "grad_norm": 0.5520187616348267, |
| "learning_rate": 0.0005373041698092877, |
| "loss": 3.7574, |
| "step": 9800 |
| }, |
| { |
| "epoch": 1.06016575180282, |
| "grad_norm": 0.5762578845024109, |
| "learning_rate": 0.0005369809287792263, |
| "loss": 3.7761, |
| "step": 9850 |
| }, |
| { |
| "epoch": 1.0655473038424281, |
| "grad_norm": 0.5505574345588684, |
| "learning_rate": 0.000536657687749165, |
| "loss": 3.7765, |
| "step": 9900 |
| }, |
| { |
| "epoch": 1.0709288558820365, |
| "grad_norm": 0.6215469837188721, |
| "learning_rate": 0.0005363344467191035, |
| "loss": 3.7648, |
| "step": 9950 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "grad_norm": 0.5857725739479065, |
| "learning_rate": 0.000536011205689042, |
| "loss": 3.7833, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.0763104079216446, |
| "eval_accuracy": 0.3440022139112421, |
| "eval_loss": 3.7528064250946045, |
| "eval_runtime": 182.9389, |
| "eval_samples_per_second": 98.454, |
| "eval_steps_per_second": 6.155, |
| "step": 10000 |
| }, |
| { |
| "epoch": 1.081691959961253, |
| "grad_norm": 0.5563129782676697, |
| "learning_rate": 0.0005356879646589807, |
| "loss": 3.7952, |
| "step": 10050 |
| }, |
| { |
| "epoch": 1.087073512000861, |
| "grad_norm": 0.5878000259399414, |
| "learning_rate": 0.0005353647236289192, |
| "loss": 3.7564, |
| "step": 10100 |
| }, |
| { |
| "epoch": 1.0924550640404693, |
| "grad_norm": 0.5479933619499207, |
| "learning_rate": 0.0005350414825988579, |
| "loss": 3.7828, |
| "step": 10150 |
| }, |
| { |
| "epoch": 1.0978366160800774, |
| "grad_norm": 0.6343529224395752, |
| "learning_rate": 0.0005347182415687964, |
| "loss": 3.7892, |
| "step": 10200 |
| }, |
| { |
| "epoch": 1.1032181681196858, |
| "grad_norm": 0.6030763387680054, |
| "learning_rate": 0.000534395000538735, |
| "loss": 3.779, |
| "step": 10250 |
| }, |
| { |
| "epoch": 1.1085997201592939, |
| "grad_norm": 0.6497897505760193, |
| "learning_rate": 0.0005340717595086736, |
| "loss": 3.7651, |
| "step": 10300 |
| }, |
| { |
| "epoch": 1.1139812721989022, |
| "grad_norm": 0.7304083704948425, |
| "learning_rate": 0.0005337549832992134, |
| "loss": 3.7863, |
| "step": 10350 |
| }, |
| { |
| "epoch": 1.1193628242385103, |
| "grad_norm": 0.5949697494506836, |
| "learning_rate": 0.000533431742269152, |
| "loss": 3.7632, |
| "step": 10400 |
| }, |
| { |
| "epoch": 1.1247443762781186, |
| "grad_norm": 0.6299553513526917, |
| "learning_rate": 0.0005331085012390905, |
| "loss": 3.7438, |
| "step": 10450 |
| }, |
| { |
| "epoch": 1.1301259283177267, |
| "grad_norm": 0.6024655103683472, |
| "learning_rate": 0.0005327852602090292, |
| "loss": 3.7659, |
| "step": 10500 |
| }, |
| { |
| "epoch": 1.135507480357335, |
| "grad_norm": 0.6806287169456482, |
| "learning_rate": 0.0005324620191789678, |
| "loss": 3.7551, |
| "step": 10550 |
| }, |
| { |
| "epoch": 1.1408890323969434, |
| "grad_norm": 0.5712878108024597, |
| "learning_rate": 0.0005321387781489063, |
| "loss": 3.7723, |
| "step": 10600 |
| }, |
| { |
| "epoch": 1.1462705844365515, |
| "grad_norm": 0.5776671171188354, |
| "learning_rate": 0.0005318155371188449, |
| "loss": 3.7644, |
| "step": 10650 |
| }, |
| { |
| "epoch": 1.1516521364761596, |
| "grad_norm": 0.5746056437492371, |
| "learning_rate": 0.0005314922960887834, |
| "loss": 3.7811, |
| "step": 10700 |
| }, |
| { |
| "epoch": 1.157033688515768, |
| "grad_norm": 0.5841644406318665, |
| "learning_rate": 0.0005311690550587221, |
| "loss": 3.767, |
| "step": 10750 |
| }, |
| { |
| "epoch": 1.1624152405553763, |
| "grad_norm": 0.6027348041534424, |
| "learning_rate": 0.0005308458140286607, |
| "loss": 3.7558, |
| "step": 10800 |
| }, |
| { |
| "epoch": 1.1677967925949844, |
| "grad_norm": 0.6047888994216919, |
| "learning_rate": 0.0005305225729985993, |
| "loss": 3.772, |
| "step": 10850 |
| }, |
| { |
| "epoch": 1.1731783446345927, |
| "grad_norm": 0.5457755327224731, |
| "learning_rate": 0.0005301993319685378, |
| "loss": 3.7738, |
| "step": 10900 |
| }, |
| { |
| "epoch": 1.1785598966742008, |
| "grad_norm": 0.6240300536155701, |
| "learning_rate": 0.0005298760909384765, |
| "loss": 3.7453, |
| "step": 10950 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "grad_norm": 0.5848184823989868, |
| "learning_rate": 0.000529552849908415, |
| "loss": 3.7495, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1839414487138091, |
| "eval_accuracy": 0.3466185754498148, |
| "eval_loss": 3.728104591369629, |
| "eval_runtime": 182.5387, |
| "eval_samples_per_second": 98.67, |
| "eval_steps_per_second": 6.169, |
| "step": 11000 |
| }, |
| { |
| "epoch": 1.1893230007534172, |
| "grad_norm": 0.6290469765663147, |
| "learning_rate": 0.0005292296088783535, |
| "loss": 3.7552, |
| "step": 11050 |
| }, |
| { |
| "epoch": 1.1947045527930256, |
| "grad_norm": 0.5949827432632446, |
| "learning_rate": 0.0005289063678482922, |
| "loss": 3.7403, |
| "step": 11100 |
| }, |
| { |
| "epoch": 1.2000861048326337, |
| "grad_norm": 0.5734910368919373, |
| "learning_rate": 0.0005285831268182307, |
| "loss": 3.7353, |
| "step": 11150 |
| }, |
| { |
| "epoch": 1.205467656872242, |
| "grad_norm": 0.5530065894126892, |
| "learning_rate": 0.0005282598857881694, |
| "loss": 3.7436, |
| "step": 11200 |
| }, |
| { |
| "epoch": 1.21084920891185, |
| "grad_norm": 0.6502180695533752, |
| "learning_rate": 0.0005279366447581079, |
| "loss": 3.7462, |
| "step": 11250 |
| }, |
| { |
| "epoch": 1.2162307609514584, |
| "grad_norm": 0.546893298625946, |
| "learning_rate": 0.0005276134037280465, |
| "loss": 3.7476, |
| "step": 11300 |
| }, |
| { |
| "epoch": 1.2216123129910665, |
| "grad_norm": 0.5351948738098145, |
| "learning_rate": 0.0005272901626979851, |
| "loss": 3.768, |
| "step": 11350 |
| }, |
| { |
| "epoch": 1.2269938650306749, |
| "grad_norm": 0.6159573793411255, |
| "learning_rate": 0.0005269669216679236, |
| "loss": 3.7397, |
| "step": 11400 |
| }, |
| { |
| "epoch": 1.232375417070283, |
| "grad_norm": 0.5443019270896912, |
| "learning_rate": 0.0005266436806378623, |
| "loss": 3.7555, |
| "step": 11450 |
| }, |
| { |
| "epoch": 1.2377569691098913, |
| "grad_norm": 0.547874927520752, |
| "learning_rate": 0.0005263204396078008, |
| "loss": 3.7532, |
| "step": 11500 |
| }, |
| { |
| "epoch": 1.2431385211494996, |
| "grad_norm": 0.672290563583374, |
| "learning_rate": 0.0005259971985777394, |
| "loss": 3.7461, |
| "step": 11550 |
| }, |
| { |
| "epoch": 1.2485200731891077, |
| "grad_norm": 0.6494690775871277, |
| "learning_rate": 0.000525673957547678, |
| "loss": 3.7317, |
| "step": 11600 |
| }, |
| { |
| "epoch": 1.2539016252287158, |
| "grad_norm": 0.5554096698760986, |
| "learning_rate": 0.0005253507165176167, |
| "loss": 3.7485, |
| "step": 11650 |
| }, |
| { |
| "epoch": 1.2592831772683242, |
| "grad_norm": 0.5914068222045898, |
| "learning_rate": 0.0005250274754875552, |
| "loss": 3.744, |
| "step": 11700 |
| }, |
| { |
| "epoch": 1.2646647293079325, |
| "grad_norm": 0.5564110279083252, |
| "learning_rate": 0.0005247042344574938, |
| "loss": 3.734, |
| "step": 11750 |
| }, |
| { |
| "epoch": 1.2700462813475406, |
| "grad_norm": 0.5499497056007385, |
| "learning_rate": 0.0005243809934274323, |
| "loss": 3.7457, |
| "step": 11800 |
| }, |
| { |
| "epoch": 1.275427833387149, |
| "grad_norm": 0.5675379633903503, |
| "learning_rate": 0.0005240577523973709, |
| "loss": 3.7338, |
| "step": 11850 |
| }, |
| { |
| "epoch": 1.280809385426757, |
| "grad_norm": 0.5705437660217285, |
| "learning_rate": 0.0005237345113673095, |
| "loss": 3.759, |
| "step": 11900 |
| }, |
| { |
| "epoch": 1.2861909374663654, |
| "grad_norm": 0.5672619342803955, |
| "learning_rate": 0.0005234112703372481, |
| "loss": 3.7245, |
| "step": 11950 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "grad_norm": 0.553011417388916, |
| "learning_rate": 0.0005230880293071867, |
| "loss": 3.7503, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2915724895059735, |
| "eval_accuracy": 0.34867276694683536, |
| "eval_loss": 3.701185703277588, |
| "eval_runtime": 183.0501, |
| "eval_samples_per_second": 98.394, |
| "eval_steps_per_second": 6.151, |
| "step": 12000 |
| }, |
| { |
| "epoch": 1.2969540415455818, |
| "grad_norm": 0.5194526314735413, |
| "learning_rate": 0.0005227647882771253, |
| "loss": 3.7416, |
| "step": 12050 |
| }, |
| { |
| "epoch": 1.30233559358519, |
| "grad_norm": 0.5847457051277161, |
| "learning_rate": 0.0005224415472470639, |
| "loss": 3.7315, |
| "step": 12100 |
| }, |
| { |
| "epoch": 1.3077171456247982, |
| "grad_norm": 0.6009708046913147, |
| "learning_rate": 0.0005221183062170024, |
| "loss": 3.7389, |
| "step": 12150 |
| }, |
| { |
| "epoch": 1.3130986976644063, |
| "grad_norm": 0.5650305151939392, |
| "learning_rate": 0.0005217950651869409, |
| "loss": 3.7334, |
| "step": 12200 |
| }, |
| { |
| "epoch": 1.3184802497040147, |
| "grad_norm": 0.4992525279521942, |
| "learning_rate": 0.0005214718241568796, |
| "loss": 3.7386, |
| "step": 12250 |
| }, |
| { |
| "epoch": 1.3238618017436228, |
| "grad_norm": 0.5877363681793213, |
| "learning_rate": 0.0005211485831268182, |
| "loss": 3.7243, |
| "step": 12300 |
| }, |
| { |
| "epoch": 1.329243353783231, |
| "grad_norm": 0.6050459146499634, |
| "learning_rate": 0.0005208253420967568, |
| "loss": 3.7248, |
| "step": 12350 |
| }, |
| { |
| "epoch": 1.3346249058228392, |
| "grad_norm": 0.5322745442390442, |
| "learning_rate": 0.0005205021010666953, |
| "loss": 3.7197, |
| "step": 12400 |
| }, |
| { |
| "epoch": 1.3400064578624475, |
| "grad_norm": 0.5668430328369141, |
| "learning_rate": 0.0005201788600366339, |
| "loss": 3.7243, |
| "step": 12450 |
| }, |
| { |
| "epoch": 1.3453880099020559, |
| "grad_norm": 0.6442492604255676, |
| "learning_rate": 0.0005198556190065725, |
| "loss": 3.7402, |
| "step": 12500 |
| }, |
| { |
| "epoch": 1.350769561941664, |
| "grad_norm": 0.593773365020752, |
| "learning_rate": 0.0005195323779765112, |
| "loss": 3.7295, |
| "step": 12550 |
| }, |
| { |
| "epoch": 1.356151113981272, |
| "grad_norm": 0.6152236461639404, |
| "learning_rate": 0.0005192091369464497, |
| "loss": 3.7121, |
| "step": 12600 |
| }, |
| { |
| "epoch": 1.3615326660208804, |
| "grad_norm": 0.579685628414154, |
| "learning_rate": 0.0005188858959163882, |
| "loss": 3.7207, |
| "step": 12650 |
| }, |
| { |
| "epoch": 1.3669142180604887, |
| "grad_norm": 0.5556007623672485, |
| "learning_rate": 0.0005185626548863269, |
| "loss": 3.7207, |
| "step": 12700 |
| }, |
| { |
| "epoch": 1.3722957701000968, |
| "grad_norm": 0.6135968565940857, |
| "learning_rate": 0.0005182394138562654, |
| "loss": 3.7287, |
| "step": 12750 |
| }, |
| { |
| "epoch": 1.3776773221397052, |
| "grad_norm": 0.5700423121452332, |
| "learning_rate": 0.0005179161728262041, |
| "loss": 3.7336, |
| "step": 12800 |
| }, |
| { |
| "epoch": 1.3830588741793133, |
| "grad_norm": 0.5285822749137878, |
| "learning_rate": 0.0005175929317961426, |
| "loss": 3.7093, |
| "step": 12850 |
| }, |
| { |
| "epoch": 1.3884404262189216, |
| "grad_norm": 0.5742802023887634, |
| "learning_rate": 0.0005172696907660812, |
| "loss": 3.7164, |
| "step": 12900 |
| }, |
| { |
| "epoch": 1.3938219782585297, |
| "grad_norm": 0.6038579940795898, |
| "learning_rate": 0.0005169464497360198, |
| "loss": 3.7429, |
| "step": 12950 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "grad_norm": 0.5283321738243103, |
| "learning_rate": 0.0005166232087059583, |
| "loss": 3.7205, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.399203530298138, |
| "eval_accuracy": 0.3511695016559243, |
| "eval_loss": 3.6827125549316406, |
| "eval_runtime": 183.2259, |
| "eval_samples_per_second": 98.299, |
| "eval_steps_per_second": 6.145, |
| "step": 13000 |
| }, |
| { |
| "epoch": 1.4045850823377461, |
| "grad_norm": 0.5694897770881653, |
| "learning_rate": 0.0005162999676758969, |
| "loss": 3.7338, |
| "step": 13050 |
| }, |
| { |
| "epoch": 1.4099666343773545, |
| "grad_norm": 0.5415697693824768, |
| "learning_rate": 0.0005159767266458355, |
| "loss": 3.7092, |
| "step": 13100 |
| }, |
| { |
| "epoch": 1.4153481864169626, |
| "grad_norm": 0.5408352017402649, |
| "learning_rate": 0.0005156534856157741, |
| "loss": 3.7065, |
| "step": 13150 |
| }, |
| { |
| "epoch": 1.420729738456571, |
| "grad_norm": 0.5147499442100525, |
| "learning_rate": 0.0005153302445857127, |
| "loss": 3.7059, |
| "step": 13200 |
| }, |
| { |
| "epoch": 1.426111290496179, |
| "grad_norm": 0.553383469581604, |
| "learning_rate": 0.0005150070035556513, |
| "loss": 3.7238, |
| "step": 13250 |
| }, |
| { |
| "epoch": 1.4314928425357873, |
| "grad_norm": 0.6054697632789612, |
| "learning_rate": 0.0005146837625255898, |
| "loss": 3.7054, |
| "step": 13300 |
| }, |
| { |
| "epoch": 1.4368743945753955, |
| "grad_norm": 0.5648375749588013, |
| "learning_rate": 0.0005143605214955285, |
| "loss": 3.7171, |
| "step": 13350 |
| }, |
| { |
| "epoch": 1.4422559466150038, |
| "grad_norm": 0.6254073977470398, |
| "learning_rate": 0.0005140372804654671, |
| "loss": 3.705, |
| "step": 13400 |
| }, |
| { |
| "epoch": 1.447637498654612, |
| "grad_norm": 0.6315209865570068, |
| "learning_rate": 0.0005137140394354056, |
| "loss": 3.7054, |
| "step": 13450 |
| }, |
| { |
| "epoch": 1.4530190506942202, |
| "grad_norm": 0.5474207997322083, |
| "learning_rate": 0.0005133907984053442, |
| "loss": 3.7168, |
| "step": 13500 |
| }, |
| { |
| "epoch": 1.4584006027338283, |
| "grad_norm": 0.5616752505302429, |
| "learning_rate": 0.0005130675573752827, |
| "loss": 3.7246, |
| "step": 13550 |
| }, |
| { |
| "epoch": 1.4637821547734367, |
| "grad_norm": 0.5460011959075928, |
| "learning_rate": 0.0005127443163452214, |
| "loss": 3.6915, |
| "step": 13600 |
| }, |
| { |
| "epoch": 1.469163706813045, |
| "grad_norm": 0.5980861186981201, |
| "learning_rate": 0.00051242107531516, |
| "loss": 3.7136, |
| "step": 13650 |
| }, |
| { |
| "epoch": 1.474545258852653, |
| "grad_norm": 0.6376727223396301, |
| "learning_rate": 0.0005120978342850986, |
| "loss": 3.6905, |
| "step": 13700 |
| }, |
| { |
| "epoch": 1.4799268108922612, |
| "grad_norm": 0.5472317337989807, |
| "learning_rate": 0.0005117745932550371, |
| "loss": 3.683, |
| "step": 13750 |
| }, |
| { |
| "epoch": 1.4853083629318695, |
| "grad_norm": 0.5763005614280701, |
| "learning_rate": 0.0005114513522249758, |
| "loss": 3.6773, |
| "step": 13800 |
| }, |
| { |
| "epoch": 1.4906899149714778, |
| "grad_norm": 0.5707553625106812, |
| "learning_rate": 0.0005111281111949143, |
| "loss": 3.6924, |
| "step": 13850 |
| }, |
| { |
| "epoch": 1.496071467011086, |
| "grad_norm": 0.5680550932884216, |
| "learning_rate": 0.0005108048701648528, |
| "loss": 3.7102, |
| "step": 13900 |
| }, |
| { |
| "epoch": 1.501453019050694, |
| "grad_norm": 0.6450196504592896, |
| "learning_rate": 0.0005104816291347915, |
| "loss": 3.6982, |
| "step": 13950 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "grad_norm": 0.5898568630218506, |
| "learning_rate": 0.00051015838810473, |
| "loss": 3.6956, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5068345710903024, |
| "eval_accuracy": 0.3528064660637373, |
| "eval_loss": 3.6613011360168457, |
| "eval_runtime": 182.9869, |
| "eval_samples_per_second": 98.428, |
| "eval_steps_per_second": 6.153, |
| "step": 14000 |
| }, |
| { |
| "epoch": 1.5122161231299107, |
| "grad_norm": 0.61032634973526, |
| "learning_rate": 0.0005098351470746687, |
| "loss": 3.7066, |
| "step": 14050 |
| }, |
| { |
| "epoch": 1.5175976751695188, |
| "grad_norm": 0.5454217791557312, |
| "learning_rate": 0.0005095119060446072, |
| "loss": 3.7021, |
| "step": 14100 |
| }, |
| { |
| "epoch": 1.5229792272091272, |
| "grad_norm": 0.5414226055145264, |
| "learning_rate": 0.0005091886650145458, |
| "loss": 3.7104, |
| "step": 14150 |
| }, |
| { |
| "epoch": 1.5283607792487355, |
| "grad_norm": 0.5376770496368408, |
| "learning_rate": 0.0005088654239844844, |
| "loss": 3.6958, |
| "step": 14200 |
| }, |
| { |
| "epoch": 1.5337423312883436, |
| "grad_norm": 0.5370742082595825, |
| "learning_rate": 0.0005085421829544229, |
| "loss": 3.6932, |
| "step": 14250 |
| }, |
| { |
| "epoch": 1.5391238833279517, |
| "grad_norm": 0.6070656776428223, |
| "learning_rate": 0.0005082189419243616, |
| "loss": 3.6957, |
| "step": 14300 |
| }, |
| { |
| "epoch": 1.54450543536756, |
| "grad_norm": 0.5474653244018555, |
| "learning_rate": 0.0005079021657149014, |
| "loss": 3.6808, |
| "step": 14350 |
| }, |
| { |
| "epoch": 1.5498869874071683, |
| "grad_norm": 0.5363737344741821, |
| "learning_rate": 0.0005075789246848399, |
| "loss": 3.6977, |
| "step": 14400 |
| }, |
| { |
| "epoch": 1.5552685394467765, |
| "grad_norm": 0.5836102366447449, |
| "learning_rate": 0.0005072556836547785, |
| "loss": 3.693, |
| "step": 14450 |
| }, |
| { |
| "epoch": 1.5606500914863846, |
| "grad_norm": 0.5627076625823975, |
| "learning_rate": 0.0005069324426247171, |
| "loss": 3.6993, |
| "step": 14500 |
| }, |
| { |
| "epoch": 1.566031643525993, |
| "grad_norm": 0.5487858057022095, |
| "learning_rate": 0.0005066092015946557, |
| "loss": 3.6824, |
| "step": 14550 |
| }, |
| { |
| "epoch": 1.5714131955656012, |
| "grad_norm": 0.5511041879653931, |
| "learning_rate": 0.0005062859605645943, |
| "loss": 3.6905, |
| "step": 14600 |
| }, |
| { |
| "epoch": 1.5767947476052093, |
| "grad_norm": 0.5165522694587708, |
| "learning_rate": 0.0005059627195345329, |
| "loss": 3.6881, |
| "step": 14650 |
| }, |
| { |
| "epoch": 1.5821762996448174, |
| "grad_norm": 0.5816965103149414, |
| "learning_rate": 0.0005056394785044715, |
| "loss": 3.697, |
| "step": 14700 |
| }, |
| { |
| "epoch": 1.5875578516844258, |
| "grad_norm": 0.55551677942276, |
| "learning_rate": 0.00050531623747441, |
| "loss": 3.6841, |
| "step": 14750 |
| }, |
| { |
| "epoch": 1.592939403724034, |
| "grad_norm": 0.5633440017700195, |
| "learning_rate": 0.0005049929964443486, |
| "loss": 3.7111, |
| "step": 14800 |
| }, |
| { |
| "epoch": 1.5983209557636422, |
| "grad_norm": 0.534885048866272, |
| "learning_rate": 0.0005046697554142871, |
| "loss": 3.6722, |
| "step": 14850 |
| }, |
| { |
| "epoch": 1.6037025078032503, |
| "grad_norm": 0.5363733768463135, |
| "learning_rate": 0.0005043465143842258, |
| "loss": 3.6866, |
| "step": 14900 |
| }, |
| { |
| "epoch": 1.6090840598428586, |
| "grad_norm": 0.7027775049209595, |
| "learning_rate": 0.0005040232733541644, |
| "loss": 3.6653, |
| "step": 14950 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "grad_norm": 0.5722295045852661, |
| "learning_rate": 0.000503700032324103, |
| "loss": 3.6897, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.614465611882467, |
| "eval_accuracy": 0.3547539604249241, |
| "eval_loss": 3.641711950302124, |
| "eval_runtime": 182.9321, |
| "eval_samples_per_second": 98.457, |
| "eval_steps_per_second": 6.155, |
| "step": 15000 |
| }, |
| { |
| "epoch": 1.619847163922075, |
| "grad_norm": 0.5616506934165955, |
| "learning_rate": 0.0005033767912940415, |
| "loss": 3.6878, |
| "step": 15050 |
| }, |
| { |
| "epoch": 1.6252287159616834, |
| "grad_norm": 0.5472654700279236, |
| "learning_rate": 0.0005030535502639802, |
| "loss": 3.7087, |
| "step": 15100 |
| }, |
| { |
| "epoch": 1.6306102680012917, |
| "grad_norm": 0.6255720853805542, |
| "learning_rate": 0.0005027303092339187, |
| "loss": 3.6824, |
| "step": 15150 |
| }, |
| { |
| "epoch": 1.6359918200408998, |
| "grad_norm": 0.49615591764450073, |
| "learning_rate": 0.0005024070682038573, |
| "loss": 3.6515, |
| "step": 15200 |
| }, |
| { |
| "epoch": 1.641373372080508, |
| "grad_norm": 0.559258759021759, |
| "learning_rate": 0.0005020838271737959, |
| "loss": 3.6834, |
| "step": 15250 |
| }, |
| { |
| "epoch": 1.6467549241201163, |
| "grad_norm": 0.572930097579956, |
| "learning_rate": 0.0005017605861437344, |
| "loss": 3.689, |
| "step": 15300 |
| }, |
| { |
| "epoch": 1.6521364761597246, |
| "grad_norm": 0.5224685668945312, |
| "learning_rate": 0.0005014373451136731, |
| "loss": 3.6909, |
| "step": 15350 |
| }, |
| { |
| "epoch": 1.6575180281993327, |
| "grad_norm": 0.591471791267395, |
| "learning_rate": 0.0005011141040836116, |
| "loss": 3.6753, |
| "step": 15400 |
| }, |
| { |
| "epoch": 1.6628995802389408, |
| "grad_norm": 0.5919022560119629, |
| "learning_rate": 0.0005007908630535503, |
| "loss": 3.687, |
| "step": 15450 |
| }, |
| { |
| "epoch": 1.6682811322785491, |
| "grad_norm": 0.5808023810386658, |
| "learning_rate": 0.0005004676220234888, |
| "loss": 3.6756, |
| "step": 15500 |
| }, |
| { |
| "epoch": 1.6736626843181575, |
| "grad_norm": 0.5336684584617615, |
| "learning_rate": 0.0005001443809934273, |
| "loss": 3.6766, |
| "step": 15550 |
| }, |
| { |
| "epoch": 1.6790442363577656, |
| "grad_norm": 0.622586190700531, |
| "learning_rate": 0.000499821139963366, |
| "loss": 3.689, |
| "step": 15600 |
| }, |
| { |
| "epoch": 1.6844257883973737, |
| "grad_norm": 0.5987013578414917, |
| "learning_rate": 0.0004994978989333045, |
| "loss": 3.6845, |
| "step": 15650 |
| }, |
| { |
| "epoch": 1.689807340436982, |
| "grad_norm": 0.6158627271652222, |
| "learning_rate": 0.0004991746579032431, |
| "loss": 3.6704, |
| "step": 15700 |
| }, |
| { |
| "epoch": 1.6951888924765903, |
| "grad_norm": 0.6078811287879944, |
| "learning_rate": 0.0004988514168731817, |
| "loss": 3.6819, |
| "step": 15750 |
| }, |
| { |
| "epoch": 1.7005704445161984, |
| "grad_norm": 0.5467185974121094, |
| "learning_rate": 0.0004985281758431204, |
| "loss": 3.6727, |
| "step": 15800 |
| }, |
| { |
| "epoch": 1.7059519965558065, |
| "grad_norm": 0.585032045841217, |
| "learning_rate": 0.0004982049348130589, |
| "loss": 3.6765, |
| "step": 15850 |
| }, |
| { |
| "epoch": 1.7113335485954149, |
| "grad_norm": 0.6013224720954895, |
| "learning_rate": 0.0004978816937829975, |
| "loss": 3.6628, |
| "step": 15900 |
| }, |
| { |
| "epoch": 1.7167151006350232, |
| "grad_norm": 0.5670937299728394, |
| "learning_rate": 0.000497558452752936, |
| "loss": 3.6655, |
| "step": 15950 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "grad_norm": 0.5367864966392517, |
| "learning_rate": 0.0004972352117228746, |
| "loss": 3.6712, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7220966526746313, |
| "eval_accuracy": 0.3568852954722929, |
| "eval_loss": 3.62510085105896, |
| "eval_runtime": 182.7792, |
| "eval_samples_per_second": 98.54, |
| "eval_steps_per_second": 6.16, |
| "step": 16000 |
| }, |
| { |
| "epoch": 1.7274782047142396, |
| "grad_norm": 0.5834771394729614, |
| "learning_rate": 0.0004969119706928133, |
| "loss": 3.6616, |
| "step": 16050 |
| }, |
| { |
| "epoch": 1.732859756753848, |
| "grad_norm": 0.552348256111145, |
| "learning_rate": 0.0004965887296627518, |
| "loss": 3.6731, |
| "step": 16100 |
| }, |
| { |
| "epoch": 1.738241308793456, |
| "grad_norm": 0.5560649037361145, |
| "learning_rate": 0.0004962654886326904, |
| "loss": 3.6797, |
| "step": 16150 |
| }, |
| { |
| "epoch": 1.7436228608330642, |
| "grad_norm": 0.5562814474105835, |
| "learning_rate": 0.000495942247602629, |
| "loss": 3.6658, |
| "step": 16200 |
| }, |
| { |
| "epoch": 1.7490044128726725, |
| "grad_norm": 0.5309078097343445, |
| "learning_rate": 0.0004956190065725676, |
| "loss": 3.6545, |
| "step": 16250 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.5403803586959839, |
| "learning_rate": 0.0004952957655425062, |
| "loss": 3.661, |
| "step": 16300 |
| }, |
| { |
| "epoch": 1.759767516951889, |
| "grad_norm": 0.5780831575393677, |
| "learning_rate": 0.0004949789893330459, |
| "loss": 3.6783, |
| "step": 16350 |
| }, |
| { |
| "epoch": 1.765149068991497, |
| "grad_norm": 0.5269186496734619, |
| "learning_rate": 0.0004946557483029846, |
| "loss": 3.6483, |
| "step": 16400 |
| }, |
| { |
| "epoch": 1.7705306210311054, |
| "grad_norm": 0.598962128162384, |
| "learning_rate": 0.0004943325072729231, |
| "loss": 3.6521, |
| "step": 16450 |
| }, |
| { |
| "epoch": 1.7759121730707137, |
| "grad_norm": 0.5480334758758545, |
| "learning_rate": 0.0004940092662428617, |
| "loss": 3.6613, |
| "step": 16500 |
| }, |
| { |
| "epoch": 1.7812937251103218, |
| "grad_norm": 0.5782729983329773, |
| "learning_rate": 0.0004936860252128003, |
| "loss": 3.6506, |
| "step": 16550 |
| }, |
| { |
| "epoch": 1.78667527714993, |
| "grad_norm": 0.5823274850845337, |
| "learning_rate": 0.0004933627841827388, |
| "loss": 3.6526, |
| "step": 16600 |
| }, |
| { |
| "epoch": 1.7920568291895382, |
| "grad_norm": 0.5701773762702942, |
| "learning_rate": 0.0004930395431526775, |
| "loss": 3.6625, |
| "step": 16650 |
| }, |
| { |
| "epoch": 1.7974383812291466, |
| "grad_norm": 0.5889811515808105, |
| "learning_rate": 0.0004927163021226161, |
| "loss": 3.6565, |
| "step": 16700 |
| }, |
| { |
| "epoch": 1.8028199332687547, |
| "grad_norm": 0.5805728435516357, |
| "learning_rate": 0.0004923930610925547, |
| "loss": 3.6663, |
| "step": 16750 |
| }, |
| { |
| "epoch": 1.8082014853083628, |
| "grad_norm": 0.5075141191482544, |
| "learning_rate": 0.0004920698200624932, |
| "loss": 3.6442, |
| "step": 16800 |
| }, |
| { |
| "epoch": 1.813583037347971, |
| "grad_norm": 0.5800615549087524, |
| "learning_rate": 0.0004917465790324317, |
| "loss": 3.6608, |
| "step": 16850 |
| }, |
| { |
| "epoch": 1.8189645893875794, |
| "grad_norm": 0.5941094160079956, |
| "learning_rate": 0.0004914233380023704, |
| "loss": 3.66, |
| "step": 16900 |
| }, |
| { |
| "epoch": 1.8243461414271875, |
| "grad_norm": 0.5344933867454529, |
| "learning_rate": 0.0004911000969723089, |
| "loss": 3.6616, |
| "step": 16950 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "grad_norm": 0.6212116479873657, |
| "learning_rate": 0.0004907768559422476, |
| "loss": 3.6428, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8297276934667959, |
| "eval_accuracy": 0.3578398110917431, |
| "eval_loss": 3.610550880432129, |
| "eval_runtime": 182.6501, |
| "eval_samples_per_second": 98.609, |
| "eval_steps_per_second": 6.165, |
| "step": 17000 |
| }, |
| { |
| "epoch": 1.8351092455064042, |
| "grad_norm": 0.6466174125671387, |
| "learning_rate": 0.0004904536149121861, |
| "loss": 3.6452, |
| "step": 17050 |
| }, |
| { |
| "epoch": 1.8404907975460123, |
| "grad_norm": 0.6004204750061035, |
| "learning_rate": 0.0004901303738821248, |
| "loss": 3.6527, |
| "step": 17100 |
| }, |
| { |
| "epoch": 1.8458723495856204, |
| "grad_norm": 0.5209426283836365, |
| "learning_rate": 0.0004898071328520633, |
| "loss": 3.6282, |
| "step": 17150 |
| }, |
| { |
| "epoch": 1.8512539016252287, |
| "grad_norm": 0.5604954361915588, |
| "learning_rate": 0.0004894838918220019, |
| "loss": 3.6619, |
| "step": 17200 |
| }, |
| { |
| "epoch": 1.856635453664837, |
| "grad_norm": 0.5392171144485474, |
| "learning_rate": 0.0004891606507919405, |
| "loss": 3.6596, |
| "step": 17250 |
| }, |
| { |
| "epoch": 1.8620170057044452, |
| "grad_norm": 0.5408145785331726, |
| "learning_rate": 0.000488837409761879, |
| "loss": 3.6552, |
| "step": 17300 |
| }, |
| { |
| "epoch": 1.8673985577440533, |
| "grad_norm": 0.5370055437088013, |
| "learning_rate": 0.0004885141687318177, |
| "loss": 3.6433, |
| "step": 17350 |
| }, |
| { |
| "epoch": 1.8727801097836616, |
| "grad_norm": 0.5374568700790405, |
| "learning_rate": 0.00048819092770175623, |
| "loss": 3.6328, |
| "step": 17400 |
| }, |
| { |
| "epoch": 1.87816166182327, |
| "grad_norm": 0.5178585648536682, |
| "learning_rate": 0.0004878676866716948, |
| "loss": 3.6565, |
| "step": 17450 |
| }, |
| { |
| "epoch": 1.883543213862878, |
| "grad_norm": 0.5513762831687927, |
| "learning_rate": 0.00048754444564163337, |
| "loss": 3.675, |
| "step": 17500 |
| }, |
| { |
| "epoch": 1.8889247659024861, |
| "grad_norm": 0.5339059829711914, |
| "learning_rate": 0.000487221204611572, |
| "loss": 3.6502, |
| "step": 17550 |
| }, |
| { |
| "epoch": 1.8943063179420945, |
| "grad_norm": 0.6063997745513916, |
| "learning_rate": 0.00048689796358151056, |
| "loss": 3.6437, |
| "step": 17600 |
| }, |
| { |
| "epoch": 1.8996878699817028, |
| "grad_norm": 0.5742690563201904, |
| "learning_rate": 0.00048657472255144915, |
| "loss": 3.646, |
| "step": 17650 |
| }, |
| { |
| "epoch": 1.905069422021311, |
| "grad_norm": 0.5425062775611877, |
| "learning_rate": 0.00048625148152138775, |
| "loss": 3.6322, |
| "step": 17700 |
| }, |
| { |
| "epoch": 1.910450974060919, |
| "grad_norm": 0.5563521981239319, |
| "learning_rate": 0.0004859282404913263, |
| "loss": 3.6437, |
| "step": 17750 |
| }, |
| { |
| "epoch": 1.9158325261005273, |
| "grad_norm": 0.5626557469367981, |
| "learning_rate": 0.0004856049994612649, |
| "loss": 3.6403, |
| "step": 17800 |
| }, |
| { |
| "epoch": 1.9212140781401357, |
| "grad_norm": 0.5019347071647644, |
| "learning_rate": 0.00048528175843120353, |
| "loss": 3.6476, |
| "step": 17850 |
| }, |
| { |
| "epoch": 1.9265956301797438, |
| "grad_norm": 0.6100844144821167, |
| "learning_rate": 0.0004849585174011421, |
| "loss": 3.6504, |
| "step": 17900 |
| }, |
| { |
| "epoch": 1.931977182219352, |
| "grad_norm": 0.5230750441551208, |
| "learning_rate": 0.00048463527637108067, |
| "loss": 3.6293, |
| "step": 17950 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "grad_norm": 0.5869380831718445, |
| "learning_rate": 0.0004843120353410192, |
| "loss": 3.6477, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9373587342589604, |
| "eval_accuracy": 0.35974362699202844, |
| "eval_loss": 3.5920450687408447, |
| "eval_runtime": 182.6734, |
| "eval_samples_per_second": 98.597, |
| "eval_steps_per_second": 6.164, |
| "step": 18000 |
| }, |
| { |
| "epoch": 1.9427402862985685, |
| "grad_norm": 0.5855605006217957, |
| "learning_rate": 0.0004839887943109578, |
| "loss": 3.6606, |
| "step": 18050 |
| }, |
| { |
| "epoch": 1.9481218383381766, |
| "grad_norm": 0.6179500222206116, |
| "learning_rate": 0.00048366555328089645, |
| "loss": 3.6605, |
| "step": 18100 |
| }, |
| { |
| "epoch": 1.953503390377785, |
| "grad_norm": 0.5851153135299683, |
| "learning_rate": 0.000483342312250835, |
| "loss": 3.6513, |
| "step": 18150 |
| }, |
| { |
| "epoch": 1.9588849424173933, |
| "grad_norm": 0.610054612159729, |
| "learning_rate": 0.0004830190712207736, |
| "loss": 3.6214, |
| "step": 18200 |
| }, |
| { |
| "epoch": 1.9642664944570014, |
| "grad_norm": 0.6112696528434753, |
| "learning_rate": 0.0004826958301907122, |
| "loss": 3.6286, |
| "step": 18250 |
| }, |
| { |
| "epoch": 1.9696480464966095, |
| "grad_norm": 0.5437807440757751, |
| "learning_rate": 0.0004823725891606507, |
| "loss": 3.6459, |
| "step": 18300 |
| }, |
| { |
| "epoch": 1.9750295985362178, |
| "grad_norm": 0.5649225115776062, |
| "learning_rate": 0.0004820493481305893, |
| "loss": 3.6386, |
| "step": 18350 |
| }, |
| { |
| "epoch": 1.9804111505758262, |
| "grad_norm": 0.5825486183166504, |
| "learning_rate": 0.00048172610710052797, |
| "loss": 3.6294, |
| "step": 18400 |
| }, |
| { |
| "epoch": 1.9857927026154343, |
| "grad_norm": 0.5508699417114258, |
| "learning_rate": 0.0004814093308910677, |
| "loss": 3.6338, |
| "step": 18450 |
| }, |
| { |
| "epoch": 1.9911742546550424, |
| "grad_norm": 0.5655311942100525, |
| "learning_rate": 0.00048108608986100637, |
| "loss": 3.6588, |
| "step": 18500 |
| }, |
| { |
| "epoch": 1.9965558066946507, |
| "grad_norm": 0.5678587555885315, |
| "learning_rate": 0.0004807628488309449, |
| "loss": 3.6348, |
| "step": 18550 |
| }, |
| { |
| "epoch": 2.001937358734259, |
| "grad_norm": 0.5703258514404297, |
| "learning_rate": 0.0004804396078008835, |
| "loss": 3.6138, |
| "step": 18600 |
| }, |
| { |
| "epoch": 2.007318910773867, |
| "grad_norm": 0.5701274871826172, |
| "learning_rate": 0.00048011636677082204, |
| "loss": 3.5586, |
| "step": 18650 |
| }, |
| { |
| "epoch": 2.0127004628134753, |
| "grad_norm": 0.567638635635376, |
| "learning_rate": 0.00047979312574076064, |
| "loss": 3.5602, |
| "step": 18700 |
| }, |
| { |
| "epoch": 2.018082014853084, |
| "grad_norm": 0.5551114678382874, |
| "learning_rate": 0.0004794698847106992, |
| "loss": 3.5546, |
| "step": 18750 |
| }, |
| { |
| "epoch": 2.023463566892692, |
| "grad_norm": 0.6258234977722168, |
| "learning_rate": 0.0004791466436806378, |
| "loss": 3.5536, |
| "step": 18800 |
| }, |
| { |
| "epoch": 2.0288451189323, |
| "grad_norm": 0.7323607206344604, |
| "learning_rate": 0.0004788234026505764, |
| "loss": 3.5522, |
| "step": 18850 |
| }, |
| { |
| "epoch": 2.034226670971908, |
| "grad_norm": 0.5760945677757263, |
| "learning_rate": 0.00047850016162051496, |
| "loss": 3.5633, |
| "step": 18900 |
| }, |
| { |
| "epoch": 2.0396082230115167, |
| "grad_norm": 0.561445951461792, |
| "learning_rate": 0.00047817692059045356, |
| "loss": 3.5486, |
| "step": 18950 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "grad_norm": 0.5980292558670044, |
| "learning_rate": 0.00047786014438099336, |
| "loss": 3.5527, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.044989775051125, |
| "eval_accuracy": 0.36137385492079693, |
| "eval_loss": 3.583522081375122, |
| "eval_runtime": 182.8509, |
| "eval_samples_per_second": 98.501, |
| "eval_steps_per_second": 6.158, |
| "step": 19000 |
| }, |
| { |
| "epoch": 2.050371327090733, |
| "grad_norm": 0.5768479108810425, |
| "learning_rate": 0.00047753690335093196, |
| "loss": 3.5578, |
| "step": 19050 |
| }, |
| { |
| "epoch": 2.055752879130341, |
| "grad_norm": 0.5865322947502136, |
| "learning_rate": 0.00047721366232087055, |
| "loss": 3.5501, |
| "step": 19100 |
| }, |
| { |
| "epoch": 2.0611344311699495, |
| "grad_norm": 0.5691093802452087, |
| "learning_rate": 0.00047689042129080915, |
| "loss": 3.544, |
| "step": 19150 |
| }, |
| { |
| "epoch": 2.0665159832095576, |
| "grad_norm": 0.6482433080673218, |
| "learning_rate": 0.00047656718026074774, |
| "loss": 3.5627, |
| "step": 19200 |
| }, |
| { |
| "epoch": 2.0718975352491658, |
| "grad_norm": 0.5731157064437866, |
| "learning_rate": 0.00047624393923068634, |
| "loss": 3.548, |
| "step": 19250 |
| }, |
| { |
| "epoch": 2.0772790872887743, |
| "grad_norm": 0.5512408018112183, |
| "learning_rate": 0.0004759206982006249, |
| "loss": 3.5753, |
| "step": 19300 |
| }, |
| { |
| "epoch": 2.0826606393283824, |
| "grad_norm": 0.5742067098617554, |
| "learning_rate": 0.00047559745717056347, |
| "loss": 3.5479, |
| "step": 19350 |
| }, |
| { |
| "epoch": 2.0880421913679905, |
| "grad_norm": 0.5789666175842285, |
| "learning_rate": 0.000475274216140502, |
| "loss": 3.5566, |
| "step": 19400 |
| }, |
| { |
| "epoch": 2.0934237434075986, |
| "grad_norm": 0.631552517414093, |
| "learning_rate": 0.00047495097511044066, |
| "loss": 3.5637, |
| "step": 19450 |
| }, |
| { |
| "epoch": 2.098805295447207, |
| "grad_norm": 0.5673854351043701, |
| "learning_rate": 0.00047462773408037925, |
| "loss": 3.5659, |
| "step": 19500 |
| }, |
| { |
| "epoch": 2.1041868474868153, |
| "grad_norm": 0.6008737683296204, |
| "learning_rate": 0.0004743044930503178, |
| "loss": 3.5755, |
| "step": 19550 |
| }, |
| { |
| "epoch": 2.1095683995264234, |
| "grad_norm": 0.5127817392349243, |
| "learning_rate": 0.0004739812520202564, |
| "loss": 3.576, |
| "step": 19600 |
| }, |
| { |
| "epoch": 2.1149499515660315, |
| "grad_norm": 0.5636597871780396, |
| "learning_rate": 0.000473658010990195, |
| "loss": 3.5646, |
| "step": 19650 |
| }, |
| { |
| "epoch": 2.12033150360564, |
| "grad_norm": 0.566287636756897, |
| "learning_rate": 0.0004733347699601336, |
| "loss": 3.5496, |
| "step": 19700 |
| }, |
| { |
| "epoch": 2.125713055645248, |
| "grad_norm": 0.5994743704795837, |
| "learning_rate": 0.0004730115289300722, |
| "loss": 3.558, |
| "step": 19750 |
| }, |
| { |
| "epoch": 2.1310946076848563, |
| "grad_norm": 0.5499207377433777, |
| "learning_rate": 0.00047268828790001077, |
| "loss": 3.5618, |
| "step": 19800 |
| }, |
| { |
| "epoch": 2.1364761597244644, |
| "grad_norm": 0.5498175621032715, |
| "learning_rate": 0.0004723650468699493, |
| "loss": 3.5618, |
| "step": 19850 |
| }, |
| { |
| "epoch": 2.141857711764073, |
| "grad_norm": 0.5704463720321655, |
| "learning_rate": 0.0004720418058398879, |
| "loss": 3.5423, |
| "step": 19900 |
| }, |
| { |
| "epoch": 2.147239263803681, |
| "grad_norm": 0.5655784606933594, |
| "learning_rate": 0.00047171856480982644, |
| "loss": 3.5586, |
| "step": 19950 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "grad_norm": 0.5610454678535461, |
| "learning_rate": 0.0004713953237797651, |
| "loss": 3.5691, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.152620815843289, |
| "eval_accuracy": 0.36227817290607683, |
| "eval_loss": 3.5729849338531494, |
| "eval_runtime": 182.9497, |
| "eval_samples_per_second": 98.448, |
| "eval_steps_per_second": 6.155, |
| "step": 20000 |
| }, |
| { |
| "epoch": 2.1580023678828972, |
| "grad_norm": 0.5196320414543152, |
| "learning_rate": 0.0004710720827497037, |
| "loss": 3.5614, |
| "step": 20050 |
| }, |
| { |
| "epoch": 2.163383919922506, |
| "grad_norm": 0.5847726464271545, |
| "learning_rate": 0.00047074884171964223, |
| "loss": 3.5468, |
| "step": 20100 |
| }, |
| { |
| "epoch": 2.168765471962114, |
| "grad_norm": 0.5680404901504517, |
| "learning_rate": 0.0004704256006895808, |
| "loss": 3.5578, |
| "step": 20150 |
| }, |
| { |
| "epoch": 2.174147024001722, |
| "grad_norm": 0.5938388705253601, |
| "learning_rate": 0.00047010235965951936, |
| "loss": 3.5858, |
| "step": 20200 |
| }, |
| { |
| "epoch": 2.1795285760413305, |
| "grad_norm": 0.6427321434020996, |
| "learning_rate": 0.00046977911862945796, |
| "loss": 3.5649, |
| "step": 20250 |
| }, |
| { |
| "epoch": 2.1849101280809387, |
| "grad_norm": 0.578403651714325, |
| "learning_rate": 0.0004694558775993966, |
| "loss": 3.5625, |
| "step": 20300 |
| }, |
| { |
| "epoch": 2.1902916801205468, |
| "grad_norm": 0.5661942362785339, |
| "learning_rate": 0.00046913263656933515, |
| "loss": 3.5531, |
| "step": 20350 |
| }, |
| { |
| "epoch": 2.195673232160155, |
| "grad_norm": 0.603402853012085, |
| "learning_rate": 0.00046880939553927374, |
| "loss": 3.5608, |
| "step": 20400 |
| }, |
| { |
| "epoch": 2.2010547841997634, |
| "grad_norm": 0.5676315426826477, |
| "learning_rate": 0.00046848615450921234, |
| "loss": 3.5555, |
| "step": 20450 |
| }, |
| { |
| "epoch": 2.2064363362393715, |
| "grad_norm": 0.592079222202301, |
| "learning_rate": 0.0004681629134791509, |
| "loss": 3.5641, |
| "step": 20500 |
| }, |
| { |
| "epoch": 2.2118178882789796, |
| "grad_norm": 0.5975874662399292, |
| "learning_rate": 0.00046783967244908947, |
| "loss": 3.5477, |
| "step": 20550 |
| }, |
| { |
| "epoch": 2.2171994403185877, |
| "grad_norm": 0.6059330701828003, |
| "learning_rate": 0.0004675164314190281, |
| "loss": 3.5673, |
| "step": 20600 |
| }, |
| { |
| "epoch": 2.2225809923581963, |
| "grad_norm": 0.5840981602668762, |
| "learning_rate": 0.00046719319038896666, |
| "loss": 3.5702, |
| "step": 20650 |
| }, |
| { |
| "epoch": 2.2279625443978044, |
| "grad_norm": 0.5514042377471924, |
| "learning_rate": 0.00046686994935890526, |
| "loss": 3.5504, |
| "step": 20700 |
| }, |
| { |
| "epoch": 2.2333440964374125, |
| "grad_norm": 0.5639162659645081, |
| "learning_rate": 0.0004665467083288438, |
| "loss": 3.5866, |
| "step": 20750 |
| }, |
| { |
| "epoch": 2.2387256484770206, |
| "grad_norm": 0.5674893260002136, |
| "learning_rate": 0.0004662234672987824, |
| "loss": 3.5668, |
| "step": 20800 |
| }, |
| { |
| "epoch": 2.244107200516629, |
| "grad_norm": 0.6036685109138489, |
| "learning_rate": 0.00046590022626872104, |
| "loss": 3.5557, |
| "step": 20850 |
| }, |
| { |
| "epoch": 2.2494887525562373, |
| "grad_norm": 0.5608086585998535, |
| "learning_rate": 0.0004655769852386596, |
| "loss": 3.5617, |
| "step": 20900 |
| }, |
| { |
| "epoch": 2.2548703045958454, |
| "grad_norm": 0.6411144137382507, |
| "learning_rate": 0.0004652537442085982, |
| "loss": 3.5523, |
| "step": 20950 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "grad_norm": 0.5787631273269653, |
| "learning_rate": 0.00046493050317853677, |
| "loss": 3.5718, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.2602518566354535, |
| "eval_accuracy": 0.3634697691267383, |
| "eval_loss": 3.564570665359497, |
| "eval_runtime": 183.029, |
| "eval_samples_per_second": 98.405, |
| "eval_steps_per_second": 6.152, |
| "step": 21000 |
| }, |
| { |
| "epoch": 2.265633408675062, |
| "grad_norm": 0.6253136992454529, |
| "learning_rate": 0.0004646072621484753, |
| "loss": 3.5701, |
| "step": 21050 |
| }, |
| { |
| "epoch": 2.27101496071467, |
| "grad_norm": 0.6177113652229309, |
| "learning_rate": 0.0004642840211184139, |
| "loss": 3.575, |
| "step": 21100 |
| }, |
| { |
| "epoch": 2.2763965127542782, |
| "grad_norm": 0.6227148771286011, |
| "learning_rate": 0.00046396078008835255, |
| "loss": 3.5739, |
| "step": 21150 |
| }, |
| { |
| "epoch": 2.281778064793887, |
| "grad_norm": 0.6199923157691956, |
| "learning_rate": 0.0004636375390582911, |
| "loss": 3.5579, |
| "step": 21200 |
| }, |
| { |
| "epoch": 2.287159616833495, |
| "grad_norm": 0.5631394982337952, |
| "learning_rate": 0.0004633142980282297, |
| "loss": 3.5603, |
| "step": 21250 |
| }, |
| { |
| "epoch": 2.292541168873103, |
| "grad_norm": 0.5839282870292664, |
| "learning_rate": 0.00046299105699816823, |
| "loss": 3.5685, |
| "step": 21300 |
| }, |
| { |
| "epoch": 2.297922720912711, |
| "grad_norm": 0.5705982446670532, |
| "learning_rate": 0.0004626678159681068, |
| "loss": 3.5749, |
| "step": 21350 |
| }, |
| { |
| "epoch": 2.303304272952319, |
| "grad_norm": 0.5807977914810181, |
| "learning_rate": 0.0004623445749380454, |
| "loss": 3.5513, |
| "step": 21400 |
| }, |
| { |
| "epoch": 2.3086858249919278, |
| "grad_norm": 0.5916268229484558, |
| "learning_rate": 0.000462021333907984, |
| "loss": 3.5706, |
| "step": 21450 |
| }, |
| { |
| "epoch": 2.314067377031536, |
| "grad_norm": 0.5938501358032227, |
| "learning_rate": 0.0004616980928779226, |
| "loss": 3.5709, |
| "step": 21500 |
| }, |
| { |
| "epoch": 2.319448929071144, |
| "grad_norm": 0.5628405809402466, |
| "learning_rate": 0.0004613748518478612, |
| "loss": 3.5559, |
| "step": 21550 |
| }, |
| { |
| "epoch": 2.3248304811107525, |
| "grad_norm": 0.5705186128616333, |
| "learning_rate": 0.00046105161081779974, |
| "loss": 3.5768, |
| "step": 21600 |
| }, |
| { |
| "epoch": 2.3302120331503606, |
| "grad_norm": 0.5919448137283325, |
| "learning_rate": 0.00046072836978773834, |
| "loss": 3.5586, |
| "step": 21650 |
| }, |
| { |
| "epoch": 2.3355935851899687, |
| "grad_norm": 0.5722348690032959, |
| "learning_rate": 0.000460405128757677, |
| "loss": 3.5473, |
| "step": 21700 |
| }, |
| { |
| "epoch": 2.340975137229577, |
| "grad_norm": 0.5616813898086548, |
| "learning_rate": 0.0004600818877276155, |
| "loss": 3.5644, |
| "step": 21750 |
| }, |
| { |
| "epoch": 2.3463566892691854, |
| "grad_norm": 0.5710437297821045, |
| "learning_rate": 0.0004597586466975541, |
| "loss": 3.5366, |
| "step": 21800 |
| }, |
| { |
| "epoch": 2.3517382413087935, |
| "grad_norm": 0.5971595644950867, |
| "learning_rate": 0.00045943540566749266, |
| "loss": 3.5574, |
| "step": 21850 |
| }, |
| { |
| "epoch": 2.3571197933484016, |
| "grad_norm": 0.6181142330169678, |
| "learning_rate": 0.00045911216463743126, |
| "loss": 3.5667, |
| "step": 21900 |
| }, |
| { |
| "epoch": 2.3625013453880097, |
| "grad_norm": 0.6226775646209717, |
| "learning_rate": 0.00045878892360736985, |
| "loss": 3.5504, |
| "step": 21950 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "grad_norm": 0.6260030269622803, |
| "learning_rate": 0.00045846568257730845, |
| "loss": 3.5632, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3678828974276183, |
| "eval_accuracy": 0.3642475064977143, |
| "eval_loss": 3.552438497543335, |
| "eval_runtime": 183.1631, |
| "eval_samples_per_second": 98.333, |
| "eval_steps_per_second": 6.148, |
| "step": 22000 |
| }, |
| { |
| "epoch": 2.3732644494672264, |
| "grad_norm": 0.5650054216384888, |
| "learning_rate": 0.00045814244154724704, |
| "loss": 3.5589, |
| "step": 22050 |
| }, |
| { |
| "epoch": 2.3786460015068345, |
| "grad_norm": 0.5674408078193665, |
| "learning_rate": 0.00045781920051718563, |
| "loss": 3.5569, |
| "step": 22100 |
| }, |
| { |
| "epoch": 2.384027553546443, |
| "grad_norm": 0.6000884175300598, |
| "learning_rate": 0.0004574959594871242, |
| "loss": 3.5644, |
| "step": 22150 |
| }, |
| { |
| "epoch": 2.389409105586051, |
| "grad_norm": 0.6201997399330139, |
| "learning_rate": 0.00045717271845706277, |
| "loss": 3.5686, |
| "step": 22200 |
| }, |
| { |
| "epoch": 2.3947906576256592, |
| "grad_norm": 0.5663855075836182, |
| "learning_rate": 0.0004568494774270013, |
| "loss": 3.5715, |
| "step": 22250 |
| }, |
| { |
| "epoch": 2.4001722096652673, |
| "grad_norm": 0.6110810041427612, |
| "learning_rate": 0.00045652623639693996, |
| "loss": 3.5599, |
| "step": 22300 |
| }, |
| { |
| "epoch": 2.4055537617048754, |
| "grad_norm": 0.5673940181732178, |
| "learning_rate": 0.00045620299536687855, |
| "loss": 3.5529, |
| "step": 22350 |
| }, |
| { |
| "epoch": 2.410935313744484, |
| "grad_norm": 0.5478566288948059, |
| "learning_rate": 0.0004558797543368171, |
| "loss": 3.5546, |
| "step": 22400 |
| }, |
| { |
| "epoch": 2.416316865784092, |
| "grad_norm": 0.5602691769599915, |
| "learning_rate": 0.0004555565133067557, |
| "loss": 3.5553, |
| "step": 22450 |
| }, |
| { |
| "epoch": 2.4216984178237, |
| "grad_norm": 0.6060113906860352, |
| "learning_rate": 0.0004552332722766943, |
| "loss": 3.5723, |
| "step": 22500 |
| }, |
| { |
| "epoch": 2.4270799698633088, |
| "grad_norm": 0.607200562953949, |
| "learning_rate": 0.0004549100312466328, |
| "loss": 3.5494, |
| "step": 22550 |
| }, |
| { |
| "epoch": 2.432461521902917, |
| "grad_norm": 0.5753543376922607, |
| "learning_rate": 0.0004545867902165715, |
| "loss": 3.5594, |
| "step": 22600 |
| }, |
| { |
| "epoch": 2.437843073942525, |
| "grad_norm": 1.2295790910720825, |
| "learning_rate": 0.00045426354918651007, |
| "loss": 3.5365, |
| "step": 22650 |
| }, |
| { |
| "epoch": 2.443224625982133, |
| "grad_norm": 0.6025099754333496, |
| "learning_rate": 0.0004539403081564486, |
| "loss": 3.5705, |
| "step": 22700 |
| }, |
| { |
| "epoch": 2.4486061780217416, |
| "grad_norm": 0.7182654142379761, |
| "learning_rate": 0.0004536170671263872, |
| "loss": 3.5582, |
| "step": 22750 |
| }, |
| { |
| "epoch": 2.4539877300613497, |
| "grad_norm": 0.5721381306648254, |
| "learning_rate": 0.00045329382609632574, |
| "loss": 3.5563, |
| "step": 22800 |
| }, |
| { |
| "epoch": 2.459369282100958, |
| "grad_norm": 0.5419154763221741, |
| "learning_rate": 0.0004529705850662644, |
| "loss": 3.562, |
| "step": 22850 |
| }, |
| { |
| "epoch": 2.464750834140566, |
| "grad_norm": 0.5342304706573486, |
| "learning_rate": 0.000452647344036203, |
| "loss": 3.5658, |
| "step": 22900 |
| }, |
| { |
| "epoch": 2.4701323861801745, |
| "grad_norm": 0.6328591108322144, |
| "learning_rate": 0.00045232410300614153, |
| "loss": 3.5421, |
| "step": 22950 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "grad_norm": 0.556786060333252, |
| "learning_rate": 0.0004520008619760801, |
| "loss": 3.5554, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4755139382197826, |
| "eval_accuracy": 0.3653008962450757, |
| "eval_loss": 3.5433034896850586, |
| "eval_runtime": 182.589, |
| "eval_samples_per_second": 98.642, |
| "eval_steps_per_second": 6.167, |
| "step": 23000 |
| }, |
| { |
| "epoch": 2.4808954902593907, |
| "grad_norm": 0.5863376259803772, |
| "learning_rate": 0.00045168408576661993, |
| "loss": 3.5371, |
| "step": 23050 |
| }, |
| { |
| "epoch": 2.4862770422989993, |
| "grad_norm": 0.5524567365646362, |
| "learning_rate": 0.0004513608447365585, |
| "loss": 3.5489, |
| "step": 23100 |
| }, |
| { |
| "epoch": 2.4916585943386074, |
| "grad_norm": 0.5922284722328186, |
| "learning_rate": 0.00045103760370649706, |
| "loss": 3.5695, |
| "step": 23150 |
| }, |
| { |
| "epoch": 2.4970401463782155, |
| "grad_norm": 0.5904964804649353, |
| "learning_rate": 0.00045071436267643566, |
| "loss": 3.551, |
| "step": 23200 |
| }, |
| { |
| "epoch": 2.5024216984178236, |
| "grad_norm": 0.5709808468818665, |
| "learning_rate": 0.0004503911216463743, |
| "loss": 3.5497, |
| "step": 23250 |
| }, |
| { |
| "epoch": 2.5078032504574317, |
| "grad_norm": 0.6329324841499329, |
| "learning_rate": 0.00045006788061631285, |
| "loss": 3.546, |
| "step": 23300 |
| }, |
| { |
| "epoch": 2.5131848024970402, |
| "grad_norm": 0.5848966836929321, |
| "learning_rate": 0.00044974463958625144, |
| "loss": 3.5475, |
| "step": 23350 |
| }, |
| { |
| "epoch": 2.5185663545366483, |
| "grad_norm": 0.6118052005767822, |
| "learning_rate": 0.00044942139855619004, |
| "loss": 3.5342, |
| "step": 23400 |
| }, |
| { |
| "epoch": 2.5239479065762565, |
| "grad_norm": 0.5881749391555786, |
| "learning_rate": 0.0004490981575261286, |
| "loss": 3.5452, |
| "step": 23450 |
| }, |
| { |
| "epoch": 2.529329458615865, |
| "grad_norm": 0.5847213864326477, |
| "learning_rate": 0.0004487749164960672, |
| "loss": 3.5521, |
| "step": 23500 |
| }, |
| { |
| "epoch": 2.534711010655473, |
| "grad_norm": 0.6213960647583008, |
| "learning_rate": 0.0004484516754660058, |
| "loss": 3.5339, |
| "step": 23550 |
| }, |
| { |
| "epoch": 2.540092562695081, |
| "grad_norm": 0.6048945784568787, |
| "learning_rate": 0.00044812843443594436, |
| "loss": 3.5714, |
| "step": 23600 |
| }, |
| { |
| "epoch": 2.5454741147346893, |
| "grad_norm": 0.6373373866081238, |
| "learning_rate": 0.00044780519340588296, |
| "loss": 3.5608, |
| "step": 23650 |
| }, |
| { |
| "epoch": 2.550855666774298, |
| "grad_norm": 0.5849354863166809, |
| "learning_rate": 0.0004474819523758215, |
| "loss": 3.5378, |
| "step": 23700 |
| }, |
| { |
| "epoch": 2.556237218813906, |
| "grad_norm": 0.5594618916511536, |
| "learning_rate": 0.0004471587113457601, |
| "loss": 3.5356, |
| "step": 23750 |
| }, |
| { |
| "epoch": 2.561618770853514, |
| "grad_norm": 0.6139757633209229, |
| "learning_rate": 0.00044683547031569874, |
| "loss": 3.5414, |
| "step": 23800 |
| }, |
| { |
| "epoch": 2.567000322893122, |
| "grad_norm": 0.5510112643241882, |
| "learning_rate": 0.0004465122292856373, |
| "loss": 3.5638, |
| "step": 23850 |
| }, |
| { |
| "epoch": 2.5723818749327307, |
| "grad_norm": 0.6046518087387085, |
| "learning_rate": 0.0004461889882555759, |
| "loss": 3.5382, |
| "step": 23900 |
| }, |
| { |
| "epoch": 2.577763426972339, |
| "grad_norm": 0.5786885619163513, |
| "learning_rate": 0.00044586574722551447, |
| "loss": 3.571, |
| "step": 23950 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "grad_norm": 0.5536140203475952, |
| "learning_rate": 0.000445542506195453, |
| "loss": 3.5367, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.583144979011947, |
| "eval_accuracy": 0.36611677077967464, |
| "eval_loss": 3.534451484680176, |
| "eval_runtime": 183.1729, |
| "eval_samples_per_second": 98.328, |
| "eval_steps_per_second": 6.147, |
| "step": 24000 |
| }, |
| { |
| "epoch": 2.5885265310515555, |
| "grad_norm": 0.5413175225257874, |
| "learning_rate": 0.0004452192651653916, |
| "loss": 3.5572, |
| "step": 24050 |
| }, |
| { |
| "epoch": 2.5939080830911636, |
| "grad_norm": 0.614355206489563, |
| "learning_rate": 0.00044489602413533025, |
| "loss": 3.551, |
| "step": 24100 |
| }, |
| { |
| "epoch": 2.5992896351307717, |
| "grad_norm": 0.5778051018714905, |
| "learning_rate": 0.0004445727831052688, |
| "loss": 3.551, |
| "step": 24150 |
| }, |
| { |
| "epoch": 2.60467118717038, |
| "grad_norm": 0.5734228491783142, |
| "learning_rate": 0.0004442495420752074, |
| "loss": 3.5781, |
| "step": 24200 |
| }, |
| { |
| "epoch": 2.610052739209988, |
| "grad_norm": 0.6527189016342163, |
| "learning_rate": 0.00044392630104514593, |
| "loss": 3.5606, |
| "step": 24250 |
| }, |
| { |
| "epoch": 2.6154342912495965, |
| "grad_norm": 0.5808097720146179, |
| "learning_rate": 0.0004436030600150845, |
| "loss": 3.539, |
| "step": 24300 |
| }, |
| { |
| "epoch": 2.6208158432892046, |
| "grad_norm": 0.578321099281311, |
| "learning_rate": 0.0004432798189850231, |
| "loss": 3.5517, |
| "step": 24350 |
| }, |
| { |
| "epoch": 2.6261973953288127, |
| "grad_norm": 0.5915061831474304, |
| "learning_rate": 0.0004429565779549617, |
| "loss": 3.5437, |
| "step": 24400 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 0.6535822749137878, |
| "learning_rate": 0.0004426333369249003, |
| "loss": 3.5527, |
| "step": 24450 |
| }, |
| { |
| "epoch": 2.6369604994080293, |
| "grad_norm": 0.5836544036865234, |
| "learning_rate": 0.0004423100958948389, |
| "loss": 3.5435, |
| "step": 24500 |
| }, |
| { |
| "epoch": 2.6423420514476375, |
| "grad_norm": 0.6255151629447937, |
| "learning_rate": 0.00044198685486477744, |
| "loss": 3.5353, |
| "step": 24550 |
| }, |
| { |
| "epoch": 2.6477236034872456, |
| "grad_norm": 0.5904508829116821, |
| "learning_rate": 0.00044166361383471604, |
| "loss": 3.5546, |
| "step": 24600 |
| }, |
| { |
| "epoch": 2.653105155526854, |
| "grad_norm": 0.5963026285171509, |
| "learning_rate": 0.0004413403728046547, |
| "loss": 3.5417, |
| "step": 24650 |
| }, |
| { |
| "epoch": 2.658486707566462, |
| "grad_norm": 0.5733145475387573, |
| "learning_rate": 0.0004410171317745932, |
| "loss": 3.5371, |
| "step": 24700 |
| }, |
| { |
| "epoch": 2.6638682596060703, |
| "grad_norm": 0.574026346206665, |
| "learning_rate": 0.0004406938907445318, |
| "loss": 3.5503, |
| "step": 24750 |
| }, |
| { |
| "epoch": 2.6692498116456784, |
| "grad_norm": 0.5410118103027344, |
| "learning_rate": 0.00044037064971447036, |
| "loss": 3.5505, |
| "step": 24800 |
| }, |
| { |
| "epoch": 2.674631363685287, |
| "grad_norm": 0.5905002951622009, |
| "learning_rate": 0.00044004740868440896, |
| "loss": 3.5496, |
| "step": 24850 |
| }, |
| { |
| "epoch": 2.680012915724895, |
| "grad_norm": 0.5615861415863037, |
| "learning_rate": 0.00043972416765434755, |
| "loss": 3.5361, |
| "step": 24900 |
| }, |
| { |
| "epoch": 2.685394467764503, |
| "grad_norm": 0.5930156111717224, |
| "learning_rate": 0.00043940092662428615, |
| "loss": 3.5658, |
| "step": 24950 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "grad_norm": 0.636413037776947, |
| "learning_rate": 0.00043907768559422474, |
| "loss": 3.543, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.6907760198041117, |
| "eval_accuracy": 0.36713582621448665, |
| "eval_loss": 3.526466131210327, |
| "eval_runtime": 182.6832, |
| "eval_samples_per_second": 98.591, |
| "eval_steps_per_second": 6.164, |
| "step": 25000 |
| }, |
| { |
| "epoch": 2.69615757184372, |
| "grad_norm": 0.5980111360549927, |
| "learning_rate": 0.00043875444456416334, |
| "loss": 3.5362, |
| "step": 25050 |
| }, |
| { |
| "epoch": 2.701539123883328, |
| "grad_norm": 0.5899256467819214, |
| "learning_rate": 0.0004384312035341019, |
| "loss": 3.5305, |
| "step": 25100 |
| }, |
| { |
| "epoch": 2.706920675922936, |
| "grad_norm": 0.6567667722702026, |
| "learning_rate": 0.00043810796250404047, |
| "loss": 3.5296, |
| "step": 25150 |
| }, |
| { |
| "epoch": 2.712302227962544, |
| "grad_norm": 0.6192686557769775, |
| "learning_rate": 0.0004377911862945803, |
| "loss": 3.5487, |
| "step": 25200 |
| }, |
| { |
| "epoch": 2.7176837800021527, |
| "grad_norm": 0.5786051154136658, |
| "learning_rate": 0.00043746794526451887, |
| "loss": 3.5591, |
| "step": 25250 |
| }, |
| { |
| "epoch": 2.723065332041761, |
| "grad_norm": 0.6026502847671509, |
| "learning_rate": 0.00043714470423445747, |
| "loss": 3.5304, |
| "step": 25300 |
| }, |
| { |
| "epoch": 2.728446884081369, |
| "grad_norm": 0.6267446279525757, |
| "learning_rate": 0.00043682146320439606, |
| "loss": 3.5329, |
| "step": 25350 |
| }, |
| { |
| "epoch": 2.7338284361209775, |
| "grad_norm": 0.617989182472229, |
| "learning_rate": 0.00043649822217433466, |
| "loss": 3.5411, |
| "step": 25400 |
| }, |
| { |
| "epoch": 2.7392099881605856, |
| "grad_norm": 0.5602372288703918, |
| "learning_rate": 0.0004361749811442732, |
| "loss": 3.5651, |
| "step": 25450 |
| }, |
| { |
| "epoch": 2.7445915402001937, |
| "grad_norm": 0.5438847541809082, |
| "learning_rate": 0.0004358517401142118, |
| "loss": 3.5549, |
| "step": 25500 |
| }, |
| { |
| "epoch": 2.749973092239802, |
| "grad_norm": 0.5827850103378296, |
| "learning_rate": 0.00043552849908415033, |
| "loss": 3.5294, |
| "step": 25550 |
| }, |
| { |
| "epoch": 2.7553546442794103, |
| "grad_norm": 0.5704156160354614, |
| "learning_rate": 0.000435205258054089, |
| "loss": 3.5352, |
| "step": 25600 |
| }, |
| { |
| "epoch": 2.7607361963190185, |
| "grad_norm": 0.5868292450904846, |
| "learning_rate": 0.0004348820170240276, |
| "loss": 3.5492, |
| "step": 25650 |
| }, |
| { |
| "epoch": 2.7661177483586266, |
| "grad_norm": 0.5794882774353027, |
| "learning_rate": 0.0004345587759939661, |
| "loss": 3.5491, |
| "step": 25700 |
| }, |
| { |
| "epoch": 2.7714993003982347, |
| "grad_norm": 0.6541237235069275, |
| "learning_rate": 0.0004342355349639047, |
| "loss": 3.5236, |
| "step": 25750 |
| }, |
| { |
| "epoch": 2.776880852437843, |
| "grad_norm": 0.6571041345596313, |
| "learning_rate": 0.0004339122939338433, |
| "loss": 3.5395, |
| "step": 25800 |
| }, |
| { |
| "epoch": 2.7822624044774513, |
| "grad_norm": 0.5382871627807617, |
| "learning_rate": 0.00043358905290378184, |
| "loss": 3.5449, |
| "step": 25850 |
| }, |
| { |
| "epoch": 2.7876439565170594, |
| "grad_norm": 0.5958513021469116, |
| "learning_rate": 0.0004332658118737205, |
| "loss": 3.5299, |
| "step": 25900 |
| }, |
| { |
| "epoch": 2.793025508556668, |
| "grad_norm": 0.5741668939590454, |
| "learning_rate": 0.0004329425708436591, |
| "loss": 3.5432, |
| "step": 25950 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "grad_norm": 0.6117798686027527, |
| "learning_rate": 0.00043261932981359763, |
| "loss": 3.542, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.798407060596276, |
| "eval_accuracy": 0.3677167932056307, |
| "eval_loss": 3.520785093307495, |
| "eval_runtime": 182.753, |
| "eval_samples_per_second": 98.554, |
| "eval_steps_per_second": 6.161, |
| "step": 26000 |
| }, |
| { |
| "epoch": 2.803788612635884, |
| "grad_norm": 0.5519882440567017, |
| "learning_rate": 0.0004322960887835362, |
| "loss": 3.5522, |
| "step": 26050 |
| }, |
| { |
| "epoch": 2.8091701646754923, |
| "grad_norm": 0.5597295761108398, |
| "learning_rate": 0.00043197284775347476, |
| "loss": 3.5286, |
| "step": 26100 |
| }, |
| { |
| "epoch": 2.8145517167151004, |
| "grad_norm": 0.6364133358001709, |
| "learning_rate": 0.00043164960672341336, |
| "loss": 3.5472, |
| "step": 26150 |
| }, |
| { |
| "epoch": 2.819933268754709, |
| "grad_norm": 0.6041819453239441, |
| "learning_rate": 0.000431326365693352, |
| "loss": 3.5536, |
| "step": 26200 |
| }, |
| { |
| "epoch": 2.825314820794317, |
| "grad_norm": 0.5981535911560059, |
| "learning_rate": 0.00043100312466329055, |
| "loss": 3.5325, |
| "step": 26250 |
| }, |
| { |
| "epoch": 2.830696372833925, |
| "grad_norm": 0.6039505004882812, |
| "learning_rate": 0.00043067988363322914, |
| "loss": 3.5477, |
| "step": 26300 |
| }, |
| { |
| "epoch": 2.8360779248735337, |
| "grad_norm": 0.5583112835884094, |
| "learning_rate": 0.00043035664260316774, |
| "loss": 3.5385, |
| "step": 26350 |
| }, |
| { |
| "epoch": 2.841459476913142, |
| "grad_norm": 0.6340571045875549, |
| "learning_rate": 0.0004300334015731063, |
| "loss": 3.5228, |
| "step": 26400 |
| }, |
| { |
| "epoch": 2.84684102895275, |
| "grad_norm": 0.5811586380004883, |
| "learning_rate": 0.0004297101605430449, |
| "loss": 3.5302, |
| "step": 26450 |
| }, |
| { |
| "epoch": 2.852222580992358, |
| "grad_norm": 0.5881547331809998, |
| "learning_rate": 0.0004293869195129835, |
| "loss": 3.5351, |
| "step": 26500 |
| }, |
| { |
| "epoch": 2.857604133031966, |
| "grad_norm": 0.6151056289672852, |
| "learning_rate": 0.00042906367848292206, |
| "loss": 3.5292, |
| "step": 26550 |
| }, |
| { |
| "epoch": 2.8629856850715747, |
| "grad_norm": 0.606568455696106, |
| "learning_rate": 0.00042874043745286066, |
| "loss": 3.5441, |
| "step": 26600 |
| }, |
| { |
| "epoch": 2.868367237111183, |
| "grad_norm": 0.535579264163971, |
| "learning_rate": 0.0004284171964227992, |
| "loss": 3.5116, |
| "step": 26650 |
| }, |
| { |
| "epoch": 2.873748789150791, |
| "grad_norm": 0.5709394216537476, |
| "learning_rate": 0.0004280939553927378, |
| "loss": 3.5527, |
| "step": 26700 |
| }, |
| { |
| "epoch": 2.8791303411903995, |
| "grad_norm": 0.6123267412185669, |
| "learning_rate": 0.00042777071436267644, |
| "loss": 3.5198, |
| "step": 26750 |
| }, |
| { |
| "epoch": 2.8845118932300076, |
| "grad_norm": 0.6574706435203552, |
| "learning_rate": 0.000427447473332615, |
| "loss": 3.5197, |
| "step": 26800 |
| }, |
| { |
| "epoch": 2.8898934452696157, |
| "grad_norm": 0.5446913838386536, |
| "learning_rate": 0.0004271242323025536, |
| "loss": 3.5375, |
| "step": 26850 |
| }, |
| { |
| "epoch": 2.895274997309224, |
| "grad_norm": 0.5856238007545471, |
| "learning_rate": 0.00042680099127249217, |
| "loss": 3.5379, |
| "step": 26900 |
| }, |
| { |
| "epoch": 2.9006565493488323, |
| "grad_norm": 0.5693891048431396, |
| "learning_rate": 0.0004264777502424307, |
| "loss": 3.5373, |
| "step": 26950 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "grad_norm": 0.5685888528823853, |
| "learning_rate": 0.0004261545092123693, |
| "loss": 3.5247, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9060381013884404, |
| "eval_accuracy": 0.3690562660066076, |
| "eval_loss": 3.5081608295440674, |
| "eval_runtime": 182.7742, |
| "eval_samples_per_second": 98.542, |
| "eval_steps_per_second": 6.161, |
| "step": 27000 |
| }, |
| { |
| "epoch": 2.9114196534280485, |
| "grad_norm": 0.5613411664962769, |
| "learning_rate": 0.00042583126818230795, |
| "loss": 3.5421, |
| "step": 27050 |
| }, |
| { |
| "epoch": 2.9168012054676566, |
| "grad_norm": 0.5848773717880249, |
| "learning_rate": 0.0004255080271522465, |
| "loss": 3.5342, |
| "step": 27100 |
| }, |
| { |
| "epoch": 2.922182757507265, |
| "grad_norm": 0.6449905037879944, |
| "learning_rate": 0.0004251847861221851, |
| "loss": 3.5196, |
| "step": 27150 |
| }, |
| { |
| "epoch": 2.9275643095468733, |
| "grad_norm": 0.6094990372657776, |
| "learning_rate": 0.00042486154509212363, |
| "loss": 3.5496, |
| "step": 27200 |
| }, |
| { |
| "epoch": 2.9329458615864814, |
| "grad_norm": 0.5689534544944763, |
| "learning_rate": 0.0004245447688826635, |
| "loss": 3.5291, |
| "step": 27250 |
| }, |
| { |
| "epoch": 2.93832741362609, |
| "grad_norm": 0.6069585084915161, |
| "learning_rate": 0.00042422152785260203, |
| "loss": 3.5095, |
| "step": 27300 |
| }, |
| { |
| "epoch": 2.943708965665698, |
| "grad_norm": 0.6799548268318176, |
| "learning_rate": 0.0004238982868225406, |
| "loss": 3.5521, |
| "step": 27350 |
| }, |
| { |
| "epoch": 2.949090517705306, |
| "grad_norm": 0.5622816681861877, |
| "learning_rate": 0.0004235750457924793, |
| "loss": 3.5134, |
| "step": 27400 |
| }, |
| { |
| "epoch": 2.9544720697449143, |
| "grad_norm": 0.5812619924545288, |
| "learning_rate": 0.0004232518047624178, |
| "loss": 3.534, |
| "step": 27450 |
| }, |
| { |
| "epoch": 2.9598536217845224, |
| "grad_norm": 0.5781611800193787, |
| "learning_rate": 0.0004229285637323564, |
| "loss": 3.5307, |
| "step": 27500 |
| }, |
| { |
| "epoch": 2.965235173824131, |
| "grad_norm": 0.6182583570480347, |
| "learning_rate": 0.00042260532270229495, |
| "loss": 3.5355, |
| "step": 27550 |
| }, |
| { |
| "epoch": 2.970616725863739, |
| "grad_norm": 0.6099269390106201, |
| "learning_rate": 0.00042228208167223354, |
| "loss": 3.5141, |
| "step": 27600 |
| }, |
| { |
| "epoch": 2.975998277903347, |
| "grad_norm": 0.6089410185813904, |
| "learning_rate": 0.00042195884064217214, |
| "loss": 3.5328, |
| "step": 27650 |
| }, |
| { |
| "epoch": 2.9813798299429557, |
| "grad_norm": 0.577688992023468, |
| "learning_rate": 0.00042163559961211073, |
| "loss": 3.5638, |
| "step": 27700 |
| }, |
| { |
| "epoch": 2.986761381982564, |
| "grad_norm": 0.6033281683921814, |
| "learning_rate": 0.00042131235858204933, |
| "loss": 3.524, |
| "step": 27750 |
| }, |
| { |
| "epoch": 2.992142934022172, |
| "grad_norm": 0.5998152494430542, |
| "learning_rate": 0.0004209891175519879, |
| "loss": 3.5123, |
| "step": 27800 |
| }, |
| { |
| "epoch": 2.9975244860617805, |
| "grad_norm": 0.676117479801178, |
| "learning_rate": 0.00042066587652192646, |
| "loss": 3.5311, |
| "step": 27850 |
| }, |
| { |
| "epoch": 3.0029060381013886, |
| "grad_norm": 0.5953896641731262, |
| "learning_rate": 0.00042034263549186506, |
| "loss": 3.494, |
| "step": 27900 |
| }, |
| { |
| "epoch": 3.0082875901409967, |
| "grad_norm": 0.5766949653625488, |
| "learning_rate": 0.0004200193944618036, |
| "loss": 3.419, |
| "step": 27950 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "grad_norm": 0.5596526265144348, |
| "learning_rate": 0.00041969615343174225, |
| "loss": 3.4332, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0136691421806048, |
| "eval_accuracy": 0.3699133199856882, |
| "eval_loss": 3.5029404163360596, |
| "eval_runtime": 182.639, |
| "eval_samples_per_second": 98.615, |
| "eval_steps_per_second": 6.165, |
| "step": 28000 |
| }, |
| { |
| "epoch": 3.0190506942202133, |
| "grad_norm": 0.581177830696106, |
| "learning_rate": 0.00041937291240168084, |
| "loss": 3.4454, |
| "step": 28050 |
| }, |
| { |
| "epoch": 3.0244322462598214, |
| "grad_norm": 0.5963121652603149, |
| "learning_rate": 0.0004190496713716194, |
| "loss": 3.4537, |
| "step": 28100 |
| }, |
| { |
| "epoch": 3.0298137982994295, |
| "grad_norm": 0.5734453201293945, |
| "learning_rate": 0.000418726430341558, |
| "loss": 3.4422, |
| "step": 28150 |
| }, |
| { |
| "epoch": 3.0351953503390376, |
| "grad_norm": 0.6005439758300781, |
| "learning_rate": 0.00041840318931149657, |
| "loss": 3.4446, |
| "step": 28200 |
| }, |
| { |
| "epoch": 3.040576902378646, |
| "grad_norm": 0.6174030303955078, |
| "learning_rate": 0.00041807994828143517, |
| "loss": 3.4522, |
| "step": 28250 |
| }, |
| { |
| "epoch": 3.0459584544182543, |
| "grad_norm": 0.6408053636550903, |
| "learning_rate": 0.00041775670725137376, |
| "loss": 3.4324, |
| "step": 28300 |
| }, |
| { |
| "epoch": 3.0513400064578624, |
| "grad_norm": 0.6011624336242676, |
| "learning_rate": 0.00041743346622131236, |
| "loss": 3.4556, |
| "step": 28350 |
| }, |
| { |
| "epoch": 3.0567215584974705, |
| "grad_norm": 0.6361274719238281, |
| "learning_rate": 0.0004171102251912509, |
| "loss": 3.4448, |
| "step": 28400 |
| }, |
| { |
| "epoch": 3.062103110537079, |
| "grad_norm": 0.602817952632904, |
| "learning_rate": 0.0004167869841611895, |
| "loss": 3.4486, |
| "step": 28450 |
| }, |
| { |
| "epoch": 3.067484662576687, |
| "grad_norm": 0.5594189167022705, |
| "learning_rate": 0.00041646374313112803, |
| "loss": 3.4733, |
| "step": 28500 |
| }, |
| { |
| "epoch": 3.0728662146162953, |
| "grad_norm": 0.7374558448791504, |
| "learning_rate": 0.0004161405021010667, |
| "loss": 3.4482, |
| "step": 28550 |
| }, |
| { |
| "epoch": 3.0782477666559034, |
| "grad_norm": 0.6085382699966431, |
| "learning_rate": 0.0004158172610710053, |
| "loss": 3.4641, |
| "step": 28600 |
| }, |
| { |
| "epoch": 3.083629318695512, |
| "grad_norm": 0.5731576681137085, |
| "learning_rate": 0.0004154940200409438, |
| "loss": 3.4374, |
| "step": 28650 |
| }, |
| { |
| "epoch": 3.08901087073512, |
| "grad_norm": 0.5917696356773376, |
| "learning_rate": 0.0004151707790108824, |
| "loss": 3.4511, |
| "step": 28700 |
| }, |
| { |
| "epoch": 3.094392422774728, |
| "grad_norm": 0.5805858373641968, |
| "learning_rate": 0.00041484753798082095, |
| "loss": 3.4436, |
| "step": 28750 |
| }, |
| { |
| "epoch": 3.0997739748143363, |
| "grad_norm": 0.575162947177887, |
| "learning_rate": 0.00041452429695075955, |
| "loss": 3.4435, |
| "step": 28800 |
| }, |
| { |
| "epoch": 3.105155526853945, |
| "grad_norm": 0.5958883166313171, |
| "learning_rate": 0.0004142010559206982, |
| "loss": 3.4675, |
| "step": 28850 |
| }, |
| { |
| "epoch": 3.110537078893553, |
| "grad_norm": 0.6094611287117004, |
| "learning_rate": 0.00041387781489063673, |
| "loss": 3.4567, |
| "step": 28900 |
| }, |
| { |
| "epoch": 3.115918630933161, |
| "grad_norm": 0.5872287750244141, |
| "learning_rate": 0.00041355457386057533, |
| "loss": 3.439, |
| "step": 28950 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "grad_norm": 0.6196814775466919, |
| "learning_rate": 0.0004132313328305139, |
| "loss": 3.444, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.121300182972769, |
| "eval_accuracy": 0.3706220627729021, |
| "eval_loss": 3.4992904663085938, |
| "eval_runtime": 182.5954, |
| "eval_samples_per_second": 98.639, |
| "eval_steps_per_second": 6.167, |
| "step": 29000 |
| }, |
| { |
| "epoch": 3.1266817350123777, |
| "grad_norm": 0.619157612323761, |
| "learning_rate": 0.00041290809180045246, |
| "loss": 3.4508, |
| "step": 29050 |
| }, |
| { |
| "epoch": 3.132063287051986, |
| "grad_norm": 0.5508050918579102, |
| "learning_rate": 0.0004125848507703911, |
| "loss": 3.4648, |
| "step": 29100 |
| }, |
| { |
| "epoch": 3.137444839091594, |
| "grad_norm": 0.5980096459388733, |
| "learning_rate": 0.0004122616097403297, |
| "loss": 3.4646, |
| "step": 29150 |
| }, |
| { |
| "epoch": 3.1428263911312024, |
| "grad_norm": 0.6267610192298889, |
| "learning_rate": 0.00041193836871026825, |
| "loss": 3.453, |
| "step": 29200 |
| }, |
| { |
| "epoch": 3.1482079431708105, |
| "grad_norm": 0.6516902446746826, |
| "learning_rate": 0.00041161512768020684, |
| "loss": 3.441, |
| "step": 29250 |
| }, |
| { |
| "epoch": 3.1535894952104186, |
| "grad_norm": 0.5599831938743591, |
| "learning_rate": 0.0004112918866501454, |
| "loss": 3.4536, |
| "step": 29300 |
| }, |
| { |
| "epoch": 3.1589710472500268, |
| "grad_norm": 0.5650069713592529, |
| "learning_rate": 0.00041097511044068524, |
| "loss": 3.462, |
| "step": 29350 |
| }, |
| { |
| "epoch": 3.1643525992896353, |
| "grad_norm": 0.597890317440033, |
| "learning_rate": 0.0004106518694106238, |
| "loss": 3.4729, |
| "step": 29400 |
| }, |
| { |
| "epoch": 3.1697341513292434, |
| "grad_norm": 0.6053642630577087, |
| "learning_rate": 0.0004103286283805624, |
| "loss": 3.447, |
| "step": 29450 |
| }, |
| { |
| "epoch": 3.1751157033688515, |
| "grad_norm": 0.6315788626670837, |
| "learning_rate": 0.00041000538735050103, |
| "loss": 3.4505, |
| "step": 29500 |
| }, |
| { |
| "epoch": 3.1804972554084596, |
| "grad_norm": 0.5334057211875916, |
| "learning_rate": 0.00040968214632043957, |
| "loss": 3.4588, |
| "step": 29550 |
| }, |
| { |
| "epoch": 3.185878807448068, |
| "grad_norm": 0.5461787581443787, |
| "learning_rate": 0.00040935890529037816, |
| "loss": 3.4605, |
| "step": 29600 |
| }, |
| { |
| "epoch": 3.1912603594876763, |
| "grad_norm": 0.6308553814888, |
| "learning_rate": 0.00040903566426031676, |
| "loss": 3.4494, |
| "step": 29650 |
| }, |
| { |
| "epoch": 3.1966419115272844, |
| "grad_norm": 0.5765513777732849, |
| "learning_rate": 0.0004087124232302553, |
| "loss": 3.4575, |
| "step": 29700 |
| }, |
| { |
| "epoch": 3.2020234635668925, |
| "grad_norm": 0.5815839171409607, |
| "learning_rate": 0.0004083891822001939, |
| "loss": 3.4578, |
| "step": 29750 |
| }, |
| { |
| "epoch": 3.207405015606501, |
| "grad_norm": 0.5885655879974365, |
| "learning_rate": 0.00040806594117013254, |
| "loss": 3.4475, |
| "step": 29800 |
| }, |
| { |
| "epoch": 3.212786567646109, |
| "grad_norm": 0.6800254583358765, |
| "learning_rate": 0.0004077427001400711, |
| "loss": 3.4626, |
| "step": 29850 |
| }, |
| { |
| "epoch": 3.2181681196857173, |
| "grad_norm": 0.5856930613517761, |
| "learning_rate": 0.0004074194591100097, |
| "loss": 3.454, |
| "step": 29900 |
| }, |
| { |
| "epoch": 3.2235496717253254, |
| "grad_norm": 0.6474602222442627, |
| "learning_rate": 0.0004070962180799482, |
| "loss": 3.4509, |
| "step": 29950 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "grad_norm": 0.5948309302330017, |
| "learning_rate": 0.0004067729770498868, |
| "loss": 3.4617, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.228931223764934, |
| "eval_accuracy": 0.3714459776211993, |
| "eval_loss": 3.4921934604644775, |
| "eval_runtime": 182.5164, |
| "eval_samples_per_second": 98.682, |
| "eval_steps_per_second": 6.169, |
| "step": 30000 |
| }, |
| { |
| "epoch": 3.234312775804542, |
| "grad_norm": 0.5894820094108582, |
| "learning_rate": 0.00040644973601982546, |
| "loss": 3.4608, |
| "step": 30050 |
| }, |
| { |
| "epoch": 3.23969432784415, |
| "grad_norm": 0.5914416313171387, |
| "learning_rate": 0.000406126494989764, |
| "loss": 3.4591, |
| "step": 30100 |
| }, |
| { |
| "epoch": 3.2450758798837587, |
| "grad_norm": 0.5802457928657532, |
| "learning_rate": 0.0004058032539597026, |
| "loss": 3.4654, |
| "step": 30150 |
| }, |
| { |
| "epoch": 3.250457431923367, |
| "grad_norm": 0.584543764591217, |
| "learning_rate": 0.00040548001292964114, |
| "loss": 3.4611, |
| "step": 30200 |
| }, |
| { |
| "epoch": 3.255838983962975, |
| "grad_norm": 0.5898339748382568, |
| "learning_rate": 0.00040515677189957973, |
| "loss": 3.4736, |
| "step": 30250 |
| }, |
| { |
| "epoch": 3.261220536002583, |
| "grad_norm": 0.5681132078170776, |
| "learning_rate": 0.0004048335308695183, |
| "loss": 3.4583, |
| "step": 30300 |
| }, |
| { |
| "epoch": 3.2666020880421915, |
| "grad_norm": 0.6399067640304565, |
| "learning_rate": 0.0004045102898394569, |
| "loss": 3.4683, |
| "step": 30350 |
| }, |
| { |
| "epoch": 3.2719836400817996, |
| "grad_norm": 0.6160270571708679, |
| "learning_rate": 0.0004041870488093955, |
| "loss": 3.4713, |
| "step": 30400 |
| }, |
| { |
| "epoch": 3.2773651921214078, |
| "grad_norm": 0.6204001307487488, |
| "learning_rate": 0.0004038638077793341, |
| "loss": 3.4577, |
| "step": 30450 |
| }, |
| { |
| "epoch": 3.282746744161016, |
| "grad_norm": 0.5580645203590393, |
| "learning_rate": 0.00040354056674927265, |
| "loss": 3.4456, |
| "step": 30500 |
| }, |
| { |
| "epoch": 3.2881282962006244, |
| "grad_norm": 0.5606558322906494, |
| "learning_rate": 0.00040321732571921124, |
| "loss": 3.449, |
| "step": 30550 |
| }, |
| { |
| "epoch": 3.2935098482402325, |
| "grad_norm": 0.6012607216835022, |
| "learning_rate": 0.0004028940846891498, |
| "loss": 3.4587, |
| "step": 30600 |
| }, |
| { |
| "epoch": 3.2988914002798406, |
| "grad_norm": 0.5980702638626099, |
| "learning_rate": 0.00040257084365908843, |
| "loss": 3.4565, |
| "step": 30650 |
| }, |
| { |
| "epoch": 3.304272952319449, |
| "grad_norm": 0.5933027267456055, |
| "learning_rate": 0.00040224760262902703, |
| "loss": 3.4612, |
| "step": 30700 |
| }, |
| { |
| "epoch": 3.3096545043590573, |
| "grad_norm": 0.6065646409988403, |
| "learning_rate": 0.00040192436159896557, |
| "loss": 3.4632, |
| "step": 30750 |
| }, |
| { |
| "epoch": 3.3150360563986654, |
| "grad_norm": 0.6052440404891968, |
| "learning_rate": 0.00040160112056890416, |
| "loss": 3.4554, |
| "step": 30800 |
| }, |
| { |
| "epoch": 3.3204176084382735, |
| "grad_norm": 0.5899329781532288, |
| "learning_rate": 0.00040127787953884276, |
| "loss": 3.4619, |
| "step": 30850 |
| }, |
| { |
| "epoch": 3.3257991604778816, |
| "grad_norm": 0.6436607837677002, |
| "learning_rate": 0.00040095463850878135, |
| "loss": 3.4441, |
| "step": 30900 |
| }, |
| { |
| "epoch": 3.33118071251749, |
| "grad_norm": 0.5700132250785828, |
| "learning_rate": 0.00040063139747871995, |
| "loss": 3.4669, |
| "step": 30950 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "grad_norm": 0.5744343996047974, |
| "learning_rate": 0.00040030815644865854, |
| "loss": 3.4675, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3365622645570983, |
| "eval_accuracy": 0.37178160639165825, |
| "eval_loss": 3.485490083694458, |
| "eval_runtime": 182.4931, |
| "eval_samples_per_second": 98.694, |
| "eval_steps_per_second": 6.17, |
| "step": 31000 |
| }, |
| { |
| "epoch": 3.3419438165967064, |
| "grad_norm": 0.5764909982681274, |
| "learning_rate": 0.0003999849154185971, |
| "loss": 3.4516, |
| "step": 31050 |
| }, |
| { |
| "epoch": 3.347325368636315, |
| "grad_norm": 0.6199538111686707, |
| "learning_rate": 0.0003996616743885357, |
| "loss": 3.4754, |
| "step": 31100 |
| }, |
| { |
| "epoch": 3.352706920675923, |
| "grad_norm": 0.6050508618354797, |
| "learning_rate": 0.0003993384333584742, |
| "loss": 3.4551, |
| "step": 31150 |
| }, |
| { |
| "epoch": 3.358088472715531, |
| "grad_norm": 0.591788113117218, |
| "learning_rate": 0.00039901519232841287, |
| "loss": 3.4839, |
| "step": 31200 |
| }, |
| { |
| "epoch": 3.3634700247551392, |
| "grad_norm": 0.6040419340133667, |
| "learning_rate": 0.00039869195129835146, |
| "loss": 3.4702, |
| "step": 31250 |
| }, |
| { |
| "epoch": 3.368851576794748, |
| "grad_norm": 0.680150032043457, |
| "learning_rate": 0.00039836871026829, |
| "loss": 3.4417, |
| "step": 31300 |
| }, |
| { |
| "epoch": 3.374233128834356, |
| "grad_norm": 0.5792616605758667, |
| "learning_rate": 0.0003980454692382286, |
| "loss": 3.4799, |
| "step": 31350 |
| }, |
| { |
| "epoch": 3.379614680873964, |
| "grad_norm": 0.5896565914154053, |
| "learning_rate": 0.0003977286930287684, |
| "loss": 3.4703, |
| "step": 31400 |
| }, |
| { |
| "epoch": 3.384996232913572, |
| "grad_norm": 0.5844624042510986, |
| "learning_rate": 0.000397405451998707, |
| "loss": 3.4721, |
| "step": 31450 |
| }, |
| { |
| "epoch": 3.3903777849531807, |
| "grad_norm": 0.5530960559844971, |
| "learning_rate": 0.00039708221096864554, |
| "loss": 3.4579, |
| "step": 31500 |
| }, |
| { |
| "epoch": 3.3957593369927888, |
| "grad_norm": 0.5991605520248413, |
| "learning_rate": 0.00039675896993858413, |
| "loss": 3.477, |
| "step": 31550 |
| }, |
| { |
| "epoch": 3.401140889032397, |
| "grad_norm": 0.5415410399436951, |
| "learning_rate": 0.0003964357289085228, |
| "loss": 3.4624, |
| "step": 31600 |
| }, |
| { |
| "epoch": 3.4065224410720054, |
| "grad_norm": 0.6166115403175354, |
| "learning_rate": 0.0003961124878784613, |
| "loss": 3.4492, |
| "step": 31650 |
| }, |
| { |
| "epoch": 3.4119039931116135, |
| "grad_norm": 0.5882399082183838, |
| "learning_rate": 0.0003957892468483999, |
| "loss": 3.4693, |
| "step": 31700 |
| }, |
| { |
| "epoch": 3.4172855451512216, |
| "grad_norm": 0.6354446411132812, |
| "learning_rate": 0.0003954660058183385, |
| "loss": 3.4692, |
| "step": 31750 |
| }, |
| { |
| "epoch": 3.4226670971908297, |
| "grad_norm": 0.5936651229858398, |
| "learning_rate": 0.00039514276478827705, |
| "loss": 3.4842, |
| "step": 31800 |
| }, |
| { |
| "epoch": 3.428048649230438, |
| "grad_norm": 0.7023192644119263, |
| "learning_rate": 0.0003948195237582157, |
| "loss": 3.4645, |
| "step": 31850 |
| }, |
| { |
| "epoch": 3.4334302012700464, |
| "grad_norm": 0.6037172675132751, |
| "learning_rate": 0.0003944962827281543, |
| "loss": 3.4535, |
| "step": 31900 |
| }, |
| { |
| "epoch": 3.4388117533096545, |
| "grad_norm": 0.6036386489868164, |
| "learning_rate": 0.00039417304169809284, |
| "loss": 3.48, |
| "step": 31950 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "grad_norm": 0.6221677660942078, |
| "learning_rate": 0.00039384980066803143, |
| "loss": 3.4746, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.4441933053492626, |
| "eval_accuracy": 0.37269016183956294, |
| "eval_loss": 3.4805102348327637, |
| "eval_runtime": 182.9051, |
| "eval_samples_per_second": 98.472, |
| "eval_steps_per_second": 6.156, |
| "step": 32000 |
| }, |
| { |
| "epoch": 3.449574857388871, |
| "grad_norm": 0.5549188852310181, |
| "learning_rate": 0.00039352655963796997, |
| "loss": 3.4655, |
| "step": 32050 |
| }, |
| { |
| "epoch": 3.4549564094284793, |
| "grad_norm": 0.6282884478569031, |
| "learning_rate": 0.00039320331860790857, |
| "loss": 3.473, |
| "step": 32100 |
| }, |
| { |
| "epoch": 3.4603379614680874, |
| "grad_norm": 0.5992251634597778, |
| "learning_rate": 0.0003928800775778472, |
| "loss": 3.4684, |
| "step": 32150 |
| }, |
| { |
| "epoch": 3.4657195135076955, |
| "grad_norm": 0.6112256050109863, |
| "learning_rate": 0.00039255683654778576, |
| "loss": 3.4635, |
| "step": 32200 |
| }, |
| { |
| "epoch": 3.471101065547304, |
| "grad_norm": 0.6498141288757324, |
| "learning_rate": 0.00039223359551772435, |
| "loss": 3.4595, |
| "step": 32250 |
| }, |
| { |
| "epoch": 3.476482617586912, |
| "grad_norm": 0.8926645517349243, |
| "learning_rate": 0.00039191035448766294, |
| "loss": 3.4693, |
| "step": 32300 |
| }, |
| { |
| "epoch": 3.4818641696265202, |
| "grad_norm": 0.5939327478408813, |
| "learning_rate": 0.0003915871134576015, |
| "loss": 3.4674, |
| "step": 32350 |
| }, |
| { |
| "epoch": 3.4872457216661283, |
| "grad_norm": 0.5974373817443848, |
| "learning_rate": 0.0003912638724275401, |
| "loss": 3.4627, |
| "step": 32400 |
| }, |
| { |
| "epoch": 3.492627273705737, |
| "grad_norm": 0.6161805987358093, |
| "learning_rate": 0.00039094063139747873, |
| "loss": 3.4778, |
| "step": 32450 |
| }, |
| { |
| "epoch": 3.498008825745345, |
| "grad_norm": 0.626426637172699, |
| "learning_rate": 0.00039061739036741727, |
| "loss": 3.4582, |
| "step": 32500 |
| }, |
| { |
| "epoch": 3.503390377784953, |
| "grad_norm": 0.6194849014282227, |
| "learning_rate": 0.00039029414933735586, |
| "loss": 3.4596, |
| "step": 32550 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 0.5703500509262085, |
| "learning_rate": 0.0003899709083072944, |
| "loss": 3.4511, |
| "step": 32600 |
| }, |
| { |
| "epoch": 3.5141534818641698, |
| "grad_norm": 0.5987148880958557, |
| "learning_rate": 0.000389647667277233, |
| "loss": 3.4513, |
| "step": 32650 |
| }, |
| { |
| "epoch": 3.519535033903778, |
| "grad_norm": 0.6004769802093506, |
| "learning_rate": 0.00038932442624717165, |
| "loss": 3.4605, |
| "step": 32700 |
| }, |
| { |
| "epoch": 3.524916585943386, |
| "grad_norm": 0.6045653820037842, |
| "learning_rate": 0.0003890011852171102, |
| "loss": 3.4566, |
| "step": 32750 |
| }, |
| { |
| "epoch": 3.530298137982994, |
| "grad_norm": 0.6004464030265808, |
| "learning_rate": 0.0003886779441870488, |
| "loss": 3.4689, |
| "step": 32800 |
| }, |
| { |
| "epoch": 3.5356796900226026, |
| "grad_norm": 0.6028228998184204, |
| "learning_rate": 0.0003883547031569874, |
| "loss": 3.4587, |
| "step": 32850 |
| }, |
| { |
| "epoch": 3.5410612420622107, |
| "grad_norm": 0.6320363879203796, |
| "learning_rate": 0.0003880314621269259, |
| "loss": 3.4762, |
| "step": 32900 |
| }, |
| { |
| "epoch": 3.546442794101819, |
| "grad_norm": 0.6261252164840698, |
| "learning_rate": 0.0003877082210968645, |
| "loss": 3.4774, |
| "step": 32950 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "grad_norm": 0.5780692100524902, |
| "learning_rate": 0.00038738498006680316, |
| "loss": 3.468, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5518243461414274, |
| "eval_accuracy": 0.3728782399883698, |
| "eval_loss": 3.4731545448303223, |
| "eval_runtime": 182.9321, |
| "eval_samples_per_second": 98.457, |
| "eval_steps_per_second": 6.155, |
| "step": 33000 |
| }, |
| { |
| "epoch": 3.5572058981810355, |
| "grad_norm": 0.5741873979568481, |
| "learning_rate": 0.0003870617390367417, |
| "loss": 3.4358, |
| "step": 33050 |
| }, |
| { |
| "epoch": 3.5625874502206436, |
| "grad_norm": 0.587976336479187, |
| "learning_rate": 0.0003867384980066803, |
| "loss": 3.4524, |
| "step": 33100 |
| }, |
| { |
| "epoch": 3.5679690022602517, |
| "grad_norm": 0.5886296033859253, |
| "learning_rate": 0.00038641525697661884, |
| "loss": 3.4588, |
| "step": 33150 |
| }, |
| { |
| "epoch": 3.57335055429986, |
| "grad_norm": 0.6252454519271851, |
| "learning_rate": 0.00038609201594655743, |
| "loss": 3.4569, |
| "step": 33200 |
| }, |
| { |
| "epoch": 3.5787321063394684, |
| "grad_norm": 0.6233946681022644, |
| "learning_rate": 0.000385768774916496, |
| "loss": 3.4602, |
| "step": 33250 |
| }, |
| { |
| "epoch": 3.5841136583790765, |
| "grad_norm": 0.6176490783691406, |
| "learning_rate": 0.0003854455338864346, |
| "loss": 3.4731, |
| "step": 33300 |
| }, |
| { |
| "epoch": 3.5894952104186846, |
| "grad_norm": 0.585235059261322, |
| "learning_rate": 0.0003851222928563732, |
| "loss": 3.4542, |
| "step": 33350 |
| }, |
| { |
| "epoch": 3.594876762458293, |
| "grad_norm": 0.6597333550453186, |
| "learning_rate": 0.0003847990518263118, |
| "loss": 3.4511, |
| "step": 33400 |
| }, |
| { |
| "epoch": 3.6002583144979012, |
| "grad_norm": 0.5512192249298096, |
| "learning_rate": 0.0003844822756168516, |
| "loss": 3.471, |
| "step": 33450 |
| }, |
| { |
| "epoch": 3.6056398665375093, |
| "grad_norm": 0.5989030003547668, |
| "learning_rate": 0.00038415903458679016, |
| "loss": 3.4706, |
| "step": 33500 |
| }, |
| { |
| "epoch": 3.611021418577118, |
| "grad_norm": 0.643202543258667, |
| "learning_rate": 0.00038383579355672875, |
| "loss": 3.4446, |
| "step": 33550 |
| }, |
| { |
| "epoch": 3.616402970616726, |
| "grad_norm": 0.6083823442459106, |
| "learning_rate": 0.00038351255252666735, |
| "loss": 3.4702, |
| "step": 33600 |
| }, |
| { |
| "epoch": 3.621784522656334, |
| "grad_norm": 0.6128966212272644, |
| "learning_rate": 0.00038318931149660594, |
| "loss": 3.4584, |
| "step": 33650 |
| }, |
| { |
| "epoch": 3.627166074695942, |
| "grad_norm": 0.6018299460411072, |
| "learning_rate": 0.00038286607046654454, |
| "loss": 3.4597, |
| "step": 33700 |
| }, |
| { |
| "epoch": 3.6325476267355503, |
| "grad_norm": 0.6082707643508911, |
| "learning_rate": 0.00038254282943648313, |
| "loss": 3.4459, |
| "step": 33750 |
| }, |
| { |
| "epoch": 3.637929178775159, |
| "grad_norm": 0.5940393805503845, |
| "learning_rate": 0.00038221958840642167, |
| "loss": 3.4514, |
| "step": 33800 |
| }, |
| { |
| "epoch": 3.643310730814767, |
| "grad_norm": 0.6136729121208191, |
| "learning_rate": 0.00038189634737636027, |
| "loss": 3.4703, |
| "step": 33850 |
| }, |
| { |
| "epoch": 3.648692282854375, |
| "grad_norm": 0.571419358253479, |
| "learning_rate": 0.0003815731063462988, |
| "loss": 3.4434, |
| "step": 33900 |
| }, |
| { |
| "epoch": 3.6540738348939836, |
| "grad_norm": 0.5937557220458984, |
| "learning_rate": 0.00038124986531623745, |
| "loss": 3.4589, |
| "step": 33950 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "grad_norm": 0.6121259331703186, |
| "learning_rate": 0.00038092662428617605, |
| "loss": 3.4691, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6594553869335917, |
| "eval_accuracy": 0.3735422177858041, |
| "eval_loss": 3.4664924144744873, |
| "eval_runtime": 183.1266, |
| "eval_samples_per_second": 98.353, |
| "eval_steps_per_second": 6.149, |
| "step": 34000 |
| }, |
| { |
| "epoch": 3.6648369389732, |
| "grad_norm": 0.6286929249763489, |
| "learning_rate": 0.0003806033832561146, |
| "loss": 3.4675, |
| "step": 34050 |
| }, |
| { |
| "epoch": 3.670218491012808, |
| "grad_norm": 0.6216891407966614, |
| "learning_rate": 0.0003802801422260532, |
| "loss": 3.4353, |
| "step": 34100 |
| }, |
| { |
| "epoch": 3.675600043052416, |
| "grad_norm": 0.5747387409210205, |
| "learning_rate": 0.0003799569011959918, |
| "loss": 3.4438, |
| "step": 34150 |
| }, |
| { |
| "epoch": 3.6809815950920246, |
| "grad_norm": 0.6043647527694702, |
| "learning_rate": 0.0003796336601659303, |
| "loss": 3.4652, |
| "step": 34200 |
| }, |
| { |
| "epoch": 3.6863631471316327, |
| "grad_norm": 0.6015361547470093, |
| "learning_rate": 0.00037931041913586897, |
| "loss": 3.4511, |
| "step": 34250 |
| }, |
| { |
| "epoch": 3.691744699171241, |
| "grad_norm": 0.5804011225700378, |
| "learning_rate": 0.00037898717810580756, |
| "loss": 3.4713, |
| "step": 34300 |
| }, |
| { |
| "epoch": 3.6971262512108494, |
| "grad_norm": 0.5761457681655884, |
| "learning_rate": 0.0003786639370757461, |
| "loss": 3.4658, |
| "step": 34350 |
| }, |
| { |
| "epoch": 3.7025078032504575, |
| "grad_norm": 0.5895703434944153, |
| "learning_rate": 0.0003783406960456847, |
| "loss": 3.4654, |
| "step": 34400 |
| }, |
| { |
| "epoch": 3.7078893552900656, |
| "grad_norm": 0.6082561016082764, |
| "learning_rate": 0.00037801745501562324, |
| "loss": 3.4621, |
| "step": 34450 |
| }, |
| { |
| "epoch": 3.713270907329674, |
| "grad_norm": 0.6229493021965027, |
| "learning_rate": 0.0003776942139855619, |
| "loss": 3.4451, |
| "step": 34500 |
| }, |
| { |
| "epoch": 3.7186524593692822, |
| "grad_norm": 0.5866677165031433, |
| "learning_rate": 0.0003773709729555005, |
| "loss": 3.4595, |
| "step": 34550 |
| }, |
| { |
| "epoch": 3.7240340114088903, |
| "grad_norm": 0.6240717172622681, |
| "learning_rate": 0.000377047731925439, |
| "loss": 3.4571, |
| "step": 34600 |
| }, |
| { |
| "epoch": 3.7294155634484984, |
| "grad_norm": 0.6172893047332764, |
| "learning_rate": 0.0003767244908953776, |
| "loss": 3.4676, |
| "step": 34650 |
| }, |
| { |
| "epoch": 3.7347971154881066, |
| "grad_norm": 0.6289220452308655, |
| "learning_rate": 0.0003764012498653162, |
| "loss": 3.4464, |
| "step": 34700 |
| }, |
| { |
| "epoch": 3.740178667527715, |
| "grad_norm": 0.6364248991012573, |
| "learning_rate": 0.00037607800883525475, |
| "loss": 3.465, |
| "step": 34750 |
| }, |
| { |
| "epoch": 3.745560219567323, |
| "grad_norm": 0.6611288189888, |
| "learning_rate": 0.0003757547678051934, |
| "loss": 3.4465, |
| "step": 34800 |
| }, |
| { |
| "epoch": 3.7509417716069313, |
| "grad_norm": 0.5711609125137329, |
| "learning_rate": 0.000375431526775132, |
| "loss": 3.4531, |
| "step": 34850 |
| }, |
| { |
| "epoch": 3.75632332364654, |
| "grad_norm": 0.6151444315910339, |
| "learning_rate": 0.00037510828574507054, |
| "loss": 3.4685, |
| "step": 34900 |
| }, |
| { |
| "epoch": 3.761704875686148, |
| "grad_norm": 0.6230692267417908, |
| "learning_rate": 0.00037478504471500913, |
| "loss": 3.446, |
| "step": 34950 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "grad_norm": 0.6537262797355652, |
| "learning_rate": 0.00037446180368494767, |
| "loss": 3.4681, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.767086427725756, |
| "eval_accuracy": 0.37434853086627534, |
| "eval_loss": 3.460052728652954, |
| "eval_runtime": 182.9222, |
| "eval_samples_per_second": 98.463, |
| "eval_steps_per_second": 6.156, |
| "step": 35000 |
| }, |
| { |
| "epoch": 3.772467979765364, |
| "grad_norm": 0.6269912123680115, |
| "learning_rate": 0.00037413856265488627, |
| "loss": 3.4745, |
| "step": 35050 |
| }, |
| { |
| "epoch": 3.7778495318049723, |
| "grad_norm": 0.6322194337844849, |
| "learning_rate": 0.0003738153216248249, |
| "loss": 3.4693, |
| "step": 35100 |
| }, |
| { |
| "epoch": 3.783231083844581, |
| "grad_norm": 0.632526695728302, |
| "learning_rate": 0.00037349208059476346, |
| "loss": 3.4557, |
| "step": 35150 |
| }, |
| { |
| "epoch": 3.788612635884189, |
| "grad_norm": 0.5771685242652893, |
| "learning_rate": 0.00037316883956470205, |
| "loss": 3.4585, |
| "step": 35200 |
| }, |
| { |
| "epoch": 3.793994187923797, |
| "grad_norm": 0.5918084979057312, |
| "learning_rate": 0.00037284559853464064, |
| "loss": 3.4503, |
| "step": 35250 |
| }, |
| { |
| "epoch": 3.7993757399634056, |
| "grad_norm": 0.6038704514503479, |
| "learning_rate": 0.0003725223575045792, |
| "loss": 3.4631, |
| "step": 35300 |
| }, |
| { |
| "epoch": 3.8047572920030137, |
| "grad_norm": 0.5990985631942749, |
| "learning_rate": 0.0003721991164745178, |
| "loss": 3.4527, |
| "step": 35350 |
| }, |
| { |
| "epoch": 3.810138844042622, |
| "grad_norm": 0.6696856021881104, |
| "learning_rate": 0.00037187587544445643, |
| "loss": 3.4572, |
| "step": 35400 |
| }, |
| { |
| "epoch": 3.8155203960822304, |
| "grad_norm": 0.5810059309005737, |
| "learning_rate": 0.00037155909923499624, |
| "loss": 3.4439, |
| "step": 35450 |
| }, |
| { |
| "epoch": 3.8209019481218385, |
| "grad_norm": 0.6126786470413208, |
| "learning_rate": 0.0003712358582049348, |
| "loss": 3.4521, |
| "step": 35500 |
| }, |
| { |
| "epoch": 3.8262835001614466, |
| "grad_norm": 0.5759022235870361, |
| "learning_rate": 0.00037091261717487337, |
| "loss": 3.439, |
| "step": 35550 |
| }, |
| { |
| "epoch": 3.8316650522010547, |
| "grad_norm": 0.6269648671150208, |
| "learning_rate": 0.00037058937614481197, |
| "loss": 3.4444, |
| "step": 35600 |
| }, |
| { |
| "epoch": 3.837046604240663, |
| "grad_norm": 0.6104834079742432, |
| "learning_rate": 0.0003702661351147505, |
| "loss": 3.4494, |
| "step": 35650 |
| }, |
| { |
| "epoch": 3.8424281562802713, |
| "grad_norm": 0.623383641242981, |
| "learning_rate": 0.0003699428940846891, |
| "loss": 3.4771, |
| "step": 35700 |
| }, |
| { |
| "epoch": 3.8478097083198795, |
| "grad_norm": 0.6111641526222229, |
| "learning_rate": 0.00036961965305462775, |
| "loss": 3.4553, |
| "step": 35750 |
| }, |
| { |
| "epoch": 3.8531912603594876, |
| "grad_norm": 0.6193336844444275, |
| "learning_rate": 0.0003692964120245663, |
| "loss": 3.4714, |
| "step": 35800 |
| }, |
| { |
| "epoch": 3.858572812399096, |
| "grad_norm": 0.6333847641944885, |
| "learning_rate": 0.0003689731709945049, |
| "loss": 3.4516, |
| "step": 35850 |
| }, |
| { |
| "epoch": 3.863954364438704, |
| "grad_norm": 0.6365866661071777, |
| "learning_rate": 0.0003686499299644434, |
| "loss": 3.4439, |
| "step": 35900 |
| }, |
| { |
| "epoch": 3.8693359164783123, |
| "grad_norm": 0.6254480481147766, |
| "learning_rate": 0.000368326688934382, |
| "loss": 3.4289, |
| "step": 35950 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "grad_norm": 0.6164137125015259, |
| "learning_rate": 0.0003680034479043206, |
| "loss": 3.4485, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8747174685179204, |
| "eval_accuracy": 0.37518298504469055, |
| "eval_loss": 3.4550580978393555, |
| "eval_runtime": 182.8899, |
| "eval_samples_per_second": 98.48, |
| "eval_steps_per_second": 6.157, |
| "step": 36000 |
| }, |
| { |
| "epoch": 3.8800990205575285, |
| "grad_norm": 0.599606990814209, |
| "learning_rate": 0.0003676802068742592, |
| "loss": 3.4553, |
| "step": 36050 |
| }, |
| { |
| "epoch": 3.885480572597137, |
| "grad_norm": 0.5952869653701782, |
| "learning_rate": 0.0003673569658441978, |
| "loss": 3.4557, |
| "step": 36100 |
| }, |
| { |
| "epoch": 3.890862124636745, |
| "grad_norm": 0.6147683262825012, |
| "learning_rate": 0.0003670337248141364, |
| "loss": 3.4397, |
| "step": 36150 |
| }, |
| { |
| "epoch": 3.8962436766763533, |
| "grad_norm": 0.6182876229286194, |
| "learning_rate": 0.00036671048378407494, |
| "loss": 3.4621, |
| "step": 36200 |
| }, |
| { |
| "epoch": 3.901625228715962, |
| "grad_norm": 0.6036136746406555, |
| "learning_rate": 0.00036638724275401353, |
| "loss": 3.459, |
| "step": 36250 |
| }, |
| { |
| "epoch": 3.90700678075557, |
| "grad_norm": 0.6490514278411865, |
| "learning_rate": 0.0003660640017239522, |
| "loss": 3.4717, |
| "step": 36300 |
| }, |
| { |
| "epoch": 3.912388332795178, |
| "grad_norm": 0.5808064341545105, |
| "learning_rate": 0.0003657407606938907, |
| "loss": 3.4619, |
| "step": 36350 |
| }, |
| { |
| "epoch": 3.9177698848347866, |
| "grad_norm": 0.6013696193695068, |
| "learning_rate": 0.0003654175196638293, |
| "loss": 3.4525, |
| "step": 36400 |
| }, |
| { |
| "epoch": 3.9231514368743947, |
| "grad_norm": 0.609664797782898, |
| "learning_rate": 0.00036509427863376786, |
| "loss": 3.4375, |
| "step": 36450 |
| }, |
| { |
| "epoch": 3.928532988914003, |
| "grad_norm": 0.6366161704063416, |
| "learning_rate": 0.00036477103760370645, |
| "loss": 3.4491, |
| "step": 36500 |
| }, |
| { |
| "epoch": 3.933914540953611, |
| "grad_norm": 0.5997065305709839, |
| "learning_rate": 0.00036444779657364505, |
| "loss": 3.4635, |
| "step": 36550 |
| }, |
| { |
| "epoch": 3.939296092993219, |
| "grad_norm": 0.6210324764251709, |
| "learning_rate": 0.00036412455554358364, |
| "loss": 3.4626, |
| "step": 36600 |
| }, |
| { |
| "epoch": 3.9446776450328276, |
| "grad_norm": 0.6512171030044556, |
| "learning_rate": 0.00036380131451352224, |
| "loss": 3.4469, |
| "step": 36650 |
| }, |
| { |
| "epoch": 3.9500591970724357, |
| "grad_norm": 0.6340328454971313, |
| "learning_rate": 0.00036347807348346083, |
| "loss": 3.4664, |
| "step": 36700 |
| }, |
| { |
| "epoch": 3.955440749112044, |
| "grad_norm": 0.5851263403892517, |
| "learning_rate": 0.00036315483245339937, |
| "loss": 3.4502, |
| "step": 36750 |
| }, |
| { |
| "epoch": 3.9608223011516523, |
| "grad_norm": 0.6642547845840454, |
| "learning_rate": 0.00036283159142333797, |
| "loss": 3.4394, |
| "step": 36800 |
| }, |
| { |
| "epoch": 3.9662038531912605, |
| "grad_norm": 0.6904765963554382, |
| "learning_rate": 0.0003625083503932765, |
| "loss": 3.4662, |
| "step": 36850 |
| }, |
| { |
| "epoch": 3.9715854052308686, |
| "grad_norm": 0.6355055570602417, |
| "learning_rate": 0.00036218510936321516, |
| "loss": 3.4568, |
| "step": 36900 |
| }, |
| { |
| "epoch": 3.9769669572704767, |
| "grad_norm": 0.585128664970398, |
| "learning_rate": 0.00036186186833315375, |
| "loss": 3.4772, |
| "step": 36950 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "grad_norm": 0.5850505232810974, |
| "learning_rate": 0.0003615386273030923, |
| "loss": 3.4428, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9823485093100848, |
| "eval_accuracy": 0.37551470231118816, |
| "eval_loss": 3.449396848678589, |
| "eval_runtime": 182.7917, |
| "eval_samples_per_second": 98.533, |
| "eval_steps_per_second": 6.16, |
| "step": 37000 |
| }, |
| { |
| "epoch": 3.9877300613496933, |
| "grad_norm": 0.6555930376052856, |
| "learning_rate": 0.0003612153862730309, |
| "loss": 3.4316, |
| "step": 37050 |
| }, |
| { |
| "epoch": 3.9931116133893014, |
| "grad_norm": 0.6331802010536194, |
| "learning_rate": 0.0003608921452429695, |
| "loss": 3.4507, |
| "step": 37100 |
| }, |
| { |
| "epoch": 3.9984931654289095, |
| "grad_norm": 0.5921234488487244, |
| "learning_rate": 0.000360568904212908, |
| "loss": 3.4469, |
| "step": 37150 |
| }, |
| { |
| "epoch": 4.003874717468518, |
| "grad_norm": 0.627491295337677, |
| "learning_rate": 0.00036024566318284667, |
| "loss": 3.384, |
| "step": 37200 |
| }, |
| { |
| "epoch": 4.009256269508126, |
| "grad_norm": 0.6413177847862244, |
| "learning_rate": 0.00035992242215278526, |
| "loss": 3.367, |
| "step": 37250 |
| }, |
| { |
| "epoch": 4.014637821547734, |
| "grad_norm": 0.6309844255447388, |
| "learning_rate": 0.0003595991811227238, |
| "loss": 3.363, |
| "step": 37300 |
| }, |
| { |
| "epoch": 4.020019373587343, |
| "grad_norm": 0.6667311191558838, |
| "learning_rate": 0.0003592759400926624, |
| "loss": 3.3572, |
| "step": 37350 |
| }, |
| { |
| "epoch": 4.0254009256269505, |
| "grad_norm": 0.6064625382423401, |
| "learning_rate": 0.00035895269906260094, |
| "loss": 3.3597, |
| "step": 37400 |
| }, |
| { |
| "epoch": 4.030782477666559, |
| "grad_norm": 0.6433827877044678, |
| "learning_rate": 0.0003586359228531408, |
| "loss": 3.3492, |
| "step": 37450 |
| }, |
| { |
| "epoch": 4.036164029706168, |
| "grad_norm": 0.66212397813797, |
| "learning_rate": 0.00035831268182307934, |
| "loss": 3.3711, |
| "step": 37500 |
| }, |
| { |
| "epoch": 4.041545581745775, |
| "grad_norm": 0.6347793936729431, |
| "learning_rate": 0.000357989440793018, |
| "loss": 3.3516, |
| "step": 37550 |
| }, |
| { |
| "epoch": 4.046927133785384, |
| "grad_norm": 0.6682549715042114, |
| "learning_rate": 0.0003576661997629566, |
| "loss": 3.3482, |
| "step": 37600 |
| }, |
| { |
| "epoch": 4.0523086858249915, |
| "grad_norm": 1.5289701223373413, |
| "learning_rate": 0.0003573429587328951, |
| "loss": 3.3792, |
| "step": 37650 |
| }, |
| { |
| "epoch": 4.0576902378646, |
| "grad_norm": 0.6569393873214722, |
| "learning_rate": 0.0003570197177028337, |
| "loss": 3.3742, |
| "step": 37700 |
| }, |
| { |
| "epoch": 4.063071789904209, |
| "grad_norm": 0.630212128162384, |
| "learning_rate": 0.00035669647667277226, |
| "loss": 3.368, |
| "step": 37750 |
| }, |
| { |
| "epoch": 4.068453341943816, |
| "grad_norm": 0.6595259308815002, |
| "learning_rate": 0.00035637323564271085, |
| "loss": 3.3954, |
| "step": 37800 |
| }, |
| { |
| "epoch": 4.073834893983425, |
| "grad_norm": 0.6565824151039124, |
| "learning_rate": 0.0003560499946126495, |
| "loss": 3.3667, |
| "step": 37850 |
| }, |
| { |
| "epoch": 4.079216446023033, |
| "grad_norm": 0.6169015765190125, |
| "learning_rate": 0.00035572675358258804, |
| "loss": 3.38, |
| "step": 37900 |
| }, |
| { |
| "epoch": 4.084597998062641, |
| "grad_norm": 0.6400800347328186, |
| "learning_rate": 0.00035540351255252664, |
| "loss": 3.3704, |
| "step": 37950 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "grad_norm": 0.6978241801261902, |
| "learning_rate": 0.00035508027152246523, |
| "loss": 3.3682, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.08997955010225, |
| "eval_accuracy": 0.3757263581366508, |
| "eval_loss": 3.452324151992798, |
| "eval_runtime": 182.3279, |
| "eval_samples_per_second": 98.784, |
| "eval_steps_per_second": 6.176, |
| "step": 38000 |
| }, |
| { |
| "epoch": 4.095361102141858, |
| "grad_norm": 0.6283411383628845, |
| "learning_rate": 0.0003547570304924038, |
| "loss": 3.3862, |
| "step": 38050 |
| }, |
| { |
| "epoch": 4.100742654181466, |
| "grad_norm": 0.6307471394538879, |
| "learning_rate": 0.0003544337894623424, |
| "loss": 3.3747, |
| "step": 38100 |
| }, |
| { |
| "epoch": 4.106124206221074, |
| "grad_norm": 0.597294270992279, |
| "learning_rate": 0.000354110548432281, |
| "loss": 3.3646, |
| "step": 38150 |
| }, |
| { |
| "epoch": 4.111505758260682, |
| "grad_norm": 0.5953186750411987, |
| "learning_rate": 0.00035378730740221956, |
| "loss": 3.3824, |
| "step": 38200 |
| }, |
| { |
| "epoch": 4.1168873103002905, |
| "grad_norm": 0.624042272567749, |
| "learning_rate": 0.00035346406637215815, |
| "loss": 3.3978, |
| "step": 38250 |
| }, |
| { |
| "epoch": 4.122268862339899, |
| "grad_norm": 0.6373803615570068, |
| "learning_rate": 0.0003531408253420967, |
| "loss": 3.3895, |
| "step": 38300 |
| }, |
| { |
| "epoch": 4.127650414379507, |
| "grad_norm": 0.6241115927696228, |
| "learning_rate": 0.0003528175843120353, |
| "loss": 3.382, |
| "step": 38350 |
| }, |
| { |
| "epoch": 4.133031966419115, |
| "grad_norm": 0.6354626417160034, |
| "learning_rate": 0.00035249434328197394, |
| "loss": 3.3788, |
| "step": 38400 |
| }, |
| { |
| "epoch": 4.138413518458724, |
| "grad_norm": 0.6405816674232483, |
| "learning_rate": 0.0003521711022519125, |
| "loss": 3.4024, |
| "step": 38450 |
| }, |
| { |
| "epoch": 4.1437950704983315, |
| "grad_norm": 0.6165269613265991, |
| "learning_rate": 0.00035184786122185107, |
| "loss": 3.3723, |
| "step": 38500 |
| }, |
| { |
| "epoch": 4.14917662253794, |
| "grad_norm": 0.6947121024131775, |
| "learning_rate": 0.00035152462019178967, |
| "loss": 3.3761, |
| "step": 38550 |
| }, |
| { |
| "epoch": 4.154558174577549, |
| "grad_norm": 0.6098374724388123, |
| "learning_rate": 0.0003512013791617282, |
| "loss": 3.37, |
| "step": 38600 |
| }, |
| { |
| "epoch": 4.159939726617156, |
| "grad_norm": 0.6297115087509155, |
| "learning_rate": 0.0003508781381316668, |
| "loss": 3.3762, |
| "step": 38650 |
| }, |
| { |
| "epoch": 4.165321278656765, |
| "grad_norm": 0.6331884860992432, |
| "learning_rate": 0.00035055489710160545, |
| "loss": 3.371, |
| "step": 38700 |
| }, |
| { |
| "epoch": 4.1707028306963725, |
| "grad_norm": 0.693463146686554, |
| "learning_rate": 0.000350231656071544, |
| "loss": 3.3778, |
| "step": 38750 |
| }, |
| { |
| "epoch": 4.176084382735981, |
| "grad_norm": 0.6266399025917053, |
| "learning_rate": 0.0003499084150414826, |
| "loss": 3.3779, |
| "step": 38800 |
| }, |
| { |
| "epoch": 4.18146593477559, |
| "grad_norm": 0.6141369342803955, |
| "learning_rate": 0.0003495851740114211, |
| "loss": 3.3825, |
| "step": 38850 |
| }, |
| { |
| "epoch": 4.186847486815197, |
| "grad_norm": 0.632418155670166, |
| "learning_rate": 0.0003492619329813597, |
| "loss": 3.3927, |
| "step": 38900 |
| }, |
| { |
| "epoch": 4.192229038854806, |
| "grad_norm": 0.6271191239356995, |
| "learning_rate": 0.0003489386919512983, |
| "loss": 3.3714, |
| "step": 38950 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "grad_norm": 0.7061901688575745, |
| "learning_rate": 0.0003486154509212369, |
| "loss": 3.3809, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.197610590894414, |
| "eval_accuracy": 0.37637001784406376, |
| "eval_loss": 3.448011875152588, |
| "eval_runtime": 182.5317, |
| "eval_samples_per_second": 98.673, |
| "eval_steps_per_second": 6.169, |
| "step": 39000 |
| }, |
| { |
| "epoch": 4.202992142934022, |
| "grad_norm": 0.6355361342430115, |
| "learning_rate": 0.0003482922098911755, |
| "loss": 3.3757, |
| "step": 39050 |
| }, |
| { |
| "epoch": 4.208373694973631, |
| "grad_norm": 0.6252569556236267, |
| "learning_rate": 0.0003479689688611141, |
| "loss": 3.3928, |
| "step": 39100 |
| }, |
| { |
| "epoch": 4.213755247013238, |
| "grad_norm": 0.6373631358146667, |
| "learning_rate": 0.00034764572783105264, |
| "loss": 3.3829, |
| "step": 39150 |
| }, |
| { |
| "epoch": 4.219136799052847, |
| "grad_norm": 0.6141077280044556, |
| "learning_rate": 0.00034732248680099123, |
| "loss": 3.3808, |
| "step": 39200 |
| }, |
| { |
| "epoch": 4.224518351092455, |
| "grad_norm": 0.6714136004447937, |
| "learning_rate": 0.00034700571059153104, |
| "loss": 3.3783, |
| "step": 39250 |
| }, |
| { |
| "epoch": 4.229899903132063, |
| "grad_norm": 0.6982507705688477, |
| "learning_rate": 0.00034668246956146963, |
| "loss": 3.3836, |
| "step": 39300 |
| }, |
| { |
| "epoch": 4.2352814551716715, |
| "grad_norm": 0.6456509828567505, |
| "learning_rate": 0.00034635922853140823, |
| "loss": 3.382, |
| "step": 39350 |
| }, |
| { |
| "epoch": 4.24066300721128, |
| "grad_norm": 0.6480652689933777, |
| "learning_rate": 0.0003460359875013468, |
| "loss": 3.3935, |
| "step": 39400 |
| }, |
| { |
| "epoch": 4.246044559250888, |
| "grad_norm": 0.6829937696456909, |
| "learning_rate": 0.0003457127464712854, |
| "loss": 3.3913, |
| "step": 39450 |
| }, |
| { |
| "epoch": 4.251426111290496, |
| "grad_norm": 0.6466607451438904, |
| "learning_rate": 0.00034538950544122396, |
| "loss": 3.3826, |
| "step": 39500 |
| }, |
| { |
| "epoch": 4.256807663330104, |
| "grad_norm": 0.665008544921875, |
| "learning_rate": 0.00034506626441116255, |
| "loss": 3.3962, |
| "step": 39550 |
| }, |
| { |
| "epoch": 4.2621892153697125, |
| "grad_norm": 0.633686363697052, |
| "learning_rate": 0.0003447430233811011, |
| "loss": 3.3794, |
| "step": 39600 |
| }, |
| { |
| "epoch": 4.267570767409321, |
| "grad_norm": 0.706251859664917, |
| "learning_rate": 0.00034441978235103974, |
| "loss": 3.3995, |
| "step": 39650 |
| }, |
| { |
| "epoch": 4.272952319448929, |
| "grad_norm": 0.6631447076797485, |
| "learning_rate": 0.00034409654132097834, |
| "loss": 3.3963, |
| "step": 39700 |
| }, |
| { |
| "epoch": 4.278333871488537, |
| "grad_norm": 0.6600978374481201, |
| "learning_rate": 0.0003437733002909169, |
| "loss": 3.3984, |
| "step": 39750 |
| }, |
| { |
| "epoch": 4.283715423528146, |
| "grad_norm": 0.624860942363739, |
| "learning_rate": 0.00034345005926085547, |
| "loss": 3.3892, |
| "step": 39800 |
| }, |
| { |
| "epoch": 4.2890969755677535, |
| "grad_norm": 0.6655292510986328, |
| "learning_rate": 0.00034312681823079407, |
| "loss": 3.3873, |
| "step": 39850 |
| }, |
| { |
| "epoch": 4.294478527607362, |
| "grad_norm": 0.6067180037498474, |
| "learning_rate": 0.00034280357720073266, |
| "loss": 3.4034, |
| "step": 39900 |
| }, |
| { |
| "epoch": 4.299860079646971, |
| "grad_norm": 0.6818814873695374, |
| "learning_rate": 0.00034248033617067126, |
| "loss": 3.3754, |
| "step": 39950 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "grad_norm": 0.6350970268249512, |
| "learning_rate": 0.00034215709514060985, |
| "loss": 3.3874, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.305241631686578, |
| "eval_accuracy": 0.3769179543573122, |
| "eval_loss": 3.442443609237671, |
| "eval_runtime": 182.8947, |
| "eval_samples_per_second": 98.477, |
| "eval_steps_per_second": 6.157, |
| "step": 40000 |
| }, |
| { |
| "epoch": 4.310623183726187, |
| "grad_norm": 0.6428865790367126, |
| "learning_rate": 0.0003418338541105484, |
| "loss": 3.3958, |
| "step": 40050 |
| }, |
| { |
| "epoch": 4.3160047357657945, |
| "grad_norm": 0.6549261212348938, |
| "learning_rate": 0.000341510613080487, |
| "loss": 3.3934, |
| "step": 40100 |
| }, |
| { |
| "epoch": 4.321386287805403, |
| "grad_norm": 0.6633482575416565, |
| "learning_rate": 0.0003411873720504255, |
| "loss": 3.4035, |
| "step": 40150 |
| }, |
| { |
| "epoch": 4.326767839845012, |
| "grad_norm": 0.6681509613990784, |
| "learning_rate": 0.0003408641310203642, |
| "loss": 3.4035, |
| "step": 40200 |
| }, |
| { |
| "epoch": 4.332149391884619, |
| "grad_norm": 0.6671531200408936, |
| "learning_rate": 0.00034054088999030277, |
| "loss": 3.3972, |
| "step": 40250 |
| }, |
| { |
| "epoch": 4.337530943924228, |
| "grad_norm": 0.6722056865692139, |
| "learning_rate": 0.0003402176489602413, |
| "loss": 3.3925, |
| "step": 40300 |
| }, |
| { |
| "epoch": 4.342912495963836, |
| "grad_norm": 0.634778618812561, |
| "learning_rate": 0.0003398944079301799, |
| "loss": 3.3951, |
| "step": 40350 |
| }, |
| { |
| "epoch": 4.348294048003444, |
| "grad_norm": 0.6151086091995239, |
| "learning_rate": 0.0003395711669001185, |
| "loss": 3.4087, |
| "step": 40400 |
| }, |
| { |
| "epoch": 4.3536756000430525, |
| "grad_norm": 0.6367116570472717, |
| "learning_rate": 0.00033924792587005704, |
| "loss": 3.3993, |
| "step": 40450 |
| }, |
| { |
| "epoch": 4.359057152082661, |
| "grad_norm": 0.6156135201454163, |
| "learning_rate": 0.0003389246848399957, |
| "loss": 3.3765, |
| "step": 40500 |
| }, |
| { |
| "epoch": 4.364438704122269, |
| "grad_norm": 0.6550938487052917, |
| "learning_rate": 0.0003386014438099343, |
| "loss": 3.4011, |
| "step": 40550 |
| }, |
| { |
| "epoch": 4.369820256161877, |
| "grad_norm": 0.6099169254302979, |
| "learning_rate": 0.0003382782027798728, |
| "loss": 3.3948, |
| "step": 40600 |
| }, |
| { |
| "epoch": 4.375201808201485, |
| "grad_norm": 0.6787609457969666, |
| "learning_rate": 0.0003379549617498114, |
| "loss": 3.4118, |
| "step": 40650 |
| }, |
| { |
| "epoch": 4.3805833602410935, |
| "grad_norm": 0.6305999755859375, |
| "learning_rate": 0.00033763172071974996, |
| "loss": 3.4038, |
| "step": 40700 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 0.6593831181526184, |
| "learning_rate": 0.00033730847968968855, |
| "loss": 3.4024, |
| "step": 40750 |
| }, |
| { |
| "epoch": 4.39134646432031, |
| "grad_norm": 0.6971632838249207, |
| "learning_rate": 0.0003369852386596272, |
| "loss": 3.3823, |
| "step": 40800 |
| }, |
| { |
| "epoch": 4.396728016359918, |
| "grad_norm": 0.6232396960258484, |
| "learning_rate": 0.00033666199762956574, |
| "loss": 3.4003, |
| "step": 40850 |
| }, |
| { |
| "epoch": 4.402109568399527, |
| "grad_norm": 0.6262663006782532, |
| "learning_rate": 0.00033633875659950434, |
| "loss": 3.391, |
| "step": 40900 |
| }, |
| { |
| "epoch": 4.4074911204391345, |
| "grad_norm": 0.652194082736969, |
| "learning_rate": 0.0003360155155694429, |
| "loss": 3.373, |
| "step": 40950 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "grad_norm": 0.645897388458252, |
| "learning_rate": 0.0003356922745393815, |
| "loss": 3.4029, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.412872672478743, |
| "eval_accuracy": 0.37729009049807677, |
| "eval_loss": 3.4402434825897217, |
| "eval_runtime": 182.7308, |
| "eval_samples_per_second": 98.566, |
| "eval_steps_per_second": 6.162, |
| "step": 41000 |
| }, |
| { |
| "epoch": 4.418254224518351, |
| "grad_norm": 0.6175161004066467, |
| "learning_rate": 0.0003353690335093201, |
| "loss": 3.3898, |
| "step": 41050 |
| }, |
| { |
| "epoch": 4.423635776557959, |
| "grad_norm": 0.6909878849983215, |
| "learning_rate": 0.0003350457924792587, |
| "loss": 3.3979, |
| "step": 41100 |
| }, |
| { |
| "epoch": 4.429017328597568, |
| "grad_norm": 0.6703479290008545, |
| "learning_rate": 0.00033472255144919726, |
| "loss": 3.403, |
| "step": 41150 |
| }, |
| { |
| "epoch": 4.4343988806371755, |
| "grad_norm": 0.635631799697876, |
| "learning_rate": 0.00033439931041913585, |
| "loss": 3.3764, |
| "step": 41200 |
| }, |
| { |
| "epoch": 4.439780432676784, |
| "grad_norm": 0.6716395616531372, |
| "learning_rate": 0.0003340760693890744, |
| "loss": 3.3872, |
| "step": 41250 |
| }, |
| { |
| "epoch": 4.445161984716393, |
| "grad_norm": 0.6503021717071533, |
| "learning_rate": 0.000333752828359013, |
| "loss": 3.3977, |
| "step": 41300 |
| }, |
| { |
| "epoch": 4.450543536756, |
| "grad_norm": 0.6269751191139221, |
| "learning_rate": 0.00033342958732895164, |
| "loss": 3.3794, |
| "step": 41350 |
| }, |
| { |
| "epoch": 4.455925088795609, |
| "grad_norm": 0.6848242282867432, |
| "learning_rate": 0.0003331063462988902, |
| "loss": 3.3979, |
| "step": 41400 |
| }, |
| { |
| "epoch": 4.461306640835216, |
| "grad_norm": 0.6538987755775452, |
| "learning_rate": 0.00033278310526882877, |
| "loss": 3.3977, |
| "step": 41450 |
| }, |
| { |
| "epoch": 4.466688192874825, |
| "grad_norm": 0.687504768371582, |
| "learning_rate": 0.0003324598642387673, |
| "loss": 3.388, |
| "step": 41500 |
| }, |
| { |
| "epoch": 4.4720697449144335, |
| "grad_norm": 0.6829115748405457, |
| "learning_rate": 0.0003321366232087059, |
| "loss": 3.3855, |
| "step": 41550 |
| }, |
| { |
| "epoch": 4.477451296954041, |
| "grad_norm": 0.6400046944618225, |
| "learning_rate": 0.0003318133821786445, |
| "loss": 3.3888, |
| "step": 41600 |
| }, |
| { |
| "epoch": 4.48283284899365, |
| "grad_norm": 0.7195608615875244, |
| "learning_rate": 0.0003314901411485831, |
| "loss": 3.386, |
| "step": 41650 |
| }, |
| { |
| "epoch": 4.488214401033258, |
| "grad_norm": 0.6601772904396057, |
| "learning_rate": 0.0003311669001185217, |
| "loss": 3.4025, |
| "step": 41700 |
| }, |
| { |
| "epoch": 4.493595953072866, |
| "grad_norm": 0.6627285480499268, |
| "learning_rate": 0.0003308436590884603, |
| "loss": 3.3845, |
| "step": 41750 |
| }, |
| { |
| "epoch": 4.4989775051124745, |
| "grad_norm": 0.6553217172622681, |
| "learning_rate": 0.0003305204180583988, |
| "loss": 3.3975, |
| "step": 41800 |
| }, |
| { |
| "epoch": 4.504359057152083, |
| "grad_norm": 0.6015192270278931, |
| "learning_rate": 0.0003302036418489387, |
| "loss": 3.4143, |
| "step": 41850 |
| }, |
| { |
| "epoch": 4.509740609191691, |
| "grad_norm": 0.709814727306366, |
| "learning_rate": 0.0003298804008188772, |
| "loss": 3.3908, |
| "step": 41900 |
| }, |
| { |
| "epoch": 4.515122161231299, |
| "grad_norm": 0.6477102637290955, |
| "learning_rate": 0.0003295571597888158, |
| "loss": 3.3887, |
| "step": 41950 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "grad_norm": 0.6681662201881409, |
| "learning_rate": 0.00032923391875875447, |
| "loss": 3.4236, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.520503713270907, |
| "eval_accuracy": 0.3778097772604935, |
| "eval_loss": 3.4332406520843506, |
| "eval_runtime": 182.6918, |
| "eval_samples_per_second": 98.587, |
| "eval_steps_per_second": 6.163, |
| "step": 42000 |
| }, |
| { |
| "epoch": 4.5258852653105155, |
| "grad_norm": 0.6888872385025024, |
| "learning_rate": 0.000328910677728693, |
| "loss": 3.399, |
| "step": 42050 |
| }, |
| { |
| "epoch": 4.531266817350124, |
| "grad_norm": 0.6245088577270508, |
| "learning_rate": 0.0003285874366986316, |
| "loss": 3.3807, |
| "step": 42100 |
| }, |
| { |
| "epoch": 4.536648369389732, |
| "grad_norm": 0.6789868474006653, |
| "learning_rate": 0.00032826419566857015, |
| "loss": 3.385, |
| "step": 42150 |
| }, |
| { |
| "epoch": 4.54202992142934, |
| "grad_norm": 0.678841769695282, |
| "learning_rate": 0.00032794095463850874, |
| "loss": 3.4002, |
| "step": 42200 |
| }, |
| { |
| "epoch": 4.547411473468949, |
| "grad_norm": 0.6410040855407715, |
| "learning_rate": 0.0003276177136084473, |
| "loss": 3.3948, |
| "step": 42250 |
| }, |
| { |
| "epoch": 4.5527930255085565, |
| "grad_norm": 0.6948608756065369, |
| "learning_rate": 0.00032729447257838593, |
| "loss": 3.4055, |
| "step": 42300 |
| }, |
| { |
| "epoch": 4.558174577548165, |
| "grad_norm": 0.6666476726531982, |
| "learning_rate": 0.0003269712315483245, |
| "loss": 3.4067, |
| "step": 42350 |
| }, |
| { |
| "epoch": 4.563556129587774, |
| "grad_norm": 0.6703059673309326, |
| "learning_rate": 0.00032664799051826306, |
| "loss": 3.3935, |
| "step": 42400 |
| }, |
| { |
| "epoch": 4.568937681627381, |
| "grad_norm": 0.6622447371482849, |
| "learning_rate": 0.00032632474948820166, |
| "loss": 3.3907, |
| "step": 42450 |
| }, |
| { |
| "epoch": 4.57431923366699, |
| "grad_norm": 0.6864114999771118, |
| "learning_rate": 0.00032600150845814025, |
| "loss": 3.3855, |
| "step": 42500 |
| }, |
| { |
| "epoch": 4.579700785706597, |
| "grad_norm": 0.6508845686912537, |
| "learning_rate": 0.0003256782674280788, |
| "loss": 3.3981, |
| "step": 42550 |
| }, |
| { |
| "epoch": 4.585082337746206, |
| "grad_norm": 0.6998566389083862, |
| "learning_rate": 0.00032535502639801744, |
| "loss": 3.3939, |
| "step": 42600 |
| }, |
| { |
| "epoch": 4.5904638897858145, |
| "grad_norm": 0.6499090194702148, |
| "learning_rate": 0.00032503178536795604, |
| "loss": 3.396, |
| "step": 42650 |
| }, |
| { |
| "epoch": 4.595845441825422, |
| "grad_norm": 0.6362233757972717, |
| "learning_rate": 0.0003247085443378946, |
| "loss": 3.394, |
| "step": 42700 |
| }, |
| { |
| "epoch": 4.601226993865031, |
| "grad_norm": 0.6605574488639832, |
| "learning_rate": 0.0003243853033078332, |
| "loss": 3.3787, |
| "step": 42750 |
| }, |
| { |
| "epoch": 4.606608545904638, |
| "grad_norm": 0.6438785195350647, |
| "learning_rate": 0.0003240620622777717, |
| "loss": 3.4107, |
| "step": 42800 |
| }, |
| { |
| "epoch": 4.611990097944247, |
| "grad_norm": 0.6508049368858337, |
| "learning_rate": 0.00032373882124771036, |
| "loss": 3.3849, |
| "step": 42850 |
| }, |
| { |
| "epoch": 4.6173716499838555, |
| "grad_norm": 0.651975691318512, |
| "learning_rate": 0.00032341558021764896, |
| "loss": 3.3751, |
| "step": 42900 |
| }, |
| { |
| "epoch": 4.622753202023463, |
| "grad_norm": 0.6845095157623291, |
| "learning_rate": 0.0003230923391875875, |
| "loss": 3.384, |
| "step": 42950 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "grad_norm": 0.6449900269508362, |
| "learning_rate": 0.0003227690981575261, |
| "loss": 3.3834, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.628134754063072, |
| "eval_accuracy": 0.37862836811728773, |
| "eval_loss": 3.428248643875122, |
| "eval_runtime": 182.9864, |
| "eval_samples_per_second": 98.428, |
| "eval_steps_per_second": 6.153, |
| "step": 43000 |
| }, |
| { |
| "epoch": 4.63351630610268, |
| "grad_norm": 0.7843633890151978, |
| "learning_rate": 0.0003224458571274647, |
| "loss": 3.394, |
| "step": 43050 |
| }, |
| { |
| "epoch": 4.638897858142288, |
| "grad_norm": 0.6628377437591553, |
| "learning_rate": 0.00032212261609740323, |
| "loss": 3.4261, |
| "step": 43100 |
| }, |
| { |
| "epoch": 4.6442794101818965, |
| "grad_norm": 0.6455616354942322, |
| "learning_rate": 0.0003217993750673419, |
| "loss": 3.3975, |
| "step": 43150 |
| }, |
| { |
| "epoch": 4.649660962221505, |
| "grad_norm": 0.6171681880950928, |
| "learning_rate": 0.00032147613403728047, |
| "loss": 3.3786, |
| "step": 43200 |
| }, |
| { |
| "epoch": 4.655042514261113, |
| "grad_norm": 0.6919983625411987, |
| "learning_rate": 0.000321152893007219, |
| "loss": 3.3941, |
| "step": 43250 |
| }, |
| { |
| "epoch": 4.660424066300721, |
| "grad_norm": 0.6645509004592896, |
| "learning_rate": 0.0003208296519771576, |
| "loss": 3.3736, |
| "step": 43300 |
| }, |
| { |
| "epoch": 4.665805618340329, |
| "grad_norm": 0.6968538165092468, |
| "learning_rate": 0.00032050641094709615, |
| "loss": 3.3966, |
| "step": 43350 |
| }, |
| { |
| "epoch": 4.6711871703799375, |
| "grad_norm": 0.6048688888549805, |
| "learning_rate": 0.00032018316991703474, |
| "loss": 3.3867, |
| "step": 43400 |
| }, |
| { |
| "epoch": 4.676568722419546, |
| "grad_norm": 0.638982892036438, |
| "learning_rate": 0.0003198599288869734, |
| "loss": 3.4118, |
| "step": 43450 |
| }, |
| { |
| "epoch": 4.681950274459154, |
| "grad_norm": 0.7195716500282288, |
| "learning_rate": 0.00031953668785691193, |
| "loss": 3.4058, |
| "step": 43500 |
| }, |
| { |
| "epoch": 4.687331826498762, |
| "grad_norm": 0.6764750480651855, |
| "learning_rate": 0.0003192134468268505, |
| "loss": 3.4037, |
| "step": 43550 |
| }, |
| { |
| "epoch": 4.692713378538371, |
| "grad_norm": 0.6833808422088623, |
| "learning_rate": 0.0003188902057967891, |
| "loss": 3.3974, |
| "step": 43600 |
| }, |
| { |
| "epoch": 4.6980949305779784, |
| "grad_norm": 0.6663907766342163, |
| "learning_rate": 0.00031856696476672766, |
| "loss": 3.3963, |
| "step": 43650 |
| }, |
| { |
| "epoch": 4.703476482617587, |
| "grad_norm": 0.661689281463623, |
| "learning_rate": 0.0003182437237366663, |
| "loss": 3.3907, |
| "step": 43700 |
| }, |
| { |
| "epoch": 4.7088580346571955, |
| "grad_norm": 0.6300943493843079, |
| "learning_rate": 0.0003179204827066049, |
| "loss": 3.3981, |
| "step": 43750 |
| }, |
| { |
| "epoch": 4.714239586696803, |
| "grad_norm": 0.6751649975776672, |
| "learning_rate": 0.00031759724167654344, |
| "loss": 3.4063, |
| "step": 43800 |
| }, |
| { |
| "epoch": 4.719621138736412, |
| "grad_norm": 0.6561940908432007, |
| "learning_rate": 0.00031727400064648204, |
| "loss": 3.4089, |
| "step": 43850 |
| }, |
| { |
| "epoch": 4.725002690776019, |
| "grad_norm": 0.6460406184196472, |
| "learning_rate": 0.0003169507596164206, |
| "loss": 3.4121, |
| "step": 43900 |
| }, |
| { |
| "epoch": 4.730384242815628, |
| "grad_norm": 0.7084411382675171, |
| "learning_rate": 0.0003166275185863592, |
| "loss": 3.397, |
| "step": 43950 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "grad_norm": 0.6859269142150879, |
| "learning_rate": 0.0003163042775562978, |
| "loss": 3.3957, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.7357657948552365, |
| "eval_accuracy": 0.3787118135351293, |
| "eval_loss": 3.4233696460723877, |
| "eval_runtime": 182.724, |
| "eval_samples_per_second": 98.569, |
| "eval_steps_per_second": 6.162, |
| "step": 44000 |
| }, |
| { |
| "epoch": 4.741147346894844, |
| "grad_norm": 0.6808167695999146, |
| "learning_rate": 0.00031598103652623636, |
| "loss": 3.3983, |
| "step": 44050 |
| }, |
| { |
| "epoch": 4.746528898934453, |
| "grad_norm": 0.6858932375907898, |
| "learning_rate": 0.00031565779549617496, |
| "loss": 3.3932, |
| "step": 44100 |
| }, |
| { |
| "epoch": 4.751910450974061, |
| "grad_norm": 0.6018813252449036, |
| "learning_rate": 0.00031533455446611355, |
| "loss": 3.3963, |
| "step": 44150 |
| }, |
| { |
| "epoch": 4.757292003013669, |
| "grad_norm": 0.6454479098320007, |
| "learning_rate": 0.0003150113134360521, |
| "loss": 3.4044, |
| "step": 44200 |
| }, |
| { |
| "epoch": 4.7626735550532775, |
| "grad_norm": 0.68412846326828, |
| "learning_rate": 0.0003146880724059907, |
| "loss": 3.4075, |
| "step": 44250 |
| }, |
| { |
| "epoch": 4.768055107092886, |
| "grad_norm": 0.6248152852058411, |
| "learning_rate": 0.00031436483137592934, |
| "loss": 3.4068, |
| "step": 44300 |
| }, |
| { |
| "epoch": 4.773436659132494, |
| "grad_norm": 0.7138214111328125, |
| "learning_rate": 0.0003140415903458679, |
| "loss": 3.4007, |
| "step": 44350 |
| }, |
| { |
| "epoch": 4.778818211172102, |
| "grad_norm": 0.6977150440216064, |
| "learning_rate": 0.00031371834931580647, |
| "loss": 3.3887, |
| "step": 44400 |
| }, |
| { |
| "epoch": 4.78419976321171, |
| "grad_norm": 0.6793955564498901, |
| "learning_rate": 0.000313395108285745, |
| "loss": 3.3965, |
| "step": 44450 |
| }, |
| { |
| "epoch": 4.7895813152513185, |
| "grad_norm": 0.7022804617881775, |
| "learning_rate": 0.0003130718672556836, |
| "loss": 3.3973, |
| "step": 44500 |
| }, |
| { |
| "epoch": 4.794962867290927, |
| "grad_norm": 0.6722605228424072, |
| "learning_rate": 0.0003127486262256222, |
| "loss": 3.3857, |
| "step": 44550 |
| }, |
| { |
| "epoch": 4.800344419330535, |
| "grad_norm": 0.6196901798248291, |
| "learning_rate": 0.0003124253851955608, |
| "loss": 3.3995, |
| "step": 44600 |
| }, |
| { |
| "epoch": 4.805725971370143, |
| "grad_norm": 0.6370094418525696, |
| "learning_rate": 0.0003121021441654994, |
| "loss": 3.3821, |
| "step": 44650 |
| }, |
| { |
| "epoch": 4.811107523409751, |
| "grad_norm": 0.6328933238983154, |
| "learning_rate": 0.0003117853679560392, |
| "loss": 3.4183, |
| "step": 44700 |
| }, |
| { |
| "epoch": 4.8164890754493594, |
| "grad_norm": 0.7133774757385254, |
| "learning_rate": 0.0003114621269259778, |
| "loss": 3.376, |
| "step": 44750 |
| }, |
| { |
| "epoch": 4.821870627488968, |
| "grad_norm": 0.6690739393234253, |
| "learning_rate": 0.00031113888589591633, |
| "loss": 3.3853, |
| "step": 44800 |
| }, |
| { |
| "epoch": 4.827252179528576, |
| "grad_norm": 0.6346887350082397, |
| "learning_rate": 0.00031081564486585493, |
| "loss": 3.406, |
| "step": 44850 |
| }, |
| { |
| "epoch": 4.832633731568184, |
| "grad_norm": 0.7371925115585327, |
| "learning_rate": 0.0003104924038357935, |
| "loss": 3.3955, |
| "step": 44900 |
| }, |
| { |
| "epoch": 4.838015283607793, |
| "grad_norm": 0.651321530342102, |
| "learning_rate": 0.0003101691628057321, |
| "loss": 3.3972, |
| "step": 44950 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "grad_norm": 0.6792730689048767, |
| "learning_rate": 0.0003098459217756707, |
| "loss": 3.3904, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.8433968356474, |
| "eval_accuracy": 0.37944913203183833, |
| "eval_loss": 3.41748309135437, |
| "eval_runtime": 182.8169, |
| "eval_samples_per_second": 98.519, |
| "eval_steps_per_second": 6.159, |
| "step": 45000 |
| }, |
| { |
| "epoch": 4.848778387687009, |
| "grad_norm": 0.66081702709198, |
| "learning_rate": 0.0003095226807456093, |
| "loss": 3.3904, |
| "step": 45050 |
| }, |
| { |
| "epoch": 4.8541599397266175, |
| "grad_norm": 0.7229673266410828, |
| "learning_rate": 0.00030919943971554785, |
| "loss": 3.4034, |
| "step": 45100 |
| }, |
| { |
| "epoch": 4.859541491766225, |
| "grad_norm": 0.6608504056930542, |
| "learning_rate": 0.00030887619868548644, |
| "loss": 3.3975, |
| "step": 45150 |
| }, |
| { |
| "epoch": 4.864923043805834, |
| "grad_norm": 0.6587163805961609, |
| "learning_rate": 0.000308552957655425, |
| "loss": 3.3832, |
| "step": 45200 |
| }, |
| { |
| "epoch": 4.870304595845441, |
| "grad_norm": 0.6220030784606934, |
| "learning_rate": 0.00030822971662536363, |
| "loss": 3.3956, |
| "step": 45250 |
| }, |
| { |
| "epoch": 4.87568614788505, |
| "grad_norm": 0.6646186113357544, |
| "learning_rate": 0.0003079064755953022, |
| "loss": 3.4069, |
| "step": 45300 |
| }, |
| { |
| "epoch": 4.8810676999246585, |
| "grad_norm": 0.6473865509033203, |
| "learning_rate": 0.00030758323456524077, |
| "loss": 3.3964, |
| "step": 45350 |
| }, |
| { |
| "epoch": 4.886449251964266, |
| "grad_norm": 0.6625189781188965, |
| "learning_rate": 0.00030725999353517936, |
| "loss": 3.3962, |
| "step": 45400 |
| }, |
| { |
| "epoch": 4.891830804003875, |
| "grad_norm": 0.656329333782196, |
| "learning_rate": 0.00030693675250511795, |
| "loss": 3.3861, |
| "step": 45450 |
| }, |
| { |
| "epoch": 4.897212356043483, |
| "grad_norm": 0.6948346495628357, |
| "learning_rate": 0.00030661351147505655, |
| "loss": 3.392, |
| "step": 45500 |
| }, |
| { |
| "epoch": 4.902593908083091, |
| "grad_norm": 0.6399689316749573, |
| "learning_rate": 0.00030629027044499514, |
| "loss": 3.3961, |
| "step": 45550 |
| }, |
| { |
| "epoch": 4.9079754601226995, |
| "grad_norm": 0.6699696779251099, |
| "learning_rate": 0.00030596702941493374, |
| "loss": 3.389, |
| "step": 45600 |
| }, |
| { |
| "epoch": 4.913357012162308, |
| "grad_norm": 0.6273096799850464, |
| "learning_rate": 0.0003056437883848723, |
| "loss": 3.3964, |
| "step": 45650 |
| }, |
| { |
| "epoch": 4.918738564201916, |
| "grad_norm": 0.6409399509429932, |
| "learning_rate": 0.0003053205473548109, |
| "loss": 3.3961, |
| "step": 45700 |
| }, |
| { |
| "epoch": 4.924120116241524, |
| "grad_norm": 0.6525577306747437, |
| "learning_rate": 0.0003049973063247494, |
| "loss": 3.3739, |
| "step": 45750 |
| }, |
| { |
| "epoch": 4.929501668281132, |
| "grad_norm": 0.6267456412315369, |
| "learning_rate": 0.00030467406529468806, |
| "loss": 3.3891, |
| "step": 45800 |
| }, |
| { |
| "epoch": 4.9348832203207404, |
| "grad_norm": 0.610181987285614, |
| "learning_rate": 0.00030435082426462666, |
| "loss": 3.3808, |
| "step": 45850 |
| }, |
| { |
| "epoch": 4.940264772360349, |
| "grad_norm": 0.6637799739837646, |
| "learning_rate": 0.0003040275832345652, |
| "loss": 3.392, |
| "step": 45900 |
| }, |
| { |
| "epoch": 4.945646324399957, |
| "grad_norm": 0.7256836891174316, |
| "learning_rate": 0.0003037043422045038, |
| "loss": 3.3918, |
| "step": 45950 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "grad_norm": 0.6999911069869995, |
| "learning_rate": 0.0003033811011744424, |
| "loss": 3.3921, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.951027876439565, |
| "eval_accuracy": 0.37981605283398784, |
| "eval_loss": 3.412885904312134, |
| "eval_runtime": 183.1736, |
| "eval_samples_per_second": 98.327, |
| "eval_steps_per_second": 6.147, |
| "step": 46000 |
| }, |
| { |
| "epoch": 4.956409428479174, |
| "grad_norm": 0.6925266981124878, |
| "learning_rate": 0.00030305786014438093, |
| "loss": 3.4043, |
| "step": 46050 |
| }, |
| { |
| "epoch": 4.961790980518781, |
| "grad_norm": 0.6724205613136292, |
| "learning_rate": 0.0003027346191143196, |
| "loss": 3.3755, |
| "step": 46100 |
| }, |
| { |
| "epoch": 4.96717253255839, |
| "grad_norm": 0.6727187633514404, |
| "learning_rate": 0.00030241137808425817, |
| "loss": 3.3878, |
| "step": 46150 |
| }, |
| { |
| "epoch": 4.9725540845979985, |
| "grad_norm": 0.6427590250968933, |
| "learning_rate": 0.0003020881370541967, |
| "loss": 3.4044, |
| "step": 46200 |
| }, |
| { |
| "epoch": 4.977935636637606, |
| "grad_norm": 0.6517530083656311, |
| "learning_rate": 0.0003017648960241353, |
| "loss": 3.3773, |
| "step": 46250 |
| }, |
| { |
| "epoch": 4.983317188677215, |
| "grad_norm": 0.6581140160560608, |
| "learning_rate": 0.00030144165499407385, |
| "loss": 3.3772, |
| "step": 46300 |
| }, |
| { |
| "epoch": 4.988698740716822, |
| "grad_norm": 0.6741767525672913, |
| "learning_rate": 0.00030111841396401244, |
| "loss": 3.3952, |
| "step": 46350 |
| }, |
| { |
| "epoch": 4.994080292756431, |
| "grad_norm": 0.6367133259773254, |
| "learning_rate": 0.0003007951729339511, |
| "loss": 3.3955, |
| "step": 46400 |
| }, |
| { |
| "epoch": 4.9994618447960395, |
| "grad_norm": 0.6892203092575073, |
| "learning_rate": 0.00030047193190388963, |
| "loss": 3.3779, |
| "step": 46450 |
| }, |
| { |
| "epoch": 5.004843396835647, |
| "grad_norm": 0.6889623403549194, |
| "learning_rate": 0.0003001486908738282, |
| "loss": 3.3198, |
| "step": 46500 |
| }, |
| { |
| "epoch": 5.010224948875256, |
| "grad_norm": 0.6872815489768982, |
| "learning_rate": 0.0002998254498437668, |
| "loss": 3.3037, |
| "step": 46550 |
| }, |
| { |
| "epoch": 5.015606500914864, |
| "grad_norm": 0.6558205485343933, |
| "learning_rate": 0.0002995022088137054, |
| "loss": 3.2964, |
| "step": 46600 |
| }, |
| { |
| "epoch": 5.020988052954472, |
| "grad_norm": 0.6722169518470764, |
| "learning_rate": 0.00029917896778364396, |
| "loss": 3.2927, |
| "step": 46650 |
| }, |
| { |
| "epoch": 5.0263696049940805, |
| "grad_norm": 0.6730004549026489, |
| "learning_rate": 0.00029885572675358255, |
| "loss": 3.3064, |
| "step": 46700 |
| }, |
| { |
| "epoch": 5.031751157033688, |
| "grad_norm": 0.6149821281433105, |
| "learning_rate": 0.00029853248572352114, |
| "loss": 3.2977, |
| "step": 46750 |
| }, |
| { |
| "epoch": 5.037132709073297, |
| "grad_norm": 0.6445924639701843, |
| "learning_rate": 0.00029820924469345974, |
| "loss": 3.3056, |
| "step": 46800 |
| }, |
| { |
| "epoch": 5.042514261112905, |
| "grad_norm": 0.6724662780761719, |
| "learning_rate": 0.0002978860036633983, |
| "loss": 3.3142, |
| "step": 46850 |
| }, |
| { |
| "epoch": 5.047895813152513, |
| "grad_norm": 0.6918281316757202, |
| "learning_rate": 0.00029756276263333693, |
| "loss": 3.3051, |
| "step": 46900 |
| }, |
| { |
| "epoch": 5.0532773651921215, |
| "grad_norm": 0.6730775833129883, |
| "learning_rate": 0.00029723952160327547, |
| "loss": 3.3048, |
| "step": 46950 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "grad_norm": 0.6861401200294495, |
| "learning_rate": 0.00029691628057321406, |
| "loss": 3.3313, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.05865891723173, |
| "eval_accuracy": 0.3799867465207444, |
| "eval_loss": 3.4170210361480713, |
| "eval_runtime": 182.9491, |
| "eval_samples_per_second": 98.448, |
| "eval_steps_per_second": 6.155, |
| "step": 47000 |
| }, |
| { |
| "epoch": 5.064040469271338, |
| "grad_norm": 0.6703794002532959, |
| "learning_rate": 0.00029659303954315266, |
| "loss": 3.3135, |
| "step": 47050 |
| }, |
| { |
| "epoch": 5.069422021310946, |
| "grad_norm": 0.6781489849090576, |
| "learning_rate": 0.00029626979851309125, |
| "loss": 3.3085, |
| "step": 47100 |
| }, |
| { |
| "epoch": 5.074803573350554, |
| "grad_norm": 0.6689127683639526, |
| "learning_rate": 0.00029594655748302985, |
| "loss": 3.3014, |
| "step": 47150 |
| }, |
| { |
| "epoch": 5.080185125390162, |
| "grad_norm": 0.6758719682693481, |
| "learning_rate": 0.0002956233164529684, |
| "loss": 3.3099, |
| "step": 47200 |
| }, |
| { |
| "epoch": 5.085566677429771, |
| "grad_norm": 0.6829871535301208, |
| "learning_rate": 0.000295300075422907, |
| "loss": 3.3137, |
| "step": 47250 |
| }, |
| { |
| "epoch": 5.090948229469379, |
| "grad_norm": 0.7148377299308777, |
| "learning_rate": 0.0002949768343928456, |
| "loss": 3.3026, |
| "step": 47300 |
| }, |
| { |
| "epoch": 5.096329781508987, |
| "grad_norm": 0.706376314163208, |
| "learning_rate": 0.00029465359336278417, |
| "loss": 3.3088, |
| "step": 47350 |
| }, |
| { |
| "epoch": 5.101711333548596, |
| "grad_norm": 0.658332109451294, |
| "learning_rate": 0.0002943303523327227, |
| "loss": 3.3188, |
| "step": 47400 |
| }, |
| { |
| "epoch": 5.107092885588203, |
| "grad_norm": 0.6532283425331116, |
| "learning_rate": 0.00029400711130266136, |
| "loss": 3.3087, |
| "step": 47450 |
| }, |
| { |
| "epoch": 5.112474437627812, |
| "grad_norm": 0.670990526676178, |
| "learning_rate": 0.0002936838702725999, |
| "loss": 3.319, |
| "step": 47500 |
| }, |
| { |
| "epoch": 5.1178559896674205, |
| "grad_norm": 0.6792743802070618, |
| "learning_rate": 0.0002933606292425385, |
| "loss": 3.3171, |
| "step": 47550 |
| }, |
| { |
| "epoch": 5.123237541707028, |
| "grad_norm": 0.6803917288780212, |
| "learning_rate": 0.0002930373882124771, |
| "loss": 3.3264, |
| "step": 47600 |
| }, |
| { |
| "epoch": 5.128619093746637, |
| "grad_norm": 0.6759734749794006, |
| "learning_rate": 0.0002927141471824157, |
| "loss": 3.3128, |
| "step": 47650 |
| }, |
| { |
| "epoch": 5.134000645786244, |
| "grad_norm": 0.6762935519218445, |
| "learning_rate": 0.0002923909061523542, |
| "loss": 3.3171, |
| "step": 47700 |
| }, |
| { |
| "epoch": 5.139382197825853, |
| "grad_norm": 0.7197624444961548, |
| "learning_rate": 0.0002920676651222928, |
| "loss": 3.3111, |
| "step": 47750 |
| }, |
| { |
| "epoch": 5.1447637498654615, |
| "grad_norm": 0.6673755049705505, |
| "learning_rate": 0.0002917444240922314, |
| "loss": 3.3256, |
| "step": 47800 |
| }, |
| { |
| "epoch": 5.150145301905069, |
| "grad_norm": 0.6834902167320251, |
| "learning_rate": 0.00029142118306216996, |
| "loss": 3.3293, |
| "step": 47850 |
| }, |
| { |
| "epoch": 5.155526853944678, |
| "grad_norm": 0.6759807467460632, |
| "learning_rate": 0.0002910979420321086, |
| "loss": 3.3295, |
| "step": 47900 |
| }, |
| { |
| "epoch": 5.160908405984286, |
| "grad_norm": 0.7257971167564392, |
| "learning_rate": 0.00029077470100204715, |
| "loss": 3.3301, |
| "step": 47950 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "grad_norm": 0.7746021747589111, |
| "learning_rate": 0.00029045145997198574, |
| "loss": 3.333, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.166289958023894, |
| "eval_accuracy": 0.3801953600653482, |
| "eval_loss": 3.416738986968994, |
| "eval_runtime": 183.2917, |
| "eval_samples_per_second": 98.264, |
| "eval_steps_per_second": 6.143, |
| "step": 48000 |
| }, |
| { |
| "epoch": 5.1716715100635025, |
| "grad_norm": 0.6825886964797974, |
| "learning_rate": 0.00029012821894192433, |
| "loss": 3.3222, |
| "step": 48050 |
| }, |
| { |
| "epoch": 5.17705306210311, |
| "grad_norm": 0.6440089344978333, |
| "learning_rate": 0.00028980497791186293, |
| "loss": 3.3423, |
| "step": 48100 |
| }, |
| { |
| "epoch": 5.182434614142719, |
| "grad_norm": 0.6887645125389099, |
| "learning_rate": 0.0002894817368818015, |
| "loss": 3.3232, |
| "step": 48150 |
| }, |
| { |
| "epoch": 5.187816166182327, |
| "grad_norm": 0.646507978439331, |
| "learning_rate": 0.00028915849585174006, |
| "loss": 3.3021, |
| "step": 48200 |
| }, |
| { |
| "epoch": 5.193197718221935, |
| "grad_norm": 0.726085364818573, |
| "learning_rate": 0.00028883525482167866, |
| "loss": 3.3173, |
| "step": 48250 |
| }, |
| { |
| "epoch": 5.198579270261543, |
| "grad_norm": 0.6602137684822083, |
| "learning_rate": 0.00028851201379161725, |
| "loss": 3.3115, |
| "step": 48300 |
| }, |
| { |
| "epoch": 5.203960822301152, |
| "grad_norm": 0.65272456407547, |
| "learning_rate": 0.00028818877276155585, |
| "loss": 3.34, |
| "step": 48350 |
| }, |
| { |
| "epoch": 5.20934237434076, |
| "grad_norm": 0.6690257787704468, |
| "learning_rate": 0.0002878655317314944, |
| "loss": 3.3203, |
| "step": 48400 |
| }, |
| { |
| "epoch": 5.214723926380368, |
| "grad_norm": 0.6628714203834534, |
| "learning_rate": 0.00028754229070143304, |
| "loss": 3.3076, |
| "step": 48450 |
| }, |
| { |
| "epoch": 5.220105478419977, |
| "grad_norm": 0.6842119097709656, |
| "learning_rate": 0.0002872190496713716, |
| "loss": 3.331, |
| "step": 48500 |
| }, |
| { |
| "epoch": 5.225487030459584, |
| "grad_norm": 0.696015477180481, |
| "learning_rate": 0.0002868958086413102, |
| "loss": 3.3258, |
| "step": 48550 |
| }, |
| { |
| "epoch": 5.230868582499193, |
| "grad_norm": 0.6456036567687988, |
| "learning_rate": 0.00028657256761124877, |
| "loss": 3.3165, |
| "step": 48600 |
| }, |
| { |
| "epoch": 5.236250134538801, |
| "grad_norm": 0.6485376954078674, |
| "learning_rate": 0.00028624932658118736, |
| "loss": 3.3451, |
| "step": 48650 |
| }, |
| { |
| "epoch": 5.241631686578409, |
| "grad_norm": 0.659721314907074, |
| "learning_rate": 0.0002859260855511259, |
| "loss": 3.3305, |
| "step": 48700 |
| }, |
| { |
| "epoch": 5.247013238618018, |
| "grad_norm": 0.6913601160049438, |
| "learning_rate": 0.00028560930934166576, |
| "loss": 3.3227, |
| "step": 48750 |
| }, |
| { |
| "epoch": 5.252394790657625, |
| "grad_norm": 0.6780300736427307, |
| "learning_rate": 0.00028528606831160436, |
| "loss": 3.331, |
| "step": 48800 |
| }, |
| { |
| "epoch": 5.257776342697234, |
| "grad_norm": 0.6792374849319458, |
| "learning_rate": 0.0002849628272815429, |
| "loss": 3.3381, |
| "step": 48850 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 0.6867091655731201, |
| "learning_rate": 0.0002846395862514815, |
| "loss": 3.3138, |
| "step": 48900 |
| }, |
| { |
| "epoch": 5.26853944677645, |
| "grad_norm": 0.6632397770881653, |
| "learning_rate": 0.0002843163452214201, |
| "loss": 3.3316, |
| "step": 48950 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "grad_norm": 0.6680681705474854, |
| "learning_rate": 0.0002839931041913587, |
| "loss": 3.3374, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.273920998816059, |
| "eval_accuracy": 0.380897692332181, |
| "eval_loss": 3.412996768951416, |
| "eval_runtime": 182.5799, |
| "eval_samples_per_second": 98.647, |
| "eval_steps_per_second": 6.167, |
| "step": 49000 |
| }, |
| { |
| "epoch": 5.279302550855666, |
| "grad_norm": 0.7043352127075195, |
| "learning_rate": 0.0002836698631612972, |
| "loss": 3.3241, |
| "step": 49050 |
| }, |
| { |
| "epoch": 5.284684102895275, |
| "grad_norm": 0.6589981913566589, |
| "learning_rate": 0.00028334662213123587, |
| "loss": 3.3239, |
| "step": 49100 |
| }, |
| { |
| "epoch": 5.2900656549348835, |
| "grad_norm": 0.6708270907402039, |
| "learning_rate": 0.0002830233811011744, |
| "loss": 3.3458, |
| "step": 49150 |
| }, |
| { |
| "epoch": 5.295447206974491, |
| "grad_norm": 0.7209701538085938, |
| "learning_rate": 0.000282700140071113, |
| "loss": 3.3305, |
| "step": 49200 |
| }, |
| { |
| "epoch": 5.3008287590141, |
| "grad_norm": 0.6668233871459961, |
| "learning_rate": 0.0002823768990410516, |
| "loss": 3.3244, |
| "step": 49250 |
| }, |
| { |
| "epoch": 5.306210311053708, |
| "grad_norm": 0.7002584934234619, |
| "learning_rate": 0.00028205365801099014, |
| "loss": 3.3261, |
| "step": 49300 |
| }, |
| { |
| "epoch": 5.311591863093316, |
| "grad_norm": 0.6543057560920715, |
| "learning_rate": 0.00028173041698092874, |
| "loss": 3.3427, |
| "step": 49350 |
| }, |
| { |
| "epoch": 5.316973415132924, |
| "grad_norm": 0.6252505779266357, |
| "learning_rate": 0.00028140717595086733, |
| "loss": 3.3378, |
| "step": 49400 |
| }, |
| { |
| "epoch": 5.322354967172533, |
| "grad_norm": 0.7060573101043701, |
| "learning_rate": 0.0002810839349208059, |
| "loss": 3.3324, |
| "step": 49450 |
| }, |
| { |
| "epoch": 5.327736519212141, |
| "grad_norm": 0.7204796075820923, |
| "learning_rate": 0.00028076069389074447, |
| "loss": 3.339, |
| "step": 49500 |
| }, |
| { |
| "epoch": 5.333118071251749, |
| "grad_norm": 0.7042475342750549, |
| "learning_rate": 0.0002804374528606831, |
| "loss": 3.3392, |
| "step": 49550 |
| }, |
| { |
| "epoch": 5.338499623291357, |
| "grad_norm": 0.6284716129302979, |
| "learning_rate": 0.00028011421183062166, |
| "loss": 3.3334, |
| "step": 49600 |
| }, |
| { |
| "epoch": 5.343881175330965, |
| "grad_norm": 0.7170406579971313, |
| "learning_rate": 0.00027979097080056025, |
| "loss": 3.3344, |
| "step": 49650 |
| }, |
| { |
| "epoch": 5.349262727370574, |
| "grad_norm": 0.6857984066009521, |
| "learning_rate": 0.00027946772977049885, |
| "loss": 3.3497, |
| "step": 49700 |
| }, |
| { |
| "epoch": 5.354644279410182, |
| "grad_norm": 0.6786603331565857, |
| "learning_rate": 0.00027914448874043744, |
| "loss": 3.3371, |
| "step": 49750 |
| }, |
| { |
| "epoch": 5.36002583144979, |
| "grad_norm": 0.6698256134986877, |
| "learning_rate": 0.00027882124771037603, |
| "loss": 3.3305, |
| "step": 49800 |
| }, |
| { |
| "epoch": 5.365407383489399, |
| "grad_norm": 0.6916489601135254, |
| "learning_rate": 0.0002784980066803146, |
| "loss": 3.3315, |
| "step": 49850 |
| }, |
| { |
| "epoch": 5.370788935529006, |
| "grad_norm": 0.6628431081771851, |
| "learning_rate": 0.00027817476565025317, |
| "loss": 3.3218, |
| "step": 49900 |
| }, |
| { |
| "epoch": 5.376170487568615, |
| "grad_norm": 0.6686885356903076, |
| "learning_rate": 0.00027785152462019176, |
| "loss": 3.344, |
| "step": 49950 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "grad_norm": 0.6686023473739624, |
| "learning_rate": 0.00027752828359013036, |
| "loss": 3.3217, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.3815520396082235, |
| "eval_accuracy": 0.3814086868635725, |
| "eval_loss": 3.4041335582733154, |
| "eval_runtime": 182.7349, |
| "eval_samples_per_second": 98.564, |
| "eval_steps_per_second": 6.162, |
| "step": 50000 |
| }, |
| { |
| "epoch": 5.386933591647831, |
| "grad_norm": 0.7179530262947083, |
| "learning_rate": 0.0002772050425600689, |
| "loss": 3.3474, |
| "step": 50050 |
| }, |
| { |
| "epoch": 5.39231514368744, |
| "grad_norm": 0.7137858867645264, |
| "learning_rate": 0.00027688180153000755, |
| "loss": 3.342, |
| "step": 50100 |
| }, |
| { |
| "epoch": 5.397696695727047, |
| "grad_norm": 0.6810168027877808, |
| "learning_rate": 0.0002765585604999461, |
| "loss": 3.3531, |
| "step": 50150 |
| }, |
| { |
| "epoch": 5.403078247766656, |
| "grad_norm": 0.6813143491744995, |
| "learning_rate": 0.0002762353194698847, |
| "loss": 3.3448, |
| "step": 50200 |
| }, |
| { |
| "epoch": 5.4084597998062645, |
| "grad_norm": 0.7286673188209534, |
| "learning_rate": 0.0002759120784398233, |
| "loss": 3.351, |
| "step": 50250 |
| }, |
| { |
| "epoch": 5.413841351845872, |
| "grad_norm": 0.670843243598938, |
| "learning_rate": 0.00027558883740976187, |
| "loss": 3.3436, |
| "step": 50300 |
| }, |
| { |
| "epoch": 5.419222903885481, |
| "grad_norm": 0.7313069105148315, |
| "learning_rate": 0.0002752655963797004, |
| "loss": 3.3302, |
| "step": 50350 |
| }, |
| { |
| "epoch": 5.424604455925088, |
| "grad_norm": 0.7491310834884644, |
| "learning_rate": 0.000274942355349639, |
| "loss": 3.3428, |
| "step": 50400 |
| }, |
| { |
| "epoch": 5.429986007964697, |
| "grad_norm": 0.6367394924163818, |
| "learning_rate": 0.0002746191143195776, |
| "loss": 3.3479, |
| "step": 50450 |
| }, |
| { |
| "epoch": 5.435367560004305, |
| "grad_norm": 0.6688310503959656, |
| "learning_rate": 0.0002742958732895162, |
| "loss": 3.3284, |
| "step": 50500 |
| }, |
| { |
| "epoch": 5.440749112043913, |
| "grad_norm": 0.7176879644393921, |
| "learning_rate": 0.0002739726322594548, |
| "loss": 3.3527, |
| "step": 50550 |
| }, |
| { |
| "epoch": 5.446130664083522, |
| "grad_norm": 0.6491169333457947, |
| "learning_rate": 0.00027364939122939333, |
| "loss": 3.3275, |
| "step": 50600 |
| }, |
| { |
| "epoch": 5.45151221612313, |
| "grad_norm": 0.75172358751297, |
| "learning_rate": 0.0002733261501993319, |
| "loss": 3.3364, |
| "step": 50650 |
| }, |
| { |
| "epoch": 5.456893768162738, |
| "grad_norm": 0.6788306832313538, |
| "learning_rate": 0.0002730029091692705, |
| "loss": 3.3293, |
| "step": 50700 |
| }, |
| { |
| "epoch": 5.462275320202346, |
| "grad_norm": 0.6975774765014648, |
| "learning_rate": 0.0002726861329598104, |
| "loss": 3.3368, |
| "step": 50750 |
| }, |
| { |
| "epoch": 5.467656872241955, |
| "grad_norm": 0.6561357378959656, |
| "learning_rate": 0.0002723628919297489, |
| "loss": 3.3413, |
| "step": 50800 |
| }, |
| { |
| "epoch": 5.473038424281563, |
| "grad_norm": 0.6256112456321716, |
| "learning_rate": 0.0002720396508996875, |
| "loss": 3.3281, |
| "step": 50850 |
| }, |
| { |
| "epoch": 5.478419976321171, |
| "grad_norm": 0.721541166305542, |
| "learning_rate": 0.0002717164098696261, |
| "loss": 3.3276, |
| "step": 50900 |
| }, |
| { |
| "epoch": 5.483801528360779, |
| "grad_norm": 0.6738684773445129, |
| "learning_rate": 0.00027139316883956465, |
| "loss": 3.35, |
| "step": 50950 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "grad_norm": 0.6639395952224731, |
| "learning_rate": 0.00027106992780950325, |
| "loss": 3.3285, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.489183080400387, |
| "eval_accuracy": 0.381844710902372, |
| "eval_loss": 3.4026851654052734, |
| "eval_runtime": 182.4881, |
| "eval_samples_per_second": 98.697, |
| "eval_steps_per_second": 6.17, |
| "step": 51000 |
| }, |
| { |
| "epoch": 5.494564632439996, |
| "grad_norm": 0.6820451617240906, |
| "learning_rate": 0.00027074668677944184, |
| "loss": 3.3397, |
| "step": 51050 |
| }, |
| { |
| "epoch": 5.499946184479604, |
| "grad_norm": 0.6895923018455505, |
| "learning_rate": 0.00027042344574938044, |
| "loss": 3.3435, |
| "step": 51100 |
| }, |
| { |
| "epoch": 5.505327736519212, |
| "grad_norm": 0.6487290859222412, |
| "learning_rate": 0.000270100204719319, |
| "loss": 3.3394, |
| "step": 51150 |
| }, |
| { |
| "epoch": 5.510709288558821, |
| "grad_norm": 0.682824969291687, |
| "learning_rate": 0.0002697769636892576, |
| "loss": 3.3384, |
| "step": 51200 |
| }, |
| { |
| "epoch": 5.516090840598428, |
| "grad_norm": 0.6406248807907104, |
| "learning_rate": 0.00026945372265919617, |
| "loss": 3.3383, |
| "step": 51250 |
| }, |
| { |
| "epoch": 5.521472392638037, |
| "grad_norm": 0.6504181027412415, |
| "learning_rate": 0.00026913048162913476, |
| "loss": 3.3614, |
| "step": 51300 |
| }, |
| { |
| "epoch": 5.5268539446776455, |
| "grad_norm": 0.7287770509719849, |
| "learning_rate": 0.00026880724059907336, |
| "loss": 3.3374, |
| "step": 51350 |
| }, |
| { |
| "epoch": 5.532235496717253, |
| "grad_norm": 0.7049496173858643, |
| "learning_rate": 0.00026848399956901195, |
| "loss": 3.3374, |
| "step": 51400 |
| }, |
| { |
| "epoch": 5.537617048756862, |
| "grad_norm": 0.659416913986206, |
| "learning_rate": 0.0002681607585389505, |
| "loss": 3.3432, |
| "step": 51450 |
| }, |
| { |
| "epoch": 5.542998600796469, |
| "grad_norm": 0.6534185409545898, |
| "learning_rate": 0.0002678375175088891, |
| "loss": 3.3324, |
| "step": 51500 |
| }, |
| { |
| "epoch": 5.548380152836078, |
| "grad_norm": 0.6897971630096436, |
| "learning_rate": 0.0002675142764788277, |
| "loss": 3.3326, |
| "step": 51550 |
| }, |
| { |
| "epoch": 5.553761704875686, |
| "grad_norm": 0.6762552261352539, |
| "learning_rate": 0.0002671910354487663, |
| "loss": 3.3378, |
| "step": 51600 |
| }, |
| { |
| "epoch": 5.559143256915294, |
| "grad_norm": 0.6849751472473145, |
| "learning_rate": 0.00026686779441870487, |
| "loss": 3.3355, |
| "step": 51650 |
| }, |
| { |
| "epoch": 5.564524808954903, |
| "grad_norm": 0.6968287229537964, |
| "learning_rate": 0.0002665445533886434, |
| "loss": 3.344, |
| "step": 51700 |
| }, |
| { |
| "epoch": 5.569906360994511, |
| "grad_norm": 0.6949653029441833, |
| "learning_rate": 0.00026622131235858206, |
| "loss": 3.3416, |
| "step": 51750 |
| }, |
| { |
| "epoch": 5.575287913034119, |
| "grad_norm": 0.677852213382721, |
| "learning_rate": 0.0002658980713285206, |
| "loss": 3.3566, |
| "step": 51800 |
| }, |
| { |
| "epoch": 5.580669465073727, |
| "grad_norm": 0.6766265034675598, |
| "learning_rate": 0.0002655748302984592, |
| "loss": 3.3509, |
| "step": 51850 |
| }, |
| { |
| "epoch": 5.586051017113336, |
| "grad_norm": 0.6820150017738342, |
| "learning_rate": 0.0002652515892683978, |
| "loss": 3.3529, |
| "step": 51900 |
| }, |
| { |
| "epoch": 5.591432569152944, |
| "grad_norm": 0.7299242615699768, |
| "learning_rate": 0.0002649283482383364, |
| "loss": 3.3359, |
| "step": 51950 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "grad_norm": 0.67701256275177, |
| "learning_rate": 0.0002646051072082749, |
| "loss": 3.3538, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.596814121192552, |
| "eval_accuracy": 0.38225780918184266, |
| "eval_loss": 3.3965156078338623, |
| "eval_runtime": 182.6924, |
| "eval_samples_per_second": 98.586, |
| "eval_steps_per_second": 6.163, |
| "step": 52000 |
| }, |
| { |
| "epoch": 5.60219567323216, |
| "grad_norm": 0.6956893801689148, |
| "learning_rate": 0.0002642818661782135, |
| "loss": 3.3411, |
| "step": 52050 |
| }, |
| { |
| "epoch": 5.607577225271768, |
| "grad_norm": 0.6988450288772583, |
| "learning_rate": 0.0002639586251481521, |
| "loss": 3.3266, |
| "step": 52100 |
| }, |
| { |
| "epoch": 5.612958777311377, |
| "grad_norm": 0.6658537983894348, |
| "learning_rate": 0.0002636353841180907, |
| "loss": 3.3534, |
| "step": 52150 |
| }, |
| { |
| "epoch": 5.618340329350985, |
| "grad_norm": 0.7193296551704407, |
| "learning_rate": 0.0002633121430880293, |
| "loss": 3.3355, |
| "step": 52200 |
| }, |
| { |
| "epoch": 5.623721881390593, |
| "grad_norm": 0.76201993227005, |
| "learning_rate": 0.00026298890205796784, |
| "loss": 3.3333, |
| "step": 52250 |
| }, |
| { |
| "epoch": 5.629103433430201, |
| "grad_norm": 0.6642473340034485, |
| "learning_rate": 0.00026266566102790644, |
| "loss": 3.3183, |
| "step": 52300 |
| }, |
| { |
| "epoch": 5.634484985469809, |
| "grad_norm": 0.7340889573097229, |
| "learning_rate": 0.00026234241999784503, |
| "loss": 3.3426, |
| "step": 52350 |
| }, |
| { |
| "epoch": 5.639866537509418, |
| "grad_norm": 0.6511938571929932, |
| "learning_rate": 0.0002620191789677836, |
| "loss": 3.3428, |
| "step": 52400 |
| }, |
| { |
| "epoch": 5.645248089549026, |
| "grad_norm": 0.7516026496887207, |
| "learning_rate": 0.00026169593793772217, |
| "loss": 3.3299, |
| "step": 52450 |
| }, |
| { |
| "epoch": 5.650629641588634, |
| "grad_norm": 0.6919920444488525, |
| "learning_rate": 0.0002613726969076608, |
| "loss": 3.3476, |
| "step": 52500 |
| }, |
| { |
| "epoch": 5.656011193628243, |
| "grad_norm": 0.6615656018257141, |
| "learning_rate": 0.00026104945587759936, |
| "loss": 3.3536, |
| "step": 52550 |
| }, |
| { |
| "epoch": 5.66139274566785, |
| "grad_norm": 0.6720345616340637, |
| "learning_rate": 0.00026072621484753795, |
| "loss": 3.3274, |
| "step": 52600 |
| }, |
| { |
| "epoch": 5.666774297707459, |
| "grad_norm": 0.7008869647979736, |
| "learning_rate": 0.00026040297381747655, |
| "loss": 3.3585, |
| "step": 52650 |
| }, |
| { |
| "epoch": 5.672155849747067, |
| "grad_norm": 0.6563118696212769, |
| "learning_rate": 0.00026007973278741514, |
| "loss": 3.3435, |
| "step": 52700 |
| }, |
| { |
| "epoch": 5.677537401786675, |
| "grad_norm": 0.7016202211380005, |
| "learning_rate": 0.00025975649175735373, |
| "loss": 3.3271, |
| "step": 52750 |
| }, |
| { |
| "epoch": 5.682918953826284, |
| "grad_norm": 0.6965526938438416, |
| "learning_rate": 0.0002594397155478935, |
| "loss": 3.3241, |
| "step": 52800 |
| }, |
| { |
| "epoch": 5.688300505865891, |
| "grad_norm": 0.6786898374557495, |
| "learning_rate": 0.00025911647451783214, |
| "loss": 3.3389, |
| "step": 52850 |
| }, |
| { |
| "epoch": 5.6936820579055, |
| "grad_norm": 0.7544721364974976, |
| "learning_rate": 0.0002587932334877707, |
| "loss": 3.3585, |
| "step": 52900 |
| }, |
| { |
| "epoch": 5.699063609945108, |
| "grad_norm": 0.6955776810646057, |
| "learning_rate": 0.00025846999245770927, |
| "loss": 3.3371, |
| "step": 52950 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "grad_norm": 0.7224853038787842, |
| "learning_rate": 0.00025814675142764787, |
| "loss": 3.3236, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.704445161984716, |
| "eval_accuracy": 0.3824792437672086, |
| "eval_loss": 3.3930840492248535, |
| "eval_runtime": 182.5431, |
| "eval_samples_per_second": 98.667, |
| "eval_steps_per_second": 6.168, |
| "step": 53000 |
| }, |
| { |
| "epoch": 5.709826714024325, |
| "grad_norm": 0.6928134560585022, |
| "learning_rate": 0.00025782351039758646, |
| "loss": 3.3293, |
| "step": 53050 |
| }, |
| { |
| "epoch": 5.715208266063933, |
| "grad_norm": 0.7381559610366821, |
| "learning_rate": 0.000257500269367525, |
| "loss": 3.3342, |
| "step": 53100 |
| }, |
| { |
| "epoch": 5.720589818103541, |
| "grad_norm": 0.6981896162033081, |
| "learning_rate": 0.0002571770283374636, |
| "loss": 3.3432, |
| "step": 53150 |
| }, |
| { |
| "epoch": 5.725971370143149, |
| "grad_norm": 0.7084531188011169, |
| "learning_rate": 0.0002568537873074022, |
| "loss": 3.3424, |
| "step": 53200 |
| }, |
| { |
| "epoch": 5.731352922182758, |
| "grad_norm": 0.6834819316864014, |
| "learning_rate": 0.0002565305462773408, |
| "loss": 3.3341, |
| "step": 53250 |
| }, |
| { |
| "epoch": 5.736734474222366, |
| "grad_norm": 0.6777629256248474, |
| "learning_rate": 0.0002562073052472794, |
| "loss": 3.3627, |
| "step": 53300 |
| }, |
| { |
| "epoch": 5.742116026261974, |
| "grad_norm": 0.6871793270111084, |
| "learning_rate": 0.0002558840642172179, |
| "loss": 3.3307, |
| "step": 53350 |
| }, |
| { |
| "epoch": 5.747497578301582, |
| "grad_norm": 0.6921470165252686, |
| "learning_rate": 0.00025556082318715657, |
| "loss": 3.337, |
| "step": 53400 |
| }, |
| { |
| "epoch": 5.75287913034119, |
| "grad_norm": 0.6944039463996887, |
| "learning_rate": 0.0002552375821570951, |
| "loss": 3.357, |
| "step": 53450 |
| }, |
| { |
| "epoch": 5.758260682380799, |
| "grad_norm": 0.7080736756324768, |
| "learning_rate": 0.0002549143411270337, |
| "loss": 3.3487, |
| "step": 53500 |
| }, |
| { |
| "epoch": 5.763642234420407, |
| "grad_norm": 0.6794440746307373, |
| "learning_rate": 0.0002545911000969723, |
| "loss": 3.344, |
| "step": 53550 |
| }, |
| { |
| "epoch": 5.769023786460015, |
| "grad_norm": 0.7343289256095886, |
| "learning_rate": 0.0002542678590669109, |
| "loss": 3.3341, |
| "step": 53600 |
| }, |
| { |
| "epoch": 5.774405338499624, |
| "grad_norm": 0.7471538186073303, |
| "learning_rate": 0.00025394461803684943, |
| "loss": 3.3386, |
| "step": 53650 |
| }, |
| { |
| "epoch": 5.779786890539231, |
| "grad_norm": 0.7091389894485474, |
| "learning_rate": 0.00025362137700678803, |
| "loss": 3.3487, |
| "step": 53700 |
| }, |
| { |
| "epoch": 5.78516844257884, |
| "grad_norm": 0.7032609581947327, |
| "learning_rate": 0.0002532981359767266, |
| "loss": 3.3359, |
| "step": 53750 |
| }, |
| { |
| "epoch": 5.790549994618448, |
| "grad_norm": 0.7148309350013733, |
| "learning_rate": 0.0002529748949466652, |
| "loss": 3.3482, |
| "step": 53800 |
| }, |
| { |
| "epoch": 5.795931546658056, |
| "grad_norm": 0.6615278124809265, |
| "learning_rate": 0.0002526516539166038, |
| "loss": 3.3418, |
| "step": 53850 |
| }, |
| { |
| "epoch": 5.801313098697665, |
| "grad_norm": 0.6663516163825989, |
| "learning_rate": 0.00025232841288654235, |
| "loss": 3.3545, |
| "step": 53900 |
| }, |
| { |
| "epoch": 5.806694650737272, |
| "grad_norm": 0.680908739566803, |
| "learning_rate": 0.00025200517185648095, |
| "loss": 3.3334, |
| "step": 53950 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "grad_norm": 0.7598308324813843, |
| "learning_rate": 0.00025168193082641954, |
| "loss": 3.3326, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.812076202776881, |
| "eval_accuracy": 0.3831473503743798, |
| "eval_loss": 3.387981414794922, |
| "eval_runtime": 182.8436, |
| "eval_samples_per_second": 98.505, |
| "eval_steps_per_second": 6.158, |
| "step": 54000 |
| }, |
| { |
| "epoch": 5.817457754816489, |
| "grad_norm": 0.6939805746078491, |
| "learning_rate": 0.00025135868979635814, |
| "loss": 3.3218, |
| "step": 54050 |
| }, |
| { |
| "epoch": 5.822839306856097, |
| "grad_norm": 0.6918478608131409, |
| "learning_rate": 0.0002510354487662967, |
| "loss": 3.3544, |
| "step": 54100 |
| }, |
| { |
| "epoch": 5.828220858895706, |
| "grad_norm": 0.703644335269928, |
| "learning_rate": 0.0002507122077362353, |
| "loss": 3.3398, |
| "step": 54150 |
| }, |
| { |
| "epoch": 5.833602410935313, |
| "grad_norm": 0.6816033720970154, |
| "learning_rate": 0.00025038896670617387, |
| "loss": 3.3412, |
| "step": 54200 |
| }, |
| { |
| "epoch": 5.838983962974922, |
| "grad_norm": 0.7181455492973328, |
| "learning_rate": 0.00025006572567611246, |
| "loss": 3.3289, |
| "step": 54250 |
| }, |
| { |
| "epoch": 5.84436551501453, |
| "grad_norm": 0.6734956502914429, |
| "learning_rate": 0.00024974248464605106, |
| "loss": 3.3377, |
| "step": 54300 |
| }, |
| { |
| "epoch": 5.849747067054138, |
| "grad_norm": 0.703682005405426, |
| "learning_rate": 0.00024941924361598965, |
| "loss": 3.3415, |
| "step": 54350 |
| }, |
| { |
| "epoch": 5.855128619093747, |
| "grad_norm": 0.7012028694152832, |
| "learning_rate": 0.00024909600258592825, |
| "loss": 3.3428, |
| "step": 54400 |
| }, |
| { |
| "epoch": 5.860510171133355, |
| "grad_norm": 0.7386256456375122, |
| "learning_rate": 0.0002487727615558668, |
| "loss": 3.3364, |
| "step": 54450 |
| }, |
| { |
| "epoch": 5.865891723172963, |
| "grad_norm": 0.685370683670044, |
| "learning_rate": 0.0002484495205258054, |
| "loss": 3.3489, |
| "step": 54500 |
| }, |
| { |
| "epoch": 5.871273275212571, |
| "grad_norm": 0.7215905785560608, |
| "learning_rate": 0.0002481327443163452, |
| "loss": 3.3417, |
| "step": 54550 |
| }, |
| { |
| "epoch": 5.87665482725218, |
| "grad_norm": 0.6396303176879883, |
| "learning_rate": 0.0002478095032862838, |
| "loss": 3.3425, |
| "step": 54600 |
| }, |
| { |
| "epoch": 5.882036379291788, |
| "grad_norm": 0.7235729098320007, |
| "learning_rate": 0.0002474862622562224, |
| "loss": 3.3433, |
| "step": 54650 |
| }, |
| { |
| "epoch": 5.887417931331396, |
| "grad_norm": 0.6854497790336609, |
| "learning_rate": 0.00024716302122616097, |
| "loss": 3.3425, |
| "step": 54700 |
| }, |
| { |
| "epoch": 5.892799483371004, |
| "grad_norm": 0.7037459015846252, |
| "learning_rate": 0.0002468397801960995, |
| "loss": 3.343, |
| "step": 54750 |
| }, |
| { |
| "epoch": 5.898181035410612, |
| "grad_norm": 0.6938320398330688, |
| "learning_rate": 0.0002465165391660381, |
| "loss": 3.3392, |
| "step": 54800 |
| }, |
| { |
| "epoch": 5.903562587450221, |
| "grad_norm": 0.7277181148529053, |
| "learning_rate": 0.0002461932981359767, |
| "loss": 3.34, |
| "step": 54850 |
| }, |
| { |
| "epoch": 5.9089441394898286, |
| "grad_norm": 0.7088878750801086, |
| "learning_rate": 0.0002458700571059153, |
| "loss": 3.3342, |
| "step": 54900 |
| }, |
| { |
| "epoch": 5.914325691529437, |
| "grad_norm": 0.7254401445388794, |
| "learning_rate": 0.0002455468160758539, |
| "loss": 3.334, |
| "step": 54950 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "grad_norm": 0.6835173964500427, |
| "learning_rate": 0.00024522357504579243, |
| "loss": 3.3383, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.919707243569046, |
| "eval_accuracy": 0.38354132574559513, |
| "eval_loss": 3.384719133377075, |
| "eval_runtime": 183.0018, |
| "eval_samples_per_second": 98.42, |
| "eval_steps_per_second": 6.153, |
| "step": 55000 |
| }, |
| { |
| "epoch": 5.925088795608653, |
| "grad_norm": 0.6644113063812256, |
| "learning_rate": 0.000244900334015731, |
| "loss": 3.3335, |
| "step": 55050 |
| }, |
| { |
| "epoch": 5.930470347648262, |
| "grad_norm": 0.6974128484725952, |
| "learning_rate": 0.0002445770929856696, |
| "loss": 3.3146, |
| "step": 55100 |
| }, |
| { |
| "epoch": 5.93585189968787, |
| "grad_norm": 0.6761989593505859, |
| "learning_rate": 0.0002442538519556082, |
| "loss": 3.3408, |
| "step": 55150 |
| }, |
| { |
| "epoch": 5.941233451727478, |
| "grad_norm": 0.7247626185417175, |
| "learning_rate": 0.0002439306109255468, |
| "loss": 3.3309, |
| "step": 55200 |
| }, |
| { |
| "epoch": 5.946615003767087, |
| "grad_norm": 0.7041187882423401, |
| "learning_rate": 0.00024360736989548538, |
| "loss": 3.3517, |
| "step": 55250 |
| }, |
| { |
| "epoch": 5.951996555806694, |
| "grad_norm": 0.6591287851333618, |
| "learning_rate": 0.00024328412886542394, |
| "loss": 3.3237, |
| "step": 55300 |
| }, |
| { |
| "epoch": 5.957378107846303, |
| "grad_norm": 0.6712716817855835, |
| "learning_rate": 0.00024296088783536257, |
| "loss": 3.3449, |
| "step": 55350 |
| }, |
| { |
| "epoch": 5.962759659885911, |
| "grad_norm": 0.7153301239013672, |
| "learning_rate": 0.00024263764680530113, |
| "loss": 3.3493, |
| "step": 55400 |
| }, |
| { |
| "epoch": 5.968141211925519, |
| "grad_norm": 0.6891111731529236, |
| "learning_rate": 0.0002423144057752397, |
| "loss": 3.3462, |
| "step": 55450 |
| }, |
| { |
| "epoch": 5.973522763965128, |
| "grad_norm": 0.7062400579452515, |
| "learning_rate": 0.00024199116474517832, |
| "loss": 3.3469, |
| "step": 55500 |
| }, |
| { |
| "epoch": 5.978904316004736, |
| "grad_norm": 0.763090193271637, |
| "learning_rate": 0.0002416679237151169, |
| "loss": 3.3347, |
| "step": 55550 |
| }, |
| { |
| "epoch": 5.984285868044344, |
| "grad_norm": 0.7440812587738037, |
| "learning_rate": 0.00024134468268505546, |
| "loss": 3.3311, |
| "step": 55600 |
| }, |
| { |
| "epoch": 5.989667420083952, |
| "grad_norm": 0.7073535323143005, |
| "learning_rate": 0.00024102144165499405, |
| "loss": 3.3357, |
| "step": 55650 |
| }, |
| { |
| "epoch": 5.995048972123561, |
| "grad_norm": 0.7137259244918823, |
| "learning_rate": 0.00024069820062493265, |
| "loss": 3.3291, |
| "step": 55700 |
| }, |
| { |
| "epoch": 6.000430524163169, |
| "grad_norm": 0.7363245487213135, |
| "learning_rate": 0.00024037495959487121, |
| "loss": 3.3288, |
| "step": 55750 |
| }, |
| { |
| "epoch": 6.005812076202777, |
| "grad_norm": 0.6792331337928772, |
| "learning_rate": 0.0002400517185648098, |
| "loss": 3.2392, |
| "step": 55800 |
| }, |
| { |
| "epoch": 6.011193628242385, |
| "grad_norm": 0.7233385443687439, |
| "learning_rate": 0.00023972847753474838, |
| "loss": 3.2546, |
| "step": 55850 |
| }, |
| { |
| "epoch": 6.016575180281993, |
| "grad_norm": 0.721964418888092, |
| "learning_rate": 0.00023940523650468697, |
| "loss": 3.2373, |
| "step": 55900 |
| }, |
| { |
| "epoch": 6.021956732321602, |
| "grad_norm": 0.7094835638999939, |
| "learning_rate": 0.00023908199547462557, |
| "loss": 3.2454, |
| "step": 55950 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "grad_norm": 0.9055956602096558, |
| "learning_rate": 0.00023875875444456413, |
| "loss": 3.2422, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.0273382843612096, |
| "eval_accuracy": 0.3834130066850862, |
| "eval_loss": 3.3882665634155273, |
| "eval_runtime": 182.8427, |
| "eval_samples_per_second": 98.505, |
| "eval_steps_per_second": 6.158, |
| "step": 56000 |
| }, |
| { |
| "epoch": 6.032719836400818, |
| "grad_norm": 0.703880250453949, |
| "learning_rate": 0.0002384355134145027, |
| "loss": 3.2557, |
| "step": 56050 |
| }, |
| { |
| "epoch": 6.038101388440427, |
| "grad_norm": 0.7022672295570374, |
| "learning_rate": 0.00023811227238444132, |
| "loss": 3.2543, |
| "step": 56100 |
| }, |
| { |
| "epoch": 6.043482940480034, |
| "grad_norm": 0.6952441334724426, |
| "learning_rate": 0.0002377890313543799, |
| "loss": 3.2414, |
| "step": 56150 |
| }, |
| { |
| "epoch": 6.048864492519643, |
| "grad_norm": 0.6822983622550964, |
| "learning_rate": 0.00023746579032431849, |
| "loss": 3.2709, |
| "step": 56200 |
| }, |
| { |
| "epoch": 6.0542460445592505, |
| "grad_norm": 0.7127947807312012, |
| "learning_rate": 0.00023714254929425708, |
| "loss": 3.2604, |
| "step": 56250 |
| }, |
| { |
| "epoch": 6.059627596598859, |
| "grad_norm": 0.7314467430114746, |
| "learning_rate": 0.00023681930826419565, |
| "loss": 3.2558, |
| "step": 56300 |
| }, |
| { |
| "epoch": 6.065009148638468, |
| "grad_norm": 0.7139166593551636, |
| "learning_rate": 0.00023649606723413424, |
| "loss": 3.2292, |
| "step": 56350 |
| }, |
| { |
| "epoch": 6.070390700678075, |
| "grad_norm": 0.7124274969100952, |
| "learning_rate": 0.0002361728262040728, |
| "loss": 3.2714, |
| "step": 56400 |
| }, |
| { |
| "epoch": 6.075772252717684, |
| "grad_norm": 0.718776285648346, |
| "learning_rate": 0.00023584958517401138, |
| "loss": 3.2716, |
| "step": 56450 |
| }, |
| { |
| "epoch": 6.081153804757292, |
| "grad_norm": 0.6894996166229248, |
| "learning_rate": 0.00023552634414395, |
| "loss": 3.2668, |
| "step": 56500 |
| }, |
| { |
| "epoch": 6.0865353567969, |
| "grad_norm": 0.729155421257019, |
| "learning_rate": 0.00023520310311388857, |
| "loss": 3.2687, |
| "step": 56550 |
| }, |
| { |
| "epoch": 6.091916908836509, |
| "grad_norm": 0.7677587866783142, |
| "learning_rate": 0.00023487986208382713, |
| "loss": 3.2677, |
| "step": 56600 |
| }, |
| { |
| "epoch": 6.097298460876116, |
| "grad_norm": 0.7513812184333801, |
| "learning_rate": 0.00023455662105376576, |
| "loss": 3.2547, |
| "step": 56650 |
| }, |
| { |
| "epoch": 6.102680012915725, |
| "grad_norm": 0.691321611404419, |
| "learning_rate": 0.00023423338002370432, |
| "loss": 3.2662, |
| "step": 56700 |
| }, |
| { |
| "epoch": 6.108061564955333, |
| "grad_norm": 0.70525062084198, |
| "learning_rate": 0.0002339101389936429, |
| "loss": 3.2635, |
| "step": 56750 |
| }, |
| { |
| "epoch": 6.113443116994941, |
| "grad_norm": 0.6724135279655457, |
| "learning_rate": 0.00023358689796358149, |
| "loss": 3.271, |
| "step": 56800 |
| }, |
| { |
| "epoch": 6.11882466903455, |
| "grad_norm": 0.7082234025001526, |
| "learning_rate": 0.00023326365693352008, |
| "loss": 3.256, |
| "step": 56850 |
| }, |
| { |
| "epoch": 6.124206221074158, |
| "grad_norm": 0.7638316750526428, |
| "learning_rate": 0.00023294041590345865, |
| "loss": 3.2694, |
| "step": 56900 |
| }, |
| { |
| "epoch": 6.129587773113766, |
| "grad_norm": 0.7289136052131653, |
| "learning_rate": 0.00023261717487339724, |
| "loss": 3.2692, |
| "step": 56950 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "grad_norm": 0.7192388772964478, |
| "learning_rate": 0.0002322939338433358, |
| "loss": 3.2646, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.134969325153374, |
| "eval_accuracy": 0.38360651747828384, |
| "eval_loss": 3.388136863708496, |
| "eval_runtime": 182.7928, |
| "eval_samples_per_second": 98.532, |
| "eval_steps_per_second": 6.16, |
| "step": 57000 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 0.7573813796043396, |
| "learning_rate": 0.0002319706928132744, |
| "loss": 3.2741, |
| "step": 57050 |
| }, |
| { |
| "epoch": 6.1457324292325906, |
| "grad_norm": 0.732204020023346, |
| "learning_rate": 0.000231647451783213, |
| "loss": 3.2918, |
| "step": 57100 |
| }, |
| { |
| "epoch": 6.151113981272199, |
| "grad_norm": 0.7195457816123962, |
| "learning_rate": 0.00023132421075315157, |
| "loss": 3.2779, |
| "step": 57150 |
| }, |
| { |
| "epoch": 6.156495533311807, |
| "grad_norm": 0.7368000745773315, |
| "learning_rate": 0.0002310009697230902, |
| "loss": 3.2607, |
| "step": 57200 |
| }, |
| { |
| "epoch": 6.161877085351415, |
| "grad_norm": 0.711700975894928, |
| "learning_rate": 0.00023067772869302876, |
| "loss": 3.2661, |
| "step": 57250 |
| }, |
| { |
| "epoch": 6.167258637391024, |
| "grad_norm": 0.6981373429298401, |
| "learning_rate": 0.00023035448766296732, |
| "loss": 3.2747, |
| "step": 57300 |
| }, |
| { |
| "epoch": 6.1726401894306315, |
| "grad_norm": 0.707578182220459, |
| "learning_rate": 0.00023003124663290592, |
| "loss": 3.2662, |
| "step": 57350 |
| }, |
| { |
| "epoch": 6.17802174147024, |
| "grad_norm": 0.7146908044815063, |
| "learning_rate": 0.0002297080056028445, |
| "loss": 3.2686, |
| "step": 57400 |
| }, |
| { |
| "epoch": 6.183403293509849, |
| "grad_norm": 0.7029908895492554, |
| "learning_rate": 0.00022938476457278308, |
| "loss": 3.263, |
| "step": 57450 |
| }, |
| { |
| "epoch": 6.188784845549456, |
| "grad_norm": 0.8150429725646973, |
| "learning_rate": 0.00022906152354272168, |
| "loss": 3.2707, |
| "step": 57500 |
| }, |
| { |
| "epoch": 6.194166397589065, |
| "grad_norm": 0.7352506518363953, |
| "learning_rate": 0.00022873828251266024, |
| "loss": 3.2762, |
| "step": 57550 |
| }, |
| { |
| "epoch": 6.1995479496286725, |
| "grad_norm": 0.6864310503005981, |
| "learning_rate": 0.00022841504148259884, |
| "loss": 3.27, |
| "step": 57600 |
| }, |
| { |
| "epoch": 6.204929501668281, |
| "grad_norm": 0.737712562084198, |
| "learning_rate": 0.00022809180045253743, |
| "loss": 3.2762, |
| "step": 57650 |
| }, |
| { |
| "epoch": 6.21031105370789, |
| "grad_norm": 0.7097147107124329, |
| "learning_rate": 0.000227768559422476, |
| "loss": 3.2642, |
| "step": 57700 |
| }, |
| { |
| "epoch": 6.215692605747497, |
| "grad_norm": 0.6888893842697144, |
| "learning_rate": 0.00022744531839241457, |
| "loss": 3.282, |
| "step": 57750 |
| }, |
| { |
| "epoch": 6.221074157787106, |
| "grad_norm": 0.739660382270813, |
| "learning_rate": 0.0002271220773623532, |
| "loss": 3.2819, |
| "step": 57800 |
| }, |
| { |
| "epoch": 6.226455709826714, |
| "grad_norm": 0.7385263442993164, |
| "learning_rate": 0.00022679883633229176, |
| "loss": 3.277, |
| "step": 57850 |
| }, |
| { |
| "epoch": 6.231837261866322, |
| "grad_norm": 0.751966655254364, |
| "learning_rate": 0.00022647559530223032, |
| "loss": 3.2734, |
| "step": 57900 |
| }, |
| { |
| "epoch": 6.237218813905931, |
| "grad_norm": 0.7751827239990234, |
| "learning_rate": 0.00022615881909277016, |
| "loss": 3.2823, |
| "step": 57950 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "grad_norm": 0.7112472057342529, |
| "learning_rate": 0.00022583557806270875, |
| "loss": 3.2708, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.242600365945538, |
| "eval_accuracy": 0.38413565704194036, |
| "eval_loss": 3.384307384490967, |
| "eval_runtime": 183.0119, |
| "eval_samples_per_second": 98.414, |
| "eval_steps_per_second": 6.153, |
| "step": 58000 |
| }, |
| { |
| "epoch": 6.247981917985147, |
| "grad_norm": 0.7275688648223877, |
| "learning_rate": 0.00022551233703264732, |
| "loss": 3.2724, |
| "step": 58050 |
| }, |
| { |
| "epoch": 6.253363470024755, |
| "grad_norm": 0.7110071778297424, |
| "learning_rate": 0.0002251890960025859, |
| "loss": 3.2628, |
| "step": 58100 |
| }, |
| { |
| "epoch": 6.258745022064363, |
| "grad_norm": 0.7214199900627136, |
| "learning_rate": 0.0002248658549725245, |
| "loss": 3.2907, |
| "step": 58150 |
| }, |
| { |
| "epoch": 6.264126574103972, |
| "grad_norm": 0.6868590116500854, |
| "learning_rate": 0.00022454261394246308, |
| "loss": 3.2706, |
| "step": 58200 |
| }, |
| { |
| "epoch": 6.26950812614358, |
| "grad_norm": 0.7192474007606506, |
| "learning_rate": 0.00022421937291240164, |
| "loss": 3.2852, |
| "step": 58250 |
| }, |
| { |
| "epoch": 6.274889678183188, |
| "grad_norm": 0.6799070239067078, |
| "learning_rate": 0.00022389613188234027, |
| "loss": 3.2645, |
| "step": 58300 |
| }, |
| { |
| "epoch": 6.280271230222796, |
| "grad_norm": 0.7178609371185303, |
| "learning_rate": 0.00022357289085227883, |
| "loss": 3.2952, |
| "step": 58350 |
| }, |
| { |
| "epoch": 6.285652782262405, |
| "grad_norm": 0.7094721794128418, |
| "learning_rate": 0.0002232496498222174, |
| "loss": 3.2788, |
| "step": 58400 |
| }, |
| { |
| "epoch": 6.2910343343020125, |
| "grad_norm": 0.7339088320732117, |
| "learning_rate": 0.000222926408792156, |
| "loss": 3.2805, |
| "step": 58450 |
| }, |
| { |
| "epoch": 6.296415886341621, |
| "grad_norm": 0.7479764223098755, |
| "learning_rate": 0.0002226031677620946, |
| "loss": 3.2833, |
| "step": 58500 |
| }, |
| { |
| "epoch": 6.301797438381229, |
| "grad_norm": 0.7498093843460083, |
| "learning_rate": 0.00022227992673203316, |
| "loss": 3.2691, |
| "step": 58550 |
| }, |
| { |
| "epoch": 6.307178990420837, |
| "grad_norm": 0.6673323512077332, |
| "learning_rate": 0.00022195668570197175, |
| "loss": 3.2946, |
| "step": 58600 |
| }, |
| { |
| "epoch": 6.312560542460446, |
| "grad_norm": 0.7104136347770691, |
| "learning_rate": 0.00022163344467191032, |
| "loss": 3.2845, |
| "step": 58650 |
| }, |
| { |
| "epoch": 6.3179420945000535, |
| "grad_norm": 0.6776658296585083, |
| "learning_rate": 0.00022131020364184891, |
| "loss": 3.2848, |
| "step": 58700 |
| }, |
| { |
| "epoch": 6.323323646539662, |
| "grad_norm": 0.8017224073410034, |
| "learning_rate": 0.0002209869626117875, |
| "loss": 3.2985, |
| "step": 58750 |
| }, |
| { |
| "epoch": 6.328705198579271, |
| "grad_norm": 0.7754116654396057, |
| "learning_rate": 0.00022066372158172608, |
| "loss": 3.2727, |
| "step": 58800 |
| }, |
| { |
| "epoch": 6.334086750618878, |
| "grad_norm": 0.6976533532142639, |
| "learning_rate": 0.00022034048055166464, |
| "loss": 3.2717, |
| "step": 58850 |
| }, |
| { |
| "epoch": 6.339468302658487, |
| "grad_norm": 0.7677433490753174, |
| "learning_rate": 0.00022001723952160327, |
| "loss": 3.27, |
| "step": 58900 |
| }, |
| { |
| "epoch": 6.344849854698095, |
| "grad_norm": 0.7851221561431885, |
| "learning_rate": 0.00021969399849154183, |
| "loss": 3.2634, |
| "step": 58950 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "grad_norm": 0.7342744469642639, |
| "learning_rate": 0.00021937075746148043, |
| "loss": 3.2901, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.350231406737703, |
| "eval_accuracy": 0.3843829510146061, |
| "eval_loss": 3.38128662109375, |
| "eval_runtime": 183.0034, |
| "eval_samples_per_second": 98.419, |
| "eval_steps_per_second": 6.153, |
| "step": 59000 |
| }, |
| { |
| "epoch": 6.355612958777312, |
| "grad_norm": 0.7511043548583984, |
| "learning_rate": 0.00021904751643141902, |
| "loss": 3.2897, |
| "step": 59050 |
| }, |
| { |
| "epoch": 6.360994510816919, |
| "grad_norm": 0.7121841907501221, |
| "learning_rate": 0.0002187242754013576, |
| "loss": 3.2602, |
| "step": 59100 |
| }, |
| { |
| "epoch": 6.366376062856528, |
| "grad_norm": 0.7471939325332642, |
| "learning_rate": 0.00021840103437129619, |
| "loss": 3.2703, |
| "step": 59150 |
| }, |
| { |
| "epoch": 6.371757614896136, |
| "grad_norm": 0.7568403482437134, |
| "learning_rate": 0.00021807779334123475, |
| "loss": 3.2817, |
| "step": 59200 |
| }, |
| { |
| "epoch": 6.377139166935744, |
| "grad_norm": 0.7506998181343079, |
| "learning_rate": 0.00021775455231117335, |
| "loss": 3.2724, |
| "step": 59250 |
| }, |
| { |
| "epoch": 6.382520718975353, |
| "grad_norm": 0.7650529146194458, |
| "learning_rate": 0.00021743131128111194, |
| "loss": 3.2874, |
| "step": 59300 |
| }, |
| { |
| "epoch": 6.387902271014961, |
| "grad_norm": 0.712727963924408, |
| "learning_rate": 0.0002171080702510505, |
| "loss": 3.2699, |
| "step": 59350 |
| }, |
| { |
| "epoch": 6.393283823054569, |
| "grad_norm": 0.7634322643280029, |
| "learning_rate": 0.00021678482922098908, |
| "loss": 3.2916, |
| "step": 59400 |
| }, |
| { |
| "epoch": 6.398665375094177, |
| "grad_norm": 0.7388198375701904, |
| "learning_rate": 0.0002164615881909277, |
| "loss": 3.2963, |
| "step": 59450 |
| }, |
| { |
| "epoch": 6.404046927133785, |
| "grad_norm": 0.7384008169174194, |
| "learning_rate": 0.00021613834716086627, |
| "loss": 3.2789, |
| "step": 59500 |
| }, |
| { |
| "epoch": 6.4094284791733935, |
| "grad_norm": 0.7477920651435852, |
| "learning_rate": 0.00021581510613080483, |
| "loss": 3.2972, |
| "step": 59550 |
| }, |
| { |
| "epoch": 6.414810031213002, |
| "grad_norm": 0.7716207504272461, |
| "learning_rate": 0.00021549186510074346, |
| "loss": 3.2811, |
| "step": 59600 |
| }, |
| { |
| "epoch": 6.42019158325261, |
| "grad_norm": 0.7266528010368347, |
| "learning_rate": 0.00021516862407068202, |
| "loss": 3.2955, |
| "step": 59650 |
| }, |
| { |
| "epoch": 6.425573135292218, |
| "grad_norm": 0.7093421220779419, |
| "learning_rate": 0.0002148453830406206, |
| "loss": 3.2774, |
| "step": 59700 |
| }, |
| { |
| "epoch": 6.430954687331827, |
| "grad_norm": 0.6861085891723633, |
| "learning_rate": 0.00021452214201055919, |
| "loss": 3.2935, |
| "step": 59750 |
| }, |
| { |
| "epoch": 6.4363362393714345, |
| "grad_norm": 0.6963001489639282, |
| "learning_rate": 0.00021419890098049778, |
| "loss": 3.2823, |
| "step": 59800 |
| }, |
| { |
| "epoch": 6.441717791411043, |
| "grad_norm": 0.7377711534500122, |
| "learning_rate": 0.00021387565995043638, |
| "loss": 3.2938, |
| "step": 59850 |
| }, |
| { |
| "epoch": 6.447099343450651, |
| "grad_norm": 0.7165238857269287, |
| "learning_rate": 0.00021355241892037494, |
| "loss": 3.296, |
| "step": 59900 |
| }, |
| { |
| "epoch": 6.452480895490259, |
| "grad_norm": 0.7209922671318054, |
| "learning_rate": 0.0002132291778903135, |
| "loss": 3.2764, |
| "step": 59950 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "grad_norm": 0.6959686875343323, |
| "learning_rate": 0.00021290593686025213, |
| "loss": 3.3013, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.457862447529868, |
| "eval_accuracy": 0.38479148587278855, |
| "eval_loss": 3.3754682540893555, |
| "eval_runtime": 182.7058, |
| "eval_samples_per_second": 98.579, |
| "eval_steps_per_second": 6.163, |
| "step": 60000 |
| }, |
| { |
| "epoch": 6.4632439995694755, |
| "grad_norm": 0.8473927974700928, |
| "learning_rate": 0.0002125826958301907, |
| "loss": 3.2835, |
| "step": 60050 |
| }, |
| { |
| "epoch": 6.468625551609084, |
| "grad_norm": 0.7251399755477905, |
| "learning_rate": 0.00021225945480012927, |
| "loss": 3.3025, |
| "step": 60100 |
| }, |
| { |
| "epoch": 6.474007103648693, |
| "grad_norm": 0.767999529838562, |
| "learning_rate": 0.00021193621377006786, |
| "loss": 3.2906, |
| "step": 60150 |
| }, |
| { |
| "epoch": 6.4793886556883, |
| "grad_norm": 0.7236109972000122, |
| "learning_rate": 0.00021161297274000646, |
| "loss": 3.2944, |
| "step": 60200 |
| }, |
| { |
| "epoch": 6.484770207727909, |
| "grad_norm": 0.7092922925949097, |
| "learning_rate": 0.00021128973170994502, |
| "loss": 3.2888, |
| "step": 60250 |
| }, |
| { |
| "epoch": 6.490151759767517, |
| "grad_norm": 0.7969743609428406, |
| "learning_rate": 0.00021096649067988362, |
| "loss": 3.2951, |
| "step": 60300 |
| }, |
| { |
| "epoch": 6.495533311807125, |
| "grad_norm": 0.724816620349884, |
| "learning_rate": 0.00021064324964982219, |
| "loss": 3.2937, |
| "step": 60350 |
| }, |
| { |
| "epoch": 6.500914863846734, |
| "grad_norm": 0.792242169380188, |
| "learning_rate": 0.00021032000861976078, |
| "loss": 3.2896, |
| "step": 60400 |
| }, |
| { |
| "epoch": 6.506296415886341, |
| "grad_norm": 0.7577232718467712, |
| "learning_rate": 0.00020999676758969938, |
| "loss": 3.2881, |
| "step": 60450 |
| }, |
| { |
| "epoch": 6.51167796792595, |
| "grad_norm": 0.7519006729125977, |
| "learning_rate": 0.00020967352655963794, |
| "loss": 3.2775, |
| "step": 60500 |
| }, |
| { |
| "epoch": 6.517059519965558, |
| "grad_norm": 0.7319377660751343, |
| "learning_rate": 0.0002093502855295765, |
| "loss": 3.2871, |
| "step": 60550 |
| }, |
| { |
| "epoch": 6.522441072005166, |
| "grad_norm": 0.7363983988761902, |
| "learning_rate": 0.00020902704449951513, |
| "loss": 3.2811, |
| "step": 60600 |
| }, |
| { |
| "epoch": 6.5278226240447745, |
| "grad_norm": 0.7572370171546936, |
| "learning_rate": 0.0002087038034694537, |
| "loss": 3.2906, |
| "step": 60650 |
| }, |
| { |
| "epoch": 6.533204176084383, |
| "grad_norm": 0.7479851841926575, |
| "learning_rate": 0.00020838056243939227, |
| "loss": 3.2969, |
| "step": 60700 |
| }, |
| { |
| "epoch": 6.538585728123991, |
| "grad_norm": 0.754562497138977, |
| "learning_rate": 0.0002080573214093309, |
| "loss": 3.2942, |
| "step": 60750 |
| }, |
| { |
| "epoch": 6.543967280163599, |
| "grad_norm": 0.7065774202346802, |
| "learning_rate": 0.00020773408037926946, |
| "loss": 3.2863, |
| "step": 60800 |
| }, |
| { |
| "epoch": 6.549348832203208, |
| "grad_norm": 0.7242980599403381, |
| "learning_rate": 0.00020741083934920805, |
| "loss": 3.2771, |
| "step": 60850 |
| }, |
| { |
| "epoch": 6.5547303842428155, |
| "grad_norm": 0.7278899550437927, |
| "learning_rate": 0.00020708759831914662, |
| "loss": 3.2715, |
| "step": 60900 |
| }, |
| { |
| "epoch": 6.560111936282424, |
| "grad_norm": 0.738157331943512, |
| "learning_rate": 0.00020676435728908521, |
| "loss": 3.2806, |
| "step": 60950 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "grad_norm": 0.7449216842651367, |
| "learning_rate": 0.0002064411162590238, |
| "loss": 3.2954, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.565493488322032, |
| "eval_accuracy": 0.3854739346611513, |
| "eval_loss": 3.3736610412597656, |
| "eval_runtime": 182.6023, |
| "eval_samples_per_second": 98.635, |
| "eval_steps_per_second": 6.166, |
| "step": 61000 |
| }, |
| { |
| "epoch": 6.57087504036164, |
| "grad_norm": 0.7469400763511658, |
| "learning_rate": 0.00020611787522896238, |
| "loss": 3.2986, |
| "step": 61050 |
| }, |
| { |
| "epoch": 6.576256592401249, |
| "grad_norm": 0.73382169008255, |
| "learning_rate": 0.00020579463419890094, |
| "loss": 3.2728, |
| "step": 61100 |
| }, |
| { |
| "epoch": 6.5816381444408565, |
| "grad_norm": 0.7447705864906311, |
| "learning_rate": 0.00020547139316883957, |
| "loss": 3.298, |
| "step": 61150 |
| }, |
| { |
| "epoch": 6.587019696480465, |
| "grad_norm": 0.7730774283409119, |
| "learning_rate": 0.00020514815213877813, |
| "loss": 3.2885, |
| "step": 61200 |
| }, |
| { |
| "epoch": 6.592401248520073, |
| "grad_norm": 0.764479398727417, |
| "learning_rate": 0.0002048249111087167, |
| "loss": 3.2927, |
| "step": 61250 |
| }, |
| { |
| "epoch": 6.597782800559681, |
| "grad_norm": 0.7595162987709045, |
| "learning_rate": 0.00020450167007865532, |
| "loss": 3.2942, |
| "step": 61300 |
| }, |
| { |
| "epoch": 6.60316435259929, |
| "grad_norm": 0.7465935945510864, |
| "learning_rate": 0.0002041784290485939, |
| "loss": 3.2903, |
| "step": 61350 |
| }, |
| { |
| "epoch": 6.608545904638898, |
| "grad_norm": 0.7768982648849487, |
| "learning_rate": 0.00020385518801853246, |
| "loss": 3.2867, |
| "step": 61400 |
| }, |
| { |
| "epoch": 6.613927456678506, |
| "grad_norm": 0.7525243759155273, |
| "learning_rate": 0.00020353194698847105, |
| "loss": 3.2923, |
| "step": 61450 |
| }, |
| { |
| "epoch": 6.619309008718115, |
| "grad_norm": 0.7230768799781799, |
| "learning_rate": 0.00020320870595840965, |
| "loss": 3.287, |
| "step": 61500 |
| }, |
| { |
| "epoch": 6.624690560757722, |
| "grad_norm": 0.7975799441337585, |
| "learning_rate": 0.00020288546492834821, |
| "loss": 3.3008, |
| "step": 61550 |
| }, |
| { |
| "epoch": 6.630072112797331, |
| "grad_norm": 0.7720596194267273, |
| "learning_rate": 0.0002025622238982868, |
| "loss": 3.2733, |
| "step": 61600 |
| }, |
| { |
| "epoch": 6.635453664836939, |
| "grad_norm": 0.7760968804359436, |
| "learning_rate": 0.00020223898286822538, |
| "loss": 3.2823, |
| "step": 61650 |
| }, |
| { |
| "epoch": 6.640835216876547, |
| "grad_norm": 0.7323305010795593, |
| "learning_rate": 0.00020191574183816397, |
| "loss": 3.2987, |
| "step": 61700 |
| }, |
| { |
| "epoch": 6.6462167689161555, |
| "grad_norm": 0.7435678839683533, |
| "learning_rate": 0.00020159250080810257, |
| "loss": 3.2919, |
| "step": 61750 |
| }, |
| { |
| "epoch": 6.651598320955763, |
| "grad_norm": 0.779816210269928, |
| "learning_rate": 0.00020126925977804113, |
| "loss": 3.3058, |
| "step": 61800 |
| }, |
| { |
| "epoch": 6.656979872995372, |
| "grad_norm": 0.7927149534225464, |
| "learning_rate": 0.00020094601874797976, |
| "loss": 3.2841, |
| "step": 61850 |
| }, |
| { |
| "epoch": 6.66236142503498, |
| "grad_norm": 0.7235358953475952, |
| "learning_rate": 0.00020062277771791832, |
| "loss": 3.3052, |
| "step": 61900 |
| }, |
| { |
| "epoch": 6.667742977074588, |
| "grad_norm": 0.7375259399414062, |
| "learning_rate": 0.00020030600150845813, |
| "loss": 3.2933, |
| "step": 61950 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "grad_norm": 0.7493704557418823, |
| "learning_rate": 0.0001999827604783967, |
| "loss": 3.2993, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.6731245291141965, |
| "eval_accuracy": 0.3856817876355404, |
| "eval_loss": 3.367237091064453, |
| "eval_runtime": 182.8132, |
| "eval_samples_per_second": 98.521, |
| "eval_steps_per_second": 6.159, |
| "step": 62000 |
| }, |
| { |
| "epoch": 6.678506081153805, |
| "grad_norm": 0.7092652916908264, |
| "learning_rate": 0.0001996595194483353, |
| "loss": 3.2846, |
| "step": 62050 |
| }, |
| { |
| "epoch": 6.683887633193413, |
| "grad_norm": 0.712834894657135, |
| "learning_rate": 0.00019933627841827389, |
| "loss": 3.2926, |
| "step": 62100 |
| }, |
| { |
| "epoch": 6.689269185233021, |
| "grad_norm": 0.7052910923957825, |
| "learning_rate": 0.00019901303738821245, |
| "loss": 3.2788, |
| "step": 62150 |
| }, |
| { |
| "epoch": 6.69465073727263, |
| "grad_norm": 0.727817177772522, |
| "learning_rate": 0.00019868979635815102, |
| "loss": 3.2872, |
| "step": 62200 |
| }, |
| { |
| "epoch": 6.7000322893122375, |
| "grad_norm": 0.7619044780731201, |
| "learning_rate": 0.00019836655532808964, |
| "loss": 3.2951, |
| "step": 62250 |
| }, |
| { |
| "epoch": 6.705413841351846, |
| "grad_norm": 0.742910623550415, |
| "learning_rate": 0.0001980433142980282, |
| "loss": 3.2846, |
| "step": 62300 |
| }, |
| { |
| "epoch": 6.710795393391454, |
| "grad_norm": 0.8113905191421509, |
| "learning_rate": 0.00019772007326796678, |
| "loss": 3.2751, |
| "step": 62350 |
| }, |
| { |
| "epoch": 6.716176945431062, |
| "grad_norm": 0.8383527994155884, |
| "learning_rate": 0.0001973968322379054, |
| "loss": 3.2939, |
| "step": 62400 |
| }, |
| { |
| "epoch": 6.721558497470671, |
| "grad_norm": 0.7623720169067383, |
| "learning_rate": 0.00019707359120784397, |
| "loss": 3.3112, |
| "step": 62450 |
| }, |
| { |
| "epoch": 6.7269400495102785, |
| "grad_norm": 0.7660823464393616, |
| "learning_rate": 0.00019675035017778253, |
| "loss": 3.2797, |
| "step": 62500 |
| }, |
| { |
| "epoch": 6.732321601549887, |
| "grad_norm": 0.7665824890136719, |
| "learning_rate": 0.00019642710914772113, |
| "loss": 3.2917, |
| "step": 62550 |
| }, |
| { |
| "epoch": 6.737703153589496, |
| "grad_norm": 0.754959762096405, |
| "learning_rate": 0.00019610386811765972, |
| "loss": 3.3053, |
| "step": 62600 |
| }, |
| { |
| "epoch": 6.743084705629103, |
| "grad_norm": 0.7177004218101501, |
| "learning_rate": 0.00019578062708759832, |
| "loss": 3.285, |
| "step": 62650 |
| }, |
| { |
| "epoch": 6.748466257668712, |
| "grad_norm": 0.741148054599762, |
| "learning_rate": 0.00019545738605753689, |
| "loss": 3.284, |
| "step": 62700 |
| }, |
| { |
| "epoch": 6.75384780970832, |
| "grad_norm": 0.7509447932243347, |
| "learning_rate": 0.00019513414502747545, |
| "loss": 3.2928, |
| "step": 62750 |
| }, |
| { |
| "epoch": 6.759229361747928, |
| "grad_norm": 0.7940971255302429, |
| "learning_rate": 0.00019481090399741408, |
| "loss": 3.2915, |
| "step": 62800 |
| }, |
| { |
| "epoch": 6.7646109137875365, |
| "grad_norm": 0.7626696228981018, |
| "learning_rate": 0.00019448766296735264, |
| "loss": 3.2999, |
| "step": 62850 |
| }, |
| { |
| "epoch": 6.769992465827144, |
| "grad_norm": 0.7458931803703308, |
| "learning_rate": 0.0001941644219372912, |
| "loss": 3.2895, |
| "step": 62900 |
| }, |
| { |
| "epoch": 6.775374017866753, |
| "grad_norm": 0.7777038812637329, |
| "learning_rate": 0.00019384118090722983, |
| "loss": 3.2885, |
| "step": 62950 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "grad_norm": 0.7893502116203308, |
| "learning_rate": 0.0001935179398771684, |
| "loss": 3.2894, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.780755569906361, |
| "eval_accuracy": 0.3861680093085102, |
| "eval_loss": 3.364060163497925, |
| "eval_runtime": 182.8147, |
| "eval_samples_per_second": 98.521, |
| "eval_steps_per_second": 6.159, |
| "step": 63000 |
| }, |
| { |
| "epoch": 6.786137121945969, |
| "grad_norm": 0.732779324054718, |
| "learning_rate": 0.00019319469884710697, |
| "loss": 3.289, |
| "step": 63050 |
| }, |
| { |
| "epoch": 6.7915186739855775, |
| "grad_norm": 0.7249733209609985, |
| "learning_rate": 0.00019287145781704556, |
| "loss": 3.2795, |
| "step": 63100 |
| }, |
| { |
| "epoch": 6.796900226025185, |
| "grad_norm": 0.7652485966682434, |
| "learning_rate": 0.00019254821678698416, |
| "loss": 3.2788, |
| "step": 63150 |
| }, |
| { |
| "epoch": 6.802281778064794, |
| "grad_norm": 0.7800796031951904, |
| "learning_rate": 0.00019222497575692272, |
| "loss": 3.282, |
| "step": 63200 |
| }, |
| { |
| "epoch": 6.807663330104402, |
| "grad_norm": 0.7617146968841553, |
| "learning_rate": 0.00019190173472686132, |
| "loss": 3.2922, |
| "step": 63250 |
| }, |
| { |
| "epoch": 6.813044882144011, |
| "grad_norm": 0.753646969795227, |
| "learning_rate": 0.0001915784936967999, |
| "loss": 3.2944, |
| "step": 63300 |
| }, |
| { |
| "epoch": 6.8184264341836185, |
| "grad_norm": 0.7656055688858032, |
| "learning_rate": 0.00019125525266673845, |
| "loss": 3.2996, |
| "step": 63350 |
| }, |
| { |
| "epoch": 6.823807986223227, |
| "grad_norm": 0.7957121133804321, |
| "learning_rate": 0.00019093201163667708, |
| "loss": 3.2804, |
| "step": 63400 |
| }, |
| { |
| "epoch": 6.829189538262835, |
| "grad_norm": 0.7702323794364929, |
| "learning_rate": 0.00019060877060661564, |
| "loss": 3.2949, |
| "step": 63450 |
| }, |
| { |
| "epoch": 6.834571090302443, |
| "grad_norm": 0.8194876313209534, |
| "learning_rate": 0.0001902855295765542, |
| "loss": 3.2796, |
| "step": 63500 |
| }, |
| { |
| "epoch": 6.839952642342052, |
| "grad_norm": 0.75143963098526, |
| "learning_rate": 0.00018996228854649283, |
| "loss": 3.3038, |
| "step": 63550 |
| }, |
| { |
| "epoch": 6.8453341943816595, |
| "grad_norm": 0.7525258660316467, |
| "learning_rate": 0.0001896390475164314, |
| "loss": 3.2956, |
| "step": 63600 |
| }, |
| { |
| "epoch": 6.850715746421268, |
| "grad_norm": 0.7692186236381531, |
| "learning_rate": 0.00018931580648637, |
| "loss": 3.2958, |
| "step": 63650 |
| }, |
| { |
| "epoch": 6.856097298460876, |
| "grad_norm": 0.7735649943351746, |
| "learning_rate": 0.00018899256545630856, |
| "loss": 3.2939, |
| "step": 63700 |
| }, |
| { |
| "epoch": 6.861478850500484, |
| "grad_norm": 0.7386011481285095, |
| "learning_rate": 0.00018866932442624716, |
| "loss": 3.3071, |
| "step": 63750 |
| }, |
| { |
| "epoch": 6.866860402540093, |
| "grad_norm": 0.7658208608627319, |
| "learning_rate": 0.00018834608339618575, |
| "loss": 3.2894, |
| "step": 63800 |
| }, |
| { |
| "epoch": 6.8722419545797, |
| "grad_norm": 0.7646046280860901, |
| "learning_rate": 0.00018802284236612432, |
| "loss": 3.3067, |
| "step": 63850 |
| }, |
| { |
| "epoch": 6.877623506619309, |
| "grad_norm": 0.732725203037262, |
| "learning_rate": 0.0001876996013360629, |
| "loss": 3.2885, |
| "step": 63900 |
| }, |
| { |
| "epoch": 6.8830050586589175, |
| "grad_norm": 0.6860552430152893, |
| "learning_rate": 0.00018738282512660272, |
| "loss": 3.3071, |
| "step": 63950 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "grad_norm": 0.7384079694747925, |
| "learning_rate": 0.0001870595840965413, |
| "loss": 3.294, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.888386610698525, |
| "eval_accuracy": 0.38654601270521677, |
| "eval_loss": 3.361266851425171, |
| "eval_runtime": 182.8473, |
| "eval_samples_per_second": 98.503, |
| "eval_steps_per_second": 6.158, |
| "step": 64000 |
| }, |
| { |
| "epoch": 6.893768162738134, |
| "grad_norm": 0.7515199184417725, |
| "learning_rate": 0.0001867363430664799, |
| "loss": 3.2797, |
| "step": 64050 |
| }, |
| { |
| "epoch": 6.899149714777742, |
| "grad_norm": 0.7795135378837585, |
| "learning_rate": 0.00018641310203641848, |
| "loss": 3.2799, |
| "step": 64100 |
| }, |
| { |
| "epoch": 6.90453126681735, |
| "grad_norm": 0.7016361355781555, |
| "learning_rate": 0.00018608986100635705, |
| "loss": 3.2991, |
| "step": 64150 |
| }, |
| { |
| "epoch": 6.9099128188569585, |
| "grad_norm": 0.7909216284751892, |
| "learning_rate": 0.00018576661997629564, |
| "loss": 3.2868, |
| "step": 64200 |
| }, |
| { |
| "epoch": 6.915294370896566, |
| "grad_norm": 0.7275353074073792, |
| "learning_rate": 0.00018544337894623423, |
| "loss": 3.2848, |
| "step": 64250 |
| }, |
| { |
| "epoch": 6.920675922936175, |
| "grad_norm": 0.7585761547088623, |
| "learning_rate": 0.0001851201379161728, |
| "loss": 3.2827, |
| "step": 64300 |
| }, |
| { |
| "epoch": 6.926057474975783, |
| "grad_norm": 0.7263973355293274, |
| "learning_rate": 0.0001847968968861114, |
| "loss": 3.296, |
| "step": 64350 |
| }, |
| { |
| "epoch": 6.931439027015391, |
| "grad_norm": 0.751746654510498, |
| "learning_rate": 0.00018447365585604996, |
| "loss": 3.2963, |
| "step": 64400 |
| }, |
| { |
| "epoch": 6.9368205790549995, |
| "grad_norm": 0.748279333114624, |
| "learning_rate": 0.00018415041482598859, |
| "loss": 3.2842, |
| "step": 64450 |
| }, |
| { |
| "epoch": 6.942202131094608, |
| "grad_norm": 0.7945259213447571, |
| "learning_rate": 0.00018382717379592715, |
| "loss": 3.3071, |
| "step": 64500 |
| }, |
| { |
| "epoch": 6.947583683134216, |
| "grad_norm": 0.74016934633255, |
| "learning_rate": 0.00018350393276586572, |
| "loss": 3.2925, |
| "step": 64550 |
| }, |
| { |
| "epoch": 6.952965235173824, |
| "grad_norm": 0.7379736304283142, |
| "learning_rate": 0.00018318069173580434, |
| "loss": 3.2959, |
| "step": 64600 |
| }, |
| { |
| "epoch": 6.958346787213433, |
| "grad_norm": 0.7810225486755371, |
| "learning_rate": 0.0001828574507057429, |
| "loss": 3.3001, |
| "step": 64650 |
| }, |
| { |
| "epoch": 6.9637283392530405, |
| "grad_norm": 0.7810648083686829, |
| "learning_rate": 0.00018253420967568148, |
| "loss": 3.2961, |
| "step": 64700 |
| }, |
| { |
| "epoch": 6.969109891292649, |
| "grad_norm": 0.740482747554779, |
| "learning_rate": 0.00018221096864562007, |
| "loss": 3.3029, |
| "step": 64750 |
| }, |
| { |
| "epoch": 6.974491443332257, |
| "grad_norm": 0.7730101943016052, |
| "learning_rate": 0.00018188772761555867, |
| "loss": 3.2891, |
| "step": 64800 |
| }, |
| { |
| "epoch": 6.979872995371865, |
| "grad_norm": 0.8048725128173828, |
| "learning_rate": 0.00018156448658549723, |
| "loss": 3.2735, |
| "step": 64850 |
| }, |
| { |
| "epoch": 6.985254547411474, |
| "grad_norm": 0.76474529504776, |
| "learning_rate": 0.00018124124555543583, |
| "loss": 3.2922, |
| "step": 64900 |
| }, |
| { |
| "epoch": 6.990636099451081, |
| "grad_norm": 0.7955503463745117, |
| "learning_rate": 0.0001809180045253744, |
| "loss": 3.2788, |
| "step": 64950 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "grad_norm": 0.7674031257629395, |
| "learning_rate": 0.00018059476349531296, |
| "loss": 3.2997, |
| "step": 65000 |
| }, |
| { |
| "epoch": 6.99601765149069, |
| "eval_accuracy": 0.3869485716545694, |
| "eval_loss": 3.357651710510254, |
| "eval_runtime": 182.7956, |
| "eval_samples_per_second": 98.531, |
| "eval_steps_per_second": 6.16, |
| "step": 65000 |
| }, |
| { |
| "epoch": 7.0013992035302985, |
| "grad_norm": 0.7507967948913574, |
| "learning_rate": 0.00018027152246525159, |
| "loss": 3.2669, |
| "step": 65050 |
| }, |
| { |
| "epoch": 7.006780755569906, |
| "grad_norm": 0.7749801278114319, |
| "learning_rate": 0.00017994828143519015, |
| "loss": 3.2086, |
| "step": 65100 |
| }, |
| { |
| "epoch": 7.012162307609515, |
| "grad_norm": 0.751576840877533, |
| "learning_rate": 0.00017962504040512872, |
| "loss": 3.1952, |
| "step": 65150 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.719531774520874, |
| "learning_rate": 0.00017930179937506734, |
| "loss": 3.2096, |
| "step": 65200 |
| }, |
| { |
| "epoch": 7.022925411688731, |
| "grad_norm": 0.7776346802711487, |
| "learning_rate": 0.0001789785583450059, |
| "loss": 3.1961, |
| "step": 65250 |
| }, |
| { |
| "epoch": 7.0283069637283395, |
| "grad_norm": 0.7524562478065491, |
| "learning_rate": 0.00017865531731494448, |
| "loss": 3.212, |
| "step": 65300 |
| }, |
| { |
| "epoch": 7.033688515767947, |
| "grad_norm": 0.7441275119781494, |
| "learning_rate": 0.00017833207628488307, |
| "loss": 3.2052, |
| "step": 65350 |
| }, |
| { |
| "epoch": 7.039070067807556, |
| "grad_norm": 0.7578763365745544, |
| "learning_rate": 0.00017800883525482167, |
| "loss": 3.2185, |
| "step": 65400 |
| }, |
| { |
| "epoch": 7.044451619847164, |
| "grad_norm": 0.7912497520446777, |
| "learning_rate": 0.00017768559422476026, |
| "loss": 3.2265, |
| "step": 65450 |
| }, |
| { |
| "epoch": 7.049833171886772, |
| "grad_norm": 0.7631890177726746, |
| "learning_rate": 0.00017736235319469883, |
| "loss": 3.2046, |
| "step": 65500 |
| }, |
| { |
| "epoch": 7.0552147239263805, |
| "grad_norm": 0.7851689457893372, |
| "learning_rate": 0.0001770391121646374, |
| "loss": 3.2074, |
| "step": 65550 |
| }, |
| { |
| "epoch": 7.060596275965988, |
| "grad_norm": 0.734584629535675, |
| "learning_rate": 0.00017671587113457602, |
| "loss": 3.2278, |
| "step": 65600 |
| }, |
| { |
| "epoch": 7.065977828005597, |
| "grad_norm": 0.7264954447746277, |
| "learning_rate": 0.0001763926301045146, |
| "loss": 3.2161, |
| "step": 65650 |
| }, |
| { |
| "epoch": 7.071359380045205, |
| "grad_norm": 0.8186044692993164, |
| "learning_rate": 0.00017606938907445315, |
| "loss": 3.1974, |
| "step": 65700 |
| }, |
| { |
| "epoch": 7.076740932084813, |
| "grad_norm": 0.7647484540939331, |
| "learning_rate": 0.00017574614804439178, |
| "loss": 3.1959, |
| "step": 65750 |
| }, |
| { |
| "epoch": 7.0821224841244215, |
| "grad_norm": 0.7596768736839294, |
| "learning_rate": 0.00017542290701433034, |
| "loss": 3.216, |
| "step": 65800 |
| }, |
| { |
| "epoch": 7.08750403616403, |
| "grad_norm": 0.7448734641075134, |
| "learning_rate": 0.0001750996659842689, |
| "loss": 3.2196, |
| "step": 65850 |
| }, |
| { |
| "epoch": 7.092885588203638, |
| "grad_norm": 0.7379791736602783, |
| "learning_rate": 0.0001747764249542075, |
| "loss": 3.2247, |
| "step": 65900 |
| }, |
| { |
| "epoch": 7.098267140243246, |
| "grad_norm": 0.7869905829429626, |
| "learning_rate": 0.0001744531839241461, |
| "loss": 3.2226, |
| "step": 65950 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "grad_norm": 0.7377243638038635, |
| "learning_rate": 0.0001741364077146859, |
| "loss": 3.238, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.103648692282855, |
| "eval_accuracy": 0.3870309305435328, |
| "eval_loss": 3.3637986183166504, |
| "eval_runtime": 182.81, |
| "eval_samples_per_second": 98.523, |
| "eval_steps_per_second": 6.159, |
| "step": 66000 |
| }, |
| { |
| "epoch": 7.109030244322462, |
| "grad_norm": 0.7684816122055054, |
| "learning_rate": 0.00017381316668462447, |
| "loss": 3.2179, |
| "step": 66050 |
| }, |
| { |
| "epoch": 7.114411796362071, |
| "grad_norm": 0.7641874551773071, |
| "learning_rate": 0.00017348992565456304, |
| "loss": 3.2054, |
| "step": 66100 |
| }, |
| { |
| "epoch": 7.119793348401679, |
| "grad_norm": 0.7933626174926758, |
| "learning_rate": 0.00017316668462450166, |
| "loss": 3.2072, |
| "step": 66150 |
| }, |
| { |
| "epoch": 7.125174900441287, |
| "grad_norm": 0.7817862629890442, |
| "learning_rate": 0.00017284344359444023, |
| "loss": 3.2125, |
| "step": 66200 |
| }, |
| { |
| "epoch": 7.130556452480896, |
| "grad_norm": 0.8065374493598938, |
| "learning_rate": 0.00017252020256437885, |
| "loss": 3.236, |
| "step": 66250 |
| }, |
| { |
| "epoch": 7.135938004520503, |
| "grad_norm": 0.7458828091621399, |
| "learning_rate": 0.00017219696153431742, |
| "loss": 3.219, |
| "step": 66300 |
| }, |
| { |
| "epoch": 7.141319556560112, |
| "grad_norm": 0.7864915132522583, |
| "learning_rate": 0.000171873720504256, |
| "loss": 3.2115, |
| "step": 66350 |
| }, |
| { |
| "epoch": 7.1467011085997205, |
| "grad_norm": 0.7758970260620117, |
| "learning_rate": 0.0001715569442947958, |
| "loss": 3.2207, |
| "step": 66400 |
| }, |
| { |
| "epoch": 7.152082660639328, |
| "grad_norm": 0.7451347708702087, |
| "learning_rate": 0.0001712337032647344, |
| "loss": 3.2267, |
| "step": 66450 |
| }, |
| { |
| "epoch": 7.157464212678937, |
| "grad_norm": 0.7944716215133667, |
| "learning_rate": 0.00017091046223467298, |
| "loss": 3.2292, |
| "step": 66500 |
| }, |
| { |
| "epoch": 7.162845764718545, |
| "grad_norm": 0.7795634269714355, |
| "learning_rate": 0.00017058722120461155, |
| "loss": 3.2236, |
| "step": 66550 |
| }, |
| { |
| "epoch": 7.168227316758153, |
| "grad_norm": 0.8057452440261841, |
| "learning_rate": 0.00017026398017455012, |
| "loss": 3.2259, |
| "step": 66600 |
| }, |
| { |
| "epoch": 7.1736088687977615, |
| "grad_norm": 0.7921217679977417, |
| "learning_rate": 0.00016994073914448874, |
| "loss": 3.2398, |
| "step": 66650 |
| }, |
| { |
| "epoch": 7.178990420837369, |
| "grad_norm": 0.7860040664672852, |
| "learning_rate": 0.0001696174981144273, |
| "loss": 3.2137, |
| "step": 66700 |
| }, |
| { |
| "epoch": 7.184371972876978, |
| "grad_norm": 0.7447194457054138, |
| "learning_rate": 0.00016929425708436588, |
| "loss": 3.2191, |
| "step": 66750 |
| }, |
| { |
| "epoch": 7.189753524916586, |
| "grad_norm": 0.757375180721283, |
| "learning_rate": 0.0001689710160543045, |
| "loss": 3.2208, |
| "step": 66800 |
| }, |
| { |
| "epoch": 7.195135076956194, |
| "grad_norm": 0.7810249924659729, |
| "learning_rate": 0.00016864777502424307, |
| "loss": 3.2391, |
| "step": 66850 |
| }, |
| { |
| "epoch": 7.2005166289958025, |
| "grad_norm": 0.8149625062942505, |
| "learning_rate": 0.00016832453399418163, |
| "loss": 3.2079, |
| "step": 66900 |
| }, |
| { |
| "epoch": 7.205898181035411, |
| "grad_norm": 0.7833424210548401, |
| "learning_rate": 0.00016800129296412023, |
| "loss": 3.2331, |
| "step": 66950 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "grad_norm": 0.8489099740982056, |
| "learning_rate": 0.00016767805193405882, |
| "loss": 3.2202, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.211279733075019, |
| "eval_accuracy": 0.38730658291991815, |
| "eval_loss": 3.3612101078033447, |
| "eval_runtime": 182.5912, |
| "eval_samples_per_second": 98.641, |
| "eval_steps_per_second": 6.167, |
| "step": 67000 |
| }, |
| { |
| "epoch": 7.216661285114627, |
| "grad_norm": 0.7586920857429504, |
| "learning_rate": 0.0001673548109039974, |
| "loss": 3.2246, |
| "step": 67050 |
| }, |
| { |
| "epoch": 7.222042837154235, |
| "grad_norm": 0.8181234002113342, |
| "learning_rate": 0.00016703156987393598, |
| "loss": 3.2262, |
| "step": 67100 |
| }, |
| { |
| "epoch": 7.2274243891938434, |
| "grad_norm": 0.7846294641494751, |
| "learning_rate": 0.00016670832884387455, |
| "loss": 3.2376, |
| "step": 67150 |
| }, |
| { |
| "epoch": 7.232805941233452, |
| "grad_norm": 0.7629982233047485, |
| "learning_rate": 0.00016638508781381317, |
| "loss": 3.2267, |
| "step": 67200 |
| }, |
| { |
| "epoch": 7.23818749327306, |
| "grad_norm": 0.8482091426849365, |
| "learning_rate": 0.00016606184678375174, |
| "loss": 3.237, |
| "step": 67250 |
| }, |
| { |
| "epoch": 7.243569045312668, |
| "grad_norm": 0.8140213489532471, |
| "learning_rate": 0.0001657386057536903, |
| "loss": 3.2355, |
| "step": 67300 |
| }, |
| { |
| "epoch": 7.248950597352277, |
| "grad_norm": 0.7859810590744019, |
| "learning_rate": 0.00016541536472362893, |
| "loss": 3.2088, |
| "step": 67350 |
| }, |
| { |
| "epoch": 7.254332149391884, |
| "grad_norm": 0.7926174998283386, |
| "learning_rate": 0.0001650921236935675, |
| "loss": 3.2284, |
| "step": 67400 |
| }, |
| { |
| "epoch": 7.259713701431493, |
| "grad_norm": 0.8039096593856812, |
| "learning_rate": 0.00016476888266350607, |
| "loss": 3.2463, |
| "step": 67450 |
| }, |
| { |
| "epoch": 7.265095253471101, |
| "grad_norm": 0.7596452832221985, |
| "learning_rate": 0.00016444564163344466, |
| "loss": 3.2319, |
| "step": 67500 |
| }, |
| { |
| "epoch": 7.270476805510709, |
| "grad_norm": 0.7452762126922607, |
| "learning_rate": 0.00016412240060338326, |
| "loss": 3.2248, |
| "step": 67550 |
| }, |
| { |
| "epoch": 7.275858357550318, |
| "grad_norm": 0.7983496189117432, |
| "learning_rate": 0.00016379915957332182, |
| "loss": 3.2319, |
| "step": 67600 |
| }, |
| { |
| "epoch": 7.281239909589925, |
| "grad_norm": 0.7758620977401733, |
| "learning_rate": 0.00016347591854326042, |
| "loss": 3.2352, |
| "step": 67650 |
| }, |
| { |
| "epoch": 7.286621461629534, |
| "grad_norm": 0.8212805986404419, |
| "learning_rate": 0.00016315267751319898, |
| "loss": 3.2403, |
| "step": 67700 |
| }, |
| { |
| "epoch": 7.2920030136691425, |
| "grad_norm": 0.8642499446868896, |
| "learning_rate": 0.00016282943648313755, |
| "loss": 3.2181, |
| "step": 67750 |
| }, |
| { |
| "epoch": 7.29738456570875, |
| "grad_norm": 0.7983525991439819, |
| "learning_rate": 0.00016250619545307617, |
| "loss": 3.2292, |
| "step": 67800 |
| }, |
| { |
| "epoch": 7.302766117748359, |
| "grad_norm": 0.7927777171134949, |
| "learning_rate": 0.00016218295442301474, |
| "loss": 3.2408, |
| "step": 67850 |
| }, |
| { |
| "epoch": 7.308147669787967, |
| "grad_norm": 0.7210763692855835, |
| "learning_rate": 0.0001618597133929533, |
| "loss": 3.2259, |
| "step": 67900 |
| }, |
| { |
| "epoch": 7.313529221827575, |
| "grad_norm": 0.8408624529838562, |
| "learning_rate": 0.00016153647236289193, |
| "loss": 3.2333, |
| "step": 67950 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "grad_norm": 0.7780632376670837, |
| "learning_rate": 0.0001612132313328305, |
| "loss": 3.2489, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.3189107738671835, |
| "eval_accuracy": 0.38737057947084086, |
| "eval_loss": 3.356868028640747, |
| "eval_runtime": 182.8416, |
| "eval_samples_per_second": 98.506, |
| "eval_steps_per_second": 6.158, |
| "step": 68000 |
| }, |
| { |
| "epoch": 7.324292325906791, |
| "grad_norm": 0.7512384653091431, |
| "learning_rate": 0.0001608899903027691, |
| "loss": 3.2333, |
| "step": 68050 |
| }, |
| { |
| "epoch": 7.3296738779464, |
| "grad_norm": 0.8020547032356262, |
| "learning_rate": 0.00016056674927270766, |
| "loss": 3.2387, |
| "step": 68100 |
| }, |
| { |
| "epoch": 7.335055429986008, |
| "grad_norm": 0.7911236882209778, |
| "learning_rate": 0.00016024350824264626, |
| "loss": 3.238, |
| "step": 68150 |
| }, |
| { |
| "epoch": 7.340436982025616, |
| "grad_norm": 0.8028789758682251, |
| "learning_rate": 0.00015992026721258485, |
| "loss": 3.2412, |
| "step": 68200 |
| }, |
| { |
| "epoch": 7.3458185340652244, |
| "grad_norm": 0.8485779166221619, |
| "learning_rate": 0.00015959702618252342, |
| "loss": 3.2367, |
| "step": 68250 |
| }, |
| { |
| "epoch": 7.351200086104833, |
| "grad_norm": 0.7784638404846191, |
| "learning_rate": 0.00015927378515246199, |
| "loss": 3.2297, |
| "step": 68300 |
| }, |
| { |
| "epoch": 7.356581638144441, |
| "grad_norm": 0.7919899821281433, |
| "learning_rate": 0.0001589505441224006, |
| "loss": 3.2273, |
| "step": 68350 |
| }, |
| { |
| "epoch": 7.361963190184049, |
| "grad_norm": 0.7827087044715881, |
| "learning_rate": 0.00015862730309233917, |
| "loss": 3.2288, |
| "step": 68400 |
| }, |
| { |
| "epoch": 7.367344742223658, |
| "grad_norm": 0.7712171077728271, |
| "learning_rate": 0.00015830406206227774, |
| "loss": 3.2382, |
| "step": 68450 |
| }, |
| { |
| "epoch": 7.372726294263265, |
| "grad_norm": 0.7548463940620422, |
| "learning_rate": 0.00015798082103221636, |
| "loss": 3.2307, |
| "step": 68500 |
| }, |
| { |
| "epoch": 7.378107846302874, |
| "grad_norm": 0.7684069275856018, |
| "learning_rate": 0.00015765758000215493, |
| "loss": 3.2362, |
| "step": 68550 |
| }, |
| { |
| "epoch": 7.383489398342482, |
| "grad_norm": 0.7532280087471008, |
| "learning_rate": 0.0001573343389720935, |
| "loss": 3.2386, |
| "step": 68600 |
| }, |
| { |
| "epoch": 7.38887095038209, |
| "grad_norm": 0.7906218767166138, |
| "learning_rate": 0.0001570110979420321, |
| "loss": 3.2553, |
| "step": 68650 |
| }, |
| { |
| "epoch": 7.394252502421699, |
| "grad_norm": 0.7797097563743591, |
| "learning_rate": 0.0001566878569119707, |
| "loss": 3.2411, |
| "step": 68700 |
| }, |
| { |
| "epoch": 7.399634054461306, |
| "grad_norm": 0.8284671902656555, |
| "learning_rate": 0.00015636461588190926, |
| "loss": 3.2456, |
| "step": 68750 |
| }, |
| { |
| "epoch": 7.405015606500915, |
| "grad_norm": 0.7855088114738464, |
| "learning_rate": 0.00015604137485184785, |
| "loss": 3.2456, |
| "step": 68800 |
| }, |
| { |
| "epoch": 7.4103971585405235, |
| "grad_norm": 0.8041961193084717, |
| "learning_rate": 0.00015571813382178642, |
| "loss": 3.2236, |
| "step": 68850 |
| }, |
| { |
| "epoch": 7.415778710580131, |
| "grad_norm": 0.8667538166046143, |
| "learning_rate": 0.000155394892791725, |
| "loss": 3.2492, |
| "step": 68900 |
| }, |
| { |
| "epoch": 7.42116026261974, |
| "grad_norm": 0.7783315777778625, |
| "learning_rate": 0.0001550716517616636, |
| "loss": 3.2429, |
| "step": 68950 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "grad_norm": 0.7982115149497986, |
| "learning_rate": 0.00015474841073160217, |
| "loss": 3.2278, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.426541814659347, |
| "eval_accuracy": 0.3878293119631936, |
| "eval_loss": 3.3534276485443115, |
| "eval_runtime": 182.9528, |
| "eval_samples_per_second": 98.446, |
| "eval_steps_per_second": 6.155, |
| "step": 69000 |
| }, |
| { |
| "epoch": 7.431923366698956, |
| "grad_norm": 0.8143750429153442, |
| "learning_rate": 0.0001544251697015408, |
| "loss": 3.2492, |
| "step": 69050 |
| }, |
| { |
| "epoch": 7.4373049187385645, |
| "grad_norm": 0.8255060315132141, |
| "learning_rate": 0.00015410192867147936, |
| "loss": 3.2362, |
| "step": 69100 |
| }, |
| { |
| "epoch": 7.442686470778172, |
| "grad_norm": 0.8084985613822937, |
| "learning_rate": 0.00015377868764141793, |
| "loss": 3.2162, |
| "step": 69150 |
| }, |
| { |
| "epoch": 7.448068022817781, |
| "grad_norm": 0.8223511576652527, |
| "learning_rate": 0.00015345544661135653, |
| "loss": 3.2509, |
| "step": 69200 |
| }, |
| { |
| "epoch": 7.453449574857389, |
| "grad_norm": 0.7960752844810486, |
| "learning_rate": 0.00015313220558129512, |
| "loss": 3.2334, |
| "step": 69250 |
| }, |
| { |
| "epoch": 7.458831126896997, |
| "grad_norm": 0.8062809705734253, |
| "learning_rate": 0.0001528089645512337, |
| "loss": 3.2391, |
| "step": 69300 |
| }, |
| { |
| "epoch": 7.4642126789366054, |
| "grad_norm": 0.8449177742004395, |
| "learning_rate": 0.00015248572352117228, |
| "loss": 3.2271, |
| "step": 69350 |
| }, |
| { |
| "epoch": 7.469594230976213, |
| "grad_norm": 0.8337926864624023, |
| "learning_rate": 0.00015216248249111085, |
| "loss": 3.2184, |
| "step": 69400 |
| }, |
| { |
| "epoch": 7.474975783015822, |
| "grad_norm": 0.7521905899047852, |
| "learning_rate": 0.00015183924146104945, |
| "loss": 3.2245, |
| "step": 69450 |
| }, |
| { |
| "epoch": 7.48035733505543, |
| "grad_norm": 0.8167929649353027, |
| "learning_rate": 0.00015151600043098804, |
| "loss": 3.2562, |
| "step": 69500 |
| }, |
| { |
| "epoch": 7.485738887095038, |
| "grad_norm": 0.7685818076133728, |
| "learning_rate": 0.0001511927594009266, |
| "loss": 3.2436, |
| "step": 69550 |
| }, |
| { |
| "epoch": 7.491120439134646, |
| "grad_norm": 0.7431442141532898, |
| "learning_rate": 0.00015086951837086518, |
| "loss": 3.2319, |
| "step": 69600 |
| }, |
| { |
| "epoch": 7.496501991174255, |
| "grad_norm": 0.783828854560852, |
| "learning_rate": 0.0001505462773408038, |
| "loss": 3.2341, |
| "step": 69650 |
| }, |
| { |
| "epoch": 7.501883543213863, |
| "grad_norm": 0.7887234687805176, |
| "learning_rate": 0.00015022303631074236, |
| "loss": 3.2257, |
| "step": 69700 |
| }, |
| { |
| "epoch": 7.507265095253471, |
| "grad_norm": 0.8269175887107849, |
| "learning_rate": 0.00014989979528068096, |
| "loss": 3.2514, |
| "step": 69750 |
| }, |
| { |
| "epoch": 7.51264664729308, |
| "grad_norm": 0.8568757772445679, |
| "learning_rate": 0.00014957655425061953, |
| "loss": 3.2363, |
| "step": 69800 |
| }, |
| { |
| "epoch": 7.518028199332687, |
| "grad_norm": 0.7879490852355957, |
| "learning_rate": 0.00014925331322055812, |
| "loss": 3.2664, |
| "step": 69850 |
| }, |
| { |
| "epoch": 7.523409751372296, |
| "grad_norm": 0.8363874554634094, |
| "learning_rate": 0.0001489300721904967, |
| "loss": 3.2386, |
| "step": 69900 |
| }, |
| { |
| "epoch": 7.528791303411904, |
| "grad_norm": 0.7977311015129089, |
| "learning_rate": 0.00014860683116043528, |
| "loss": 3.2544, |
| "step": 69950 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "grad_norm": 0.7749576568603516, |
| "learning_rate": 0.00014828359013037385, |
| "loss": 3.2375, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.534172855451512, |
| "eval_accuracy": 0.38801554301290764, |
| "eval_loss": 3.349801540374756, |
| "eval_runtime": 182.841, |
| "eval_samples_per_second": 98.506, |
| "eval_steps_per_second": 6.158, |
| "step": 70000 |
| }, |
| { |
| "epoch": 7.539554407491121, |
| "grad_norm": 0.7948490381240845, |
| "learning_rate": 0.00014796034910031245, |
| "loss": 3.2395, |
| "step": 70050 |
| }, |
| { |
| "epoch": 7.544935959530728, |
| "grad_norm": 0.7794623374938965, |
| "learning_rate": 0.00014763710807025104, |
| "loss": 3.2374, |
| "step": 70100 |
| }, |
| { |
| "epoch": 7.550317511570337, |
| "grad_norm": 0.7769473791122437, |
| "learning_rate": 0.0001473138670401896, |
| "loss": 3.2361, |
| "step": 70150 |
| }, |
| { |
| "epoch": 7.5556990636099455, |
| "grad_norm": 0.8074938654899597, |
| "learning_rate": 0.0001469906260101282, |
| "loss": 3.2296, |
| "step": 70200 |
| }, |
| { |
| "epoch": 7.561080615649553, |
| "grad_norm": 0.7983949184417725, |
| "learning_rate": 0.0001466673849800668, |
| "loss": 3.2314, |
| "step": 70250 |
| }, |
| { |
| "epoch": 7.566462167689162, |
| "grad_norm": 0.8049135208129883, |
| "learning_rate": 0.0001463441439500054, |
| "loss": 3.2324, |
| "step": 70300 |
| }, |
| { |
| "epoch": 7.57184371972877, |
| "grad_norm": 0.8315684795379639, |
| "learning_rate": 0.00014602090291994396, |
| "loss": 3.256, |
| "step": 70350 |
| }, |
| { |
| "epoch": 7.577225271768378, |
| "grad_norm": 0.8284087181091309, |
| "learning_rate": 0.00014569766188988255, |
| "loss": 3.2303, |
| "step": 70400 |
| }, |
| { |
| "epoch": 7.5826068238079865, |
| "grad_norm": 0.8085214495658875, |
| "learning_rate": 0.00014538088568042236, |
| "loss": 3.2405, |
| "step": 70450 |
| }, |
| { |
| "epoch": 7.587988375847594, |
| "grad_norm": 0.8195266723632812, |
| "learning_rate": 0.00014505764465036093, |
| "loss": 3.2479, |
| "step": 70500 |
| }, |
| { |
| "epoch": 7.593369927887203, |
| "grad_norm": 0.7887190580368042, |
| "learning_rate": 0.00014473440362029952, |
| "loss": 3.229, |
| "step": 70550 |
| }, |
| { |
| "epoch": 7.598751479926811, |
| "grad_norm": 0.7637575268745422, |
| "learning_rate": 0.00014441116259023812, |
| "loss": 3.2514, |
| "step": 70600 |
| }, |
| { |
| "epoch": 7.604133031966419, |
| "grad_norm": 0.8185248970985413, |
| "learning_rate": 0.00014408792156017669, |
| "loss": 3.2379, |
| "step": 70650 |
| }, |
| { |
| "epoch": 7.609514584006027, |
| "grad_norm": 0.7940756678581238, |
| "learning_rate": 0.00014376468053011528, |
| "loss": 3.2618, |
| "step": 70700 |
| }, |
| { |
| "epoch": 7.614896136045635, |
| "grad_norm": 0.7927145957946777, |
| "learning_rate": 0.00014344143950005387, |
| "loss": 3.2455, |
| "step": 70750 |
| }, |
| { |
| "epoch": 7.620277688085244, |
| "grad_norm": 0.7812114953994751, |
| "learning_rate": 0.00014311819846999244, |
| "loss": 3.2428, |
| "step": 70800 |
| }, |
| { |
| "epoch": 7.625659240124852, |
| "grad_norm": 0.7778552174568176, |
| "learning_rate": 0.00014279495743993104, |
| "loss": 3.2367, |
| "step": 70850 |
| }, |
| { |
| "epoch": 7.63104079216446, |
| "grad_norm": 0.8149660229682922, |
| "learning_rate": 0.00014247171640986963, |
| "loss": 3.244, |
| "step": 70900 |
| }, |
| { |
| "epoch": 7.636422344204068, |
| "grad_norm": 0.8159542679786682, |
| "learning_rate": 0.0001421484753798082, |
| "loss": 3.234, |
| "step": 70950 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "grad_norm": 0.7451802492141724, |
| "learning_rate": 0.0001418252343497468, |
| "loss": 3.247, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.641803896243677, |
| "eval_accuracy": 0.38847731778611916, |
| "eval_loss": 3.345766305923462, |
| "eval_runtime": 182.6769, |
| "eval_samples_per_second": 98.595, |
| "eval_steps_per_second": 6.164, |
| "step": 71000 |
| }, |
| { |
| "epoch": 7.647185448283285, |
| "grad_norm": 0.8018648624420166, |
| "learning_rate": 0.00014150199331968536, |
| "loss": 3.2532, |
| "step": 71050 |
| }, |
| { |
| "epoch": 7.652567000322893, |
| "grad_norm": 0.8257297277450562, |
| "learning_rate": 0.00014118521711022517, |
| "loss": 3.2431, |
| "step": 71100 |
| }, |
| { |
| "epoch": 7.657948552362502, |
| "grad_norm": 0.8156144618988037, |
| "learning_rate": 0.00014086197608016376, |
| "loss": 3.2349, |
| "step": 71150 |
| }, |
| { |
| "epoch": 7.663330104402109, |
| "grad_norm": 0.792155385017395, |
| "learning_rate": 0.00014053873505010236, |
| "loss": 3.2245, |
| "step": 71200 |
| }, |
| { |
| "epoch": 7.668711656441718, |
| "grad_norm": 0.8484143018722534, |
| "learning_rate": 0.00014021549402004092, |
| "loss": 3.2349, |
| "step": 71250 |
| }, |
| { |
| "epoch": 7.674093208481326, |
| "grad_norm": 0.8315868377685547, |
| "learning_rate": 0.00013989225298997952, |
| "loss": 3.2703, |
| "step": 71300 |
| }, |
| { |
| "epoch": 7.679474760520934, |
| "grad_norm": 0.7925692200660706, |
| "learning_rate": 0.00013956901195991811, |
| "loss": 3.2345, |
| "step": 71350 |
| }, |
| { |
| "epoch": 7.684856312560543, |
| "grad_norm": 0.8163354396820068, |
| "learning_rate": 0.00013924577092985668, |
| "loss": 3.2444, |
| "step": 71400 |
| }, |
| { |
| "epoch": 7.69023786460015, |
| "grad_norm": 0.82204669713974, |
| "learning_rate": 0.00013892252989979528, |
| "loss": 3.2424, |
| "step": 71450 |
| }, |
| { |
| "epoch": 7.695619416639759, |
| "grad_norm": 0.839684247970581, |
| "learning_rate": 0.00013859928886973384, |
| "loss": 3.2359, |
| "step": 71500 |
| }, |
| { |
| "epoch": 7.7010009686793675, |
| "grad_norm": 0.8018124103546143, |
| "learning_rate": 0.00013827604783967244, |
| "loss": 3.2321, |
| "step": 71550 |
| }, |
| { |
| "epoch": 7.706382520718975, |
| "grad_norm": 0.7850207090377808, |
| "learning_rate": 0.000137952806809611, |
| "loss": 3.2435, |
| "step": 71600 |
| }, |
| { |
| "epoch": 7.711764072758584, |
| "grad_norm": 0.7935765981674194, |
| "learning_rate": 0.0001376295657795496, |
| "loss": 3.2332, |
| "step": 71650 |
| }, |
| { |
| "epoch": 7.717145624798192, |
| "grad_norm": 0.8368144631385803, |
| "learning_rate": 0.0001373063247494882, |
| "loss": 3.2416, |
| "step": 71700 |
| }, |
| { |
| "epoch": 7.7225271768378, |
| "grad_norm": 0.8086535930633545, |
| "learning_rate": 0.00013698308371942676, |
| "loss": 3.2458, |
| "step": 71750 |
| }, |
| { |
| "epoch": 7.727908728877408, |
| "grad_norm": 0.8280001282691956, |
| "learning_rate": 0.00013665984268936536, |
| "loss": 3.2307, |
| "step": 71800 |
| }, |
| { |
| "epoch": 7.733290280917016, |
| "grad_norm": 0.8143231272697449, |
| "learning_rate": 0.00013633660165930395, |
| "loss": 3.2272, |
| "step": 71850 |
| }, |
| { |
| "epoch": 7.738671832956625, |
| "grad_norm": 0.784056544303894, |
| "learning_rate": 0.00013601336062924255, |
| "loss": 3.2341, |
| "step": 71900 |
| }, |
| { |
| "epoch": 7.744053384996233, |
| "grad_norm": 0.8832378387451172, |
| "learning_rate": 0.00013569011959918111, |
| "loss": 3.2553, |
| "step": 71950 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "grad_norm": 0.806605875492096, |
| "learning_rate": 0.0001353668785691197, |
| "loss": 3.2471, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.749434937035841, |
| "eval_accuracy": 0.3890513309924431, |
| "eval_loss": 3.342980146408081, |
| "eval_runtime": 182.5155, |
| "eval_samples_per_second": 98.682, |
| "eval_steps_per_second": 6.169, |
| "step": 72000 |
| }, |
| { |
| "epoch": 7.754816489075449, |
| "grad_norm": 0.8063587546348572, |
| "learning_rate": 0.00013504363753905828, |
| "loss": 3.2495, |
| "step": 72050 |
| }, |
| { |
| "epoch": 7.760198041115058, |
| "grad_norm": 0.8236513137817383, |
| "learning_rate": 0.00013472039650899687, |
| "loss": 3.2403, |
| "step": 72100 |
| }, |
| { |
| "epoch": 7.765579593154666, |
| "grad_norm": 0.801658034324646, |
| "learning_rate": 0.00013439715547893544, |
| "loss": 3.2375, |
| "step": 72150 |
| }, |
| { |
| "epoch": 7.770961145194274, |
| "grad_norm": 0.7991564273834229, |
| "learning_rate": 0.00013407391444887403, |
| "loss": 3.2362, |
| "step": 72200 |
| }, |
| { |
| "epoch": 7.776342697233883, |
| "grad_norm": 0.780725359916687, |
| "learning_rate": 0.0001337506734188126, |
| "loss": 3.2467, |
| "step": 72250 |
| }, |
| { |
| "epoch": 7.78172424927349, |
| "grad_norm": 0.7829933762550354, |
| "learning_rate": 0.0001334274323887512, |
| "loss": 3.2352, |
| "step": 72300 |
| }, |
| { |
| "epoch": 7.787105801313099, |
| "grad_norm": 0.8279498815536499, |
| "learning_rate": 0.0001331041913586898, |
| "loss": 3.2495, |
| "step": 72350 |
| }, |
| { |
| "epoch": 7.792487353352707, |
| "grad_norm": 0.8042828440666199, |
| "learning_rate": 0.00013278095032862838, |
| "loss": 3.2352, |
| "step": 72400 |
| }, |
| { |
| "epoch": 7.797868905392315, |
| "grad_norm": 0.8500546813011169, |
| "learning_rate": 0.00013245770929856695, |
| "loss": 3.248, |
| "step": 72450 |
| }, |
| { |
| "epoch": 7.803250457431924, |
| "grad_norm": 0.8876932859420776, |
| "learning_rate": 0.00013213446826850555, |
| "loss": 3.2425, |
| "step": 72500 |
| }, |
| { |
| "epoch": 7.808632009471531, |
| "grad_norm": 0.7689300775527954, |
| "learning_rate": 0.00013181122723844411, |
| "loss": 3.2308, |
| "step": 72550 |
| }, |
| { |
| "epoch": 7.81401356151114, |
| "grad_norm": 0.7971476912498474, |
| "learning_rate": 0.0001314879862083827, |
| "loss": 3.2349, |
| "step": 72600 |
| }, |
| { |
| "epoch": 7.819395113550748, |
| "grad_norm": 0.817987859249115, |
| "learning_rate": 0.00013116474517832128, |
| "loss": 3.2602, |
| "step": 72650 |
| }, |
| { |
| "epoch": 7.824776665590356, |
| "grad_norm": 0.819493293762207, |
| "learning_rate": 0.00013084150414825987, |
| "loss": 3.2283, |
| "step": 72700 |
| }, |
| { |
| "epoch": 7.830158217629965, |
| "grad_norm": 0.8192011117935181, |
| "learning_rate": 0.00013051826311819844, |
| "loss": 3.2344, |
| "step": 72750 |
| }, |
| { |
| "epoch": 7.835539769669572, |
| "grad_norm": 0.7826513051986694, |
| "learning_rate": 0.00013019502208813703, |
| "loss": 3.2499, |
| "step": 72800 |
| }, |
| { |
| "epoch": 7.840921321709181, |
| "grad_norm": 0.8338637351989746, |
| "learning_rate": 0.00012987178105807563, |
| "loss": 3.2288, |
| "step": 72850 |
| }, |
| { |
| "epoch": 7.846302873748789, |
| "grad_norm": 0.7698312997817993, |
| "learning_rate": 0.00012954854002801422, |
| "loss": 3.2517, |
| "step": 72900 |
| }, |
| { |
| "epoch": 7.851684425788397, |
| "grad_norm": 0.8017268180847168, |
| "learning_rate": 0.0001292252989979528, |
| "loss": 3.2544, |
| "step": 72950 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "grad_norm": 0.7961682677268982, |
| "learning_rate": 0.00012890205796789139, |
| "loss": 3.2406, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.857065977828006, |
| "eval_accuracy": 0.38943215936423287, |
| "eval_loss": 3.3387157917022705, |
| "eval_runtime": 182.7982, |
| "eval_samples_per_second": 98.529, |
| "eval_steps_per_second": 6.16, |
| "step": 73000 |
| }, |
| { |
| "epoch": 7.862447529867614, |
| "grad_norm": 0.8738141655921936, |
| "learning_rate": 0.00012857881693782998, |
| "loss": 3.2274, |
| "step": 73050 |
| }, |
| { |
| "epoch": 7.867829081907222, |
| "grad_norm": 0.7610816955566406, |
| "learning_rate": 0.00012825557590776855, |
| "loss": 3.2318, |
| "step": 73100 |
| }, |
| { |
| "epoch": 7.87321063394683, |
| "grad_norm": 0.8317003846168518, |
| "learning_rate": 0.00012793233487770714, |
| "loss": 3.2434, |
| "step": 73150 |
| }, |
| { |
| "epoch": 7.878592185986438, |
| "grad_norm": 0.8011530041694641, |
| "learning_rate": 0.0001276090938476457, |
| "loss": 3.2545, |
| "step": 73200 |
| }, |
| { |
| "epoch": 7.883973738026047, |
| "grad_norm": 0.8046674132347107, |
| "learning_rate": 0.0001272858528175843, |
| "loss": 3.2302, |
| "step": 73250 |
| }, |
| { |
| "epoch": 7.889355290065655, |
| "grad_norm": 0.8095680475234985, |
| "learning_rate": 0.00012696261178752287, |
| "loss": 3.2464, |
| "step": 73300 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 0.7865450978279114, |
| "learning_rate": 0.00012663937075746147, |
| "loss": 3.2599, |
| "step": 73350 |
| }, |
| { |
| "epoch": 7.900118394144871, |
| "grad_norm": 0.7817733287811279, |
| "learning_rate": 0.00012631612972740006, |
| "loss": 3.2339, |
| "step": 73400 |
| }, |
| { |
| "epoch": 7.90549994618448, |
| "grad_norm": 0.8104103207588196, |
| "learning_rate": 0.00012599288869733863, |
| "loss": 3.2297, |
| "step": 73450 |
| }, |
| { |
| "epoch": 7.910881498224088, |
| "grad_norm": 0.8111141324043274, |
| "learning_rate": 0.00012566964766727722, |
| "loss": 3.2322, |
| "step": 73500 |
| }, |
| { |
| "epoch": 7.916263050263696, |
| "grad_norm": 0.8497397899627686, |
| "learning_rate": 0.00012534640663721582, |
| "loss": 3.2276, |
| "step": 73550 |
| }, |
| { |
| "epoch": 7.921644602303305, |
| "grad_norm": 0.8326553106307983, |
| "learning_rate": 0.00012502316560715439, |
| "loss": 3.2472, |
| "step": 73600 |
| }, |
| { |
| "epoch": 7.927026154342912, |
| "grad_norm": 0.8586411476135254, |
| "learning_rate": 0.00012469992457709298, |
| "loss": 3.2429, |
| "step": 73650 |
| }, |
| { |
| "epoch": 7.932407706382521, |
| "grad_norm": 0.7935066819190979, |
| "learning_rate": 0.00012437668354703158, |
| "loss": 3.2422, |
| "step": 73700 |
| }, |
| { |
| "epoch": 7.937789258422129, |
| "grad_norm": 0.7909044623374939, |
| "learning_rate": 0.00012405344251697014, |
| "loss": 3.2395, |
| "step": 73750 |
| }, |
| { |
| "epoch": 7.943170810461737, |
| "grad_norm": 0.8109498023986816, |
| "learning_rate": 0.00012373020148690874, |
| "loss": 3.2482, |
| "step": 73800 |
| }, |
| { |
| "epoch": 7.948552362501346, |
| "grad_norm": 0.8576630353927612, |
| "learning_rate": 0.0001234069604568473, |
| "loss": 3.239, |
| "step": 73850 |
| }, |
| { |
| "epoch": 7.953933914540953, |
| "grad_norm": 0.7980335354804993, |
| "learning_rate": 0.0001230837194267859, |
| "loss": 3.2356, |
| "step": 73900 |
| }, |
| { |
| "epoch": 7.959315466580562, |
| "grad_norm": 0.7563722729682922, |
| "learning_rate": 0.00012276047839672447, |
| "loss": 3.2249, |
| "step": 73950 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "grad_norm": 0.7781243324279785, |
| "learning_rate": 0.00012243723736666306, |
| "loss": 3.2495, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.96469701862017, |
| "eval_accuracy": 0.38969031862568004, |
| "eval_loss": 3.334991216659546, |
| "eval_runtime": 182.9415, |
| "eval_samples_per_second": 98.452, |
| "eval_steps_per_second": 6.155, |
| "step": 74000 |
| }, |
| { |
| "epoch": 7.970078570659778, |
| "grad_norm": 0.8000725507736206, |
| "learning_rate": 0.00012211399633660166, |
| "loss": 3.2204, |
| "step": 74050 |
| }, |
| { |
| "epoch": 7.975460122699387, |
| "grad_norm": 0.8195854425430298, |
| "learning_rate": 0.00012179075530654022, |
| "loss": 3.2446, |
| "step": 74100 |
| }, |
| { |
| "epoch": 7.980841674738995, |
| "grad_norm": 0.8186859488487244, |
| "learning_rate": 0.00012146751427647882, |
| "loss": 3.2262, |
| "step": 74150 |
| }, |
| { |
| "epoch": 7.986223226778603, |
| "grad_norm": 0.8260078430175781, |
| "learning_rate": 0.0001211442732464174, |
| "loss": 3.2422, |
| "step": 74200 |
| }, |
| { |
| "epoch": 7.991604778818211, |
| "grad_norm": 0.7887690663337708, |
| "learning_rate": 0.00012082103221635598, |
| "loss": 3.2354, |
| "step": 74250 |
| }, |
| { |
| "epoch": 7.996986330857819, |
| "grad_norm": 0.8294403553009033, |
| "learning_rate": 0.00012049779118629456, |
| "loss": 3.252, |
| "step": 74300 |
| }, |
| { |
| "epoch": 8.002367882897428, |
| "grad_norm": 0.830507755279541, |
| "learning_rate": 0.00012017455015623316, |
| "loss": 3.2075, |
| "step": 74350 |
| }, |
| { |
| "epoch": 8.007749434937036, |
| "grad_norm": 0.8426108360290527, |
| "learning_rate": 0.00011985130912617175, |
| "loss": 3.1655, |
| "step": 74400 |
| }, |
| { |
| "epoch": 8.013130986976645, |
| "grad_norm": 0.8193655610084534, |
| "learning_rate": 0.00011952806809611032, |
| "loss": 3.1579, |
| "step": 74450 |
| }, |
| { |
| "epoch": 8.018512539016251, |
| "grad_norm": 0.8287805318832397, |
| "learning_rate": 0.00011920482706604891, |
| "loss": 3.1689, |
| "step": 74500 |
| }, |
| { |
| "epoch": 8.02389409105586, |
| "grad_norm": 0.8014912009239197, |
| "learning_rate": 0.0001188815860359875, |
| "loss": 3.1657, |
| "step": 74550 |
| }, |
| { |
| "epoch": 8.029275643095469, |
| "grad_norm": 0.7986035943031311, |
| "learning_rate": 0.00011855834500592608, |
| "loss": 3.161, |
| "step": 74600 |
| }, |
| { |
| "epoch": 8.034657195135077, |
| "grad_norm": 0.7975103855133057, |
| "learning_rate": 0.00011823510397586466, |
| "loss": 3.1607, |
| "step": 74650 |
| }, |
| { |
| "epoch": 8.040038747174686, |
| "grad_norm": 0.8071293234825134, |
| "learning_rate": 0.00011791186294580325, |
| "loss": 3.1557, |
| "step": 74700 |
| }, |
| { |
| "epoch": 8.045420299214294, |
| "grad_norm": 0.8412976264953613, |
| "learning_rate": 0.00011758862191574182, |
| "loss": 3.1663, |
| "step": 74750 |
| }, |
| { |
| "epoch": 8.050801851253901, |
| "grad_norm": 0.8266987204551697, |
| "learning_rate": 0.00011726538088568041, |
| "loss": 3.1748, |
| "step": 74800 |
| }, |
| { |
| "epoch": 8.05618340329351, |
| "grad_norm": 0.8250101804733276, |
| "learning_rate": 0.000116942139855619, |
| "loss": 3.1774, |
| "step": 74850 |
| }, |
| { |
| "epoch": 8.061564955333118, |
| "grad_norm": 0.808516800403595, |
| "learning_rate": 0.00011661889882555759, |
| "loss": 3.1703, |
| "step": 74900 |
| }, |
| { |
| "epoch": 8.066946507372727, |
| "grad_norm": 0.8228300213813782, |
| "learning_rate": 0.00011629565779549616, |
| "loss": 3.1582, |
| "step": 74950 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "grad_norm": 0.8521013855934143, |
| "learning_rate": 0.00011597241676543475, |
| "loss": 3.191, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.072328059412335, |
| "eval_accuracy": 0.3899119705168216, |
| "eval_loss": 3.341465711593628, |
| "eval_runtime": 182.5475, |
| "eval_samples_per_second": 98.665, |
| "eval_steps_per_second": 6.168, |
| "step": 75000 |
| }, |
| { |
| "epoch": 8.077709611451942, |
| "grad_norm": 0.834549069404602, |
| "learning_rate": 0.00011564917573537335, |
| "loss": 3.1519, |
| "step": 75050 |
| }, |
| { |
| "epoch": 8.08309116349155, |
| "grad_norm": 0.8692988157272339, |
| "learning_rate": 0.00011533239952591314, |
| "loss": 3.1654, |
| "step": 75100 |
| }, |
| { |
| "epoch": 8.088472715531159, |
| "grad_norm": 0.8068045973777771, |
| "learning_rate": 0.00011500915849585173, |
| "loss": 3.1802, |
| "step": 75150 |
| }, |
| { |
| "epoch": 8.093854267570768, |
| "grad_norm": 0.8539791107177734, |
| "learning_rate": 0.00011468591746579033, |
| "loss": 3.1812, |
| "step": 75200 |
| }, |
| { |
| "epoch": 8.099235819610376, |
| "grad_norm": 0.8729422092437744, |
| "learning_rate": 0.0001143626764357289, |
| "loss": 3.1766, |
| "step": 75250 |
| }, |
| { |
| "epoch": 8.104617371649983, |
| "grad_norm": 0.8850834369659424, |
| "learning_rate": 0.00011403943540566749, |
| "loss": 3.1687, |
| "step": 75300 |
| }, |
| { |
| "epoch": 8.109998923689592, |
| "grad_norm": 0.8430905938148499, |
| "learning_rate": 0.00011371619437560607, |
| "loss": 3.1833, |
| "step": 75350 |
| }, |
| { |
| "epoch": 8.1153804757292, |
| "grad_norm": 0.7951804399490356, |
| "learning_rate": 0.00011339295334554464, |
| "loss": 3.1759, |
| "step": 75400 |
| }, |
| { |
| "epoch": 8.120762027768809, |
| "grad_norm": 0.8502519130706787, |
| "learning_rate": 0.00011306971231548323, |
| "loss": 3.1873, |
| "step": 75450 |
| }, |
| { |
| "epoch": 8.126143579808417, |
| "grad_norm": 1.6292310953140259, |
| "learning_rate": 0.00011274647128542183, |
| "loss": 3.1808, |
| "step": 75500 |
| }, |
| { |
| "epoch": 8.131525131848026, |
| "grad_norm": 0.8119076490402222, |
| "learning_rate": 0.0001124232302553604, |
| "loss": 3.1815, |
| "step": 75550 |
| }, |
| { |
| "epoch": 8.136906683887632, |
| "grad_norm": 0.8140049576759338, |
| "learning_rate": 0.00011209998922529899, |
| "loss": 3.1838, |
| "step": 75600 |
| }, |
| { |
| "epoch": 8.142288235927241, |
| "grad_norm": 0.8442477583885193, |
| "learning_rate": 0.00011177674819523757, |
| "loss": 3.1647, |
| "step": 75650 |
| }, |
| { |
| "epoch": 8.14766978796685, |
| "grad_norm": 0.8645812273025513, |
| "learning_rate": 0.00011145350716517617, |
| "loss": 3.1821, |
| "step": 75700 |
| }, |
| { |
| "epoch": 8.153051340006458, |
| "grad_norm": 0.778192400932312, |
| "learning_rate": 0.00011113026613511473, |
| "loss": 3.19, |
| "step": 75750 |
| }, |
| { |
| "epoch": 8.158432892046067, |
| "grad_norm": 0.8154362440109253, |
| "learning_rate": 0.00011080702510505333, |
| "loss": 3.1737, |
| "step": 75800 |
| }, |
| { |
| "epoch": 8.163814444085673, |
| "grad_norm": 0.8348283767700195, |
| "learning_rate": 0.00011048378407499191, |
| "loss": 3.1763, |
| "step": 75850 |
| }, |
| { |
| "epoch": 8.169195996125282, |
| "grad_norm": 0.8607150316238403, |
| "learning_rate": 0.00011016054304493049, |
| "loss": 3.1867, |
| "step": 75900 |
| }, |
| { |
| "epoch": 8.17457754816489, |
| "grad_norm": 0.8294938206672668, |
| "learning_rate": 0.00010983730201486907, |
| "loss": 3.179, |
| "step": 75950 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "grad_norm": 0.8422661423683167, |
| "learning_rate": 0.00010951406098480767, |
| "loss": 3.1735, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.1799591002045, |
| "eval_accuracy": 0.38991914160741736, |
| "eval_loss": 3.339107036590576, |
| "eval_runtime": 182.7233, |
| "eval_samples_per_second": 98.57, |
| "eval_steps_per_second": 6.162, |
| "step": 76000 |
| }, |
| { |
| "epoch": 8.185340652244108, |
| "grad_norm": 0.8388949036598206, |
| "learning_rate": 0.00010919081995474623, |
| "loss": 3.1938, |
| "step": 76050 |
| }, |
| { |
| "epoch": 8.190722204283716, |
| "grad_norm": 0.786255419254303, |
| "learning_rate": 0.00010886757892468483, |
| "loss": 3.1737, |
| "step": 76100 |
| }, |
| { |
| "epoch": 8.196103756323323, |
| "grad_norm": 0.8635239005088806, |
| "learning_rate": 0.00010854433789462342, |
| "loss": 3.1797, |
| "step": 76150 |
| }, |
| { |
| "epoch": 8.201485308362932, |
| "grad_norm": 0.8113918304443359, |
| "learning_rate": 0.000108221096864562, |
| "loss": 3.1891, |
| "step": 76200 |
| }, |
| { |
| "epoch": 8.20686686040254, |
| "grad_norm": 0.823483407497406, |
| "learning_rate": 0.00010789785583450059, |
| "loss": 3.1757, |
| "step": 76250 |
| }, |
| { |
| "epoch": 8.212248412442149, |
| "grad_norm": 0.7900793552398682, |
| "learning_rate": 0.00010757461480443917, |
| "loss": 3.1828, |
| "step": 76300 |
| }, |
| { |
| "epoch": 8.217629964481757, |
| "grad_norm": 0.8567901253700256, |
| "learning_rate": 0.00010725137377437776, |
| "loss": 3.1645, |
| "step": 76350 |
| }, |
| { |
| "epoch": 8.223011516521364, |
| "grad_norm": 0.8328141570091248, |
| "learning_rate": 0.00010692813274431633, |
| "loss": 3.1711, |
| "step": 76400 |
| }, |
| { |
| "epoch": 8.228393068560973, |
| "grad_norm": 0.8192028403282166, |
| "learning_rate": 0.00010660489171425492, |
| "loss": 3.1744, |
| "step": 76450 |
| }, |
| { |
| "epoch": 8.233774620600581, |
| "grad_norm": 0.8761533498764038, |
| "learning_rate": 0.0001062816506841935, |
| "loss": 3.1959, |
| "step": 76500 |
| }, |
| { |
| "epoch": 8.23915617264019, |
| "grad_norm": 0.8476336002349854, |
| "learning_rate": 0.00010595840965413209, |
| "loss": 3.1788, |
| "step": 76550 |
| }, |
| { |
| "epoch": 8.244537724679798, |
| "grad_norm": 0.8509985208511353, |
| "learning_rate": 0.00010563516862407067, |
| "loss": 3.1825, |
| "step": 76600 |
| }, |
| { |
| "epoch": 8.249919276719407, |
| "grad_norm": 0.8411309123039246, |
| "learning_rate": 0.00010531192759400926, |
| "loss": 3.1826, |
| "step": 76650 |
| }, |
| { |
| "epoch": 8.255300828759013, |
| "grad_norm": 0.8168168067932129, |
| "learning_rate": 0.00010498868656394784, |
| "loss": 3.185, |
| "step": 76700 |
| }, |
| { |
| "epoch": 8.260682380798622, |
| "grad_norm": 0.8766189813613892, |
| "learning_rate": 0.00010466544553388642, |
| "loss": 3.1826, |
| "step": 76750 |
| }, |
| { |
| "epoch": 8.26606393283823, |
| "grad_norm": 0.7768109440803528, |
| "learning_rate": 0.000104342204503825, |
| "loss": 3.193, |
| "step": 76800 |
| }, |
| { |
| "epoch": 8.27144548487784, |
| "grad_norm": 0.8481165170669556, |
| "learning_rate": 0.0001040189634737636, |
| "loss": 3.1845, |
| "step": 76850 |
| }, |
| { |
| "epoch": 8.276827036917448, |
| "grad_norm": 0.83498215675354, |
| "learning_rate": 0.00010369572244370217, |
| "loss": 3.1856, |
| "step": 76900 |
| }, |
| { |
| "epoch": 8.282208588957054, |
| "grad_norm": 0.8582146167755127, |
| "learning_rate": 0.00010337248141364076, |
| "loss": 3.1902, |
| "step": 76950 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "grad_norm": 0.8045734167098999, |
| "learning_rate": 0.00010304924038357936, |
| "loss": 3.1844, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.287590140996663, |
| "eval_accuracy": 0.3902144601564971, |
| "eval_loss": 3.3361928462982178, |
| "eval_runtime": 182.7919, |
| "eval_samples_per_second": 98.533, |
| "eval_steps_per_second": 6.16, |
| "step": 77000 |
| }, |
| { |
| "epoch": 8.292971693036272, |
| "grad_norm": 0.854202926158905, |
| "learning_rate": 0.00010272599935351792, |
| "loss": 3.1828, |
| "step": 77050 |
| }, |
| { |
| "epoch": 8.29835324507588, |
| "grad_norm": 0.8577749729156494, |
| "learning_rate": 0.00010240922314405774, |
| "loss": 3.1808, |
| "step": 77100 |
| }, |
| { |
| "epoch": 8.303734797115489, |
| "grad_norm": 0.8859609365463257, |
| "learning_rate": 0.00010208598211399634, |
| "loss": 3.1855, |
| "step": 77150 |
| }, |
| { |
| "epoch": 8.309116349155097, |
| "grad_norm": 0.8407235145568848, |
| "learning_rate": 0.0001017627410839349, |
| "loss": 3.1775, |
| "step": 77200 |
| }, |
| { |
| "epoch": 8.314497901194704, |
| "grad_norm": 0.8498949408531189, |
| "learning_rate": 0.0001014395000538735, |
| "loss": 3.1787, |
| "step": 77250 |
| }, |
| { |
| "epoch": 8.319879453234313, |
| "grad_norm": 0.8562375903129578, |
| "learning_rate": 0.00010111625902381208, |
| "loss": 3.1898, |
| "step": 77300 |
| }, |
| { |
| "epoch": 8.325261005273921, |
| "grad_norm": 0.8841971755027771, |
| "learning_rate": 0.00010079301799375066, |
| "loss": 3.2001, |
| "step": 77350 |
| }, |
| { |
| "epoch": 8.33064255731353, |
| "grad_norm": 0.8151336312294006, |
| "learning_rate": 0.00010046977696368924, |
| "loss": 3.1902, |
| "step": 77400 |
| }, |
| { |
| "epoch": 8.336024109353138, |
| "grad_norm": 0.8526255488395691, |
| "learning_rate": 0.00010014653593362784, |
| "loss": 3.1803, |
| "step": 77450 |
| }, |
| { |
| "epoch": 8.341405661392745, |
| "grad_norm": 0.8695437908172607, |
| "learning_rate": 9.982329490356642e-05, |
| "loss": 3.1975, |
| "step": 77500 |
| }, |
| { |
| "epoch": 8.346787213432354, |
| "grad_norm": 0.9299212694168091, |
| "learning_rate": 9.9500053873505e-05, |
| "loss": 3.1894, |
| "step": 77550 |
| }, |
| { |
| "epoch": 8.352168765471962, |
| "grad_norm": 0.8620630502700806, |
| "learning_rate": 9.917681284344358e-05, |
| "loss": 3.1769, |
| "step": 77600 |
| }, |
| { |
| "epoch": 8.35755031751157, |
| "grad_norm": 0.8638424277305603, |
| "learning_rate": 9.885357181338218e-05, |
| "loss": 3.1993, |
| "step": 77650 |
| }, |
| { |
| "epoch": 8.36293186955118, |
| "grad_norm": 0.832884669303894, |
| "learning_rate": 9.853033078332074e-05, |
| "loss": 3.1877, |
| "step": 77700 |
| }, |
| { |
| "epoch": 8.368313421590786, |
| "grad_norm": 0.857495129108429, |
| "learning_rate": 9.820708975325934e-05, |
| "loss": 3.1844, |
| "step": 77750 |
| }, |
| { |
| "epoch": 8.373694973630395, |
| "grad_norm": 0.8742907047271729, |
| "learning_rate": 9.788384872319793e-05, |
| "loss": 3.1762, |
| "step": 77800 |
| }, |
| { |
| "epoch": 8.379076525670003, |
| "grad_norm": 0.8619995713233948, |
| "learning_rate": 9.75606076931365e-05, |
| "loss": 3.1828, |
| "step": 77850 |
| }, |
| { |
| "epoch": 8.384458077709612, |
| "grad_norm": 0.8461150527000427, |
| "learning_rate": 9.723736666307508e-05, |
| "loss": 3.2053, |
| "step": 77900 |
| }, |
| { |
| "epoch": 8.38983962974922, |
| "grad_norm": 0.809444785118103, |
| "learning_rate": 9.691412563301368e-05, |
| "loss": 3.1613, |
| "step": 77950 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "grad_norm": 0.8414546251296997, |
| "learning_rate": 9.659088460295227e-05, |
| "loss": 3.1819, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.395221181788829, |
| "eval_accuracy": 0.3904907644502093, |
| "eval_loss": 3.3329615592956543, |
| "eval_runtime": 182.3978, |
| "eval_samples_per_second": 98.746, |
| "eval_steps_per_second": 6.173, |
| "step": 78000 |
| }, |
| { |
| "epoch": 8.400602733828435, |
| "grad_norm": 0.8904616832733154, |
| "learning_rate": 9.626764357289084e-05, |
| "loss": 3.1935, |
| "step": 78050 |
| }, |
| { |
| "epoch": 8.405984285868044, |
| "grad_norm": 0.8500921726226807, |
| "learning_rate": 9.594440254282943e-05, |
| "loss": 3.201, |
| "step": 78100 |
| }, |
| { |
| "epoch": 8.411365837907653, |
| "grad_norm": 0.8491564989089966, |
| "learning_rate": 9.562116151276802e-05, |
| "loss": 3.18, |
| "step": 78150 |
| }, |
| { |
| "epoch": 8.416747389947261, |
| "grad_norm": 0.8127497434616089, |
| "learning_rate": 9.52979204827066e-05, |
| "loss": 3.1789, |
| "step": 78200 |
| }, |
| { |
| "epoch": 8.42212894198687, |
| "grad_norm": 0.8613414764404297, |
| "learning_rate": 9.497467945264518e-05, |
| "loss": 3.1947, |
| "step": 78250 |
| }, |
| { |
| "epoch": 8.427510494026476, |
| "grad_norm": 0.8296908140182495, |
| "learning_rate": 9.465143842258377e-05, |
| "loss": 3.174, |
| "step": 78300 |
| }, |
| { |
| "epoch": 8.432892046066085, |
| "grad_norm": 0.8717524409294128, |
| "learning_rate": 9.432819739252234e-05, |
| "loss": 3.1924, |
| "step": 78350 |
| }, |
| { |
| "epoch": 8.438273598105694, |
| "grad_norm": 0.8354899287223816, |
| "learning_rate": 9.400495636246093e-05, |
| "loss": 3.1854, |
| "step": 78400 |
| }, |
| { |
| "epoch": 8.443655150145302, |
| "grad_norm": 0.8695220351219177, |
| "learning_rate": 9.368171533239952e-05, |
| "loss": 3.1859, |
| "step": 78450 |
| }, |
| { |
| "epoch": 8.44903670218491, |
| "grad_norm": 0.8374029994010925, |
| "learning_rate": 9.335847430233811e-05, |
| "loss": 3.1781, |
| "step": 78500 |
| }, |
| { |
| "epoch": 8.45441825422452, |
| "grad_norm": 0.8391903638839722, |
| "learning_rate": 9.303523327227668e-05, |
| "loss": 3.1786, |
| "step": 78550 |
| }, |
| { |
| "epoch": 8.459799806264126, |
| "grad_norm": 0.8173428177833557, |
| "learning_rate": 9.271199224221527e-05, |
| "loss": 3.1855, |
| "step": 78600 |
| }, |
| { |
| "epoch": 8.465181358303735, |
| "grad_norm": 0.8497678637504578, |
| "learning_rate": 9.238875121215387e-05, |
| "loss": 3.2016, |
| "step": 78650 |
| }, |
| { |
| "epoch": 8.470562910343343, |
| "grad_norm": 0.8355598449707031, |
| "learning_rate": 9.206551018209243e-05, |
| "loss": 3.1891, |
| "step": 78700 |
| }, |
| { |
| "epoch": 8.475944462382952, |
| "grad_norm": 0.9021919965744019, |
| "learning_rate": 9.174226915203103e-05, |
| "loss": 3.1931, |
| "step": 78750 |
| }, |
| { |
| "epoch": 8.48132601442256, |
| "grad_norm": 0.8534740805625916, |
| "learning_rate": 9.141902812196961e-05, |
| "loss": 3.1779, |
| "step": 78800 |
| }, |
| { |
| "epoch": 8.486707566462167, |
| "grad_norm": 0.8480976819992065, |
| "learning_rate": 9.109578709190818e-05, |
| "loss": 3.1862, |
| "step": 78850 |
| }, |
| { |
| "epoch": 8.492089118501776, |
| "grad_norm": 0.8286227583885193, |
| "learning_rate": 9.077254606184677e-05, |
| "loss": 3.214, |
| "step": 78900 |
| }, |
| { |
| "epoch": 8.497470670541384, |
| "grad_norm": 0.8237711787223816, |
| "learning_rate": 9.044930503178537e-05, |
| "loss": 3.1884, |
| "step": 78950 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "grad_norm": 0.8402367234230042, |
| "learning_rate": 9.012606400172395e-05, |
| "loss": 3.1854, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.502852222580993, |
| "eval_accuracy": 0.39106488630942104, |
| "eval_loss": 3.32970929145813, |
| "eval_runtime": 182.9601, |
| "eval_samples_per_second": 98.442, |
| "eval_steps_per_second": 6.154, |
| "step": 79000 |
| }, |
| { |
| "epoch": 8.508233774620601, |
| "grad_norm": 0.8203577995300293, |
| "learning_rate": 8.980282297166253e-05, |
| "loss": 3.1807, |
| "step": 79050 |
| }, |
| { |
| "epoch": 8.513615326660208, |
| "grad_norm": 0.8231856822967529, |
| "learning_rate": 8.947958194160111e-05, |
| "loss": 3.1975, |
| "step": 79100 |
| }, |
| { |
| "epoch": 8.518996878699816, |
| "grad_norm": 0.8475301861763, |
| "learning_rate": 8.916280573214092e-05, |
| "loss": 3.2022, |
| "step": 79150 |
| }, |
| { |
| "epoch": 8.524378430739425, |
| "grad_norm": 0.9068256616592407, |
| "learning_rate": 8.883956470207951e-05, |
| "loss": 3.1841, |
| "step": 79200 |
| }, |
| { |
| "epoch": 8.529759982779034, |
| "grad_norm": 0.8666347861289978, |
| "learning_rate": 8.851632367201809e-05, |
| "loss": 3.1918, |
| "step": 79250 |
| }, |
| { |
| "epoch": 8.535141534818642, |
| "grad_norm": 0.8289416432380676, |
| "learning_rate": 8.819308264195669e-05, |
| "loss": 3.1962, |
| "step": 79300 |
| }, |
| { |
| "epoch": 8.54052308685825, |
| "grad_norm": 0.8145918846130371, |
| "learning_rate": 8.786984161189526e-05, |
| "loss": 3.1855, |
| "step": 79350 |
| }, |
| { |
| "epoch": 8.545904638897857, |
| "grad_norm": 0.8508062958717346, |
| "learning_rate": 8.754660058183385e-05, |
| "loss": 3.1653, |
| "step": 79400 |
| }, |
| { |
| "epoch": 8.551286190937466, |
| "grad_norm": 0.8114455342292786, |
| "learning_rate": 8.722335955177243e-05, |
| "loss": 3.1935, |
| "step": 79450 |
| }, |
| { |
| "epoch": 8.556667742977075, |
| "grad_norm": 0.8585551977157593, |
| "learning_rate": 8.690658334231224e-05, |
| "loss": 3.2131, |
| "step": 79500 |
| }, |
| { |
| "epoch": 8.562049295016683, |
| "grad_norm": 0.8328741192817688, |
| "learning_rate": 8.658334231225083e-05, |
| "loss": 3.1715, |
| "step": 79550 |
| }, |
| { |
| "epoch": 8.567430847056292, |
| "grad_norm": 0.8506522178649902, |
| "learning_rate": 8.626010128218943e-05, |
| "loss": 3.2035, |
| "step": 79600 |
| }, |
| { |
| "epoch": 8.572812399095898, |
| "grad_norm": 0.8559414148330688, |
| "learning_rate": 8.5936860252128e-05, |
| "loss": 3.198, |
| "step": 79650 |
| }, |
| { |
| "epoch": 8.578193951135507, |
| "grad_norm": 0.8611463904380798, |
| "learning_rate": 8.561361922206658e-05, |
| "loss": 3.1907, |
| "step": 79700 |
| }, |
| { |
| "epoch": 8.583575503175116, |
| "grad_norm": 0.7964751720428467, |
| "learning_rate": 8.529037819200517e-05, |
| "loss": 3.1686, |
| "step": 79750 |
| }, |
| { |
| "epoch": 8.588957055214724, |
| "grad_norm": 0.8190116882324219, |
| "learning_rate": 8.496713716194374e-05, |
| "loss": 3.1866, |
| "step": 79800 |
| }, |
| { |
| "epoch": 8.594338607254333, |
| "grad_norm": 0.9009842872619629, |
| "learning_rate": 8.464389613188233e-05, |
| "loss": 3.1879, |
| "step": 79850 |
| }, |
| { |
| "epoch": 8.599720159293941, |
| "grad_norm": 0.7971447706222534, |
| "learning_rate": 8.432065510182093e-05, |
| "loss": 3.2059, |
| "step": 79900 |
| }, |
| { |
| "epoch": 8.605101711333548, |
| "grad_norm": 0.8355123996734619, |
| "learning_rate": 8.39974140717595e-05, |
| "loss": 3.1917, |
| "step": 79950 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "grad_norm": 0.9042732119560242, |
| "learning_rate": 8.367417304169809e-05, |
| "loss": 3.1847, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.610483263373157, |
| "eval_accuracy": 0.3913339108596497, |
| "eval_loss": 3.327451705932617, |
| "eval_runtime": 182.9461, |
| "eval_samples_per_second": 98.45, |
| "eval_steps_per_second": 6.155, |
| "step": 80000 |
| }, |
| { |
| "epoch": 8.615864815412765, |
| "grad_norm": 0.8184645175933838, |
| "learning_rate": 8.335093201163667e-05, |
| "loss": 3.177, |
| "step": 80050 |
| }, |
| { |
| "epoch": 8.621246367452374, |
| "grad_norm": 0.8244253396987915, |
| "learning_rate": 8.302769098157526e-05, |
| "loss": 3.1778, |
| "step": 80100 |
| }, |
| { |
| "epoch": 8.626627919491982, |
| "grad_norm": 0.934569239616394, |
| "learning_rate": 8.270444995151383e-05, |
| "loss": 3.2007, |
| "step": 80150 |
| }, |
| { |
| "epoch": 8.632009471531589, |
| "grad_norm": 0.798776388168335, |
| "learning_rate": 8.238120892145243e-05, |
| "loss": 3.1926, |
| "step": 80200 |
| }, |
| { |
| "epoch": 8.637391023571197, |
| "grad_norm": 0.8425439596176147, |
| "learning_rate": 8.205796789139101e-05, |
| "loss": 3.2076, |
| "step": 80250 |
| }, |
| { |
| "epoch": 8.642772575610806, |
| "grad_norm": 0.8503188490867615, |
| "learning_rate": 8.173472686132959e-05, |
| "loss": 3.1863, |
| "step": 80300 |
| }, |
| { |
| "epoch": 8.648154127650415, |
| "grad_norm": 0.8668044209480286, |
| "learning_rate": 8.141148583126817e-05, |
| "loss": 3.1889, |
| "step": 80350 |
| }, |
| { |
| "epoch": 8.653535679690023, |
| "grad_norm": 0.8459951877593994, |
| "learning_rate": 8.108824480120676e-05, |
| "loss": 3.1957, |
| "step": 80400 |
| }, |
| { |
| "epoch": 8.658917231729632, |
| "grad_norm": 0.911152720451355, |
| "learning_rate": 8.076500377114533e-05, |
| "loss": 3.1835, |
| "step": 80450 |
| }, |
| { |
| "epoch": 8.664298783769238, |
| "grad_norm": 0.8265844583511353, |
| "learning_rate": 8.044176274108393e-05, |
| "loss": 3.2005, |
| "step": 80500 |
| }, |
| { |
| "epoch": 8.669680335808847, |
| "grad_norm": 0.8671901226043701, |
| "learning_rate": 8.011852171102252e-05, |
| "loss": 3.2096, |
| "step": 80550 |
| }, |
| { |
| "epoch": 8.675061887848456, |
| "grad_norm": 0.8355448842048645, |
| "learning_rate": 7.97952806809611e-05, |
| "loss": 3.187, |
| "step": 80600 |
| }, |
| { |
| "epoch": 8.680443439888064, |
| "grad_norm": 0.8255912065505981, |
| "learning_rate": 7.947203965089967e-05, |
| "loss": 3.1844, |
| "step": 80650 |
| }, |
| { |
| "epoch": 8.685824991927673, |
| "grad_norm": 0.8755044341087341, |
| "learning_rate": 7.914879862083827e-05, |
| "loss": 3.1869, |
| "step": 80700 |
| }, |
| { |
| "epoch": 8.69120654396728, |
| "grad_norm": 0.8891885876655579, |
| "learning_rate": 7.882555759077686e-05, |
| "loss": 3.1692, |
| "step": 80750 |
| }, |
| { |
| "epoch": 8.696588096006888, |
| "grad_norm": 0.8529818058013916, |
| "learning_rate": 7.850231656071543e-05, |
| "loss": 3.188, |
| "step": 80800 |
| }, |
| { |
| "epoch": 8.701969648046497, |
| "grad_norm": 0.8659524321556091, |
| "learning_rate": 7.817907553065402e-05, |
| "loss": 3.1952, |
| "step": 80850 |
| }, |
| { |
| "epoch": 8.707351200086105, |
| "grad_norm": 0.8727506995201111, |
| "learning_rate": 7.78558345005926e-05, |
| "loss": 3.1946, |
| "step": 80900 |
| }, |
| { |
| "epoch": 8.712732752125714, |
| "grad_norm": 0.8414073586463928, |
| "learning_rate": 7.753259347053118e-05, |
| "loss": 3.1998, |
| "step": 80950 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "grad_norm": 0.8435977101325989, |
| "learning_rate": 7.720935244046977e-05, |
| "loss": 3.1935, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.718114304165322, |
| "eval_accuracy": 0.3915950037490679, |
| "eval_loss": 3.323737382888794, |
| "eval_runtime": 183.0449, |
| "eval_samples_per_second": 98.397, |
| "eval_steps_per_second": 6.151, |
| "step": 81000 |
| }, |
| { |
| "epoch": 8.723495856204929, |
| "grad_norm": 0.8313448429107666, |
| "learning_rate": 7.688611141040836e-05, |
| "loss": 3.1835, |
| "step": 81050 |
| }, |
| { |
| "epoch": 8.728877408244538, |
| "grad_norm": 0.8675030469894409, |
| "learning_rate": 7.656287038034694e-05, |
| "loss": 3.1836, |
| "step": 81100 |
| }, |
| { |
| "epoch": 8.734258960284146, |
| "grad_norm": 0.8448699116706848, |
| "learning_rate": 7.623962935028552e-05, |
| "loss": 3.2062, |
| "step": 81150 |
| }, |
| { |
| "epoch": 8.739640512323755, |
| "grad_norm": 0.8596765995025635, |
| "learning_rate": 7.59163883202241e-05, |
| "loss": 3.1863, |
| "step": 81200 |
| }, |
| { |
| "epoch": 8.745022064363363, |
| "grad_norm": 0.8727709054946899, |
| "learning_rate": 7.55931472901627e-05, |
| "loss": 3.1742, |
| "step": 81250 |
| }, |
| { |
| "epoch": 8.75040361640297, |
| "grad_norm": 0.8483537435531616, |
| "learning_rate": 7.526990626010127e-05, |
| "loss": 3.1818, |
| "step": 81300 |
| }, |
| { |
| "epoch": 8.755785168442578, |
| "grad_norm": 0.8239384293556213, |
| "learning_rate": 7.494666523003986e-05, |
| "loss": 3.1747, |
| "step": 81350 |
| }, |
| { |
| "epoch": 8.761166720482187, |
| "grad_norm": 0.8851037621498108, |
| "learning_rate": 7.462342419997844e-05, |
| "loss": 3.1878, |
| "step": 81400 |
| }, |
| { |
| "epoch": 8.766548272521796, |
| "grad_norm": 0.870302677154541, |
| "learning_rate": 7.430018316991704e-05, |
| "loss": 3.1834, |
| "step": 81450 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.8676862120628357, |
| "learning_rate": 7.397694213985562e-05, |
| "loss": 3.1805, |
| "step": 81500 |
| }, |
| { |
| "epoch": 8.777311376601011, |
| "grad_norm": 0.8838236331939697, |
| "learning_rate": 7.36537011097942e-05, |
| "loss": 3.1878, |
| "step": 81550 |
| }, |
| { |
| "epoch": 8.78269292864062, |
| "grad_norm": 0.9052426218986511, |
| "learning_rate": 7.333046007973278e-05, |
| "loss": 3.176, |
| "step": 81600 |
| }, |
| { |
| "epoch": 8.788074480680228, |
| "grad_norm": 0.827214241027832, |
| "learning_rate": 7.300721904967136e-05, |
| "loss": 3.1943, |
| "step": 81650 |
| }, |
| { |
| "epoch": 8.793456032719837, |
| "grad_norm": 0.8450335264205933, |
| "learning_rate": 7.268397801960996e-05, |
| "loss": 3.1906, |
| "step": 81700 |
| }, |
| { |
| "epoch": 8.798837584759445, |
| "grad_norm": 0.8375703692436218, |
| "learning_rate": 7.236073698954854e-05, |
| "loss": 3.1844, |
| "step": 81750 |
| }, |
| { |
| "epoch": 8.804219136799054, |
| "grad_norm": 0.816857635974884, |
| "learning_rate": 7.203749595948712e-05, |
| "loss": 3.1712, |
| "step": 81800 |
| }, |
| { |
| "epoch": 8.80960068883866, |
| "grad_norm": 0.8428307771682739, |
| "learning_rate": 7.17142549294257e-05, |
| "loss": 3.1939, |
| "step": 81850 |
| }, |
| { |
| "epoch": 8.814982240878269, |
| "grad_norm": 0.8175262212753296, |
| "learning_rate": 7.139101389936428e-05, |
| "loss": 3.1919, |
| "step": 81900 |
| }, |
| { |
| "epoch": 8.820363792917878, |
| "grad_norm": 0.8650425672531128, |
| "learning_rate": 7.106777286930287e-05, |
| "loss": 3.2, |
| "step": 81950 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "grad_norm": 0.866899311542511, |
| "learning_rate": 7.074453183924146e-05, |
| "loss": 3.1881, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.825745344957486, |
| "eval_accuracy": 0.39193965070921544, |
| "eval_loss": 3.3191041946411133, |
| "eval_runtime": 182.564, |
| "eval_samples_per_second": 98.656, |
| "eval_steps_per_second": 6.168, |
| "step": 82000 |
| }, |
| { |
| "epoch": 8.831126896997095, |
| "grad_norm": 0.8628833889961243, |
| "learning_rate": 7.042129080918004e-05, |
| "loss": 3.1915, |
| "step": 82050 |
| }, |
| { |
| "epoch": 8.836508449036701, |
| "grad_norm": 0.9049327969551086, |
| "learning_rate": 7.009804977911862e-05, |
| "loss": 3.1922, |
| "step": 82100 |
| }, |
| { |
| "epoch": 8.84189000107631, |
| "grad_norm": 0.8244563937187195, |
| "learning_rate": 6.97748087490572e-05, |
| "loss": 3.1827, |
| "step": 82150 |
| }, |
| { |
| "epoch": 8.847271553115919, |
| "grad_norm": 0.8381883502006531, |
| "learning_rate": 6.945156771899579e-05, |
| "loss": 3.2009, |
| "step": 82200 |
| }, |
| { |
| "epoch": 8.852653105155527, |
| "grad_norm": 0.8337753415107727, |
| "learning_rate": 6.912832668893437e-05, |
| "loss": 3.1808, |
| "step": 82250 |
| }, |
| { |
| "epoch": 8.858034657195136, |
| "grad_norm": 0.8778477907180786, |
| "learning_rate": 6.880508565887297e-05, |
| "loss": 3.218, |
| "step": 82300 |
| }, |
| { |
| "epoch": 8.863416209234742, |
| "grad_norm": 0.8658258318901062, |
| "learning_rate": 6.848184462881155e-05, |
| "loss": 3.198, |
| "step": 82350 |
| }, |
| { |
| "epoch": 8.868797761274351, |
| "grad_norm": 0.861770749092102, |
| "learning_rate": 6.815860359875013e-05, |
| "loss": 3.1882, |
| "step": 82400 |
| }, |
| { |
| "epoch": 8.87417931331396, |
| "grad_norm": 1.062533974647522, |
| "learning_rate": 6.783536256868871e-05, |
| "loss": 3.2048, |
| "step": 82450 |
| }, |
| { |
| "epoch": 8.879560865353568, |
| "grad_norm": 0.878668487071991, |
| "learning_rate": 6.75121215386273e-05, |
| "loss": 3.1679, |
| "step": 82500 |
| }, |
| { |
| "epoch": 8.884942417393177, |
| "grad_norm": 0.8749240636825562, |
| "learning_rate": 6.718888050856589e-05, |
| "loss": 3.1839, |
| "step": 82550 |
| }, |
| { |
| "epoch": 8.890323969432785, |
| "grad_norm": 0.8733687996864319, |
| "learning_rate": 6.686563947850447e-05, |
| "loss": 3.1964, |
| "step": 82600 |
| }, |
| { |
| "epoch": 8.895705521472392, |
| "grad_norm": 0.8685417175292969, |
| "learning_rate": 6.654239844844305e-05, |
| "loss": 3.1968, |
| "step": 82650 |
| }, |
| { |
| "epoch": 8.901087073512, |
| "grad_norm": 0.8061568140983582, |
| "learning_rate": 6.621915741838163e-05, |
| "loss": 3.1726, |
| "step": 82700 |
| }, |
| { |
| "epoch": 8.906468625551609, |
| "grad_norm": 0.8258588314056396, |
| "learning_rate": 6.589591638832021e-05, |
| "loss": 3.1938, |
| "step": 82750 |
| }, |
| { |
| "epoch": 8.911850177591218, |
| "grad_norm": 0.8688393235206604, |
| "learning_rate": 6.557267535825881e-05, |
| "loss": 3.1881, |
| "step": 82800 |
| }, |
| { |
| "epoch": 8.917231729630826, |
| "grad_norm": 0.870529055595398, |
| "learning_rate": 6.524943432819739e-05, |
| "loss": 3.1997, |
| "step": 82850 |
| }, |
| { |
| "epoch": 8.922613281670433, |
| "grad_norm": 0.8257046341896057, |
| "learning_rate": 6.492619329813597e-05, |
| "loss": 3.1816, |
| "step": 82900 |
| }, |
| { |
| "epoch": 8.927994833710041, |
| "grad_norm": 0.858521580696106, |
| "learning_rate": 6.460295226807455e-05, |
| "loss": 3.1988, |
| "step": 82950 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "grad_norm": 0.8709513545036316, |
| "learning_rate": 6.427971123801313e-05, |
| "loss": 3.1831, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.93337638574965, |
| "eval_accuracy": 0.3923701334507364, |
| "eval_loss": 3.3174479007720947, |
| "eval_runtime": 182.9075, |
| "eval_samples_per_second": 98.471, |
| "eval_steps_per_second": 6.156, |
| "step": 83000 |
| }, |
| { |
| "epoch": 8.938757937789259, |
| "grad_norm": 0.8560492992401123, |
| "learning_rate": 6.395647020795173e-05, |
| "loss": 3.1856, |
| "step": 83050 |
| }, |
| { |
| "epoch": 8.944139489828867, |
| "grad_norm": 0.8757853507995605, |
| "learning_rate": 6.363322917789031e-05, |
| "loss": 3.1807, |
| "step": 83100 |
| }, |
| { |
| "epoch": 8.949521041868476, |
| "grad_norm": 0.852837860584259, |
| "learning_rate": 6.330998814782889e-05, |
| "loss": 3.1919, |
| "step": 83150 |
| }, |
| { |
| "epoch": 8.954902593908082, |
| "grad_norm": 0.8725079298019409, |
| "learning_rate": 6.298674711776748e-05, |
| "loss": 3.1904, |
| "step": 83200 |
| }, |
| { |
| "epoch": 8.960284145947691, |
| "grad_norm": 0.8584096431732178, |
| "learning_rate": 6.266350608770606e-05, |
| "loss": 3.1921, |
| "step": 83250 |
| }, |
| { |
| "epoch": 8.9656656979873, |
| "grad_norm": 0.8362786769866943, |
| "learning_rate": 6.234026505764465e-05, |
| "loss": 3.1988, |
| "step": 83300 |
| }, |
| { |
| "epoch": 8.971047250026908, |
| "grad_norm": 0.8816782832145691, |
| "learning_rate": 6.201702402758323e-05, |
| "loss": 3.1755, |
| "step": 83350 |
| }, |
| { |
| "epoch": 8.976428802066517, |
| "grad_norm": 0.8913627862930298, |
| "learning_rate": 6.169378299752181e-05, |
| "loss": 3.1949, |
| "step": 83400 |
| }, |
| { |
| "epoch": 8.981810354106123, |
| "grad_norm": 0.8645463585853577, |
| "learning_rate": 6.13705419674604e-05, |
| "loss": 3.1884, |
| "step": 83450 |
| }, |
| { |
| "epoch": 8.987191906145732, |
| "grad_norm": 0.8139950633049011, |
| "learning_rate": 6.104730093739898e-05, |
| "loss": 3.1769, |
| "step": 83500 |
| }, |
| { |
| "epoch": 8.99257345818534, |
| "grad_norm": 0.9117967486381531, |
| "learning_rate": 6.073052472793879e-05, |
| "loss": 3.1918, |
| "step": 83550 |
| }, |
| { |
| "epoch": 8.997955010224949, |
| "grad_norm": 0.8582161664962769, |
| "learning_rate": 6.0407283697877384e-05, |
| "loss": 3.1887, |
| "step": 83600 |
| }, |
| { |
| "epoch": 9.003336562264558, |
| "grad_norm": 0.8494151830673218, |
| "learning_rate": 6.0084042667815966e-05, |
| "loss": 3.1343, |
| "step": 83650 |
| }, |
| { |
| "epoch": 9.008718114304166, |
| "grad_norm": 0.8479065299034119, |
| "learning_rate": 5.976080163775455e-05, |
| "loss": 3.1253, |
| "step": 83700 |
| }, |
| { |
| "epoch": 9.014099666343773, |
| "grad_norm": 0.8485672473907471, |
| "learning_rate": 5.9437560607693135e-05, |
| "loss": 3.1436, |
| "step": 83750 |
| }, |
| { |
| "epoch": 9.019481218383381, |
| "grad_norm": 0.8757447004318237, |
| "learning_rate": 5.9114319577631716e-05, |
| "loss": 3.1274, |
| "step": 83800 |
| }, |
| { |
| "epoch": 9.02486277042299, |
| "grad_norm": 0.8620525598526001, |
| "learning_rate": 5.8791078547570304e-05, |
| "loss": 3.1237, |
| "step": 83850 |
| }, |
| { |
| "epoch": 9.030244322462599, |
| "grad_norm": 0.8854354619979858, |
| "learning_rate": 5.8467837517508885e-05, |
| "loss": 3.1347, |
| "step": 83900 |
| }, |
| { |
| "epoch": 9.035625874502207, |
| "grad_norm": 0.8926529884338379, |
| "learning_rate": 5.8144596487447466e-05, |
| "loss": 3.1083, |
| "step": 83950 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "grad_norm": 0.8925355076789856, |
| "learning_rate": 5.7821355457386054e-05, |
| "loss": 3.126, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.041007426541814, |
| "eval_accuracy": 0.3923292799649182, |
| "eval_loss": 3.3203017711639404, |
| "eval_runtime": 182.8076, |
| "eval_samples_per_second": 98.524, |
| "eval_steps_per_second": 6.159, |
| "step": 84000 |
| }, |
| { |
| "epoch": 9.046388978581422, |
| "grad_norm": 0.8431273698806763, |
| "learning_rate": 5.7498114427324635e-05, |
| "loss": 3.1304, |
| "step": 84050 |
| }, |
| { |
| "epoch": 9.051770530621031, |
| "grad_norm": 0.8672769665718079, |
| "learning_rate": 5.717487339726322e-05, |
| "loss": 3.1198, |
| "step": 84100 |
| }, |
| { |
| "epoch": 9.05715208266064, |
| "grad_norm": 0.8500916957855225, |
| "learning_rate": 5.6851632367201804e-05, |
| "loss": 3.1346, |
| "step": 84150 |
| }, |
| { |
| "epoch": 9.062533634700248, |
| "grad_norm": 0.8439621329307556, |
| "learning_rate": 5.6528391337140385e-05, |
| "loss": 3.1345, |
| "step": 84200 |
| }, |
| { |
| "epoch": 9.067915186739857, |
| "grad_norm": 0.8591272234916687, |
| "learning_rate": 5.620515030707897e-05, |
| "loss": 3.1324, |
| "step": 84250 |
| }, |
| { |
| "epoch": 9.073296738779463, |
| "grad_norm": 0.8671176433563232, |
| "learning_rate": 5.5881909277017554e-05, |
| "loss": 3.1263, |
| "step": 84300 |
| }, |
| { |
| "epoch": 9.078678290819072, |
| "grad_norm": 0.8660874962806702, |
| "learning_rate": 5.555866824695614e-05, |
| "loss": 3.1316, |
| "step": 84350 |
| }, |
| { |
| "epoch": 9.08405984285868, |
| "grad_norm": 0.8906688690185547, |
| "learning_rate": 5.523542721689472e-05, |
| "loss": 3.1293, |
| "step": 84400 |
| }, |
| { |
| "epoch": 9.089441394898289, |
| "grad_norm": 0.8381503820419312, |
| "learning_rate": 5.4912186186833304e-05, |
| "loss": 3.1144, |
| "step": 84450 |
| }, |
| { |
| "epoch": 9.094822946937898, |
| "grad_norm": 0.8566581606864929, |
| "learning_rate": 5.45889451567719e-05, |
| "loss": 3.1277, |
| "step": 84500 |
| }, |
| { |
| "epoch": 9.100204498977504, |
| "grad_norm": 0.8410578966140747, |
| "learning_rate": 5.426570412671048e-05, |
| "loss": 3.1333, |
| "step": 84550 |
| }, |
| { |
| "epoch": 9.105586051017113, |
| "grad_norm": 0.8373113870620728, |
| "learning_rate": 5.394246309664907e-05, |
| "loss": 3.1343, |
| "step": 84600 |
| }, |
| { |
| "epoch": 9.110967603056721, |
| "grad_norm": 0.8358331322669983, |
| "learning_rate": 5.361922206658765e-05, |
| "loss": 3.1296, |
| "step": 84650 |
| }, |
| { |
| "epoch": 9.11634915509633, |
| "grad_norm": 0.8469173908233643, |
| "learning_rate": 5.329598103652623e-05, |
| "loss": 3.1223, |
| "step": 84700 |
| }, |
| { |
| "epoch": 9.121730707135939, |
| "grad_norm": 0.8513491153717041, |
| "learning_rate": 5.297274000646482e-05, |
| "loss": 3.1239, |
| "step": 84750 |
| }, |
| { |
| "epoch": 9.127112259175545, |
| "grad_norm": 0.8536253571510315, |
| "learning_rate": 5.26494989764034e-05, |
| "loss": 3.1249, |
| "step": 84800 |
| }, |
| { |
| "epoch": 9.132493811215154, |
| "grad_norm": 0.9170660376548767, |
| "learning_rate": 5.232625794634199e-05, |
| "loss": 3.1293, |
| "step": 84850 |
| }, |
| { |
| "epoch": 9.137875363254762, |
| "grad_norm": 0.8490689992904663, |
| "learning_rate": 5.200301691628057e-05, |
| "loss": 3.1382, |
| "step": 84900 |
| }, |
| { |
| "epoch": 9.143256915294371, |
| "grad_norm": 0.8707136511802673, |
| "learning_rate": 5.167977588621915e-05, |
| "loss": 3.1361, |
| "step": 84950 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "grad_norm": 0.8578998446464539, |
| "learning_rate": 5.135653485615774e-05, |
| "loss": 3.1376, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.14863846733398, |
| "eval_accuracy": 0.39240283796996855, |
| "eval_loss": 3.3188514709472656, |
| "eval_runtime": 182.6043, |
| "eval_samples_per_second": 98.634, |
| "eval_steps_per_second": 6.166, |
| "step": 85000 |
| }, |
| { |
| "epoch": 9.154020019373588, |
| "grad_norm": 0.8587227463722229, |
| "learning_rate": 5.103329382609632e-05, |
| "loss": 3.1398, |
| "step": 85050 |
| }, |
| { |
| "epoch": 9.159401571413195, |
| "grad_norm": 0.8671402931213379, |
| "learning_rate": 5.0710052796034906e-05, |
| "loss": 3.1134, |
| "step": 85100 |
| }, |
| { |
| "epoch": 9.164783123452803, |
| "grad_norm": 0.8717479109764099, |
| "learning_rate": 5.038681176597349e-05, |
| "loss": 3.116, |
| "step": 85150 |
| }, |
| { |
| "epoch": 9.170164675492412, |
| "grad_norm": 0.8798679709434509, |
| "learning_rate": 5.006357073591207e-05, |
| "loss": 3.1406, |
| "step": 85200 |
| }, |
| { |
| "epoch": 9.17554622753202, |
| "grad_norm": 0.9084823727607727, |
| "learning_rate": 4.974032970585066e-05, |
| "loss": 3.144, |
| "step": 85250 |
| }, |
| { |
| "epoch": 9.180927779571629, |
| "grad_norm": 0.8945959210395813, |
| "learning_rate": 4.9417088675789244e-05, |
| "loss": 3.1382, |
| "step": 85300 |
| }, |
| { |
| "epoch": 9.186309331611236, |
| "grad_norm": 0.8824502229690552, |
| "learning_rate": 4.909384764572783e-05, |
| "loss": 3.1076, |
| "step": 85350 |
| }, |
| { |
| "epoch": 9.191690883650844, |
| "grad_norm": 0.8436898589134216, |
| "learning_rate": 4.877060661566641e-05, |
| "loss": 3.1517, |
| "step": 85400 |
| }, |
| { |
| "epoch": 9.197072435690453, |
| "grad_norm": 0.8343037366867065, |
| "learning_rate": 4.8447365585604994e-05, |
| "loss": 3.1237, |
| "step": 85450 |
| }, |
| { |
| "epoch": 9.202453987730062, |
| "grad_norm": 0.9071884751319885, |
| "learning_rate": 4.812412455554358e-05, |
| "loss": 3.1321, |
| "step": 85500 |
| }, |
| { |
| "epoch": 9.20783553976967, |
| "grad_norm": 0.8292504549026489, |
| "learning_rate": 4.780088352548216e-05, |
| "loss": 3.1412, |
| "step": 85550 |
| }, |
| { |
| "epoch": 9.213217091809279, |
| "grad_norm": 0.8744844794273376, |
| "learning_rate": 4.7484107316021976e-05, |
| "loss": 3.1387, |
| "step": 85600 |
| }, |
| { |
| "epoch": 9.218598643848885, |
| "grad_norm": 0.8166503310203552, |
| "learning_rate": 4.7160866285960564e-05, |
| "loss": 3.1446, |
| "step": 85650 |
| }, |
| { |
| "epoch": 9.223980195888494, |
| "grad_norm": 0.8761563897132874, |
| "learning_rate": 4.6837625255899145e-05, |
| "loss": 3.1335, |
| "step": 85700 |
| }, |
| { |
| "epoch": 9.229361747928102, |
| "grad_norm": 0.9411799907684326, |
| "learning_rate": 4.6514384225837726e-05, |
| "loss": 3.1534, |
| "step": 85750 |
| }, |
| { |
| "epoch": 9.234743299967711, |
| "grad_norm": 0.9132449626922607, |
| "learning_rate": 4.6191143195776314e-05, |
| "loss": 3.127, |
| "step": 85800 |
| }, |
| { |
| "epoch": 9.24012485200732, |
| "grad_norm": 0.8338202834129333, |
| "learning_rate": 4.5867902165714895e-05, |
| "loss": 3.129, |
| "step": 85850 |
| }, |
| { |
| "epoch": 9.245506404046926, |
| "grad_norm": 0.8570502400398254, |
| "learning_rate": 4.554466113565348e-05, |
| "loss": 3.1349, |
| "step": 85900 |
| }, |
| { |
| "epoch": 9.250887956086535, |
| "grad_norm": 0.8650547862052917, |
| "learning_rate": 4.5221420105592064e-05, |
| "loss": 3.1229, |
| "step": 85950 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "grad_norm": 0.8735968470573425, |
| "learning_rate": 4.4898179075530645e-05, |
| "loss": 3.1377, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.256269508126143, |
| "eval_accuracy": 0.3926530655706053, |
| "eval_loss": 3.316767692565918, |
| "eval_runtime": 182.8243, |
| "eval_samples_per_second": 98.515, |
| "eval_steps_per_second": 6.159, |
| "step": 86000 |
| }, |
| { |
| "epoch": 9.261651060165752, |
| "grad_norm": 0.8486570715904236, |
| "learning_rate": 4.457493804546923e-05, |
| "loss": 3.1238, |
| "step": 86050 |
| }, |
| { |
| "epoch": 9.26703261220536, |
| "grad_norm": 0.8732266426086426, |
| "learning_rate": 4.4251697015407814e-05, |
| "loss": 3.1398, |
| "step": 86100 |
| }, |
| { |
| "epoch": 9.272414164244967, |
| "grad_norm": 0.8636999726295471, |
| "learning_rate": 4.392845598534641e-05, |
| "loss": 3.1303, |
| "step": 86150 |
| }, |
| { |
| "epoch": 9.277795716284576, |
| "grad_norm": 0.8534356355667114, |
| "learning_rate": 4.360521495528499e-05, |
| "loss": 3.156, |
| "step": 86200 |
| }, |
| { |
| "epoch": 9.283177268324184, |
| "grad_norm": 0.8626139163970947, |
| "learning_rate": 4.3281973925223564e-05, |
| "loss": 3.1201, |
| "step": 86250 |
| }, |
| { |
| "epoch": 9.288558820363793, |
| "grad_norm": 0.8573952317237854, |
| "learning_rate": 4.295873289516216e-05, |
| "loss": 3.1281, |
| "step": 86300 |
| }, |
| { |
| "epoch": 9.293940372403402, |
| "grad_norm": 0.8308889269828796, |
| "learning_rate": 4.263549186510074e-05, |
| "loss": 3.1453, |
| "step": 86350 |
| }, |
| { |
| "epoch": 9.29932192444301, |
| "grad_norm": 0.813922643661499, |
| "learning_rate": 4.231225083503933e-05, |
| "loss": 3.1252, |
| "step": 86400 |
| }, |
| { |
| "epoch": 9.304703476482617, |
| "grad_norm": 0.8177182078361511, |
| "learning_rate": 4.198900980497791e-05, |
| "loss": 3.1361, |
| "step": 86450 |
| }, |
| { |
| "epoch": 9.310085028522225, |
| "grad_norm": 0.8975428938865662, |
| "learning_rate": 4.167223359551772e-05, |
| "loss": 3.1266, |
| "step": 86500 |
| }, |
| { |
| "epoch": 9.315466580561834, |
| "grad_norm": 0.8596468567848206, |
| "learning_rate": 4.1348992565456303e-05, |
| "loss": 3.1277, |
| "step": 86550 |
| }, |
| { |
| "epoch": 9.320848132601443, |
| "grad_norm": 0.8630208969116211, |
| "learning_rate": 4.102575153539489e-05, |
| "loss": 3.1461, |
| "step": 86600 |
| }, |
| { |
| "epoch": 9.326229684641051, |
| "grad_norm": 0.9110468029975891, |
| "learning_rate": 4.070251050533347e-05, |
| "loss": 3.1253, |
| "step": 86650 |
| }, |
| { |
| "epoch": 9.331611236680658, |
| "grad_norm": 0.8691954612731934, |
| "learning_rate": 4.037926947527206e-05, |
| "loss": 3.1465, |
| "step": 86700 |
| }, |
| { |
| "epoch": 9.336992788720266, |
| "grad_norm": 0.8495018482208252, |
| "learning_rate": 4.005602844521064e-05, |
| "loss": 3.129, |
| "step": 86750 |
| }, |
| { |
| "epoch": 9.342374340759875, |
| "grad_norm": 0.8317940831184387, |
| "learning_rate": 3.973278741514922e-05, |
| "loss": 3.1439, |
| "step": 86800 |
| }, |
| { |
| "epoch": 9.347755892799483, |
| "grad_norm": 0.8267626166343689, |
| "learning_rate": 3.940954638508781e-05, |
| "loss": 3.1323, |
| "step": 86850 |
| }, |
| { |
| "epoch": 9.353137444839092, |
| "grad_norm": 0.8663368225097656, |
| "learning_rate": 3.908630535502639e-05, |
| "loss": 3.1397, |
| "step": 86900 |
| }, |
| { |
| "epoch": 9.3585189968787, |
| "grad_norm": 0.8476428389549255, |
| "learning_rate": 3.876306432496498e-05, |
| "loss": 3.132, |
| "step": 86950 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "grad_norm": 0.8747643232345581, |
| "learning_rate": 3.843982329490356e-05, |
| "loss": 3.1389, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.363900548918307, |
| "eval_accuracy": 0.3930124893234956, |
| "eval_loss": 3.3138458728790283, |
| "eval_runtime": 182.6258, |
| "eval_samples_per_second": 98.622, |
| "eval_steps_per_second": 6.166, |
| "step": 87000 |
| }, |
| { |
| "epoch": 9.369282100957916, |
| "grad_norm": 0.922262966632843, |
| "learning_rate": 3.811658226484214e-05, |
| "loss": 3.1331, |
| "step": 87050 |
| }, |
| { |
| "epoch": 9.374663652997524, |
| "grad_norm": 0.8760948181152344, |
| "learning_rate": 3.7793341234780736e-05, |
| "loss": 3.1383, |
| "step": 87100 |
| }, |
| { |
| "epoch": 9.380045205037133, |
| "grad_norm": 0.8842629790306091, |
| "learning_rate": 3.747010020471931e-05, |
| "loss": 3.1441, |
| "step": 87150 |
| }, |
| { |
| "epoch": 9.385426757076742, |
| "grad_norm": 0.885867178440094, |
| "learning_rate": 3.71468591746579e-05, |
| "loss": 3.1412, |
| "step": 87200 |
| }, |
| { |
| "epoch": 9.390808309116348, |
| "grad_norm": 0.814979076385498, |
| "learning_rate": 3.6823618144596486e-05, |
| "loss": 3.1407, |
| "step": 87250 |
| }, |
| { |
| "epoch": 9.396189861155957, |
| "grad_norm": 0.8274505734443665, |
| "learning_rate": 3.650037711453507e-05, |
| "loss": 3.1296, |
| "step": 87300 |
| }, |
| { |
| "epoch": 9.401571413195565, |
| "grad_norm": 0.8518266081809998, |
| "learning_rate": 3.6177136084473655e-05, |
| "loss": 3.115, |
| "step": 87350 |
| }, |
| { |
| "epoch": 9.406952965235174, |
| "grad_norm": 0.8622663021087646, |
| "learning_rate": 3.5853895054412236e-05, |
| "loss": 3.1405, |
| "step": 87400 |
| }, |
| { |
| "epoch": 9.412334517274783, |
| "grad_norm": 0.8241939544677734, |
| "learning_rate": 3.553065402435082e-05, |
| "loss": 3.1449, |
| "step": 87450 |
| }, |
| { |
| "epoch": 9.417716069314391, |
| "grad_norm": 0.8843784928321838, |
| "learning_rate": 3.5207412994289405e-05, |
| "loss": 3.1569, |
| "step": 87500 |
| }, |
| { |
| "epoch": 9.423097621353998, |
| "grad_norm": 0.906825840473175, |
| "learning_rate": 3.488417196422799e-05, |
| "loss": 3.1392, |
| "step": 87550 |
| }, |
| { |
| "epoch": 9.428479173393606, |
| "grad_norm": 0.8589437007904053, |
| "learning_rate": 3.4560930934166574e-05, |
| "loss": 3.1291, |
| "step": 87600 |
| }, |
| { |
| "epoch": 9.433860725433215, |
| "grad_norm": 0.8674544095993042, |
| "learning_rate": 3.4237689904105156e-05, |
| "loss": 3.1476, |
| "step": 87650 |
| }, |
| { |
| "epoch": 9.439242277472824, |
| "grad_norm": 0.8766100406646729, |
| "learning_rate": 3.3914448874043743e-05, |
| "loss": 3.1407, |
| "step": 87700 |
| }, |
| { |
| "epoch": 9.444623829512432, |
| "grad_norm": 0.8393718600273132, |
| "learning_rate": 3.3591207843982325e-05, |
| "loss": 3.1521, |
| "step": 87750 |
| }, |
| { |
| "epoch": 9.450005381552039, |
| "grad_norm": 0.8990697860717773, |
| "learning_rate": 3.326796681392091e-05, |
| "loss": 3.1242, |
| "step": 87800 |
| }, |
| { |
| "epoch": 9.455386933591647, |
| "grad_norm": 0.8910731077194214, |
| "learning_rate": 3.2944725783859494e-05, |
| "loss": 3.1614, |
| "step": 87850 |
| }, |
| { |
| "epoch": 9.460768485631256, |
| "grad_norm": 0.8416497111320496, |
| "learning_rate": 3.2621484753798075e-05, |
| "loss": 3.1444, |
| "step": 87900 |
| }, |
| { |
| "epoch": 9.466150037670864, |
| "grad_norm": 0.8401027321815491, |
| "learning_rate": 3.229824372373666e-05, |
| "loss": 3.1381, |
| "step": 87950 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "grad_norm": 0.8736851215362549, |
| "learning_rate": 3.197500269367525e-05, |
| "loss": 3.1323, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.471531589710473, |
| "eval_accuracy": 0.39324837474294083, |
| "eval_loss": 3.312713384628296, |
| "eval_runtime": 182.6668, |
| "eval_samples_per_second": 98.6, |
| "eval_steps_per_second": 6.164, |
| "step": 88000 |
| }, |
| { |
| "epoch": 9.476913141750082, |
| "grad_norm": 0.8908987045288086, |
| "learning_rate": 3.165176166361383e-05, |
| "loss": 3.1391, |
| "step": 88050 |
| }, |
| { |
| "epoch": 9.482294693789688, |
| "grad_norm": 0.8490933179855347, |
| "learning_rate": 3.132852063355242e-05, |
| "loss": 3.1512, |
| "step": 88100 |
| }, |
| { |
| "epoch": 9.487676245829297, |
| "grad_norm": 0.9195178151130676, |
| "learning_rate": 3.1005279603491e-05, |
| "loss": 3.1404, |
| "step": 88150 |
| }, |
| { |
| "epoch": 9.493057797868905, |
| "grad_norm": 0.8848748803138733, |
| "learning_rate": 3.068203857342958e-05, |
| "loss": 3.149, |
| "step": 88200 |
| }, |
| { |
| "epoch": 9.498439349908514, |
| "grad_norm": 0.86981201171875, |
| "learning_rate": 3.035879754336817e-05, |
| "loss": 3.1165, |
| "step": 88250 |
| }, |
| { |
| "epoch": 9.503820901948123, |
| "grad_norm": 0.9147974848747253, |
| "learning_rate": 3.0035556513306754e-05, |
| "loss": 3.1424, |
| "step": 88300 |
| }, |
| { |
| "epoch": 9.50920245398773, |
| "grad_norm": 0.8942081332206726, |
| "learning_rate": 2.971231548324534e-05, |
| "loss": 3.1351, |
| "step": 88350 |
| }, |
| { |
| "epoch": 9.514584006027338, |
| "grad_norm": 0.8392196893692017, |
| "learning_rate": 2.938907445318392e-05, |
| "loss": 3.1381, |
| "step": 88400 |
| }, |
| { |
| "epoch": 9.519965558066946, |
| "grad_norm": 0.8911840915679932, |
| "learning_rate": 2.9065833423122504e-05, |
| "loss": 3.1417, |
| "step": 88450 |
| }, |
| { |
| "epoch": 9.525347110106555, |
| "grad_norm": 0.8524636626243591, |
| "learning_rate": 2.874259239306109e-05, |
| "loss": 3.1324, |
| "step": 88500 |
| }, |
| { |
| "epoch": 9.530728662146164, |
| "grad_norm": 0.8815006017684937, |
| "learning_rate": 2.8419351362999673e-05, |
| "loss": 3.1403, |
| "step": 88550 |
| }, |
| { |
| "epoch": 9.536110214185772, |
| "grad_norm": 0.8821752667427063, |
| "learning_rate": 2.809611033293826e-05, |
| "loss": 3.1315, |
| "step": 88600 |
| }, |
| { |
| "epoch": 9.541491766225379, |
| "grad_norm": 0.9411100745201111, |
| "learning_rate": 2.7772869302876845e-05, |
| "loss": 3.1516, |
| "step": 88650 |
| }, |
| { |
| "epoch": 9.546873318264987, |
| "grad_norm": 0.851620078086853, |
| "learning_rate": 2.7449628272815427e-05, |
| "loss": 3.1355, |
| "step": 88700 |
| }, |
| { |
| "epoch": 9.552254870304596, |
| "grad_norm": 0.8492972254753113, |
| "learning_rate": 2.712638724275401e-05, |
| "loss": 3.1426, |
| "step": 88750 |
| }, |
| { |
| "epoch": 9.557636422344205, |
| "grad_norm": 0.8770946860313416, |
| "learning_rate": 2.6803146212692596e-05, |
| "loss": 3.1521, |
| "step": 88800 |
| }, |
| { |
| "epoch": 9.563017974383813, |
| "grad_norm": 0.8662365078926086, |
| "learning_rate": 2.647990518263118e-05, |
| "loss": 3.1437, |
| "step": 88850 |
| }, |
| { |
| "epoch": 9.56839952642342, |
| "grad_norm": 0.8437976837158203, |
| "learning_rate": 2.6156664152569765e-05, |
| "loss": 3.1468, |
| "step": 88900 |
| }, |
| { |
| "epoch": 9.573781078463028, |
| "grad_norm": 0.8942129015922546, |
| "learning_rate": 2.5833423122508346e-05, |
| "loss": 3.1457, |
| "step": 88950 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "grad_norm": 0.9466322064399719, |
| "learning_rate": 2.5510182092446934e-05, |
| "loss": 3.1385, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.579162630502637, |
| "eval_accuracy": 0.39364148089105366, |
| "eval_loss": 3.309281587600708, |
| "eval_runtime": 182.7896, |
| "eval_samples_per_second": 98.534, |
| "eval_steps_per_second": 6.16, |
| "step": 89000 |
| }, |
| { |
| "epoch": 9.584544182542245, |
| "grad_norm": 0.9119086265563965, |
| "learning_rate": 2.5186941062385518e-05, |
| "loss": 3.1391, |
| "step": 89050 |
| }, |
| { |
| "epoch": 9.589925734581854, |
| "grad_norm": 0.9008535146713257, |
| "learning_rate": 2.4863700032324103e-05, |
| "loss": 3.1422, |
| "step": 89100 |
| }, |
| { |
| "epoch": 9.59530728662146, |
| "grad_norm": 0.8503806591033936, |
| "learning_rate": 2.4540459002262687e-05, |
| "loss": 3.151, |
| "step": 89150 |
| }, |
| { |
| "epoch": 9.60068883866107, |
| "grad_norm": 0.9381389617919922, |
| "learning_rate": 2.4217217972201268e-05, |
| "loss": 3.1439, |
| "step": 89200 |
| }, |
| { |
| "epoch": 9.606070390700678, |
| "grad_norm": 0.8869844079017639, |
| "learning_rate": 2.3893976942139853e-05, |
| "loss": 3.1347, |
| "step": 89250 |
| }, |
| { |
| "epoch": 9.611451942740286, |
| "grad_norm": 0.8782930374145508, |
| "learning_rate": 2.3570735912078437e-05, |
| "loss": 3.1562, |
| "step": 89300 |
| }, |
| { |
| "epoch": 9.616833494779895, |
| "grad_norm": 0.9496018290519714, |
| "learning_rate": 2.324749488201702e-05, |
| "loss": 3.1202, |
| "step": 89350 |
| }, |
| { |
| "epoch": 9.622215046819504, |
| "grad_norm": 0.8809753060340881, |
| "learning_rate": 2.292425385195561e-05, |
| "loss": 3.136, |
| "step": 89400 |
| }, |
| { |
| "epoch": 9.62759659885911, |
| "grad_norm": 0.8729392290115356, |
| "learning_rate": 2.260101282189419e-05, |
| "loss": 3.1371, |
| "step": 89450 |
| }, |
| { |
| "epoch": 9.632978150898719, |
| "grad_norm": 0.881260871887207, |
| "learning_rate": 2.2277771791832775e-05, |
| "loss": 3.1288, |
| "step": 89500 |
| }, |
| { |
| "epoch": 9.638359702938327, |
| "grad_norm": 0.8699634671211243, |
| "learning_rate": 2.195453076177136e-05, |
| "loss": 3.1467, |
| "step": 89550 |
| }, |
| { |
| "epoch": 9.643741254977936, |
| "grad_norm": 0.8865028619766235, |
| "learning_rate": 2.1631289731709944e-05, |
| "loss": 3.133, |
| "step": 89600 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.8839409351348877, |
| "learning_rate": 2.130804870164853e-05, |
| "loss": 3.1223, |
| "step": 89650 |
| }, |
| { |
| "epoch": 9.654504359057151, |
| "grad_norm": 0.9245901703834534, |
| "learning_rate": 2.098480767158711e-05, |
| "loss": 3.1552, |
| "step": 89700 |
| }, |
| { |
| "epoch": 9.65988591109676, |
| "grad_norm": 0.8770548105239868, |
| "learning_rate": 2.0661566641525694e-05, |
| "loss": 3.131, |
| "step": 89750 |
| }, |
| { |
| "epoch": 9.665267463136368, |
| "grad_norm": 0.8477882146835327, |
| "learning_rate": 2.0338325611464282e-05, |
| "loss": 3.1385, |
| "step": 89800 |
| }, |
| { |
| "epoch": 9.670649015175977, |
| "grad_norm": 0.8711922764778137, |
| "learning_rate": 2.0015084581402867e-05, |
| "loss": 3.146, |
| "step": 89850 |
| }, |
| { |
| "epoch": 9.676030567215586, |
| "grad_norm": 0.8607136607170105, |
| "learning_rate": 1.969184355134145e-05, |
| "loss": 3.1414, |
| "step": 89900 |
| }, |
| { |
| "epoch": 9.681412119255192, |
| "grad_norm": 0.8953176736831665, |
| "learning_rate": 1.9368602521280032e-05, |
| "loss": 3.1304, |
| "step": 89950 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "grad_norm": 0.8567274212837219, |
| "learning_rate": 1.9045361491218617e-05, |
| "loss": 3.1358, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.6867936712948, |
| "eval_accuracy": 0.3937655624889378, |
| "eval_loss": 3.3076210021972656, |
| "eval_runtime": 182.9314, |
| "eval_samples_per_second": 98.458, |
| "eval_steps_per_second": 6.155, |
| "step": 90000 |
| }, |
| { |
| "epoch": 9.69217522333441, |
| "grad_norm": 0.8794586062431335, |
| "learning_rate": 1.87221204611572e-05, |
| "loss": 3.1459, |
| "step": 90050 |
| }, |
| { |
| "epoch": 9.697556775374018, |
| "grad_norm": 0.8762345314025879, |
| "learning_rate": 1.8398879431095786e-05, |
| "loss": 3.1327, |
| "step": 90100 |
| }, |
| { |
| "epoch": 9.702938327413626, |
| "grad_norm": 0.9184940457344055, |
| "learning_rate": 1.807563840103437e-05, |
| "loss": 3.1297, |
| "step": 90150 |
| }, |
| { |
| "epoch": 9.708319879453235, |
| "grad_norm": 0.8900254964828491, |
| "learning_rate": 1.7752397370972955e-05, |
| "loss": 3.1323, |
| "step": 90200 |
| }, |
| { |
| "epoch": 9.713701431492842, |
| "grad_norm": 0.8552128076553345, |
| "learning_rate": 1.742915634091154e-05, |
| "loss": 3.1414, |
| "step": 90250 |
| }, |
| { |
| "epoch": 9.71908298353245, |
| "grad_norm": 0.84034264087677, |
| "learning_rate": 1.7105915310850124e-05, |
| "loss": 3.1346, |
| "step": 90300 |
| }, |
| { |
| "epoch": 9.724464535572059, |
| "grad_norm": 0.8617538213729858, |
| "learning_rate": 1.6782674280788708e-05, |
| "loss": 3.1243, |
| "step": 90350 |
| }, |
| { |
| "epoch": 9.729846087611667, |
| "grad_norm": 0.9199374318122864, |
| "learning_rate": 1.645943325072729e-05, |
| "loss": 3.1422, |
| "step": 90400 |
| }, |
| { |
| "epoch": 9.735227639651276, |
| "grad_norm": 0.8754702210426331, |
| "learning_rate": 1.6136192220665877e-05, |
| "loss": 3.1472, |
| "step": 90450 |
| }, |
| { |
| "epoch": 9.740609191690883, |
| "grad_norm": 0.9364842772483826, |
| "learning_rate": 1.5819416011205687e-05, |
| "loss": 3.1309, |
| "step": 90500 |
| }, |
| { |
| "epoch": 9.745990743730491, |
| "grad_norm": 0.8289980292320251, |
| "learning_rate": 1.549617498114427e-05, |
| "loss": 3.1472, |
| "step": 90550 |
| }, |
| { |
| "epoch": 9.7513722957701, |
| "grad_norm": 0.8661265969276428, |
| "learning_rate": 1.5172933951082856e-05, |
| "loss": 3.1371, |
| "step": 90600 |
| }, |
| { |
| "epoch": 9.756753847809708, |
| "grad_norm": 0.8728821873664856, |
| "learning_rate": 1.4849692921021439e-05, |
| "loss": 3.1438, |
| "step": 90650 |
| }, |
| { |
| "epoch": 9.762135399849317, |
| "grad_norm": 0.8967341780662537, |
| "learning_rate": 1.4526451890960025e-05, |
| "loss": 3.1279, |
| "step": 90700 |
| }, |
| { |
| "epoch": 9.767516951888926, |
| "grad_norm": 0.8679084181785583, |
| "learning_rate": 1.420321086089861e-05, |
| "loss": 3.1178, |
| "step": 90750 |
| }, |
| { |
| "epoch": 9.772898503928532, |
| "grad_norm": 0.8503843545913696, |
| "learning_rate": 1.3879969830837192e-05, |
| "loss": 3.1446, |
| "step": 90800 |
| }, |
| { |
| "epoch": 9.77828005596814, |
| "grad_norm": 0.918580949306488, |
| "learning_rate": 1.3556728800775778e-05, |
| "loss": 3.13, |
| "step": 90850 |
| }, |
| { |
| "epoch": 9.78366160800775, |
| "grad_norm": 0.8387547135353088, |
| "learning_rate": 1.3233487770714361e-05, |
| "loss": 3.1473, |
| "step": 90900 |
| }, |
| { |
| "epoch": 9.789043160047358, |
| "grad_norm": 0.8897818326950073, |
| "learning_rate": 1.2910246740652946e-05, |
| "loss": 3.1323, |
| "step": 90950 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "grad_norm": 0.8836583495140076, |
| "learning_rate": 1.258700571059153e-05, |
| "loss": 3.1303, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.794424712086967, |
| "eval_accuracy": 0.3940082930403153, |
| "eval_loss": 3.3052875995635986, |
| "eval_runtime": 182.9499, |
| "eval_samples_per_second": 98.448, |
| "eval_steps_per_second": 6.155, |
| "step": 91000 |
| }, |
| { |
| "epoch": 9.799806264126573, |
| "grad_norm": 0.8632960319519043, |
| "learning_rate": 1.2263764680530113e-05, |
| "loss": 3.1429, |
| "step": 91050 |
| }, |
| { |
| "epoch": 9.805187816166182, |
| "grad_norm": 0.8691814541816711, |
| "learning_rate": 1.19405236504687e-05, |
| "loss": 3.1271, |
| "step": 91100 |
| }, |
| { |
| "epoch": 9.81056936820579, |
| "grad_norm": 0.8277129530906677, |
| "learning_rate": 1.1617282620407282e-05, |
| "loss": 3.1307, |
| "step": 91150 |
| }, |
| { |
| "epoch": 9.815950920245399, |
| "grad_norm": 0.8648361563682556, |
| "learning_rate": 1.1294041590345867e-05, |
| "loss": 3.1471, |
| "step": 91200 |
| }, |
| { |
| "epoch": 9.821332472285007, |
| "grad_norm": 0.8396468758583069, |
| "learning_rate": 1.0970800560284453e-05, |
| "loss": 3.1444, |
| "step": 91250 |
| }, |
| { |
| "epoch": 9.826714024324616, |
| "grad_norm": 0.8713970184326172, |
| "learning_rate": 1.0647559530223035e-05, |
| "loss": 3.1379, |
| "step": 91300 |
| }, |
| { |
| "epoch": 9.832095576364223, |
| "grad_norm": 0.8426269292831421, |
| "learning_rate": 1.032431850016162e-05, |
| "loss": 3.1476, |
| "step": 91350 |
| }, |
| { |
| "epoch": 9.837477128403831, |
| "grad_norm": 0.9147149324417114, |
| "learning_rate": 1.0001077470100203e-05, |
| "loss": 3.1378, |
| "step": 91400 |
| }, |
| { |
| "epoch": 9.84285868044344, |
| "grad_norm": 0.8848015069961548, |
| "learning_rate": 9.677836440038787e-06, |
| "loss": 3.1497, |
| "step": 91450 |
| }, |
| { |
| "epoch": 9.848240232483048, |
| "grad_norm": 0.9187755584716797, |
| "learning_rate": 9.354595409977372e-06, |
| "loss": 3.1406, |
| "step": 91500 |
| }, |
| { |
| "epoch": 9.853621784522657, |
| "grad_norm": 0.8338111639022827, |
| "learning_rate": 9.031354379915956e-06, |
| "loss": 3.1277, |
| "step": 91550 |
| }, |
| { |
| "epoch": 9.859003336562264, |
| "grad_norm": 0.880181074142456, |
| "learning_rate": 8.70811334985454e-06, |
| "loss": 3.1482, |
| "step": 91600 |
| }, |
| { |
| "epoch": 9.864384888601872, |
| "grad_norm": 0.8883482217788696, |
| "learning_rate": 8.384872319793125e-06, |
| "loss": 3.1256, |
| "step": 91650 |
| }, |
| { |
| "epoch": 9.869766440641481, |
| "grad_norm": 0.8639237880706787, |
| "learning_rate": 8.06163128973171e-06, |
| "loss": 3.1349, |
| "step": 91700 |
| }, |
| { |
| "epoch": 9.87514799268109, |
| "grad_norm": 0.8801106810569763, |
| "learning_rate": 7.738390259670293e-06, |
| "loss": 3.1469, |
| "step": 91750 |
| }, |
| { |
| "epoch": 9.880529544720698, |
| "grad_norm": 0.9389435648918152, |
| "learning_rate": 7.415149229608878e-06, |
| "loss": 3.1366, |
| "step": 91800 |
| }, |
| { |
| "epoch": 9.885911096760307, |
| "grad_norm": 0.8664288520812988, |
| "learning_rate": 7.091908199547462e-06, |
| "loss": 3.1287, |
| "step": 91850 |
| }, |
| { |
| "epoch": 9.891292648799913, |
| "grad_norm": 0.8594654202461243, |
| "learning_rate": 6.768667169486046e-06, |
| "loss": 3.1367, |
| "step": 91900 |
| }, |
| { |
| "epoch": 9.896674200839522, |
| "grad_norm": 0.852776288986206, |
| "learning_rate": 6.4454261394246305e-06, |
| "loss": 3.15, |
| "step": 91950 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "grad_norm": 0.871873676776886, |
| "learning_rate": 6.122185109363214e-06, |
| "loss": 3.1412, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.90205575287913, |
| "eval_accuracy": 0.3942072364779036, |
| "eval_loss": 3.303832769393921, |
| "eval_runtime": 183.128, |
| "eval_samples_per_second": 98.352, |
| "eval_steps_per_second": 6.149, |
| "step": 92000 |
| }, |
| { |
| "epoch": 9.907437304918739, |
| "grad_norm": 0.8564249277114868, |
| "learning_rate": 5.7989440793017995e-06, |
| "loss": 3.1272, |
| "step": 92050 |
| }, |
| { |
| "epoch": 9.912818856958348, |
| "grad_norm": 0.8567243814468384, |
| "learning_rate": 5.475703049240383e-06, |
| "loss": 3.1432, |
| "step": 92100 |
| }, |
| { |
| "epoch": 9.918200408997954, |
| "grad_norm": 0.8726806044578552, |
| "learning_rate": 5.152462019178968e-06, |
| "loss": 3.1318, |
| "step": 92150 |
| }, |
| { |
| "epoch": 9.923581961037563, |
| "grad_norm": 0.881909191608429, |
| "learning_rate": 4.829220989117551e-06, |
| "loss": 3.1186, |
| "step": 92200 |
| }, |
| { |
| "epoch": 9.928963513077171, |
| "grad_norm": 0.8314427137374878, |
| "learning_rate": 4.505979959056136e-06, |
| "loss": 3.1237, |
| "step": 92250 |
| }, |
| { |
| "epoch": 9.93434506511678, |
| "grad_norm": 0.8411848545074463, |
| "learning_rate": 4.18273892899472e-06, |
| "loss": 3.1282, |
| "step": 92300 |
| }, |
| { |
| "epoch": 9.939726617156388, |
| "grad_norm": 0.9116420745849609, |
| "learning_rate": 3.859497898933305e-06, |
| "loss": 3.1365, |
| "step": 92350 |
| }, |
| { |
| "epoch": 9.945108169195997, |
| "grad_norm": 0.8482275009155273, |
| "learning_rate": 3.5362568688718884e-06, |
| "loss": 3.1371, |
| "step": 92400 |
| }, |
| { |
| "epoch": 9.950489721235604, |
| "grad_norm": 0.8526927828788757, |
| "learning_rate": 3.2194806594117013e-06, |
| "loss": 3.1521, |
| "step": 92450 |
| }, |
| { |
| "epoch": 9.955871273275212, |
| "grad_norm": 0.8895639777183533, |
| "learning_rate": 2.8962396293502854e-06, |
| "loss": 3.1432, |
| "step": 92500 |
| }, |
| { |
| "epoch": 9.961252825314821, |
| "grad_norm": 0.8692976832389832, |
| "learning_rate": 2.5729985992888694e-06, |
| "loss": 3.148, |
| "step": 92550 |
| }, |
| { |
| "epoch": 9.96663437735443, |
| "grad_norm": 0.8431194424629211, |
| "learning_rate": 2.249757569227454e-06, |
| "loss": 3.1493, |
| "step": 92600 |
| }, |
| { |
| "epoch": 9.972015929394038, |
| "grad_norm": 0.8579978346824646, |
| "learning_rate": 1.926516539166038e-06, |
| "loss": 3.1416, |
| "step": 92650 |
| }, |
| { |
| "epoch": 9.977397481433645, |
| "grad_norm": 0.8592678308486938, |
| "learning_rate": 1.603275509104622e-06, |
| "loss": 3.1405, |
| "step": 92700 |
| }, |
| { |
| "epoch": 9.982779033473253, |
| "grad_norm": 0.8317117691040039, |
| "learning_rate": 1.2800344790432064e-06, |
| "loss": 3.1257, |
| "step": 92750 |
| }, |
| { |
| "epoch": 9.988160585512862, |
| "grad_norm": 0.8965446352958679, |
| "learning_rate": 9.567934489817906e-07, |
| "loss": 3.1337, |
| "step": 92800 |
| }, |
| { |
| "epoch": 9.99354213755247, |
| "grad_norm": 0.8839283585548401, |
| "learning_rate": 6.335524189203748e-07, |
| "loss": 3.1165, |
| "step": 92850 |
| }, |
| { |
| "epoch": 9.998923689592079, |
| "grad_norm": 0.8802606463432312, |
| "learning_rate": 3.103113888589591e-07, |
| "loss": 3.1291, |
| "step": 92900 |
| }, |
| { |
| "epoch": 10.0, |
| "step": 92910, |
| "total_flos": 7.7681859821568e+17, |
| "train_loss": 3.460850506426251, |
| "train_runtime": 80822.0047, |
| "train_samples_per_second": 36.784, |
| "train_steps_per_second": 1.15 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 92910, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 10000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 7.7681859821568e+17, |
| "train_batch_size": 32, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|