| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.5728105382319644, | |
| "eval_steps": 250, | |
| "global_step": 25000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005145621076463929, | |
| "grad_norm": 10.373374938964844, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 4.8488, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.010291242152927859, | |
| "grad_norm": 11.32767391204834, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 2.2402, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.015436863229391787, | |
| "grad_norm": 7.465405464172363, | |
| "learning_rate": 1.9990110229731125e-05, | |
| "loss": 1.8204, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.020582484305855717, | |
| "grad_norm": 11.595544815063477, | |
| "learning_rate": 1.9979808385701043e-05, | |
| "loss": 1.7765, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.025728105382319646, | |
| "grad_norm": 15.59911060333252, | |
| "learning_rate": 1.996950654167096e-05, | |
| "loss": 1.7482, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.025728105382319646, | |
| "eval_accuracy": 0.9227184653282166, | |
| "eval_loss": 1.1625392436981201, | |
| "eval_runtime": 418.704, | |
| "eval_samples_per_second": 82.514, | |
| "eval_steps_per_second": 2.579, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.030873726458783574, | |
| "grad_norm": 7.875217914581299, | |
| "learning_rate": 1.995920469764088e-05, | |
| "loss": 1.7821, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.036019347535247506, | |
| "grad_norm": 11.855839729309082, | |
| "learning_rate": 1.99489028536108e-05, | |
| "loss": 1.6761, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.041164968611711435, | |
| "grad_norm": 7.26309061050415, | |
| "learning_rate": 1.9938601009580717e-05, | |
| "loss": 1.4887, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.04631058968817536, | |
| "grad_norm": 8.920960426330566, | |
| "learning_rate": 1.9928299165550635e-05, | |
| "loss": 1.6001, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.05145621076463929, | |
| "grad_norm": 7.441282749176025, | |
| "learning_rate": 1.9917997321520554e-05, | |
| "loss": 1.7426, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05145621076463929, | |
| "eval_accuracy": 0.9316622614860535, | |
| "eval_loss": 1.1087791919708252, | |
| "eval_runtime": 417.9867, | |
| "eval_samples_per_second": 82.656, | |
| "eval_steps_per_second": 2.584, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.05660183184110322, | |
| "grad_norm": 5.284599781036377, | |
| "learning_rate": 1.9907695477490472e-05, | |
| "loss": 1.5562, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.06174745291756715, | |
| "grad_norm": 9.429953575134277, | |
| "learning_rate": 1.989739363346039e-05, | |
| "loss": 1.6811, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.06689307399403108, | |
| "grad_norm": 13.159565925598145, | |
| "learning_rate": 1.988709178943031e-05, | |
| "loss": 1.5994, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.07203869507049501, | |
| "grad_norm": 6.322610378265381, | |
| "learning_rate": 1.9876789945400228e-05, | |
| "loss": 1.5981, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.07718431614695893, | |
| "grad_norm": 13.995536804199219, | |
| "learning_rate": 1.9866488101370146e-05, | |
| "loss": 1.5713, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.07718431614695893, | |
| "eval_accuracy": 0.9368722438812256, | |
| "eval_loss": 1.0817046165466309, | |
| "eval_runtime": 418.366, | |
| "eval_samples_per_second": 82.581, | |
| "eval_steps_per_second": 2.581, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.08232993722342287, | |
| "grad_norm": 10.23471450805664, | |
| "learning_rate": 1.9856186257340068e-05, | |
| "loss": 1.6516, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.08747555829988679, | |
| "grad_norm": 9.448832511901855, | |
| "learning_rate": 1.9845884413309983e-05, | |
| "loss": 1.5768, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.09262117937635073, | |
| "grad_norm": 9.08662223815918, | |
| "learning_rate": 1.9835582569279905e-05, | |
| "loss": 1.5902, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.09776680045281465, | |
| "grad_norm": 9.396324157714844, | |
| "learning_rate": 1.982528072524982e-05, | |
| "loss": 1.4613, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.10291242152927858, | |
| "grad_norm": 6.668619632720947, | |
| "learning_rate": 1.9814978881219738e-05, | |
| "loss": 1.6295, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10291242152927858, | |
| "eval_accuracy": 0.9374222159385681, | |
| "eval_loss": 1.0676764249801636, | |
| "eval_runtime": 418.3645, | |
| "eval_samples_per_second": 82.581, | |
| "eval_steps_per_second": 2.581, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.10805804260574252, | |
| "grad_norm": 12.718502044677734, | |
| "learning_rate": 1.980467703718966e-05, | |
| "loss": 1.5301, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.11320366368220644, | |
| "grad_norm": 6.540752410888672, | |
| "learning_rate": 1.9794375193159575e-05, | |
| "loss": 1.6072, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.11834928475867038, | |
| "grad_norm": 11.122970581054688, | |
| "learning_rate": 1.9784073349129497e-05, | |
| "loss": 1.4644, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1234949058351343, | |
| "grad_norm": 9.239429473876953, | |
| "learning_rate": 1.9773771505099415e-05, | |
| "loss": 1.6331, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.12864052691159822, | |
| "grad_norm": 7.708181858062744, | |
| "learning_rate": 1.9763469661069334e-05, | |
| "loss": 1.5464, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.12864052691159822, | |
| "eval_accuracy": 0.9407797455787659, | |
| "eval_loss": 1.0546813011169434, | |
| "eval_runtime": 418.2896, | |
| "eval_samples_per_second": 82.596, | |
| "eval_steps_per_second": 2.582, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.13378614798806215, | |
| "grad_norm": 8.665009498596191, | |
| "learning_rate": 1.9753167817039252e-05, | |
| "loss": 1.5406, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1389317690645261, | |
| "grad_norm": 8.5620756149292, | |
| "learning_rate": 1.974286597300917e-05, | |
| "loss": 1.5471, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.14407739014099002, | |
| "grad_norm": 10.859761238098145, | |
| "learning_rate": 1.973256412897909e-05, | |
| "loss": 1.6685, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.14922301121745396, | |
| "grad_norm": 6.381153106689453, | |
| "learning_rate": 1.9722262284949007e-05, | |
| "loss": 1.5644, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.15436863229391787, | |
| "grad_norm": 3.7411134243011475, | |
| "learning_rate": 1.9711960440918926e-05, | |
| "loss": 1.6587, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.15436863229391787, | |
| "eval_accuracy": 0.9419954419136047, | |
| "eval_loss": 1.0589897632598877, | |
| "eval_runtime": 418.2875, | |
| "eval_samples_per_second": 82.596, | |
| "eval_steps_per_second": 2.582, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1595142533703818, | |
| "grad_norm": 7.128472328186035, | |
| "learning_rate": 1.9701658596888844e-05, | |
| "loss": 1.5793, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.16465987444684574, | |
| "grad_norm": 7.605306625366211, | |
| "learning_rate": 1.9691356752858763e-05, | |
| "loss": 1.4877, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.16980549552330967, | |
| "grad_norm": 10.431309700012207, | |
| "learning_rate": 1.9681054908828684e-05, | |
| "loss": 1.5781, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.17495111659977358, | |
| "grad_norm": 7.39127254486084, | |
| "learning_rate": 1.96707530647986e-05, | |
| "loss": 1.5081, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.18009673767623752, | |
| "grad_norm": 8.452966690063477, | |
| "learning_rate": 1.9660451220768518e-05, | |
| "loss": 1.5434, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.18009673767623752, | |
| "eval_accuracy": 0.9395930171012878, | |
| "eval_loss": 1.0563884973526, | |
| "eval_runtime": 418.2001, | |
| "eval_samples_per_second": 82.614, | |
| "eval_steps_per_second": 2.582, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.18524235875270145, | |
| "grad_norm": 11.400819778442383, | |
| "learning_rate": 1.965014937673844e-05, | |
| "loss": 1.4617, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1903879798291654, | |
| "grad_norm": 2.2198734283447266, | |
| "learning_rate": 1.9639847532708355e-05, | |
| "loss": 1.4531, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.1955336009056293, | |
| "grad_norm": 9.084458351135254, | |
| "learning_rate": 1.9629545688678276e-05, | |
| "loss": 1.5713, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.20067922198209323, | |
| "grad_norm": 9.126426696777344, | |
| "learning_rate": 1.961924384464819e-05, | |
| "loss": 1.5166, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.20582484305855717, | |
| "grad_norm": 18.784996032714844, | |
| "learning_rate": 1.9608942000618113e-05, | |
| "loss": 1.4771, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.20582484305855717, | |
| "eval_accuracy": 0.943095326423645, | |
| "eval_loss": 1.0343589782714844, | |
| "eval_runtime": 418.12, | |
| "eval_samples_per_second": 82.629, | |
| "eval_steps_per_second": 2.583, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2109704641350211, | |
| "grad_norm": 4.355062961578369, | |
| "learning_rate": 1.9598640156588032e-05, | |
| "loss": 1.4706, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.21611608521148504, | |
| "grad_norm": 7.846619606018066, | |
| "learning_rate": 1.958833831255795e-05, | |
| "loss": 1.5276, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.22126170628794894, | |
| "grad_norm": 11.4408597946167, | |
| "learning_rate": 1.957803646852787e-05, | |
| "loss": 1.4002, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.22640732736441288, | |
| "grad_norm": 10.485089302062988, | |
| "learning_rate": 1.9567734624497787e-05, | |
| "loss": 1.5605, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.23155294844087682, | |
| "grad_norm": 5.763485431671143, | |
| "learning_rate": 1.9557432780467705e-05, | |
| "loss": 1.4871, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.23155294844087682, | |
| "eval_accuracy": 0.9440793991088867, | |
| "eval_loss": 1.035501480102539, | |
| "eval_runtime": 418.1856, | |
| "eval_samples_per_second": 82.616, | |
| "eval_steps_per_second": 2.583, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.23669856951734075, | |
| "grad_norm": 6.086212635040283, | |
| "learning_rate": 1.9547130936437624e-05, | |
| "loss": 1.56, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.24184419059380466, | |
| "grad_norm": 10.038729667663574, | |
| "learning_rate": 1.9536829092407542e-05, | |
| "loss": 1.4322, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.2469898116702686, | |
| "grad_norm": 8.869370460510254, | |
| "learning_rate": 1.952652724837746e-05, | |
| "loss": 1.4682, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.25213543274673256, | |
| "grad_norm": 9.509527206420898, | |
| "learning_rate": 1.951622540434738e-05, | |
| "loss": 1.4375, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.25728105382319644, | |
| "grad_norm": 14.525392532348633, | |
| "learning_rate": 1.95061295971979e-05, | |
| "loss": 1.4499, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.25728105382319644, | |
| "eval_accuracy": 0.9433557987213135, | |
| "eval_loss": 1.030641794204712, | |
| "eval_runtime": 418.9422, | |
| "eval_samples_per_second": 82.467, | |
| "eval_steps_per_second": 2.578, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.2624266748996604, | |
| "grad_norm": 10.151646614074707, | |
| "learning_rate": 1.949582775316782e-05, | |
| "loss": 1.5088, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.2675722959761243, | |
| "grad_norm": 9.49378776550293, | |
| "learning_rate": 1.948552590913774e-05, | |
| "loss": 1.5577, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.27271791705258824, | |
| "grad_norm": 11.339292526245117, | |
| "learning_rate": 1.9475224065107657e-05, | |
| "loss": 1.4221, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.2778635381290522, | |
| "grad_norm": 6.203045845031738, | |
| "learning_rate": 1.9464922221077572e-05, | |
| "loss": 1.5105, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.2830091592055161, | |
| "grad_norm": 8.609308242797852, | |
| "learning_rate": 1.9454620377047494e-05, | |
| "loss": 1.4681, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.2830091592055161, | |
| "eval_accuracy": 0.9453240633010864, | |
| "eval_loss": 1.0219130516052246, | |
| "eval_runtime": 418.4116, | |
| "eval_samples_per_second": 82.572, | |
| "eval_steps_per_second": 2.581, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.28815478028198005, | |
| "grad_norm": 5.019013404846191, | |
| "learning_rate": 1.9444318533017412e-05, | |
| "loss": 1.4354, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.293300401358444, | |
| "grad_norm": 11.37190055847168, | |
| "learning_rate": 1.943401668898733e-05, | |
| "loss": 1.4982, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.2984460224349079, | |
| "grad_norm": 4.953094005584717, | |
| "learning_rate": 1.942371484495725e-05, | |
| "loss": 1.5374, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.3035916435113718, | |
| "grad_norm": 12.19895076751709, | |
| "learning_rate": 1.9413413000927167e-05, | |
| "loss": 1.4769, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.30873726458783574, | |
| "grad_norm": 6.096263408660889, | |
| "learning_rate": 1.9403111156897086e-05, | |
| "loss": 1.5767, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.30873726458783574, | |
| "eval_accuracy": 0.9450345635414124, | |
| "eval_loss": 1.0167551040649414, | |
| "eval_runtime": 419.6576, | |
| "eval_samples_per_second": 82.327, | |
| "eval_steps_per_second": 2.574, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.31388288566429967, | |
| "grad_norm": 5.119435787200928, | |
| "learning_rate": 1.9392809312867004e-05, | |
| "loss": 1.3712, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.3190285067407636, | |
| "grad_norm": 3.375337600708008, | |
| "learning_rate": 1.9382507468836923e-05, | |
| "loss": 1.4979, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.32417412781722754, | |
| "grad_norm": 12.87149715423584, | |
| "learning_rate": 1.937220562480684e-05, | |
| "loss": 1.4633, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.3293197488936915, | |
| "grad_norm": 8.743680000305176, | |
| "learning_rate": 1.936190378077676e-05, | |
| "loss": 1.5025, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.3344653699701554, | |
| "grad_norm": 7.334400653839111, | |
| "learning_rate": 1.935160193674668e-05, | |
| "loss": 1.5206, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.3344653699701554, | |
| "eval_accuracy": 0.9457292556762695, | |
| "eval_loss": 1.016142725944519, | |
| "eval_runtime": 419.0299, | |
| "eval_samples_per_second": 82.45, | |
| "eval_steps_per_second": 2.577, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.33961099104661935, | |
| "grad_norm": 15.349682807922363, | |
| "learning_rate": 1.9341300092716596e-05, | |
| "loss": 1.5119, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.3447566121230833, | |
| "grad_norm": 10.006840705871582, | |
| "learning_rate": 1.9330998248686518e-05, | |
| "loss": 1.6285, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.34990223319954716, | |
| "grad_norm": 10.248564720153809, | |
| "learning_rate": 1.9320696404656437e-05, | |
| "loss": 1.4421, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.3550478542760111, | |
| "grad_norm": 10.41620922088623, | |
| "learning_rate": 1.931039456062635e-05, | |
| "loss": 1.4866, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.36019347535247503, | |
| "grad_norm": 11.682571411132812, | |
| "learning_rate": 1.9300092716596273e-05, | |
| "loss": 1.4651, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.36019347535247503, | |
| "eval_accuracy": 0.9464818239212036, | |
| "eval_loss": 1.0084654092788696, | |
| "eval_runtime": 418.3089, | |
| "eval_samples_per_second": 82.592, | |
| "eval_steps_per_second": 2.582, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.36533909642893897, | |
| "grad_norm": 4.340071678161621, | |
| "learning_rate": 1.9289790872566192e-05, | |
| "loss": 1.3777, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.3704847175054029, | |
| "grad_norm": 5.201279163360596, | |
| "learning_rate": 1.927948902853611e-05, | |
| "loss": 1.5256, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.37563033858186684, | |
| "grad_norm": 13.030069351196289, | |
| "learning_rate": 1.926918718450603e-05, | |
| "loss": 1.358, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.3807759596583308, | |
| "grad_norm": 6.489394664764404, | |
| "learning_rate": 1.9259091377356548e-05, | |
| "loss": 1.4384, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.3859215807347947, | |
| "grad_norm": 9.361295700073242, | |
| "learning_rate": 1.9248789533326466e-05, | |
| "loss": 1.4847, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.3859215807347947, | |
| "eval_accuracy": 0.9460765719413757, | |
| "eval_loss": 1.0093164443969727, | |
| "eval_runtime": 418.9794, | |
| "eval_samples_per_second": 82.46, | |
| "eval_steps_per_second": 2.578, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.3910672018112586, | |
| "grad_norm": 5.094785213470459, | |
| "learning_rate": 1.9238487689296385e-05, | |
| "loss": 1.327, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.3962128228877225, | |
| "grad_norm": 2.1664323806762695, | |
| "learning_rate": 1.9228185845266307e-05, | |
| "loss": 1.4463, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.40135844396418646, | |
| "grad_norm": 2.0584053993225098, | |
| "learning_rate": 1.921788400123622e-05, | |
| "loss": 1.3179, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.4065040650406504, | |
| "grad_norm": 7.324461936950684, | |
| "learning_rate": 1.920758215720614e-05, | |
| "loss": 1.4312, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.41164968611711433, | |
| "grad_norm": 13.046509742736816, | |
| "learning_rate": 1.9197280313176062e-05, | |
| "loss": 1.4179, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.41164968611711433, | |
| "eval_accuracy": 0.9460186958312988, | |
| "eval_loss": 1.0144544839859009, | |
| "eval_runtime": 418.3617, | |
| "eval_samples_per_second": 82.582, | |
| "eval_steps_per_second": 2.581, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.41679530719357827, | |
| "grad_norm": 4.131565093994141, | |
| "learning_rate": 1.9186978469145977e-05, | |
| "loss": 1.4828, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.4219409282700422, | |
| "grad_norm": 14.746273040771484, | |
| "learning_rate": 1.91766766251159e-05, | |
| "loss": 1.4568, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.42708654934650614, | |
| "grad_norm": 9.327777862548828, | |
| "learning_rate": 1.9166374781085817e-05, | |
| "loss": 1.4921, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.4322321704229701, | |
| "grad_norm": 11.792791366577148, | |
| "learning_rate": 1.9156072937055735e-05, | |
| "loss": 1.4485, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.43737779149943395, | |
| "grad_norm": 9.524967193603516, | |
| "learning_rate": 1.9145771093025654e-05, | |
| "loss": 1.4908, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.43737779149943395, | |
| "eval_accuracy": 0.9477843046188354, | |
| "eval_loss": 1.0120937824249268, | |
| "eval_runtime": 418.3734, | |
| "eval_samples_per_second": 82.579, | |
| "eval_steps_per_second": 2.581, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.4425234125758979, | |
| "grad_norm": 6.7903361320495605, | |
| "learning_rate": 1.9135469248995572e-05, | |
| "loss": 1.295, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.4476690336523618, | |
| "grad_norm": 7.337329387664795, | |
| "learning_rate": 1.912516740496549e-05, | |
| "loss": 1.4687, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.45281465472882576, | |
| "grad_norm": 8.49622631072998, | |
| "learning_rate": 1.911486556093541e-05, | |
| "loss": 1.3846, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.4579602758052897, | |
| "grad_norm": 4.385276794433594, | |
| "learning_rate": 1.9104563716905328e-05, | |
| "loss": 1.4704, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.46310589688175363, | |
| "grad_norm": 9.520790100097656, | |
| "learning_rate": 1.9094261872875246e-05, | |
| "loss": 1.3646, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.46310589688175363, | |
| "eval_accuracy": 0.9479579925537109, | |
| "eval_loss": 1.0055809020996094, | |
| "eval_runtime": 419.9183, | |
| "eval_samples_per_second": 82.276, | |
| "eval_steps_per_second": 2.572, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.46825151795821757, | |
| "grad_norm": 11.194172859191895, | |
| "learning_rate": 1.9083960028845164e-05, | |
| "loss": 1.4779, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.4733971390346815, | |
| "grad_norm": 8.098811149597168, | |
| "learning_rate": 1.9073658184815083e-05, | |
| "loss": 1.4581, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.47854276011114544, | |
| "grad_norm": 3.986377477645874, | |
| "learning_rate": 1.9063356340785e-05, | |
| "loss": 1.3786, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.4836883811876093, | |
| "grad_norm": 7.204378128051758, | |
| "learning_rate": 1.9053054496754923e-05, | |
| "loss": 1.56, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.48883400226407325, | |
| "grad_norm": 6.332393169403076, | |
| "learning_rate": 1.9042752652724838e-05, | |
| "loss": 1.4334, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.48883400226407325, | |
| "eval_accuracy": 0.947523832321167, | |
| "eval_loss": 1.0032474994659424, | |
| "eval_runtime": 418.1822, | |
| "eval_samples_per_second": 82.617, | |
| "eval_steps_per_second": 2.583, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.4939796233405372, | |
| "grad_norm": 5.785167694091797, | |
| "learning_rate": 1.9032450808694756e-05, | |
| "loss": 1.3877, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.4991252444170011, | |
| "grad_norm": 2.13838529586792, | |
| "learning_rate": 1.9022148964664678e-05, | |
| "loss": 1.3485, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.5042708654934651, | |
| "grad_norm": 5.9960618019104, | |
| "learning_rate": 1.9011847120634593e-05, | |
| "loss": 1.4509, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.509416486569929, | |
| "grad_norm": 3.8960604667663574, | |
| "learning_rate": 1.9001545276604515e-05, | |
| "loss": 1.3693, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.5145621076463929, | |
| "grad_norm": 5.9818115234375, | |
| "learning_rate": 1.8991243432574434e-05, | |
| "loss": 1.5226, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5145621076463929, | |
| "eval_accuracy": 0.9477264285087585, | |
| "eval_loss": 0.9975742101669312, | |
| "eval_runtime": 418.0863, | |
| "eval_samples_per_second": 82.636, | |
| "eval_steps_per_second": 2.583, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.5197077287228569, | |
| "grad_norm": 9.811531066894531, | |
| "learning_rate": 1.8980941588544352e-05, | |
| "loss": 1.4423, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.5248533497993207, | |
| "grad_norm": 4.812816619873047, | |
| "learning_rate": 1.897063974451427e-05, | |
| "loss": 1.4191, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.5299989708757847, | |
| "grad_norm": 7.36176872253418, | |
| "learning_rate": 1.896033790048419e-05, | |
| "loss": 1.5109, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.5351445919522486, | |
| "grad_norm": 12.9472017288208, | |
| "learning_rate": 1.8950036056454107e-05, | |
| "loss": 1.4509, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.5402902130287126, | |
| "grad_norm": 6.96859073638916, | |
| "learning_rate": 1.8939734212424026e-05, | |
| "loss": 1.4351, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5402902130287126, | |
| "eval_accuracy": 0.9485947489738464, | |
| "eval_loss": 1.000069499015808, | |
| "eval_runtime": 418.3411, | |
| "eval_samples_per_second": 82.586, | |
| "eval_steps_per_second": 2.582, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.5454358341051765, | |
| "grad_norm": 11.015822410583496, | |
| "learning_rate": 1.8929432368393944e-05, | |
| "loss": 1.3868, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.5505814551816405, | |
| "grad_norm": 7.5497050285339355, | |
| "learning_rate": 1.8919130524363863e-05, | |
| "loss": 1.4339, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.5557270762581044, | |
| "grad_norm": 11.352765083312988, | |
| "learning_rate": 1.890882868033378e-05, | |
| "loss": 1.365, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.5608726973345682, | |
| "grad_norm": 10.072178840637207, | |
| "learning_rate": 1.88985268363037e-05, | |
| "loss": 1.44, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.5660183184110322, | |
| "grad_norm": 5.5806803703308105, | |
| "learning_rate": 1.8888224992273618e-05, | |
| "loss": 1.2895, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5660183184110322, | |
| "eval_accuracy": 0.9490578770637512, | |
| "eval_loss": 1.0065183639526367, | |
| "eval_runtime": 418.105, | |
| "eval_samples_per_second": 82.632, | |
| "eval_steps_per_second": 2.583, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.5711639394874961, | |
| "grad_norm": 12.096445083618164, | |
| "learning_rate": 1.8877923148243536e-05, | |
| "loss": 1.4253, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.5763095605639601, | |
| "grad_norm": 6.126964569091797, | |
| "learning_rate": 1.8867621304213455e-05, | |
| "loss": 1.4438, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.581455181640424, | |
| "grad_norm": 10.5121488571167, | |
| "learning_rate": 1.8857319460183373e-05, | |
| "loss": 1.3543, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.586600802716888, | |
| "grad_norm": 4.3215227127075195, | |
| "learning_rate": 1.8847017616153295e-05, | |
| "loss": 1.5587, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.5917464237933519, | |
| "grad_norm": 6.327254295349121, | |
| "learning_rate": 1.883671577212321e-05, | |
| "loss": 1.342, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.5917464237933519, | |
| "eval_accuracy": 0.9487684369087219, | |
| "eval_loss": 0.9927480816841125, | |
| "eval_runtime": 418.6843, | |
| "eval_samples_per_second": 82.518, | |
| "eval_steps_per_second": 2.58, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.5968920448698158, | |
| "grad_norm": 9.713254928588867, | |
| "learning_rate": 1.882641392809313e-05, | |
| "loss": 1.4503, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.6020376659462797, | |
| "grad_norm": 5.628683090209961, | |
| "learning_rate": 1.881611208406305e-05, | |
| "loss": 1.4045, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.6071832870227436, | |
| "grad_norm": 11.369056701660156, | |
| "learning_rate": 1.8805810240032965e-05, | |
| "loss": 1.4092, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.6123289080992076, | |
| "grad_norm": 2.5842366218566895, | |
| "learning_rate": 1.8795508396002887e-05, | |
| "loss": 1.3318, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.6174745291756715, | |
| "grad_norm": 11.178747177124023, | |
| "learning_rate": 1.8785206551972805e-05, | |
| "loss": 1.416, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6174745291756715, | |
| "eval_accuracy": 0.9503603577613831, | |
| "eval_loss": 0.9909718632698059, | |
| "eval_runtime": 418.5904, | |
| "eval_samples_per_second": 82.537, | |
| "eval_steps_per_second": 2.58, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.6226201502521355, | |
| "grad_norm": 7.2673115730285645, | |
| "learning_rate": 1.8774904707942724e-05, | |
| "loss": 1.5132, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.6277657713285993, | |
| "grad_norm": 4.217124938964844, | |
| "learning_rate": 1.8764602863912642e-05, | |
| "loss": 1.3275, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.6329113924050633, | |
| "grad_norm": 2.112212896347046, | |
| "learning_rate": 1.875430101988256e-05, | |
| "loss": 1.4595, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.6380570134815272, | |
| "grad_norm": 5.421743392944336, | |
| "learning_rate": 1.874399917585248e-05, | |
| "loss": 1.5112, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.6432026345579912, | |
| "grad_norm": 9.458545684814453, | |
| "learning_rate": 1.8733697331822397e-05, | |
| "loss": 1.4435, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6432026345579912, | |
| "eval_accuracy": 0.9514892101287842, | |
| "eval_loss": 0.9927791357040405, | |
| "eval_runtime": 418.0438, | |
| "eval_samples_per_second": 82.644, | |
| "eval_steps_per_second": 2.583, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.6483482556344551, | |
| "grad_norm": 8.746692657470703, | |
| "learning_rate": 1.8723395487792316e-05, | |
| "loss": 1.4268, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.653493876710919, | |
| "grad_norm": 6.339073657989502, | |
| "learning_rate": 1.8713093643762234e-05, | |
| "loss": 1.5071, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.658639497787383, | |
| "grad_norm": 10.726541519165039, | |
| "learning_rate": 1.8702791799732153e-05, | |
| "loss": 1.3817, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.6637851188638468, | |
| "grad_norm": 6.412696361541748, | |
| "learning_rate": 1.869248995570207e-05, | |
| "loss": 1.5101, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.6689307399403108, | |
| "grad_norm": 8.011473655700684, | |
| "learning_rate": 1.868218811167199e-05, | |
| "loss": 1.4014, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6689307399403108, | |
| "eval_accuracy": 0.9489710330963135, | |
| "eval_loss": 0.9953876733779907, | |
| "eval_runtime": 418.8528, | |
| "eval_samples_per_second": 82.485, | |
| "eval_steps_per_second": 2.578, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.6740763610167747, | |
| "grad_norm": 2.333108901977539, | |
| "learning_rate": 1.867188626764191e-05, | |
| "loss": 1.2797, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.6792219820932387, | |
| "grad_norm": 13.239009857177734, | |
| "learning_rate": 1.8661584423611826e-05, | |
| "loss": 1.3829, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.6843676031697026, | |
| "grad_norm": 7.554291248321533, | |
| "learning_rate": 1.8651282579581745e-05, | |
| "loss": 1.4907, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.6895132242461666, | |
| "grad_norm": 8.046769142150879, | |
| "learning_rate": 1.8640980735551667e-05, | |
| "loss": 1.4098, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.6946588453226304, | |
| "grad_norm": 3.5291695594787598, | |
| "learning_rate": 1.863067889152158e-05, | |
| "loss": 1.482, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.6946588453226304, | |
| "eval_accuracy": 0.9492025971412659, | |
| "eval_loss": 0.9936777949333191, | |
| "eval_runtime": 418.0424, | |
| "eval_samples_per_second": 82.645, | |
| "eval_steps_per_second": 2.583, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.6998044663990943, | |
| "grad_norm": 5.854330539703369, | |
| "learning_rate": 1.8620377047491503e-05, | |
| "loss": 1.3779, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.7049500874755583, | |
| "grad_norm": 9.476693153381348, | |
| "learning_rate": 1.8610075203461422e-05, | |
| "loss": 1.3791, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.7100957085520222, | |
| "grad_norm": 9.646202087402344, | |
| "learning_rate": 1.859977335943134e-05, | |
| "loss": 1.5183, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.7152413296284862, | |
| "grad_norm": 3.42673397064209, | |
| "learning_rate": 1.858947151540126e-05, | |
| "loss": 1.4022, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.7203869507049501, | |
| "grad_norm": 9.239468574523926, | |
| "learning_rate": 1.8579169671371177e-05, | |
| "loss": 1.544, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7203869507049501, | |
| "eval_accuracy": 0.9508234858512878, | |
| "eval_loss": 0.9934782385826111, | |
| "eval_runtime": 418.095, | |
| "eval_samples_per_second": 82.634, | |
| "eval_steps_per_second": 2.583, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.7255325717814141, | |
| "grad_norm": 5.876420021057129, | |
| "learning_rate": 1.8568867827341096e-05, | |
| "loss": 1.4566, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.7306781928578779, | |
| "grad_norm": 2.191608190536499, | |
| "learning_rate": 1.8558565983311014e-05, | |
| "loss": 1.4641, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.7358238139343419, | |
| "grad_norm": 10.467001914978027, | |
| "learning_rate": 1.8548264139280932e-05, | |
| "loss": 1.4208, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.7409694350108058, | |
| "grad_norm": 9.560342788696289, | |
| "learning_rate": 1.853796229525085e-05, | |
| "loss": 1.3391, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.7461150560872697, | |
| "grad_norm": 10.074899673461914, | |
| "learning_rate": 1.852766045122077e-05, | |
| "loss": 1.5002, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7461150560872697, | |
| "eval_accuracy": 0.9496946334838867, | |
| "eval_loss": 0.9860528707504272, | |
| "eval_runtime": 418.2736, | |
| "eval_samples_per_second": 82.599, | |
| "eval_steps_per_second": 2.582, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.7512606771637337, | |
| "grad_norm": 5.140987873077393, | |
| "learning_rate": 1.851735860719069e-05, | |
| "loss": 1.2985, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.7564062982401976, | |
| "grad_norm": 4.276757717132568, | |
| "learning_rate": 1.8507056763160606e-05, | |
| "loss": 1.5496, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.7615519193166616, | |
| "grad_norm": 8.268556594848633, | |
| "learning_rate": 1.8496754919130528e-05, | |
| "loss": 1.5046, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.7666975403931254, | |
| "grad_norm": 7.343358516693115, | |
| "learning_rate": 1.8486453075100443e-05, | |
| "loss": 1.3687, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.7718431614695894, | |
| "grad_norm": 5.345001220703125, | |
| "learning_rate": 1.847615123107036e-05, | |
| "loss": 1.3841, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7718431614695894, | |
| "eval_accuracy": 0.9501287937164307, | |
| "eval_loss": 0.9868325591087341, | |
| "eval_runtime": 418.192, | |
| "eval_samples_per_second": 82.615, | |
| "eval_steps_per_second": 2.583, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.7769887825460533, | |
| "grad_norm": 11.624256134033203, | |
| "learning_rate": 1.8465849387040283e-05, | |
| "loss": 1.3996, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 0.7821344036225172, | |
| "grad_norm": 6.849825859069824, | |
| "learning_rate": 1.8455547543010198e-05, | |
| "loss": 1.5112, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.7872800246989812, | |
| "grad_norm": 9.704992294311523, | |
| "learning_rate": 1.844524569898012e-05, | |
| "loss": 1.4335, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 0.792425645775445, | |
| "grad_norm": 5.669846534729004, | |
| "learning_rate": 1.843494385495004e-05, | |
| "loss": 1.3867, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.797571266851909, | |
| "grad_norm": 6.519596099853516, | |
| "learning_rate": 1.8424642010919957e-05, | |
| "loss": 1.3865, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.797571266851909, | |
| "eval_accuracy": 0.9511418342590332, | |
| "eval_loss": 0.986303448677063, | |
| "eval_runtime": 418.5501, | |
| "eval_samples_per_second": 82.544, | |
| "eval_steps_per_second": 2.58, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 0.8027168879283729, | |
| "grad_norm": 7.500890731811523, | |
| "learning_rate": 1.8414340166889875e-05, | |
| "loss": 1.4039, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.8078625090048369, | |
| "grad_norm": 8.141263961791992, | |
| "learning_rate": 1.8404038322859794e-05, | |
| "loss": 1.379, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 0.8130081300813008, | |
| "grad_norm": 8.75843620300293, | |
| "learning_rate": 1.8393736478829712e-05, | |
| "loss": 1.3459, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.8181537511577648, | |
| "grad_norm": 9.22071647644043, | |
| "learning_rate": 1.838343463479963e-05, | |
| "loss": 1.3996, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 0.8232993722342287, | |
| "grad_norm": 5.6009345054626465, | |
| "learning_rate": 1.837313279076955e-05, | |
| "loss": 1.4151, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8232993722342287, | |
| "eval_accuracy": 0.9510839581489563, | |
| "eval_loss": 0.9821743369102478, | |
| "eval_runtime": 418.1692, | |
| "eval_samples_per_second": 82.62, | |
| "eval_steps_per_second": 2.583, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.8284449933106925, | |
| "grad_norm": 6.377861976623535, | |
| "learning_rate": 1.8362830946739467e-05, | |
| "loss": 1.3745, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 0.8335906143871565, | |
| "grad_norm": 6.7617902755737305, | |
| "learning_rate": 1.8352529102709386e-05, | |
| "loss": 1.4404, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.8387362354636204, | |
| "grad_norm": 10.52645492553711, | |
| "learning_rate": 1.8342227258679308e-05, | |
| "loss": 1.4776, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 0.8438818565400844, | |
| "grad_norm": 7.829946517944336, | |
| "learning_rate": 1.8331925414649223e-05, | |
| "loss": 1.398, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.8490274776165483, | |
| "grad_norm": 6.536490440368652, | |
| "learning_rate": 1.832162357061914e-05, | |
| "loss": 1.4482, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.8490274776165483, | |
| "eval_accuracy": 0.9505919218063354, | |
| "eval_loss": 0.9802690744400024, | |
| "eval_runtime": 417.9193, | |
| "eval_samples_per_second": 82.669, | |
| "eval_steps_per_second": 2.584, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 0.8541730986930123, | |
| "grad_norm": 8.002507209777832, | |
| "learning_rate": 1.8311321726589063e-05, | |
| "loss": 1.4551, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.8593187197694762, | |
| "grad_norm": 10.97170352935791, | |
| "learning_rate": 1.8301019882558978e-05, | |
| "loss": 1.46, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 0.8644643408459401, | |
| "grad_norm": 9.144811630249023, | |
| "learning_rate": 1.82907180385289e-05, | |
| "loss": 1.5179, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.869609961922404, | |
| "grad_norm": 11.398577690124512, | |
| "learning_rate": 1.8280416194498818e-05, | |
| "loss": 1.4067, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 0.8747555829988679, | |
| "grad_norm": 8.858057022094727, | |
| "learning_rate": 1.8270320387349337e-05, | |
| "loss": 1.4393, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8747555829988679, | |
| "eval_accuracy": 0.9503893256187439, | |
| "eval_loss": 0.9808804392814636, | |
| "eval_runtime": 418.2126, | |
| "eval_samples_per_second": 82.611, | |
| "eval_steps_per_second": 2.582, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.8799012040753319, | |
| "grad_norm": 9.175712585449219, | |
| "learning_rate": 1.8260018543319256e-05, | |
| "loss": 1.4995, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 0.8850468251517958, | |
| "grad_norm": 9.636043548583984, | |
| "learning_rate": 1.8249716699289174e-05, | |
| "loss": 1.4077, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.8901924462282598, | |
| "grad_norm": 8.578084945678711, | |
| "learning_rate": 1.8239414855259093e-05, | |
| "loss": 1.4088, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 0.8953380673047237, | |
| "grad_norm": 7.253017425537109, | |
| "learning_rate": 1.822911301122901e-05, | |
| "loss": 1.3464, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.9004836883811876, | |
| "grad_norm": 8.55578899383545, | |
| "learning_rate": 1.821881116719893e-05, | |
| "loss": 1.3455, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9004836883811876, | |
| "eval_accuracy": 0.9506208300590515, | |
| "eval_loss": 0.9797450304031372, | |
| "eval_runtime": 418.1418, | |
| "eval_samples_per_second": 82.625, | |
| "eval_steps_per_second": 2.583, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 0.9056293094576515, | |
| "grad_norm": 9.603639602661133, | |
| "learning_rate": 1.8208509323168848e-05, | |
| "loss": 1.5172, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.9107749305341155, | |
| "grad_norm": 5.811156272888184, | |
| "learning_rate": 1.8198207479138766e-05, | |
| "loss": 1.3922, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 0.9159205516105794, | |
| "grad_norm": 7.18412971496582, | |
| "learning_rate": 1.8187905635108688e-05, | |
| "loss": 1.3645, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.9210661726870433, | |
| "grad_norm": 10.653360366821289, | |
| "learning_rate": 1.8177603791078603e-05, | |
| "loss": 1.3627, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 0.9262117937635073, | |
| "grad_norm": 9.01271915435791, | |
| "learning_rate": 1.8167301947048525e-05, | |
| "loss": 1.3896, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9262117937635073, | |
| "eval_accuracy": 0.9506497979164124, | |
| "eval_loss": 0.9806250929832458, | |
| "eval_runtime": 417.4847, | |
| "eval_samples_per_second": 82.755, | |
| "eval_steps_per_second": 2.587, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.9313574148399711, | |
| "grad_norm": 6.072149276733398, | |
| "learning_rate": 1.8157000103018443e-05, | |
| "loss": 1.433, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 0.9365030359164351, | |
| "grad_norm": 5.18344783782959, | |
| "learning_rate": 1.814669825898836e-05, | |
| "loss": 1.4678, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.941648656992899, | |
| "grad_norm": 12.650690078735352, | |
| "learning_rate": 1.813639641495828e-05, | |
| "loss": 1.3206, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 0.946794278069363, | |
| "grad_norm": 4.13425350189209, | |
| "learning_rate": 1.8126094570928195e-05, | |
| "loss": 1.4589, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.9519398991458269, | |
| "grad_norm": 9.408120155334473, | |
| "learning_rate": 1.8115792726898117e-05, | |
| "loss": 1.3494, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.9519398991458269, | |
| "eval_accuracy": 0.9509103298187256, | |
| "eval_loss": 0.9760673642158508, | |
| "eval_runtime": 418.3905, | |
| "eval_samples_per_second": 82.576, | |
| "eval_steps_per_second": 2.581, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 0.9570855202222909, | |
| "grad_norm": 8.437677383422852, | |
| "learning_rate": 1.8105490882868035e-05, | |
| "loss": 1.3768, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.9622311412987548, | |
| "grad_norm": 5.862843990325928, | |
| "learning_rate": 1.8095189038837954e-05, | |
| "loss": 1.4449, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 0.9673767623752186, | |
| "grad_norm": 5.639468193054199, | |
| "learning_rate": 1.8084887194807872e-05, | |
| "loss": 1.4187, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.9725223834516826, | |
| "grad_norm": 5.7434401512146, | |
| "learning_rate": 1.807458535077779e-05, | |
| "loss": 1.3046, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 0.9776680045281465, | |
| "grad_norm": 8.578060150146484, | |
| "learning_rate": 1.806428350674771e-05, | |
| "loss": 1.3586, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9776680045281465, | |
| "eval_accuracy": 0.9511997699737549, | |
| "eval_loss": 0.9817301034927368, | |
| "eval_runtime": 417.7965, | |
| "eval_samples_per_second": 82.693, | |
| "eval_steps_per_second": 2.585, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.9828136256046105, | |
| "grad_norm": 6.870723247528076, | |
| "learning_rate": 1.8053981662717628e-05, | |
| "loss": 1.4631, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 0.9879592466810744, | |
| "grad_norm": 8.596879005432129, | |
| "learning_rate": 1.8043679818687546e-05, | |
| "loss": 1.3113, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.9931048677575384, | |
| "grad_norm": 5.606679439544678, | |
| "learning_rate": 1.8033377974657464e-05, | |
| "loss": 1.2972, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 0.9982504888340022, | |
| "grad_norm": 1.0621393918991089, | |
| "learning_rate": 1.8023076130627383e-05, | |
| "loss": 1.3793, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.0033961099104662, | |
| "grad_norm": 1.193249225616455, | |
| "learning_rate": 1.8012774286597305e-05, | |
| "loss": 1.1729, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.0033961099104662, | |
| "eval_accuracy": 0.9509392380714417, | |
| "eval_loss": 0.9846755266189575, | |
| "eval_runtime": 417.935, | |
| "eval_samples_per_second": 82.666, | |
| "eval_steps_per_second": 2.584, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.0085417309869302, | |
| "grad_norm": 10.64986515045166, | |
| "learning_rate": 1.800247244256722e-05, | |
| "loss": 1.2009, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.013687352063394, | |
| "grad_norm": 9.815643310546875, | |
| "learning_rate": 1.799217059853714e-05, | |
| "loss": 1.2576, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.018832973139858, | |
| "grad_norm": 9.344294548034668, | |
| "learning_rate": 1.798186875450706e-05, | |
| "loss": 1.3483, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.023978594216322, | |
| "grad_norm": 2.3761701583862305, | |
| "learning_rate": 1.7971566910476975e-05, | |
| "loss": 1.2609, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.0291242152927857, | |
| "grad_norm": 9.36589527130127, | |
| "learning_rate": 1.7961265066446897e-05, | |
| "loss": 1.3099, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0291242152927857, | |
| "eval_accuracy": 0.9513155221939087, | |
| "eval_loss": 0.9894696474075317, | |
| "eval_runtime": 417.5493, | |
| "eval_samples_per_second": 82.742, | |
| "eval_steps_per_second": 2.587, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.0342698363692497, | |
| "grad_norm": 14.563089370727539, | |
| "learning_rate": 1.7950963222416815e-05, | |
| "loss": 1.2224, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.0394154574457137, | |
| "grad_norm": 11.867334365844727, | |
| "learning_rate": 1.7940661378386734e-05, | |
| "loss": 1.3552, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.0445610785221777, | |
| "grad_norm": 1.510968565940857, | |
| "learning_rate": 1.7930359534356652e-05, | |
| "loss": 1.3508, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.0497066995986415, | |
| "grad_norm": 0.8010023832321167, | |
| "learning_rate": 1.792005769032657e-05, | |
| "loss": 1.3242, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.0548523206751055, | |
| "grad_norm": 5.283142566680908, | |
| "learning_rate": 1.790975584629649e-05, | |
| "loss": 1.2287, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.0548523206751055, | |
| "eval_accuracy": 0.951170802116394, | |
| "eval_loss": 0.9977254867553711, | |
| "eval_runtime": 417.1052, | |
| "eval_samples_per_second": 82.83, | |
| "eval_steps_per_second": 2.589, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.0599979417515695, | |
| "grad_norm": 6.146693706512451, | |
| "learning_rate": 1.7899454002266407e-05, | |
| "loss": 1.2863, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.0651435628280332, | |
| "grad_norm": 9.038339614868164, | |
| "learning_rate": 1.7889152158236326e-05, | |
| "loss": 1.2377, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.0702891839044972, | |
| "grad_norm": 8.985528945922852, | |
| "learning_rate": 1.7878850314206244e-05, | |
| "loss": 1.3058, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.0754348049809612, | |
| "grad_norm": 6.91862154006958, | |
| "learning_rate": 1.7868548470176162e-05, | |
| "loss": 1.3013, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.0805804260574252, | |
| "grad_norm": 4.811442852020264, | |
| "learning_rate": 1.785824662614608e-05, | |
| "loss": 1.3233, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.0805804260574252, | |
| "eval_accuracy": 0.9488263130187988, | |
| "eval_loss": 0.9947823286056519, | |
| "eval_runtime": 418.0737, | |
| "eval_samples_per_second": 82.639, | |
| "eval_steps_per_second": 2.583, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.085726047133889, | |
| "grad_norm": 3.9576923847198486, | |
| "learning_rate": 1.7847944782116e-05, | |
| "loss": 1.334, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.090871668210353, | |
| "grad_norm": 11.280867576599121, | |
| "learning_rate": 1.783764293808592e-05, | |
| "loss": 1.246, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.096017289286817, | |
| "grad_norm": 10.32507038116455, | |
| "learning_rate": 1.7827341094055836e-05, | |
| "loss": 1.2298, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.1011629103632807, | |
| "grad_norm": 9.05435848236084, | |
| "learning_rate": 1.7817039250025755e-05, | |
| "loss": 1.2016, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.1063085314397447, | |
| "grad_norm": 10.334163665771484, | |
| "learning_rate": 1.7806737405995676e-05, | |
| "loss": 1.3035, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.1063085314397447, | |
| "eval_accuracy": 0.9506497979164124, | |
| "eval_loss": 0.9946981072425842, | |
| "eval_runtime": 417.8974, | |
| "eval_samples_per_second": 82.673, | |
| "eval_steps_per_second": 2.584, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.1114541525162087, | |
| "grad_norm": 11.852724075317383, | |
| "learning_rate": 1.779643556196559e-05, | |
| "loss": 1.2457, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.1165997735926727, | |
| "grad_norm": 10.447225570678711, | |
| "learning_rate": 1.7786133717935513e-05, | |
| "loss": 1.2882, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.1217453946691365, | |
| "grad_norm": 3.3465206623077393, | |
| "learning_rate": 1.777583187390543e-05, | |
| "loss": 1.2365, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.1268910157456005, | |
| "grad_norm": 6.849998950958252, | |
| "learning_rate": 1.776553002987535e-05, | |
| "loss": 1.19, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.1320366368220645, | |
| "grad_norm": 11.492406845092773, | |
| "learning_rate": 1.775522818584527e-05, | |
| "loss": 1.2377, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1320366368220645, | |
| "eval_accuracy": 0.9511129260063171, | |
| "eval_loss": 0.9914972186088562, | |
| "eval_runtime": 417.922, | |
| "eval_samples_per_second": 82.669, | |
| "eval_steps_per_second": 2.584, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.1371822578985284, | |
| "grad_norm": 7.080196857452393, | |
| "learning_rate": 1.7744926341815187e-05, | |
| "loss": 1.3028, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.1423278789749922, | |
| "grad_norm": 3.5371875762939453, | |
| "learning_rate": 1.7734624497785105e-05, | |
| "loss": 1.319, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.1474735000514562, | |
| "grad_norm": 5.618402004241943, | |
| "learning_rate": 1.7724322653755024e-05, | |
| "loss": 1.3315, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.1526191211279202, | |
| "grad_norm": 6.200303554534912, | |
| "learning_rate": 1.7714020809724942e-05, | |
| "loss": 1.2161, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.157764742204384, | |
| "grad_norm": 8.898612976074219, | |
| "learning_rate": 1.770371896569486e-05, | |
| "loss": 1.3555, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.157764742204384, | |
| "eval_accuracy": 0.9510550498962402, | |
| "eval_loss": 0.990160346031189, | |
| "eval_runtime": 417.6517, | |
| "eval_samples_per_second": 82.722, | |
| "eval_steps_per_second": 2.586, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.162910363280848, | |
| "grad_norm": 4.882264137268066, | |
| "learning_rate": 1.769341712166478e-05, | |
| "loss": 1.1874, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.168055984357312, | |
| "grad_norm": 3.1759836673736572, | |
| "learning_rate": 1.7683115277634697e-05, | |
| "loss": 1.2373, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.173201605433776, | |
| "grad_norm": 13.944663047790527, | |
| "learning_rate": 1.7672813433604616e-05, | |
| "loss": 1.2474, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.1783472265102397, | |
| "grad_norm": 6.393034934997559, | |
| "learning_rate": 1.7662511589574534e-05, | |
| "loss": 1.2838, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.1834928475867037, | |
| "grad_norm": 9.834447860717773, | |
| "learning_rate": 1.7652209745544453e-05, | |
| "loss": 1.2242, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.1834928475867037, | |
| "eval_accuracy": 0.9518075585365295, | |
| "eval_loss": 0.992717444896698, | |
| "eval_runtime": 417.5743, | |
| "eval_samples_per_second": 82.737, | |
| "eval_steps_per_second": 2.586, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.1886384686631677, | |
| "grad_norm": 7.190279006958008, | |
| "learning_rate": 1.764190790151437e-05, | |
| "loss": 1.3123, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.1937840897396317, | |
| "grad_norm": 3.1001381874084473, | |
| "learning_rate": 1.7631606057484293e-05, | |
| "loss": 1.2874, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.1989297108160955, | |
| "grad_norm": 10.424577713012695, | |
| "learning_rate": 1.7621304213454208e-05, | |
| "loss": 1.2568, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.2040753318925594, | |
| "grad_norm": 2.9702112674713135, | |
| "learning_rate": 1.761100236942413e-05, | |
| "loss": 1.2526, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.2092209529690234, | |
| "grad_norm": 4.956679821014404, | |
| "learning_rate": 1.7600700525394048e-05, | |
| "loss": 1.347, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.2092209529690234, | |
| "eval_accuracy": 0.9508523941040039, | |
| "eval_loss": 0.9882821440696716, | |
| "eval_runtime": 417.5587, | |
| "eval_samples_per_second": 82.74, | |
| "eval_steps_per_second": 2.586, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.2143665740454872, | |
| "grad_norm": 7.329675674438477, | |
| "learning_rate": 1.7590398681363963e-05, | |
| "loss": 1.3098, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.2195121951219512, | |
| "grad_norm": 2.8485448360443115, | |
| "learning_rate": 1.7580096837333885e-05, | |
| "loss": 1.2541, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.2246578161984152, | |
| "grad_norm": 13.313427925109863, | |
| "learning_rate": 1.7569794993303803e-05, | |
| "loss": 1.2791, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.2298034372748792, | |
| "grad_norm": 10.920377731323242, | |
| "learning_rate": 1.7559493149273722e-05, | |
| "loss": 1.2333, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.234949058351343, | |
| "grad_norm": 3.033597946166992, | |
| "learning_rate": 1.754919130524364e-05, | |
| "loss": 1.3827, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.234949058351343, | |
| "eval_accuracy": 0.9507366418838501, | |
| "eval_loss": 0.9942870140075684, | |
| "eval_runtime": 417.7023, | |
| "eval_samples_per_second": 82.712, | |
| "eval_steps_per_second": 2.586, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.240094679427807, | |
| "grad_norm": 1.0334879159927368, | |
| "learning_rate": 1.753888946121356e-05, | |
| "loss": 1.2732, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.245240300504271, | |
| "grad_norm": 7.173407077789307, | |
| "learning_rate": 1.7528587617183477e-05, | |
| "loss": 1.2993, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.250385921580735, | |
| "grad_norm": 15.351693153381348, | |
| "learning_rate": 1.7518285773153396e-05, | |
| "loss": 1.2947, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.2555315426571987, | |
| "grad_norm": 13.320657730102539, | |
| "learning_rate": 1.7507983929123314e-05, | |
| "loss": 1.3001, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.2606771637336627, | |
| "grad_norm": 4.0671186447143555, | |
| "learning_rate": 1.7497682085093232e-05, | |
| "loss": 1.2957, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.2606771637336627, | |
| "eval_accuracy": 0.9514023661613464, | |
| "eval_loss": 0.9864968657493591, | |
| "eval_runtime": 417.8412, | |
| "eval_samples_per_second": 82.685, | |
| "eval_steps_per_second": 2.585, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.2658227848101267, | |
| "grad_norm": 7.0425519943237305, | |
| "learning_rate": 1.748738024106315e-05, | |
| "loss": 1.1393, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.2709684058865904, | |
| "grad_norm": 4.306710243225098, | |
| "learning_rate": 1.747707839703307e-05, | |
| "loss": 1.2996, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.2761140269630544, | |
| "grad_norm": 10.586379051208496, | |
| "learning_rate": 1.7466776553002988e-05, | |
| "loss": 1.3218, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.2812596480395184, | |
| "grad_norm": 6.002781867980957, | |
| "learning_rate": 1.745647470897291e-05, | |
| "loss": 1.2138, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.2864052691159822, | |
| "grad_norm": 7.406036853790283, | |
| "learning_rate": 1.7446172864942825e-05, | |
| "loss": 1.1731, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.2864052691159822, | |
| "eval_accuracy": 0.9509682059288025, | |
| "eval_loss": 0.9963937997817993, | |
| "eval_runtime": 417.622, | |
| "eval_samples_per_second": 82.728, | |
| "eval_steps_per_second": 2.586, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.2915508901924462, | |
| "grad_norm": 11.54760456085205, | |
| "learning_rate": 1.7436077057793347e-05, | |
| "loss": 1.3326, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.2966965112689102, | |
| "grad_norm": 3.4204094409942627, | |
| "learning_rate": 1.7425775213763265e-05, | |
| "loss": 1.3575, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.3018421323453742, | |
| "grad_norm": 9.140461921691895, | |
| "learning_rate": 1.7415473369733184e-05, | |
| "loss": 1.2948, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.3069877534218381, | |
| "grad_norm": 6.069116592407227, | |
| "learning_rate": 1.7405171525703102e-05, | |
| "loss": 1.2921, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.312133374498302, | |
| "grad_norm": 13.66699504852295, | |
| "learning_rate": 1.739486968167302e-05, | |
| "loss": 1.3052, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.312133374498302, | |
| "eval_accuracy": 0.9509103298187256, | |
| "eval_loss": 0.9840078949928284, | |
| "eval_runtime": 418.1556, | |
| "eval_samples_per_second": 82.622, | |
| "eval_steps_per_second": 2.583, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.317278995574766, | |
| "grad_norm": 6.949051380157471, | |
| "learning_rate": 1.738456783764294e-05, | |
| "loss": 1.3662, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.32242461665123, | |
| "grad_norm": 9.286051750183105, | |
| "learning_rate": 1.7374265993612858e-05, | |
| "loss": 1.3673, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.3275702377276937, | |
| "grad_norm": 9.19774341583252, | |
| "learning_rate": 1.7363964149582776e-05, | |
| "loss": 1.3006, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.3327158588041577, | |
| "grad_norm": 5.003039360046387, | |
| "learning_rate": 1.7353662305552694e-05, | |
| "loss": 1.4217, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.3378614798806217, | |
| "grad_norm": 4.849103927612305, | |
| "learning_rate": 1.7343360461522613e-05, | |
| "loss": 1.1608, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.3378614798806217, | |
| "eval_accuracy": 0.9520102143287659, | |
| "eval_loss": 0.98476642370224, | |
| "eval_runtime": 417.8875, | |
| "eval_samples_per_second": 82.675, | |
| "eval_steps_per_second": 2.584, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.3430071009570854, | |
| "grad_norm": 3.944049596786499, | |
| "learning_rate": 1.7333058617492535e-05, | |
| "loss": 1.2066, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.3481527220335494, | |
| "grad_norm": 8.767118453979492, | |
| "learning_rate": 1.732275677346245e-05, | |
| "loss": 1.408, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.3532983431100134, | |
| "grad_norm": 8.175588607788086, | |
| "learning_rate": 1.7312454929432368e-05, | |
| "loss": 1.3574, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.3584439641864772, | |
| "grad_norm": 5.246455192565918, | |
| "learning_rate": 1.730215308540229e-05, | |
| "loss": 1.3171, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.3635895852629412, | |
| "grad_norm": 8.986821174621582, | |
| "learning_rate": 1.7291851241372205e-05, | |
| "loss": 1.3188, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.3635895852629412, | |
| "eval_accuracy": 0.9502446055412292, | |
| "eval_loss": 0.9888262152671814, | |
| "eval_runtime": 418.0556, | |
| "eval_samples_per_second": 82.642, | |
| "eval_steps_per_second": 2.583, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.3687352063394052, | |
| "grad_norm": 3.4874706268310547, | |
| "learning_rate": 1.7281549397342127e-05, | |
| "loss": 1.299, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.3738808274158691, | |
| "grad_norm": 5.339372158050537, | |
| "learning_rate": 1.7271247553312045e-05, | |
| "loss": 1.3015, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.3790264484923331, | |
| "grad_norm": 0.7593218684196472, | |
| "learning_rate": 1.7260945709281964e-05, | |
| "loss": 1.3159, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.384172069568797, | |
| "grad_norm": 6.2086896896362305, | |
| "learning_rate": 1.7250643865251882e-05, | |
| "loss": 1.2139, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.389317690645261, | |
| "grad_norm": 8.667464256286621, | |
| "learning_rate": 1.72403420212218e-05, | |
| "loss": 1.2855, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.389317690645261, | |
| "eval_accuracy": 0.9513733983039856, | |
| "eval_loss": 0.9957149624824524, | |
| "eval_runtime": 417.8702, | |
| "eval_samples_per_second": 82.679, | |
| "eval_steps_per_second": 2.585, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.3944633117217249, | |
| "grad_norm": 11.632777214050293, | |
| "learning_rate": 1.723004017719172e-05, | |
| "loss": 1.2705, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.3996089327981887, | |
| "grad_norm": 24.493167877197266, | |
| "learning_rate": 1.7219738333161637e-05, | |
| "loss": 1.3099, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.4047545538746526, | |
| "grad_norm": 6.20335054397583, | |
| "learning_rate": 1.7209436489131556e-05, | |
| "loss": 1.3144, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.4099001749511166, | |
| "grad_norm": 9.66215991973877, | |
| "learning_rate": 1.7199134645101474e-05, | |
| "loss": 1.2948, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.4150457960275804, | |
| "grad_norm": 3.034616470336914, | |
| "learning_rate": 1.7188832801071393e-05, | |
| "loss": 1.3313, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.4150457960275804, | |
| "eval_accuracy": 0.9511997699737549, | |
| "eval_loss": 0.9909895658493042, | |
| "eval_runtime": 417.5787, | |
| "eval_samples_per_second": 82.737, | |
| "eval_steps_per_second": 2.586, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.4201914171040444, | |
| "grad_norm": 5.056656360626221, | |
| "learning_rate": 1.7178530957041314e-05, | |
| "loss": 1.3473, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.4253370381805084, | |
| "grad_norm": 2.1632890701293945, | |
| "learning_rate": 1.716822911301123e-05, | |
| "loss": 1.2037, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.4304826592569724, | |
| "grad_norm": 8.617193222045898, | |
| "learning_rate": 1.7157927268981148e-05, | |
| "loss": 1.3059, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.4356282803334364, | |
| "grad_norm": 4.062990188598633, | |
| "learning_rate": 1.714762542495107e-05, | |
| "loss": 1.3763, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.4407739014099001, | |
| "grad_norm": 8.483048439025879, | |
| "learning_rate": 1.7137323580920985e-05, | |
| "loss": 1.2606, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.4407739014099001, | |
| "eval_accuracy": 0.9522996544837952, | |
| "eval_loss": 0.9875785708427429, | |
| "eval_runtime": 417.781, | |
| "eval_samples_per_second": 82.696, | |
| "eval_steps_per_second": 2.585, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.4459195224863641, | |
| "grad_norm": 8.226116180419922, | |
| "learning_rate": 1.7127021736890906e-05, | |
| "loss": 1.2394, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.4510651435628281, | |
| "grad_norm": 0.5191435813903809, | |
| "learning_rate": 1.711671989286082e-05, | |
| "loss": 1.219, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.4562107646392919, | |
| "grad_norm": 8.271252632141113, | |
| "learning_rate": 1.7106418048830743e-05, | |
| "loss": 1.3501, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.4613563857157559, | |
| "grad_norm": 6.9849066734313965, | |
| "learning_rate": 1.7096116204800662e-05, | |
| "loss": 1.2664, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.4665020067922199, | |
| "grad_norm": 3.286569595336914, | |
| "learning_rate": 1.7085814360770577e-05, | |
| "loss": 1.2704, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.4665020067922199, | |
| "eval_accuracy": 0.9512576460838318, | |
| "eval_loss": 0.994490385055542, | |
| "eval_runtime": 418.1263, | |
| "eval_samples_per_second": 82.628, | |
| "eval_steps_per_second": 2.583, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.4716476278686836, | |
| "grad_norm": 6.6526618003845215, | |
| "learning_rate": 1.70755125167405e-05, | |
| "loss": 1.2332, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.4767932489451476, | |
| "grad_norm": 7.4313578605651855, | |
| "learning_rate": 1.7065210672710417e-05, | |
| "loss": 1.2286, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.4819388700216116, | |
| "grad_norm": 6.093780517578125, | |
| "learning_rate": 1.7054908828680335e-05, | |
| "loss": 1.2123, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.4870844910980756, | |
| "grad_norm": 6.429713726043701, | |
| "learning_rate": 1.7044606984650254e-05, | |
| "loss": 1.2437, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.4922301121745396, | |
| "grad_norm": 8.225885391235352, | |
| "learning_rate": 1.7034305140620172e-05, | |
| "loss": 1.2292, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.4922301121745396, | |
| "eval_accuracy": 0.9502446055412292, | |
| "eval_loss": 0.9886476993560791, | |
| "eval_runtime": 417.1781, | |
| "eval_samples_per_second": 82.816, | |
| "eval_steps_per_second": 2.589, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.4973757332510034, | |
| "grad_norm": 6.095223903656006, | |
| "learning_rate": 1.7024209333470695e-05, | |
| "loss": 1.3007, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.5025213543274674, | |
| "grad_norm": 12.490996360778809, | |
| "learning_rate": 1.701390748944061e-05, | |
| "loss": 1.308, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.5076669754039314, | |
| "grad_norm": 9.118165016174316, | |
| "learning_rate": 1.700360564541053e-05, | |
| "loss": 1.174, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.5128125964803951, | |
| "grad_norm": 6.5648722648620605, | |
| "learning_rate": 1.6993303801380447e-05, | |
| "loss": 1.2648, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.5179582175568591, | |
| "grad_norm": 8.813359260559082, | |
| "learning_rate": 1.698300195735037e-05, | |
| "loss": 1.2533, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.5179582175568591, | |
| "eval_accuracy": 0.9517496824264526, | |
| "eval_loss": 0.9885143041610718, | |
| "eval_runtime": 417.9702, | |
| "eval_samples_per_second": 82.659, | |
| "eval_steps_per_second": 2.584, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.523103838633323, | |
| "grad_norm": 1.7033747434616089, | |
| "learning_rate": 1.6972700113320287e-05, | |
| "loss": 1.2576, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.5282494597097869, | |
| "grad_norm": 5.316808700561523, | |
| "learning_rate": 1.6962398269290202e-05, | |
| "loss": 1.3659, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.5333950807862509, | |
| "grad_norm": 3.3904647827148438, | |
| "learning_rate": 1.6952096425260124e-05, | |
| "loss": 1.298, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.5385407018627149, | |
| "grad_norm": 0.8259275555610657, | |
| "learning_rate": 1.6941794581230042e-05, | |
| "loss": 1.2723, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.5436863229391786, | |
| "grad_norm": 7.67642068862915, | |
| "learning_rate": 1.693149273719996e-05, | |
| "loss": 1.3099, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.5436863229391786, | |
| "eval_accuracy": 0.9517786502838135, | |
| "eval_loss": 0.9875179529190063, | |
| "eval_runtime": 416.9745, | |
| "eval_samples_per_second": 82.856, | |
| "eval_steps_per_second": 2.59, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.5488319440156428, | |
| "grad_norm": 2.3492562770843506, | |
| "learning_rate": 1.692119089316988e-05, | |
| "loss": 1.2984, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.5539775650921066, | |
| "grad_norm": 5.415560722351074, | |
| "learning_rate": 1.6910889049139797e-05, | |
| "loss": 1.2128, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.5591231861685706, | |
| "grad_norm": 12.2908935546875, | |
| "learning_rate": 1.6900587205109716e-05, | |
| "loss": 1.2689, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 1.5642688072450346, | |
| "grad_norm": 8.375056266784668, | |
| "learning_rate": 1.6890285361079634e-05, | |
| "loss": 1.2516, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.5694144283214984, | |
| "grad_norm": 9.067890167236328, | |
| "learning_rate": 1.6879983517049553e-05, | |
| "loss": 1.3028, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.5694144283214984, | |
| "eval_accuracy": 0.9523285627365112, | |
| "eval_loss": 0.9856404066085815, | |
| "eval_runtime": 417.1329, | |
| "eval_samples_per_second": 82.825, | |
| "eval_steps_per_second": 2.589, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 1.5745600493979623, | |
| "grad_norm": 10.568164825439453, | |
| "learning_rate": 1.686968167301947e-05, | |
| "loss": 1.3619, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.5797056704744263, | |
| "grad_norm": 15.765814781188965, | |
| "learning_rate": 1.685937982898939e-05, | |
| "loss": 1.3524, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 1.58485129155089, | |
| "grad_norm": 11.065564155578613, | |
| "learning_rate": 1.684907798495931e-05, | |
| "loss": 1.1749, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.589996912627354, | |
| "grad_norm": 7.860668659210205, | |
| "learning_rate": 1.6838776140929226e-05, | |
| "loss": 1.205, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 1.595142533703818, | |
| "grad_norm": 2.4386684894561768, | |
| "learning_rate": 1.6828474296899148e-05, | |
| "loss": 1.297, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.595142533703818, | |
| "eval_accuracy": 0.9513155221939087, | |
| "eval_loss": 0.9780011177062988, | |
| "eval_runtime": 418.0332, | |
| "eval_samples_per_second": 82.647, | |
| "eval_steps_per_second": 2.584, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.6002881547802819, | |
| "grad_norm": 7.5391316413879395, | |
| "learning_rate": 1.6818172452869067e-05, | |
| "loss": 1.2469, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 1.605433775856746, | |
| "grad_norm": 9.402176856994629, | |
| "learning_rate": 1.680787060883898e-05, | |
| "loss": 1.2285, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.6105793969332098, | |
| "grad_norm": 5.171482563018799, | |
| "learning_rate": 1.6797568764808903e-05, | |
| "loss": 1.2963, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 1.6157250180096736, | |
| "grad_norm": 7.366409778594971, | |
| "learning_rate": 1.678726692077882e-05, | |
| "loss": 1.2406, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.6208706390861378, | |
| "grad_norm": 10.613348007202148, | |
| "learning_rate": 1.677696507674874e-05, | |
| "loss": 1.3049, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.6208706390861378, | |
| "eval_accuracy": 0.9511997699737549, | |
| "eval_loss": 0.9873180389404297, | |
| "eval_runtime": 417.7747, | |
| "eval_samples_per_second": 82.698, | |
| "eval_steps_per_second": 2.585, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 1.6260162601626016, | |
| "grad_norm": 3.9607322216033936, | |
| "learning_rate": 1.676666323271866e-05, | |
| "loss": 1.2174, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.6311618812390656, | |
| "grad_norm": 8.552703857421875, | |
| "learning_rate": 1.6756361388688577e-05, | |
| "loss": 1.2789, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 1.6363075023155296, | |
| "grad_norm": 5.216203689575195, | |
| "learning_rate": 1.6746059544658496e-05, | |
| "loss": 1.289, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.6414531233919933, | |
| "grad_norm": 7.981589317321777, | |
| "learning_rate": 1.6735757700628414e-05, | |
| "loss": 1.3242, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 1.6465987444684573, | |
| "grad_norm": 9.827128410339355, | |
| "learning_rate": 1.6725455856598332e-05, | |
| "loss": 1.2974, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.6465987444684573, | |
| "eval_accuracy": 0.9522417187690735, | |
| "eval_loss": 0.9755061268806458, | |
| "eval_runtime": 418.3094, | |
| "eval_samples_per_second": 82.592, | |
| "eval_steps_per_second": 2.582, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.6517443655449213, | |
| "grad_norm": 8.793742179870605, | |
| "learning_rate": 1.671515401256825e-05, | |
| "loss": 1.2741, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 1.656889986621385, | |
| "grad_norm": 4.681251049041748, | |
| "learning_rate": 1.670485216853817e-05, | |
| "loss": 1.1625, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.662035607697849, | |
| "grad_norm": 9.398008346557617, | |
| "learning_rate": 1.6694550324508088e-05, | |
| "loss": 1.2795, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 1.667181228774313, | |
| "grad_norm": 7.628296852111816, | |
| "learning_rate": 1.6684248480478006e-05, | |
| "loss": 1.2301, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.6723268498507768, | |
| "grad_norm": 7.104902267456055, | |
| "learning_rate": 1.6673946636447928e-05, | |
| "loss": 1.2348, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.6723268498507768, | |
| "eval_accuracy": 0.952791690826416, | |
| "eval_loss": 0.980122447013855, | |
| "eval_runtime": 418.027, | |
| "eval_samples_per_second": 82.648, | |
| "eval_steps_per_second": 2.584, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 1.677472470927241, | |
| "grad_norm": 6.678224563598633, | |
| "learning_rate": 1.6663644792417843e-05, | |
| "loss": 1.2408, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.6826180920037048, | |
| "grad_norm": 13.851053237915039, | |
| "learning_rate": 1.665334294838776e-05, | |
| "loss": 1.2477, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 1.6877637130801688, | |
| "grad_norm": 3.6658806800842285, | |
| "learning_rate": 1.6643041104357683e-05, | |
| "loss": 1.3386, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.6929093341566328, | |
| "grad_norm": 5.4644927978515625, | |
| "learning_rate": 1.6632739260327598e-05, | |
| "loss": 1.2346, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 1.6980549552330966, | |
| "grad_norm": 1.6028341054916382, | |
| "learning_rate": 1.662243741629752e-05, | |
| "loss": 1.2904, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.6980549552330966, | |
| "eval_accuracy": 0.9520391225814819, | |
| "eval_loss": 0.9905561208724976, | |
| "eval_runtime": 417.9366, | |
| "eval_samples_per_second": 82.666, | |
| "eval_steps_per_second": 2.584, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.7032005763095606, | |
| "grad_norm": 0.9734807014465332, | |
| "learning_rate": 1.661234160914804e-05, | |
| "loss": 1.2947, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 1.7083461973860246, | |
| "grad_norm": 3.4319236278533936, | |
| "learning_rate": 1.6602039765117958e-05, | |
| "loss": 1.2572, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.7134918184624883, | |
| "grad_norm": 3.019766092300415, | |
| "learning_rate": 1.6591737921087876e-05, | |
| "loss": 1.2738, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 1.7186374395389523, | |
| "grad_norm": 9.71827507019043, | |
| "learning_rate": 1.6581436077057794e-05, | |
| "loss": 1.2686, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.7237830606154163, | |
| "grad_norm": 5.171957969665527, | |
| "learning_rate": 1.6571134233027713e-05, | |
| "loss": 1.4041, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.7237830606154163, | |
| "eval_accuracy": 0.952791690826416, | |
| "eval_loss": 0.9791179895401001, | |
| "eval_runtime": 418.3742, | |
| "eval_samples_per_second": 82.579, | |
| "eval_steps_per_second": 2.581, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 1.72892868169188, | |
| "grad_norm": 5.277884006500244, | |
| "learning_rate": 1.656083238899763e-05, | |
| "loss": 1.2935, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.7340743027683443, | |
| "grad_norm": 10.89902400970459, | |
| "learning_rate": 1.655053054496755e-05, | |
| "loss": 1.2501, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 1.739219923844808, | |
| "grad_norm": 2.373206377029419, | |
| "learning_rate": 1.6540434737818072e-05, | |
| "loss": 1.3208, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.744365544921272, | |
| "grad_norm": 1.7645074129104614, | |
| "learning_rate": 1.653013289378799e-05, | |
| "loss": 1.2486, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 1.749511165997736, | |
| "grad_norm": 3.979423999786377, | |
| "learning_rate": 1.651983104975791e-05, | |
| "loss": 1.2587, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.749511165997736, | |
| "eval_accuracy": 0.9519522786140442, | |
| "eval_loss": 0.9862294793128967, | |
| "eval_runtime": 417.504, | |
| "eval_samples_per_second": 82.751, | |
| "eval_steps_per_second": 2.587, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.7546567870741998, | |
| "grad_norm": 3.5347177982330322, | |
| "learning_rate": 1.6509529205727824e-05, | |
| "loss": 1.3325, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 1.7598024081506638, | |
| "grad_norm": 5.752897262573242, | |
| "learning_rate": 1.6499227361697746e-05, | |
| "loss": 1.3104, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.7649480292271278, | |
| "grad_norm": 8.936431884765625, | |
| "learning_rate": 1.6488925517667664e-05, | |
| "loss": 1.2504, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 1.7700936503035916, | |
| "grad_norm": 11.348810195922852, | |
| "learning_rate": 1.6478623673637583e-05, | |
| "loss": 1.3153, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.7752392713800556, | |
| "grad_norm": 8.096456527709961, | |
| "learning_rate": 1.64683218296075e-05, | |
| "loss": 1.328, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.7752392713800556, | |
| "eval_accuracy": 0.9529942870140076, | |
| "eval_loss": 0.9803459644317627, | |
| "eval_runtime": 417.43, | |
| "eval_samples_per_second": 82.766, | |
| "eval_steps_per_second": 2.587, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 1.7803848924565195, | |
| "grad_norm": 4.984877586364746, | |
| "learning_rate": 1.645801998557742e-05, | |
| "loss": 1.3417, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.7855305135329833, | |
| "grad_norm": 8.615971565246582, | |
| "learning_rate": 1.6447718141547338e-05, | |
| "loss": 1.2486, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 1.7906761346094475, | |
| "grad_norm": 6.480031490325928, | |
| "learning_rate": 1.6437416297517256e-05, | |
| "loss": 1.2869, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.7958217556859113, | |
| "grad_norm": 3.220890522003174, | |
| "learning_rate": 1.6427114453487175e-05, | |
| "loss": 1.3599, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 1.800967376762375, | |
| "grad_norm": 6.310009956359863, | |
| "learning_rate": 1.6416812609457093e-05, | |
| "loss": 1.2822, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.800967376762375, | |
| "eval_accuracy": 0.9526180028915405, | |
| "eval_loss": 0.9846508502960205, | |
| "eval_runtime": 417.34, | |
| "eval_samples_per_second": 82.784, | |
| "eval_steps_per_second": 2.588, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.8061129978388393, | |
| "grad_norm": 9.1428804397583, | |
| "learning_rate": 1.640651076542701e-05, | |
| "loss": 1.3001, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 1.811258618915303, | |
| "grad_norm": 11.503830909729004, | |
| "learning_rate": 1.6396208921396934e-05, | |
| "loss": 1.0848, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.816404239991767, | |
| "grad_norm": 6.229241847991943, | |
| "learning_rate": 1.638590707736685e-05, | |
| "loss": 1.3171, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 1.821549861068231, | |
| "grad_norm": 6.697323799133301, | |
| "learning_rate": 1.637560523333677e-05, | |
| "loss": 1.3387, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.8266954821446948, | |
| "grad_norm": 12.814096450805664, | |
| "learning_rate": 1.636530338930669e-05, | |
| "loss": 1.2401, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.8266954821446948, | |
| "eval_accuracy": 0.9528206586837769, | |
| "eval_loss": 0.9803994297981262, | |
| "eval_runtime": 417.6448, | |
| "eval_samples_per_second": 82.723, | |
| "eval_steps_per_second": 2.586, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 1.8318411032211588, | |
| "grad_norm": 5.222059726715088, | |
| "learning_rate": 1.6355001545276604e-05, | |
| "loss": 1.2979, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.8369867242976228, | |
| "grad_norm": 3.9017868041992188, | |
| "learning_rate": 1.6344699701246526e-05, | |
| "loss": 1.2222, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 1.8421323453740865, | |
| "grad_norm": 7.524175643920898, | |
| "learning_rate": 1.6334397857216444e-05, | |
| "loss": 1.27, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.8472779664505505, | |
| "grad_norm": 4.3478593826293945, | |
| "learning_rate": 1.6324096013186362e-05, | |
| "loss": 1.3109, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 1.8524235875270145, | |
| "grad_norm": 8.614230155944824, | |
| "learning_rate": 1.631379416915628e-05, | |
| "loss": 1.2306, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.8524235875270145, | |
| "eval_accuracy": 0.9536600112915039, | |
| "eval_loss": 0.987566351890564, | |
| "eval_runtime": 417.5552, | |
| "eval_samples_per_second": 82.741, | |
| "eval_steps_per_second": 2.586, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.8575692086034783, | |
| "grad_norm": 7.108985900878906, | |
| "learning_rate": 1.63034923251262e-05, | |
| "loss": 1.1878, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 1.8627148296799425, | |
| "grad_norm": 10.433032989501953, | |
| "learning_rate": 1.6293190481096118e-05, | |
| "loss": 1.2398, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.8678604507564063, | |
| "grad_norm": 10.102560043334961, | |
| "learning_rate": 1.6282888637066036e-05, | |
| "loss": 1.2576, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 1.8730060718328703, | |
| "grad_norm": 4.380664348602295, | |
| "learning_rate": 1.6272586793035955e-05, | |
| "loss": 1.1579, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.8781516929093343, | |
| "grad_norm": 2.1999149322509766, | |
| "learning_rate": 1.6262284949005873e-05, | |
| "loss": 1.2889, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.8781516929093343, | |
| "eval_accuracy": 0.9519233703613281, | |
| "eval_loss": 0.9859423041343689, | |
| "eval_runtime": 417.7845, | |
| "eval_samples_per_second": 82.696, | |
| "eval_steps_per_second": 2.585, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 1.883297313985798, | |
| "grad_norm": 5.329191207885742, | |
| "learning_rate": 1.625198310497579e-05, | |
| "loss": 1.3331, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.888442935062262, | |
| "grad_norm": 17.370649337768555, | |
| "learning_rate": 1.624168126094571e-05, | |
| "loss": 1.2957, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 1.893588556138726, | |
| "grad_norm": 10.373506546020508, | |
| "learning_rate": 1.6231379416915628e-05, | |
| "loss": 1.2286, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.8987341772151898, | |
| "grad_norm": 13.445988655090332, | |
| "learning_rate": 1.622107757288555e-05, | |
| "loss": 1.2513, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 1.9038797982916538, | |
| "grad_norm": 11.915916442871094, | |
| "learning_rate": 1.6210775728855465e-05, | |
| "loss": 1.1702, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9038797982916538, | |
| "eval_accuracy": 0.9540941715240479, | |
| "eval_loss": 0.9839755296707153, | |
| "eval_runtime": 417.3645, | |
| "eval_samples_per_second": 82.779, | |
| "eval_steps_per_second": 2.588, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.9090254193681178, | |
| "grad_norm": 13.058712005615234, | |
| "learning_rate": 1.6200473884825383e-05, | |
| "loss": 1.3181, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 1.9141710404445815, | |
| "grad_norm": 8.620599746704102, | |
| "learning_rate": 1.6190172040795305e-05, | |
| "loss": 1.1976, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.9193166615210457, | |
| "grad_norm": 7.074895858764648, | |
| "learning_rate": 1.617987019676522e-05, | |
| "loss": 1.3623, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 1.9244622825975095, | |
| "grad_norm": 10.293702125549316, | |
| "learning_rate": 1.6169568352735142e-05, | |
| "loss": 1.2594, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.9296079036739735, | |
| "grad_norm": 7.491464138031006, | |
| "learning_rate": 1.615926650870506e-05, | |
| "loss": 1.2902, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.9296079036739735, | |
| "eval_accuracy": 0.9522128105163574, | |
| "eval_loss": 0.9844051003456116, | |
| "eval_runtime": 418.2592, | |
| "eval_samples_per_second": 82.602, | |
| "eval_steps_per_second": 2.582, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 1.9347535247504375, | |
| "grad_norm": 2.0575156211853027, | |
| "learning_rate": 1.614896466467498e-05, | |
| "loss": 1.3283, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.9398991458269013, | |
| "grad_norm": 10.51094913482666, | |
| "learning_rate": 1.6138662820644897e-05, | |
| "loss": 1.2987, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 1.9450447669033653, | |
| "grad_norm": 6.296252727508545, | |
| "learning_rate": 1.6128567013495417e-05, | |
| "loss": 1.1987, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.9501903879798292, | |
| "grad_norm": 2.280179738998413, | |
| "learning_rate": 1.6118265169465335e-05, | |
| "loss": 1.2385, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 1.955336009056293, | |
| "grad_norm": 5.830591678619385, | |
| "learning_rate": 1.6107963325435253e-05, | |
| "loss": 1.2772, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.955336009056293, | |
| "eval_accuracy": 0.9533126950263977, | |
| "eval_loss": 0.9861400723457336, | |
| "eval_runtime": 418.0997, | |
| "eval_samples_per_second": 82.633, | |
| "eval_steps_per_second": 2.583, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.960481630132757, | |
| "grad_norm": 6.468533515930176, | |
| "learning_rate": 1.6097661481405172e-05, | |
| "loss": 1.1906, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 1.965627251209221, | |
| "grad_norm": 7.839109420776367, | |
| "learning_rate": 1.608735963737509e-05, | |
| "loss": 1.3041, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.9707728722856848, | |
| "grad_norm": 12.740795135498047, | |
| "learning_rate": 1.607705779334501e-05, | |
| "loss": 1.2345, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 1.975918493362149, | |
| "grad_norm": 7.1893134117126465, | |
| "learning_rate": 1.606675594931493e-05, | |
| "loss": 1.2586, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.9810641144386127, | |
| "grad_norm": 14.163928031921387, | |
| "learning_rate": 1.6056454105284846e-05, | |
| "loss": 1.196, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.9810641144386127, | |
| "eval_accuracy": 0.9521838426589966, | |
| "eval_loss": 0.9835113286972046, | |
| "eval_runtime": 417.4557, | |
| "eval_samples_per_second": 82.761, | |
| "eval_steps_per_second": 2.587, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 1.9862097355150765, | |
| "grad_norm": 5.9427618980407715, | |
| "learning_rate": 1.6046152261254767e-05, | |
| "loss": 1.2872, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.9913553565915407, | |
| "grad_norm": 14.67308235168457, | |
| "learning_rate": 1.6035850417224686e-05, | |
| "loss": 1.2449, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 1.9965009776680045, | |
| "grad_norm": 3.581702947616577, | |
| "learning_rate": 1.6025548573194604e-05, | |
| "loss": 1.2435, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.0016465987444683, | |
| "grad_norm": 13.742449760437012, | |
| "learning_rate": 1.6015246729164523e-05, | |
| "loss": 1.3096, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.0067922198209325, | |
| "grad_norm": 10.677633285522461, | |
| "learning_rate": 1.600494488513444e-05, | |
| "loss": 1.1697, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.0067922198209325, | |
| "eval_accuracy": 0.9514312744140625, | |
| "eval_loss": 1.0035802125930786, | |
| "eval_runtime": 418.5248, | |
| "eval_samples_per_second": 82.549, | |
| "eval_steps_per_second": 2.58, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.0119378408973962, | |
| "grad_norm": 6.8798418045043945, | |
| "learning_rate": 1.599464304110436e-05, | |
| "loss": 1.0556, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.0170834619738605, | |
| "grad_norm": 8.785711288452148, | |
| "learning_rate": 1.5984341197074278e-05, | |
| "loss": 1.1592, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.0222290830503242, | |
| "grad_norm": 11.857321739196777, | |
| "learning_rate": 1.5974039353044196e-05, | |
| "loss": 1.1808, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.027374704126788, | |
| "grad_norm": 3.2849769592285156, | |
| "learning_rate": 1.5963737509014115e-05, | |
| "loss": 1.141, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.032520325203252, | |
| "grad_norm": 7.293135166168213, | |
| "learning_rate": 1.5953435664984033e-05, | |
| "loss": 1.1139, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.032520325203252, | |
| "eval_accuracy": 0.9516628384590149, | |
| "eval_loss": 1.0205085277557373, | |
| "eval_runtime": 417.5267, | |
| "eval_samples_per_second": 82.747, | |
| "eval_steps_per_second": 2.587, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.037665946279716, | |
| "grad_norm": 10.044347763061523, | |
| "learning_rate": 1.594313382095395e-05, | |
| "loss": 1.1959, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.0428115673561797, | |
| "grad_norm": 0.14353907108306885, | |
| "learning_rate": 1.593283197692387e-05, | |
| "loss": 1.0762, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.047957188432644, | |
| "grad_norm": 9.524683952331543, | |
| "learning_rate": 1.592253013289379e-05, | |
| "loss": 1.3522, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.0531028095091077, | |
| "grad_norm": 12.576092720031738, | |
| "learning_rate": 1.5912228288863707e-05, | |
| "loss": 1.1175, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.0582484305855715, | |
| "grad_norm": 0.6325793862342834, | |
| "learning_rate": 1.5901926444833625e-05, | |
| "loss": 1.178, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.0582484305855715, | |
| "eval_accuracy": 0.951228678226471, | |
| "eval_loss": 1.018436074256897, | |
| "eval_runtime": 417.7277, | |
| "eval_samples_per_second": 82.707, | |
| "eval_steps_per_second": 2.585, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.0633940516620357, | |
| "grad_norm": 12.589435577392578, | |
| "learning_rate": 1.5891624600803547e-05, | |
| "loss": 1.1416, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.0685396727384995, | |
| "grad_norm": 4.007430076599121, | |
| "learning_rate": 1.5881322756773462e-05, | |
| "loss": 1.1523, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.0736852938149637, | |
| "grad_norm": 4.076907157897949, | |
| "learning_rate": 1.5871020912743384e-05, | |
| "loss": 1.2561, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.0788309148914275, | |
| "grad_norm": 1.5451477766036987, | |
| "learning_rate": 1.5860719068713302e-05, | |
| "loss": 1.119, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.0839765359678912, | |
| "grad_norm": 5.0513691902160645, | |
| "learning_rate": 1.5850417224683217e-05, | |
| "loss": 1.095, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.0839765359678912, | |
| "eval_accuracy": 0.95041823387146, | |
| "eval_loss": 1.0154516696929932, | |
| "eval_runtime": 418.0138, | |
| "eval_samples_per_second": 82.65, | |
| "eval_steps_per_second": 2.584, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.0891221570443554, | |
| "grad_norm": 9.095207214355469, | |
| "learning_rate": 1.584011538065314e-05, | |
| "loss": 1.1432, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.094267778120819, | |
| "grad_norm": 9.274755477905273, | |
| "learning_rate": 1.5829813536623058e-05, | |
| "loss": 1.1455, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.099413399197283, | |
| "grad_norm": 14.58219051361084, | |
| "learning_rate": 1.5819511692592976e-05, | |
| "loss": 1.0913, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.104559020273747, | |
| "grad_norm": 12.373740196228027, | |
| "learning_rate": 1.5809209848562894e-05, | |
| "loss": 1.1671, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.109704641350211, | |
| "grad_norm": 7.772844314575195, | |
| "learning_rate": 1.5798908004532813e-05, | |
| "loss": 1.2776, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.109704641350211, | |
| "eval_accuracy": 0.9514023661613464, | |
| "eval_loss": 1.0333930253982544, | |
| "eval_runtime": 418.0428, | |
| "eval_samples_per_second": 82.645, | |
| "eval_steps_per_second": 2.583, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.1148502624266747, | |
| "grad_norm": 7.367280006408691, | |
| "learning_rate": 1.578860616050273e-05, | |
| "loss": 1.3092, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.119995883503139, | |
| "grad_norm": 10.191935539245605, | |
| "learning_rate": 1.577830431647265e-05, | |
| "loss": 1.1981, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.1251415045796027, | |
| "grad_norm": 7.1885199546813965, | |
| "learning_rate": 1.5768002472442568e-05, | |
| "loss": 1.1399, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.1302871256560665, | |
| "grad_norm": 1.4416226148605347, | |
| "learning_rate": 1.5757700628412486e-05, | |
| "loss": 1.0976, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.1354327467325307, | |
| "grad_norm": 13.531189918518066, | |
| "learning_rate": 1.5747398784382405e-05, | |
| "loss": 1.1335, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.1354327467325307, | |
| "eval_accuracy": 0.9518365263938904, | |
| "eval_loss": 1.0136040449142456, | |
| "eval_runtime": 418.6163, | |
| "eval_samples_per_second": 82.531, | |
| "eval_steps_per_second": 2.58, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.1405783678089945, | |
| "grad_norm": 0.48753559589385986, | |
| "learning_rate": 1.5737096940352323e-05, | |
| "loss": 1.1567, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.1457239888854587, | |
| "grad_norm": 1.1756892204284668, | |
| "learning_rate": 1.5726795096322242e-05, | |
| "loss": 1.2536, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.1508696099619224, | |
| "grad_norm": 13.005239486694336, | |
| "learning_rate": 1.5716493252292164e-05, | |
| "loss": 1.1717, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.156015231038386, | |
| "grad_norm": 13.348917961120605, | |
| "learning_rate": 1.570619140826208e-05, | |
| "loss": 1.1433, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.1611608521148504, | |
| "grad_norm": 4.952757835388184, | |
| "learning_rate": 1.5695889564231997e-05, | |
| "loss": 1.1885, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.1611608521148504, | |
| "eval_accuracy": 0.951170802116394, | |
| "eval_loss": 1.0185319185256958, | |
| "eval_runtime": 418.1975, | |
| "eval_samples_per_second": 82.614, | |
| "eval_steps_per_second": 2.583, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.166306473191314, | |
| "grad_norm": 10.125651359558105, | |
| "learning_rate": 1.568558772020192e-05, | |
| "loss": 1.0543, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.171452094267778, | |
| "grad_norm": 5.2072062492370605, | |
| "learning_rate": 1.5675285876171834e-05, | |
| "loss": 1.1122, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.176597715344242, | |
| "grad_norm": 3.4542808532714844, | |
| "learning_rate": 1.5664984032141756e-05, | |
| "loss": 1.17, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 2.181743336420706, | |
| "grad_norm": 1.4935418367385864, | |
| "learning_rate": 1.5654682188111674e-05, | |
| "loss": 1.0757, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.1868889574971697, | |
| "grad_norm": 2.735926389694214, | |
| "learning_rate": 1.5644380344081593e-05, | |
| "loss": 1.3008, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.1868889574971697, | |
| "eval_accuracy": 0.9506497979164124, | |
| "eval_loss": 1.016100287437439, | |
| "eval_runtime": 417.7311, | |
| "eval_samples_per_second": 82.706, | |
| "eval_steps_per_second": 2.585, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.192034578573634, | |
| "grad_norm": 11.821269989013672, | |
| "learning_rate": 1.563407850005151e-05, | |
| "loss": 1.1723, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.1971801996500977, | |
| "grad_norm": 1.3524460792541504, | |
| "learning_rate": 1.562377665602143e-05, | |
| "loss": 1.2517, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 2.2023258207265615, | |
| "grad_norm": 6.308670520782471, | |
| "learning_rate": 1.5613474811991348e-05, | |
| "loss": 1.1834, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.2074714418030257, | |
| "grad_norm": 10.960680961608887, | |
| "learning_rate": 1.5603172967961266e-05, | |
| "loss": 1.1284, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 2.2126170628794894, | |
| "grad_norm": 8.426880836486816, | |
| "learning_rate": 1.5592871123931185e-05, | |
| "loss": 1.28, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.2126170628794894, | |
| "eval_accuracy": 0.9507076740264893, | |
| "eval_loss": 1.0217114686965942, | |
| "eval_runtime": 418.3067, | |
| "eval_samples_per_second": 82.593, | |
| "eval_steps_per_second": 2.582, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.2177626839559537, | |
| "grad_norm": 3.2604434490203857, | |
| "learning_rate": 1.5582569279901103e-05, | |
| "loss": 1.2478, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 2.2229083050324174, | |
| "grad_norm": 1.98189115524292, | |
| "learning_rate": 1.557226743587102e-05, | |
| "loss": 1.1798, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.228053926108881, | |
| "grad_norm": 4.054050445556641, | |
| "learning_rate": 1.5562171628721544e-05, | |
| "loss": 1.1218, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 2.2331995471853454, | |
| "grad_norm": 10.367090225219727, | |
| "learning_rate": 1.555186978469146e-05, | |
| "loss": 1.2787, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.238345168261809, | |
| "grad_norm": 12.966401100158691, | |
| "learning_rate": 1.554156794066138e-05, | |
| "loss": 1.1254, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.238345168261809, | |
| "eval_accuracy": 0.9507656097412109, | |
| "eval_loss": 1.0311578512191772, | |
| "eval_runtime": 417.8078, | |
| "eval_samples_per_second": 82.691, | |
| "eval_steps_per_second": 2.585, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.243490789338273, | |
| "grad_norm": 10.402215957641602, | |
| "learning_rate": 1.55312660966313e-05, | |
| "loss": 1.2375, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.248636410414737, | |
| "grad_norm": 15.226551055908203, | |
| "learning_rate": 1.5520964252601218e-05, | |
| "loss": 1.1074, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 2.253782031491201, | |
| "grad_norm": 7.523915767669678, | |
| "learning_rate": 1.5510662408571136e-05, | |
| "loss": 1.0927, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.2589276525676647, | |
| "grad_norm": 8.177473068237305, | |
| "learning_rate": 1.5500360564541055e-05, | |
| "loss": 1.1691, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 2.264073273644129, | |
| "grad_norm": 4.812458038330078, | |
| "learning_rate": 1.5490058720510973e-05, | |
| "loss": 1.1703, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.264073273644129, | |
| "eval_accuracy": 0.9499261975288391, | |
| "eval_loss": 1.0275415182113647, | |
| "eval_runtime": 417.8493, | |
| "eval_samples_per_second": 82.683, | |
| "eval_steps_per_second": 2.585, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.2692188947205927, | |
| "grad_norm": 5.703485012054443, | |
| "learning_rate": 1.547975687648089e-05, | |
| "loss": 1.2158, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 2.274364515797057, | |
| "grad_norm": 10.64054012298584, | |
| "learning_rate": 1.546945503245081e-05, | |
| "loss": 1.1026, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.2795101368735207, | |
| "grad_norm": 0.8261292576789856, | |
| "learning_rate": 1.5459153188420728e-05, | |
| "loss": 1.0644, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 2.2846557579499844, | |
| "grad_norm": 5.98064661026001, | |
| "learning_rate": 1.5448851344390647e-05, | |
| "loss": 1.1092, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.2898013790264486, | |
| "grad_norm": 10.404533386230469, | |
| "learning_rate": 1.543854950036057e-05, | |
| "loss": 1.1686, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.2898013790264486, | |
| "eval_accuracy": 0.9511997699737549, | |
| "eval_loss": 1.0342940092086792, | |
| "eval_runtime": 417.6196, | |
| "eval_samples_per_second": 82.728, | |
| "eval_steps_per_second": 2.586, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.2949470001029124, | |
| "grad_norm": 3.5781595706939697, | |
| "learning_rate": 1.5428247656330483e-05, | |
| "loss": 1.2711, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.300092621179376, | |
| "grad_norm": 5.956209182739258, | |
| "learning_rate": 1.5417945812300402e-05, | |
| "loss": 1.2942, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 2.3052382422558404, | |
| "grad_norm": 0.08046738803386688, | |
| "learning_rate": 1.5407643968270324e-05, | |
| "loss": 1.2073, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.310383863332304, | |
| "grad_norm": 6.365548610687256, | |
| "learning_rate": 1.539734212424024e-05, | |
| "loss": 1.2131, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 2.315529484408768, | |
| "grad_norm": 6.6707258224487305, | |
| "learning_rate": 1.538704028021016e-05, | |
| "loss": 1.1445, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.315529484408768, | |
| "eval_accuracy": 0.9516628384590149, | |
| "eval_loss": 1.0127946138381958, | |
| "eval_runtime": 417.7487, | |
| "eval_samples_per_second": 82.703, | |
| "eval_steps_per_second": 2.585, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.320675105485232, | |
| "grad_norm": 0.25063377618789673, | |
| "learning_rate": 1.5376738436180076e-05, | |
| "loss": 1.1553, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 2.325820726561696, | |
| "grad_norm": 8.28696060180664, | |
| "learning_rate": 1.5366436592149997e-05, | |
| "loss": 1.1512, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.33096634763816, | |
| "grad_norm": 1.361279845237732, | |
| "learning_rate": 1.5356134748119916e-05, | |
| "loss": 1.2069, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 2.336111968714624, | |
| "grad_norm": 1.9882014989852905, | |
| "learning_rate": 1.534583290408983e-05, | |
| "loss": 1.1345, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 2.3412575897910877, | |
| "grad_norm": 4.8411865234375, | |
| "learning_rate": 1.5335531060059753e-05, | |
| "loss": 1.1681, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 2.3412575897910877, | |
| "eval_accuracy": 0.9508813619613647, | |
| "eval_loss": 1.0100795030593872, | |
| "eval_runtime": 417.4967, | |
| "eval_samples_per_second": 82.753, | |
| "eval_steps_per_second": 2.587, | |
| "step": 22750 | |
| }, | |
| { | |
| "epoch": 2.346403210867552, | |
| "grad_norm": 6.883627414703369, | |
| "learning_rate": 1.532522921602967e-05, | |
| "loss": 1.1372, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 2.3515488319440156, | |
| "grad_norm": 9.81013298034668, | |
| "learning_rate": 1.531492737199959e-05, | |
| "loss": 1.1393, | |
| "step": 22850 | |
| }, | |
| { | |
| "epoch": 2.3566944530204794, | |
| "grad_norm": 7.514392852783203, | |
| "learning_rate": 1.5304625527969508e-05, | |
| "loss": 1.1327, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 2.3618400740969436, | |
| "grad_norm": 2.8904621601104736, | |
| "learning_rate": 1.5294323683939426e-05, | |
| "loss": 1.0903, | |
| "step": 22950 | |
| }, | |
| { | |
| "epoch": 2.3669856951734074, | |
| "grad_norm": 7.99860954284668, | |
| "learning_rate": 1.5284021839909345e-05, | |
| "loss": 1.1354, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.3669856951734074, | |
| "eval_accuracy": 0.9513444900512695, | |
| "eval_loss": 1.0172919034957886, | |
| "eval_runtime": 418.1914, | |
| "eval_samples_per_second": 82.615, | |
| "eval_steps_per_second": 2.583, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 2.372131316249871, | |
| "grad_norm": 3.2717432975769043, | |
| "learning_rate": 1.5273719995879263e-05, | |
| "loss": 1.2517, | |
| "step": 23050 | |
| }, | |
| { | |
| "epoch": 2.3772769373263354, | |
| "grad_norm": 4.3879594802856445, | |
| "learning_rate": 1.526341815184918e-05, | |
| "loss": 1.0634, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 2.382422558402799, | |
| "grad_norm": 8.286825180053711, | |
| "learning_rate": 1.52531163078191e-05, | |
| "loss": 1.2095, | |
| "step": 23150 | |
| }, | |
| { | |
| "epoch": 2.3875681794792634, | |
| "grad_norm": 2.0084967613220215, | |
| "learning_rate": 1.524281446378902e-05, | |
| "loss": 1.1686, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 2.392713800555727, | |
| "grad_norm": 5.974940776824951, | |
| "learning_rate": 1.5232512619758939e-05, | |
| "loss": 1.1063, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 2.392713800555727, | |
| "eval_accuracy": 0.9516628384590149, | |
| "eval_loss": 1.0242797136306763, | |
| "eval_runtime": 417.7754, | |
| "eval_samples_per_second": 82.698, | |
| "eval_steps_per_second": 2.585, | |
| "step": 23250 | |
| }, | |
| { | |
| "epoch": 2.397859421632191, | |
| "grad_norm": 7.380057334899902, | |
| "learning_rate": 1.5222210775728857e-05, | |
| "loss": 1.1309, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 2.403005042708655, | |
| "grad_norm": 1.6446843147277832, | |
| "learning_rate": 1.5211908931698775e-05, | |
| "loss": 1.1869, | |
| "step": 23350 | |
| }, | |
| { | |
| "epoch": 2.408150663785119, | |
| "grad_norm": 5.716843605041504, | |
| "learning_rate": 1.5201607087668696e-05, | |
| "loss": 1.1743, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 2.4132962848615827, | |
| "grad_norm": 2.141338586807251, | |
| "learning_rate": 1.5191305243638612e-05, | |
| "loss": 1.1001, | |
| "step": 23450 | |
| }, | |
| { | |
| "epoch": 2.418441905938047, | |
| "grad_norm": 1.5462799072265625, | |
| "learning_rate": 1.5181003399608532e-05, | |
| "loss": 1.1696, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.418441905938047, | |
| "eval_accuracy": 0.9524732828140259, | |
| "eval_loss": 1.0314745903015137, | |
| "eval_runtime": 418.5814, | |
| "eval_samples_per_second": 82.538, | |
| "eval_steps_per_second": 2.58, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 2.4235875270145106, | |
| "grad_norm": 10.894279479980469, | |
| "learning_rate": 1.5170701555578449e-05, | |
| "loss": 1.1493, | |
| "step": 23550 | |
| }, | |
| { | |
| "epoch": 2.4287331480909744, | |
| "grad_norm": 10.256954193115234, | |
| "learning_rate": 1.5160399711548367e-05, | |
| "loss": 1.1486, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 2.4338787691674386, | |
| "grad_norm": 14.089293479919434, | |
| "learning_rate": 1.5150097867518288e-05, | |
| "loss": 1.2302, | |
| "step": 23650 | |
| }, | |
| { | |
| "epoch": 2.4390243902439024, | |
| "grad_norm": 7.1174492835998535, | |
| "learning_rate": 1.5139796023488204e-05, | |
| "loss": 1.1427, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 2.4441700113203666, | |
| "grad_norm": 1.6686251163482666, | |
| "learning_rate": 1.5129494179458124e-05, | |
| "loss": 1.2123, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 2.4441700113203666, | |
| "eval_accuracy": 0.9509971141815186, | |
| "eval_loss": 1.0296884775161743, | |
| "eval_runtime": 418.2348, | |
| "eval_samples_per_second": 82.607, | |
| "eval_steps_per_second": 2.582, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 2.4493156323968304, | |
| "grad_norm": 0.8798663020133972, | |
| "learning_rate": 1.5119192335428043e-05, | |
| "loss": 1.1169, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 2.454461253473294, | |
| "grad_norm": 2.511453151702881, | |
| "learning_rate": 1.5108890491397961e-05, | |
| "loss": 1.1688, | |
| "step": 23850 | |
| }, | |
| { | |
| "epoch": 2.4596068745497583, | |
| "grad_norm": 8.896649360656738, | |
| "learning_rate": 1.509858864736788e-05, | |
| "loss": 1.0506, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 2.464752495626222, | |
| "grad_norm": 12.617236137390137, | |
| "learning_rate": 1.50882868033378e-05, | |
| "loss": 1.1965, | |
| "step": 23950 | |
| }, | |
| { | |
| "epoch": 2.469898116702686, | |
| "grad_norm": 14.843036651611328, | |
| "learning_rate": 1.5077984959307717e-05, | |
| "loss": 1.1253, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.469898116702686, | |
| "eval_accuracy": 0.9508234858512878, | |
| "eval_loss": 1.0238152742385864, | |
| "eval_runtime": 418.2418, | |
| "eval_samples_per_second": 82.605, | |
| "eval_steps_per_second": 2.582, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 2.47504373777915, | |
| "grad_norm": 13.253664016723633, | |
| "learning_rate": 1.5067683115277637e-05, | |
| "loss": 1.1957, | |
| "step": 24050 | |
| }, | |
| { | |
| "epoch": 2.480189358855614, | |
| "grad_norm": 4.080730438232422, | |
| "learning_rate": 1.5057381271247555e-05, | |
| "loss": 1.1395, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 2.4853349799320776, | |
| "grad_norm": 1.9019577503204346, | |
| "learning_rate": 1.5047079427217472e-05, | |
| "loss": 1.1238, | |
| "step": 24150 | |
| }, | |
| { | |
| "epoch": 2.490480601008542, | |
| "grad_norm": 10.57223129272461, | |
| "learning_rate": 1.5036983620067993e-05, | |
| "loss": 1.1342, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 2.4956262220850056, | |
| "grad_norm": 11.598908424377441, | |
| "learning_rate": 1.5026681776037913e-05, | |
| "loss": 1.1703, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 2.4956262220850056, | |
| "eval_accuracy": 0.9505629539489746, | |
| "eval_loss": 1.0218814611434937, | |
| "eval_runtime": 418.3594, | |
| "eval_samples_per_second": 82.582, | |
| "eval_steps_per_second": 2.582, | |
| "step": 24250 | |
| }, | |
| { | |
| "epoch": 2.50077184316147, | |
| "grad_norm": 9.800350189208984, | |
| "learning_rate": 1.501637993200783e-05, | |
| "loss": 1.0947, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 2.5059174642379336, | |
| "grad_norm": 3.784536361694336, | |
| "learning_rate": 1.500607808797775e-05, | |
| "loss": 1.1281, | |
| "step": 24350 | |
| }, | |
| { | |
| "epoch": 2.5110630853143974, | |
| "grad_norm": 2.499333620071411, | |
| "learning_rate": 1.4995776243947668e-05, | |
| "loss": 1.1029, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 2.516208706390861, | |
| "grad_norm": 9.453124046325684, | |
| "learning_rate": 1.4985474399917585e-05, | |
| "loss": 1.1784, | |
| "step": 24450 | |
| }, | |
| { | |
| "epoch": 2.5213543274673254, | |
| "grad_norm": 2.7490689754486084, | |
| "learning_rate": 1.4975172555887505e-05, | |
| "loss": 1.101, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.5213543274673254, | |
| "eval_accuracy": 0.9527627229690552, | |
| "eval_loss": 1.0266767740249634, | |
| "eval_runtime": 417.6079, | |
| "eval_samples_per_second": 82.731, | |
| "eval_steps_per_second": 2.586, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 2.526499948543789, | |
| "grad_norm": 8.228843688964844, | |
| "learning_rate": 1.4964870711857425e-05, | |
| "loss": 1.1231, | |
| "step": 24550 | |
| }, | |
| { | |
| "epoch": 2.5316455696202533, | |
| "grad_norm": 8.344508171081543, | |
| "learning_rate": 1.4954568867827342e-05, | |
| "loss": 1.1364, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 2.536791190696717, | |
| "grad_norm": 2.6875457763671875, | |
| "learning_rate": 1.494426702379726e-05, | |
| "loss": 1.1778, | |
| "step": 24650 | |
| }, | |
| { | |
| "epoch": 2.541936811773181, | |
| "grad_norm": 6.898427486419678, | |
| "learning_rate": 1.493396517976718e-05, | |
| "loss": 1.1089, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 2.547082432849645, | |
| "grad_norm": 3.228970766067505, | |
| "learning_rate": 1.4923663335737097e-05, | |
| "loss": 1.1626, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 2.547082432849645, | |
| "eval_accuracy": 0.9508234858512878, | |
| "eval_loss": 1.0254093408584595, | |
| "eval_runtime": 417.4897, | |
| "eval_samples_per_second": 82.754, | |
| "eval_steps_per_second": 2.587, | |
| "step": 24750 | |
| }, | |
| { | |
| "epoch": 2.552228053926109, | |
| "grad_norm": 4.928084850311279, | |
| "learning_rate": 1.4913361491707017e-05, | |
| "loss": 1.2019, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 2.557373675002573, | |
| "grad_norm": 15.422240257263184, | |
| "learning_rate": 1.4903059647676936e-05, | |
| "loss": 1.1503, | |
| "step": 24850 | |
| }, | |
| { | |
| "epoch": 2.562519296079037, | |
| "grad_norm": 11.451377868652344, | |
| "learning_rate": 1.4892757803646854e-05, | |
| "loss": 1.1697, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 2.5676649171555006, | |
| "grad_norm": 7.738549709320068, | |
| "learning_rate": 1.4882455959616772e-05, | |
| "loss": 1.0921, | |
| "step": 24950 | |
| }, | |
| { | |
| "epoch": 2.5728105382319644, | |
| "grad_norm": 2.7136483192443848, | |
| "learning_rate": 1.4872154115586692e-05, | |
| "loss": 1.3136, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 2.5728105382319644, | |
| "eval_accuracy": 0.9512865543365479, | |
| "eval_loss": 1.0222209692001343, | |
| "eval_runtime": 419.1601, | |
| "eval_samples_per_second": 82.424, | |
| "eval_steps_per_second": 2.577, | |
| "step": 25000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 97170, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |