| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 250, | |
| "global_step": 22737, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.006597176408497163, | |
| "grad_norm": 25.778600692749023, | |
| "learning_rate": 9.600000000000001e-06, | |
| "loss": 3.9255, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.013194352816994326, | |
| "grad_norm": 15.797815322875977, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 1.7945, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.01979152922549149, | |
| "grad_norm": 16.71779441833496, | |
| "learning_rate": 1.9957591553651104e-05, | |
| "loss": 1.5522, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.02638870563398865, | |
| "grad_norm": 14.723907470703125, | |
| "learning_rate": 1.9913416088704335e-05, | |
| "loss": 1.6267, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.032985882042485815, | |
| "grad_norm": 14.432233810424805, | |
| "learning_rate": 1.9869240623757565e-05, | |
| "loss": 1.5434, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.032985882042485815, | |
| "eval_accuracy": 0.8714285492897034, | |
| "eval_loss": 0.8546671867370605, | |
| "eval_runtime": 14.7911, | |
| "eval_samples_per_second": 165.64, | |
| "eval_steps_per_second": 5.206, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.03958305845098298, | |
| "grad_norm": 12.044981002807617, | |
| "learning_rate": 1.98250651588108e-05, | |
| "loss": 1.3113, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.04618023485948014, | |
| "grad_norm": 15.790678024291992, | |
| "learning_rate": 1.978088969386403e-05, | |
| "loss": 1.3674, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.0527774112679773, | |
| "grad_norm": 14.051512718200684, | |
| "learning_rate": 1.9736714228917263e-05, | |
| "loss": 1.3417, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.059374587676474466, | |
| "grad_norm": 16.866775512695312, | |
| "learning_rate": 1.9692538763970493e-05, | |
| "loss": 1.2831, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.06597176408497163, | |
| "grad_norm": 15.336055755615234, | |
| "learning_rate": 1.9648363299023723e-05, | |
| "loss": 1.2243, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.06597176408497163, | |
| "eval_accuracy": 0.882040798664093, | |
| "eval_loss": 0.7872514724731445, | |
| "eval_runtime": 14.0019, | |
| "eval_samples_per_second": 174.976, | |
| "eval_steps_per_second": 5.499, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.07256894049346879, | |
| "grad_norm": 21.990840911865234, | |
| "learning_rate": 1.9604187834076954e-05, | |
| "loss": 1.2276, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.07916611690196595, | |
| "grad_norm": 14.601304054260254, | |
| "learning_rate": 1.9560012369130184e-05, | |
| "loss": 1.2502, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.08576329331046312, | |
| "grad_norm": 10.78171157836914, | |
| "learning_rate": 1.9516720413482352e-05, | |
| "loss": 1.2247, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.09236046971896028, | |
| "grad_norm": 14.99619197845459, | |
| "learning_rate": 1.9472544948535586e-05, | |
| "loss": 1.178, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.09895764612745744, | |
| "grad_norm": 11.09481430053711, | |
| "learning_rate": 1.9428369483588816e-05, | |
| "loss": 1.2379, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.09895764612745744, | |
| "eval_accuracy": 0.8930612206459045, | |
| "eval_loss": 0.7503395080566406, | |
| "eval_runtime": 14.018, | |
| "eval_samples_per_second": 174.775, | |
| "eval_steps_per_second": 5.493, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.1055548225359546, | |
| "grad_norm": 12.924555778503418, | |
| "learning_rate": 1.938419401864205e-05, | |
| "loss": 1.3893, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.11215199894445177, | |
| "grad_norm": 16.87848663330078, | |
| "learning_rate": 1.934001855369528e-05, | |
| "loss": 1.1852, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.11874917535294893, | |
| "grad_norm": 17.876659393310547, | |
| "learning_rate": 1.929584308874851e-05, | |
| "loss": 1.1082, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.1253463517614461, | |
| "grad_norm": 14.923641204833984, | |
| "learning_rate": 1.925166762380174e-05, | |
| "loss": 0.9946, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.13194352816994326, | |
| "grad_norm": 20.28868865966797, | |
| "learning_rate": 1.9207492158854975e-05, | |
| "loss": 1.1834, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13194352816994326, | |
| "eval_accuracy": 0.899591863155365, | |
| "eval_loss": 0.7308884859085083, | |
| "eval_runtime": 14.203, | |
| "eval_samples_per_second": 172.499, | |
| "eval_steps_per_second": 5.421, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.13854070457844042, | |
| "grad_norm": 13.742298126220703, | |
| "learning_rate": 1.9163316693908205e-05, | |
| "loss": 1.1556, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.14513788098693758, | |
| "grad_norm": 14.966007232666016, | |
| "learning_rate": 1.9119141228961435e-05, | |
| "loss": 1.0251, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.15173505739543475, | |
| "grad_norm": 16.00642967224121, | |
| "learning_rate": 1.907496576401467e-05, | |
| "loss": 1.1943, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.1583322338039319, | |
| "grad_norm": 13.92847728729248, | |
| "learning_rate": 1.90307902990679e-05, | |
| "loss": 1.086, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.16492941021242907, | |
| "grad_norm": 16.767595291137695, | |
| "learning_rate": 1.898661483412113e-05, | |
| "loss": 1.1236, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.16492941021242907, | |
| "eval_accuracy": 0.9008163213729858, | |
| "eval_loss": 0.6945549845695496, | |
| "eval_runtime": 14.235, | |
| "eval_samples_per_second": 172.11, | |
| "eval_steps_per_second": 5.409, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.17152658662092624, | |
| "grad_norm": 13.734739303588867, | |
| "learning_rate": 1.894243936917436e-05, | |
| "loss": 1.0485, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.1781237630294234, | |
| "grad_norm": 8.18282699584961, | |
| "learning_rate": 1.8898263904227594e-05, | |
| "loss": 0.9481, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.18472093943792056, | |
| "grad_norm": 13.874724388122559, | |
| "learning_rate": 1.8854088439280824e-05, | |
| "loss": 1.0898, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.19131811584641772, | |
| "grad_norm": 15.291620254516602, | |
| "learning_rate": 1.8809912974334058e-05, | |
| "loss": 1.0863, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.1979152922549149, | |
| "grad_norm": 21.6629581451416, | |
| "learning_rate": 1.8765737509387288e-05, | |
| "loss": 1.0756, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.1979152922549149, | |
| "eval_accuracy": 0.9036734700202942, | |
| "eval_loss": 0.6746897101402283, | |
| "eval_runtime": 14.4342, | |
| "eval_samples_per_second": 169.736, | |
| "eval_steps_per_second": 5.335, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.20451246866341205, | |
| "grad_norm": 14.420069694519043, | |
| "learning_rate": 1.872156204444052e-05, | |
| "loss": 0.9973, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.2111096450719092, | |
| "grad_norm": 7.541851043701172, | |
| "learning_rate": 1.867738657949375e-05, | |
| "loss": 1.1098, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.21770682148040638, | |
| "grad_norm": 15.377376556396484, | |
| "learning_rate": 1.8633211114546983e-05, | |
| "loss": 1.1745, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.22430399788890354, | |
| "grad_norm": 10.262870788574219, | |
| "learning_rate": 1.8589035649600213e-05, | |
| "loss": 0.9654, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.2309011742974007, | |
| "grad_norm": 11.869269371032715, | |
| "learning_rate": 1.8544860184653447e-05, | |
| "loss": 1.0919, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.2309011742974007, | |
| "eval_accuracy": 0.9093877673149109, | |
| "eval_loss": 0.649857223033905, | |
| "eval_runtime": 14.4377, | |
| "eval_samples_per_second": 169.695, | |
| "eval_steps_per_second": 5.333, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.23749835070589786, | |
| "grad_norm": 12.47613525390625, | |
| "learning_rate": 1.8500684719706677e-05, | |
| "loss": 1.0249, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.24409552711439503, | |
| "grad_norm": 11.694280624389648, | |
| "learning_rate": 1.8456509254759907e-05, | |
| "loss": 0.9863, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.2506927035228922, | |
| "grad_norm": 6.96587610244751, | |
| "learning_rate": 1.841233378981314e-05, | |
| "loss": 1.1091, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.25728987993138935, | |
| "grad_norm": 16.962194442749023, | |
| "learning_rate": 1.836815832486637e-05, | |
| "loss": 1.0989, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.2638870563398865, | |
| "grad_norm": 16.43683433532715, | |
| "learning_rate": 1.83239828599196e-05, | |
| "loss": 1.0662, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2638870563398865, | |
| "eval_accuracy": 0.9065306186676025, | |
| "eval_loss": 0.6661304235458374, | |
| "eval_runtime": 14.2056, | |
| "eval_samples_per_second": 172.467, | |
| "eval_steps_per_second": 5.42, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2704842327483837, | |
| "grad_norm": 12.090469360351562, | |
| "learning_rate": 1.8279807394972832e-05, | |
| "loss": 1.0456, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 0.27708140915688084, | |
| "grad_norm": 14.27798843383789, | |
| "learning_rate": 1.8235631930026066e-05, | |
| "loss": 1.1349, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.283678585565378, | |
| "grad_norm": 14.521726608276367, | |
| "learning_rate": 1.8191456465079296e-05, | |
| "loss": 1.0111, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 0.29027576197387517, | |
| "grad_norm": 9.772090911865234, | |
| "learning_rate": 1.814728100013253e-05, | |
| "loss": 1.026, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.29687293838237233, | |
| "grad_norm": 15.107865333557129, | |
| "learning_rate": 1.810310553518576e-05, | |
| "loss": 0.9415, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.29687293838237233, | |
| "eval_accuracy": 0.9073469638824463, | |
| "eval_loss": 0.6389794945716858, | |
| "eval_runtime": 14.5287, | |
| "eval_samples_per_second": 168.631, | |
| "eval_steps_per_second": 5.3, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 0.3034701147908695, | |
| "grad_norm": 15.88947582244873, | |
| "learning_rate": 1.805893007023899e-05, | |
| "loss": 0.9761, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.31006729119936666, | |
| "grad_norm": 13.472917556762695, | |
| "learning_rate": 1.801475460529222e-05, | |
| "loss": 0.9748, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 0.3166644676078638, | |
| "grad_norm": 14.00285530090332, | |
| "learning_rate": 1.7970579140345454e-05, | |
| "loss": 1.0238, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.323261644016361, | |
| "grad_norm": 15.622306823730469, | |
| "learning_rate": 1.7926403675398685e-05, | |
| "loss": 1.0456, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 0.32985882042485815, | |
| "grad_norm": 12.947722434997559, | |
| "learning_rate": 1.788222821045192e-05, | |
| "loss": 0.9895, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.32985882042485815, | |
| "eval_accuracy": 0.9110203981399536, | |
| "eval_loss": 0.6434822678565979, | |
| "eval_runtime": 14.6459, | |
| "eval_samples_per_second": 167.283, | |
| "eval_steps_per_second": 5.257, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.3364559968333553, | |
| "grad_norm": 16.64254379272461, | |
| "learning_rate": 1.783805274550515e-05, | |
| "loss": 0.8796, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 0.34305317324185247, | |
| "grad_norm": 12.790375709533691, | |
| "learning_rate": 1.779387728055838e-05, | |
| "loss": 1.0172, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.34965034965034963, | |
| "grad_norm": 13.025754928588867, | |
| "learning_rate": 1.774970181561161e-05, | |
| "loss": 1.014, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 0.3562475260588468, | |
| "grad_norm": 13.217813491821289, | |
| "learning_rate": 1.770552635066484e-05, | |
| "loss": 0.9748, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.36284470246734396, | |
| "grad_norm": 13.908524513244629, | |
| "learning_rate": 1.7661350885718073e-05, | |
| "loss": 0.9273, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.36284470246734396, | |
| "eval_accuracy": 0.9081632494926453, | |
| "eval_loss": 0.6303015947341919, | |
| "eval_runtime": 14.1305, | |
| "eval_samples_per_second": 173.384, | |
| "eval_steps_per_second": 5.449, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 0.3694418788758411, | |
| "grad_norm": 12.053607940673828, | |
| "learning_rate": 1.7617175420771304e-05, | |
| "loss": 1.0122, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.3760390552843383, | |
| "grad_norm": 13.809615135192871, | |
| "learning_rate": 1.7572999955824538e-05, | |
| "loss": 1.0054, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 0.38263623169283545, | |
| "grad_norm": 14.718282699584961, | |
| "learning_rate": 1.7528824490877768e-05, | |
| "loss": 0.8974, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.3892334081013326, | |
| "grad_norm": 16.11876678466797, | |
| "learning_rate": 1.7484649025930998e-05, | |
| "loss": 0.9396, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 0.3958305845098298, | |
| "grad_norm": 9.439668655395508, | |
| "learning_rate": 1.744047356098423e-05, | |
| "loss": 0.8734, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.3958305845098298, | |
| "eval_accuracy": 0.9048979878425598, | |
| "eval_loss": 0.6237688064575195, | |
| "eval_runtime": 14.1809, | |
| "eval_samples_per_second": 172.767, | |
| "eval_steps_per_second": 5.43, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.40242776091832694, | |
| "grad_norm": 10.298930168151855, | |
| "learning_rate": 1.7396298096037462e-05, | |
| "loss": 1.0048, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 0.4090249373268241, | |
| "grad_norm": 7.693696975708008, | |
| "learning_rate": 1.7352122631090693e-05, | |
| "loss": 0.9701, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.41562211373532126, | |
| "grad_norm": 16.300338745117188, | |
| "learning_rate": 1.7307947166143926e-05, | |
| "loss": 0.9924, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 0.4222192901438184, | |
| "grad_norm": 10.341270446777344, | |
| "learning_rate": 1.7263771701197157e-05, | |
| "loss": 0.9349, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.4288164665523156, | |
| "grad_norm": 14.08645248413086, | |
| "learning_rate": 1.721959623625039e-05, | |
| "loss": 0.974, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.4288164665523156, | |
| "eval_accuracy": 0.9118367433547974, | |
| "eval_loss": 0.6216471791267395, | |
| "eval_runtime": 14.3141, | |
| "eval_samples_per_second": 171.16, | |
| "eval_steps_per_second": 5.379, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 0.43541364296081275, | |
| "grad_norm": 20.50489044189453, | |
| "learning_rate": 1.717542077130362e-05, | |
| "loss": 1.0539, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.4420108193693099, | |
| "grad_norm": 17.268712997436523, | |
| "learning_rate": 1.713124530635685e-05, | |
| "loss": 0.9389, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 0.4486079957778071, | |
| "grad_norm": 12.712272644042969, | |
| "learning_rate": 1.708706984141008e-05, | |
| "loss": 0.9171, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.45520517218630424, | |
| "grad_norm": 12.377297401428223, | |
| "learning_rate": 1.704289437646331e-05, | |
| "loss": 0.9706, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 0.4618023485948014, | |
| "grad_norm": 18.502830505371094, | |
| "learning_rate": 1.6998718911516545e-05, | |
| "loss": 1.0124, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.4618023485948014, | |
| "eval_accuracy": 0.9065306186676025, | |
| "eval_loss": 0.6126046180725098, | |
| "eval_runtime": 14.1583, | |
| "eval_samples_per_second": 173.043, | |
| "eval_steps_per_second": 5.438, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.46839952500329857, | |
| "grad_norm": 17.543399810791016, | |
| "learning_rate": 1.6954543446569776e-05, | |
| "loss": 0.9215, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 0.47499670141179573, | |
| "grad_norm": 15.049899101257324, | |
| "learning_rate": 1.691036798162301e-05, | |
| "loss": 0.8563, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.4815938778202929, | |
| "grad_norm": 14.00575065612793, | |
| "learning_rate": 1.686619251667624e-05, | |
| "loss": 0.8249, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 0.48819105422879006, | |
| "grad_norm": 19.295759201049805, | |
| "learning_rate": 1.682201705172947e-05, | |
| "loss": 0.8794, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.4947882306372872, | |
| "grad_norm": 14.837241172790527, | |
| "learning_rate": 1.67778415867827e-05, | |
| "loss": 1.0013, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.4947882306372872, | |
| "eval_accuracy": 0.9077550768852234, | |
| "eval_loss": 0.6021705865859985, | |
| "eval_runtime": 14.1781, | |
| "eval_samples_per_second": 172.802, | |
| "eval_steps_per_second": 5.431, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 0.5013854070457844, | |
| "grad_norm": 12.423500061035156, | |
| "learning_rate": 1.6733666121835934e-05, | |
| "loss": 0.922, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.5079825834542816, | |
| "grad_norm": 13.505254745483398, | |
| "learning_rate": 1.6689490656889164e-05, | |
| "loss": 0.9168, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 0.5145797598627787, | |
| "grad_norm": 12.56449031829834, | |
| "learning_rate": 1.6645315191942398e-05, | |
| "loss": 0.9315, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.5211769362712759, | |
| "grad_norm": 11.239628791809082, | |
| "learning_rate": 1.660113972699563e-05, | |
| "loss": 0.9265, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 0.527774112679773, | |
| "grad_norm": 9.262091636657715, | |
| "learning_rate": 1.655696426204886e-05, | |
| "loss": 0.9453, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.527774112679773, | |
| "eval_accuracy": 0.9077550768852234, | |
| "eval_loss": 0.6083095669746399, | |
| "eval_runtime": 14.2575, | |
| "eval_samples_per_second": 171.839, | |
| "eval_steps_per_second": 5.401, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5343712890882703, | |
| "grad_norm": 11.317748069763184, | |
| "learning_rate": 1.651278879710209e-05, | |
| "loss": 0.9585, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 0.5409684654967674, | |
| "grad_norm": 13.768712997436523, | |
| "learning_rate": 1.6468613332155323e-05, | |
| "loss": 0.9886, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.5475656419052646, | |
| "grad_norm": 11.504364967346191, | |
| "learning_rate": 1.6424437867208553e-05, | |
| "loss": 0.9081, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 0.5541628183137617, | |
| "grad_norm": 16.876300811767578, | |
| "learning_rate": 1.6380262402261787e-05, | |
| "loss": 0.8181, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.5607599947222589, | |
| "grad_norm": 15.651288986206055, | |
| "learning_rate": 1.6336086937315017e-05, | |
| "loss": 0.8806, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.5607599947222589, | |
| "eval_accuracy": 0.9118367433547974, | |
| "eval_loss": 0.5917608141899109, | |
| "eval_runtime": 14.7158, | |
| "eval_samples_per_second": 166.487, | |
| "eval_steps_per_second": 5.232, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 0.567357171130756, | |
| "grad_norm": 11.500801086425781, | |
| "learning_rate": 1.6291911472368248e-05, | |
| "loss": 0.858, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.5739543475392532, | |
| "grad_norm": 10.485420227050781, | |
| "learning_rate": 1.624773600742148e-05, | |
| "loss": 0.8781, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 0.5805515239477503, | |
| "grad_norm": 8.773555755615234, | |
| "learning_rate": 1.620356054247471e-05, | |
| "loss": 0.9059, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.5871487003562476, | |
| "grad_norm": 12.097881317138672, | |
| "learning_rate": 1.6159385077527942e-05, | |
| "loss": 0.8475, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 0.5937458767647447, | |
| "grad_norm": 9.051371574401855, | |
| "learning_rate": 1.6115209612581172e-05, | |
| "loss": 0.9649, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.5937458767647447, | |
| "eval_accuracy": 0.9057142734527588, | |
| "eval_loss": 0.5950626730918884, | |
| "eval_runtime": 14.9206, | |
| "eval_samples_per_second": 164.202, | |
| "eval_steps_per_second": 5.161, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.6003430531732419, | |
| "grad_norm": 15.799858093261719, | |
| "learning_rate": 1.6071034147634406e-05, | |
| "loss": 0.969, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 0.606940229581739, | |
| "grad_norm": 10.038565635681152, | |
| "learning_rate": 1.6026858682687636e-05, | |
| "loss": 0.8685, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.6135374059902362, | |
| "grad_norm": 14.452479362487793, | |
| "learning_rate": 1.598268321774087e-05, | |
| "loss": 0.9555, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 0.6201345823987333, | |
| "grad_norm": 14.48049259185791, | |
| "learning_rate": 1.59385077527941e-05, | |
| "loss": 0.9166, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.6267317588072305, | |
| "grad_norm": 10.772700309753418, | |
| "learning_rate": 1.589433228784733e-05, | |
| "loss": 0.877, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6267317588072305, | |
| "eval_accuracy": 0.9073469638824463, | |
| "eval_loss": 0.5858258605003357, | |
| "eval_runtime": 14.792, | |
| "eval_samples_per_second": 165.63, | |
| "eval_steps_per_second": 5.206, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 0.6333289352157276, | |
| "grad_norm": 12.199923515319824, | |
| "learning_rate": 1.585015682290056e-05, | |
| "loss": 0.938, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.6399261116242249, | |
| "grad_norm": 11.47739315032959, | |
| "learning_rate": 1.5805981357953795e-05, | |
| "loss": 0.9211, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 0.646523288032722, | |
| "grad_norm": 12.546594619750977, | |
| "learning_rate": 1.5761805893007025e-05, | |
| "loss": 0.9699, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.6531204644412192, | |
| "grad_norm": 15.941895484924316, | |
| "learning_rate": 1.571763042806026e-05, | |
| "loss": 0.8818, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 0.6597176408497163, | |
| "grad_norm": 12.06876277923584, | |
| "learning_rate": 1.567345496311349e-05, | |
| "loss": 0.9814, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6597176408497163, | |
| "eval_accuracy": 0.9175510406494141, | |
| "eval_loss": 0.5705481767654419, | |
| "eval_runtime": 14.3212, | |
| "eval_samples_per_second": 171.075, | |
| "eval_steps_per_second": 5.377, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.6663148172582135, | |
| "grad_norm": 11.047979354858398, | |
| "learning_rate": 1.562927949816672e-05, | |
| "loss": 0.8588, | |
| "step": 5050 | |
| }, | |
| { | |
| "epoch": 0.6729119936667106, | |
| "grad_norm": 13.39299488067627, | |
| "learning_rate": 1.558510403321995e-05, | |
| "loss": 0.8922, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.6795091700752078, | |
| "grad_norm": 11.451362609863281, | |
| "learning_rate": 1.554092856827318e-05, | |
| "loss": 1.0096, | |
| "step": 5150 | |
| }, | |
| { | |
| "epoch": 0.6861063464837049, | |
| "grad_norm": 3.436371326446533, | |
| "learning_rate": 1.5496753103326414e-05, | |
| "loss": 0.9217, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.6927035228922022, | |
| "grad_norm": 9.360651016235352, | |
| "learning_rate": 1.5452577638379644e-05, | |
| "loss": 0.9446, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.6927035228922022, | |
| "eval_accuracy": 0.9146938920021057, | |
| "eval_loss": 0.5739869475364685, | |
| "eval_runtime": 14.2053, | |
| "eval_samples_per_second": 172.471, | |
| "eval_steps_per_second": 5.421, | |
| "step": 5250 | |
| }, | |
| { | |
| "epoch": 0.6993006993006993, | |
| "grad_norm": 13.184717178344727, | |
| "learning_rate": 1.5408402173432878e-05, | |
| "loss": 0.9301, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.7058978757091965, | |
| "grad_norm": 9.54310417175293, | |
| "learning_rate": 1.5364226708486108e-05, | |
| "loss": 0.8436, | |
| "step": 5350 | |
| }, | |
| { | |
| "epoch": 0.7124950521176936, | |
| "grad_norm": 12.212594032287598, | |
| "learning_rate": 1.532005124353934e-05, | |
| "loss": 0.8547, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.7190922285261908, | |
| "grad_norm": 13.941079139709473, | |
| "learning_rate": 1.527587577859257e-05, | |
| "loss": 0.9552, | |
| "step": 5450 | |
| }, | |
| { | |
| "epoch": 0.7256894049346879, | |
| "grad_norm": 9.494156837463379, | |
| "learning_rate": 1.5232583822944737e-05, | |
| "loss": 0.9227, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7256894049346879, | |
| "eval_accuracy": 0.9134693741798401, | |
| "eval_loss": 0.5912680625915527, | |
| "eval_runtime": 14.37, | |
| "eval_samples_per_second": 170.494, | |
| "eval_steps_per_second": 5.358, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.7322865813431851, | |
| "grad_norm": 15.922163963317871, | |
| "learning_rate": 1.518840835799797e-05, | |
| "loss": 0.8813, | |
| "step": 5550 | |
| }, | |
| { | |
| "epoch": 0.7388837577516822, | |
| "grad_norm": 5.2287068367004395, | |
| "learning_rate": 1.5144232893051201e-05, | |
| "loss": 0.8519, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.7454809341601795, | |
| "grad_norm": 13.272147178649902, | |
| "learning_rate": 1.5100057428104431e-05, | |
| "loss": 0.8223, | |
| "step": 5650 | |
| }, | |
| { | |
| "epoch": 0.7520781105686766, | |
| "grad_norm": 10.859210968017578, | |
| "learning_rate": 1.5055881963157663e-05, | |
| "loss": 0.8603, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.7586752869771738, | |
| "grad_norm": 13.087916374206543, | |
| "learning_rate": 1.5011706498210894e-05, | |
| "loss": 0.8208, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7586752869771738, | |
| "eval_accuracy": 0.9151020646095276, | |
| "eval_loss": 0.5698295831680298, | |
| "eval_runtime": 14.3053, | |
| "eval_samples_per_second": 171.265, | |
| "eval_steps_per_second": 5.383, | |
| "step": 5750 | |
| }, | |
| { | |
| "epoch": 0.7652724633856709, | |
| "grad_norm": 11.395907402038574, | |
| "learning_rate": 1.4967531033264127e-05, | |
| "loss": 0.8542, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.7718696397941681, | |
| "grad_norm": 12.993699073791504, | |
| "learning_rate": 1.4923355568317358e-05, | |
| "loss": 0.7924, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 0.7784668162026652, | |
| "grad_norm": 12.30950927734375, | |
| "learning_rate": 1.487918010337059e-05, | |
| "loss": 0.9238, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.7850639926111624, | |
| "grad_norm": 14.112768173217773, | |
| "learning_rate": 1.483500463842382e-05, | |
| "loss": 0.8303, | |
| "step": 5950 | |
| }, | |
| { | |
| "epoch": 0.7916611690196595, | |
| "grad_norm": 12.148374557495117, | |
| "learning_rate": 1.4790829173477052e-05, | |
| "loss": 0.8254, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7916611690196595, | |
| "eval_accuracy": 0.9159183502197266, | |
| "eval_loss": 0.5643152594566345, | |
| "eval_runtime": 14.2837, | |
| "eval_samples_per_second": 171.524, | |
| "eval_steps_per_second": 5.391, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.7982583454281568, | |
| "grad_norm": 22.816574096679688, | |
| "learning_rate": 1.4746653708530282e-05, | |
| "loss": 0.8556, | |
| "step": 6050 | |
| }, | |
| { | |
| "epoch": 0.8048555218366539, | |
| "grad_norm": 20.84126853942871, | |
| "learning_rate": 1.4702478243583516e-05, | |
| "loss": 0.9286, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.8114526982451511, | |
| "grad_norm": 10.403019905090332, | |
| "learning_rate": 1.4658302778636746e-05, | |
| "loss": 0.8776, | |
| "step": 6150 | |
| }, | |
| { | |
| "epoch": 0.8180498746536482, | |
| "grad_norm": 12.371121406555176, | |
| "learning_rate": 1.4614127313689978e-05, | |
| "loss": 0.8146, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.8246470510621454, | |
| "grad_norm": 8.280356407165527, | |
| "learning_rate": 1.4569951848743209e-05, | |
| "loss": 0.8469, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.8246470510621454, | |
| "eval_accuracy": 0.9126530885696411, | |
| "eval_loss": 0.5626720190048218, | |
| "eval_runtime": 14.5893, | |
| "eval_samples_per_second": 167.931, | |
| "eval_steps_per_second": 5.278, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 0.8312442274706425, | |
| "grad_norm": 12.503052711486816, | |
| "learning_rate": 1.452577638379644e-05, | |
| "loss": 0.9719, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.8378414038791397, | |
| "grad_norm": 8.480955123901367, | |
| "learning_rate": 1.4481600918849673e-05, | |
| "loss": 0.9297, | |
| "step": 6350 | |
| }, | |
| { | |
| "epoch": 0.8444385802876369, | |
| "grad_norm": 11.265844345092773, | |
| "learning_rate": 1.4437425453902903e-05, | |
| "loss": 0.896, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.8510357566961341, | |
| "grad_norm": 14.131726264953613, | |
| "learning_rate": 1.4393249988956135e-05, | |
| "loss": 0.8709, | |
| "step": 6450 | |
| }, | |
| { | |
| "epoch": 0.8576329331046312, | |
| "grad_norm": 15.310577392578125, | |
| "learning_rate": 1.4349958033308302e-05, | |
| "loss": 0.9436, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8576329331046312, | |
| "eval_accuracy": 0.9159183502197266, | |
| "eval_loss": 0.5638322234153748, | |
| "eval_runtime": 14.7169, | |
| "eval_samples_per_second": 166.475, | |
| "eval_steps_per_second": 5.232, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.8642301095131284, | |
| "grad_norm": 9.686817169189453, | |
| "learning_rate": 1.4305782568361532e-05, | |
| "loss": 0.8938, | |
| "step": 6550 | |
| }, | |
| { | |
| "epoch": 0.8708272859216255, | |
| "grad_norm": 13.079803466796875, | |
| "learning_rate": 1.4261607103414766e-05, | |
| "loss": 0.8065, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.8774244623301227, | |
| "grad_norm": 8.047029495239258, | |
| "learning_rate": 1.4217431638467996e-05, | |
| "loss": 0.8281, | |
| "step": 6650 | |
| }, | |
| { | |
| "epoch": 0.8840216387386198, | |
| "grad_norm": 9.289030075073242, | |
| "learning_rate": 1.4173256173521228e-05, | |
| "loss": 0.8449, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.890618815147117, | |
| "grad_norm": 14.22453498840332, | |
| "learning_rate": 1.4129080708574458e-05, | |
| "loss": 0.813, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.890618815147117, | |
| "eval_accuracy": 0.9167346954345703, | |
| "eval_loss": 0.5693557262420654, | |
| "eval_runtime": 14.6084, | |
| "eval_samples_per_second": 167.711, | |
| "eval_steps_per_second": 5.271, | |
| "step": 6750 | |
| }, | |
| { | |
| "epoch": 0.8972159915556142, | |
| "grad_norm": 13.204727172851562, | |
| "learning_rate": 1.408490524362769e-05, | |
| "loss": 0.9052, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.9038131679641114, | |
| "grad_norm": 13.640901565551758, | |
| "learning_rate": 1.404072977868092e-05, | |
| "loss": 0.9501, | |
| "step": 6850 | |
| }, | |
| { | |
| "epoch": 0.9104103443726085, | |
| "grad_norm": 10.711437225341797, | |
| "learning_rate": 1.3996554313734155e-05, | |
| "loss": 0.9612, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.9170075207811057, | |
| "grad_norm": 13.457585334777832, | |
| "learning_rate": 1.3952378848787385e-05, | |
| "loss": 0.8649, | |
| "step": 6950 | |
| }, | |
| { | |
| "epoch": 0.9236046971896028, | |
| "grad_norm": 12.793722152709961, | |
| "learning_rate": 1.3908203383840615e-05, | |
| "loss": 0.7366, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9236046971896028, | |
| "eval_accuracy": 0.9187754988670349, | |
| "eval_loss": 0.5691282153129578, | |
| "eval_runtime": 14.243, | |
| "eval_samples_per_second": 172.014, | |
| "eval_steps_per_second": 5.406, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.9302018735981, | |
| "grad_norm": 15.058178901672363, | |
| "learning_rate": 1.3864027918893847e-05, | |
| "loss": 0.9621, | |
| "step": 7050 | |
| }, | |
| { | |
| "epoch": 0.9367990500065971, | |
| "grad_norm": 14.427763938903809, | |
| "learning_rate": 1.3819852453947078e-05, | |
| "loss": 0.9154, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.9433962264150944, | |
| "grad_norm": 13.261103630065918, | |
| "learning_rate": 1.3775676989000311e-05, | |
| "loss": 0.8617, | |
| "step": 7150 | |
| }, | |
| { | |
| "epoch": 0.9499934028235915, | |
| "grad_norm": 12.778352737426758, | |
| "learning_rate": 1.3731501524053542e-05, | |
| "loss": 0.8629, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.9565905792320887, | |
| "grad_norm": 14.332444190979004, | |
| "learning_rate": 1.3687326059106774e-05, | |
| "loss": 0.899, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9565905792320887, | |
| "eval_accuracy": 0.9159183502197266, | |
| "eval_loss": 0.5559064745903015, | |
| "eval_runtime": 14.2133, | |
| "eval_samples_per_second": 172.374, | |
| "eval_steps_per_second": 5.417, | |
| "step": 7250 | |
| }, | |
| { | |
| "epoch": 0.9631877556405858, | |
| "grad_norm": 8.828652381896973, | |
| "learning_rate": 1.3643150594160004e-05, | |
| "loss": 0.7766, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.969784932049083, | |
| "grad_norm": 11.421220779418945, | |
| "learning_rate": 1.3598975129213236e-05, | |
| "loss": 0.8968, | |
| "step": 7350 | |
| }, | |
| { | |
| "epoch": 0.9763821084575801, | |
| "grad_norm": 13.00658893585205, | |
| "learning_rate": 1.3554799664266466e-05, | |
| "loss": 0.8462, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.9829792848660773, | |
| "grad_norm": 6.505890369415283, | |
| "learning_rate": 1.35106241993197e-05, | |
| "loss": 0.8478, | |
| "step": 7450 | |
| }, | |
| { | |
| "epoch": 0.9895764612745744, | |
| "grad_norm": 9.694055557250977, | |
| "learning_rate": 1.346644873437293e-05, | |
| "loss": 0.8184, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9895764612745744, | |
| "eval_accuracy": 0.9163265228271484, | |
| "eval_loss": 0.5564213395118713, | |
| "eval_runtime": 14.1149, | |
| "eval_samples_per_second": 173.576, | |
| "eval_steps_per_second": 5.455, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.9961736376830717, | |
| "grad_norm": 8.785748481750488, | |
| "learning_rate": 1.3422273269426162e-05, | |
| "loss": 0.8445, | |
| "step": 7550 | |
| }, | |
| { | |
| "epoch": 1.0027708140915688, | |
| "grad_norm": 12.255693435668945, | |
| "learning_rate": 1.3378097804479393e-05, | |
| "loss": 0.7305, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 1.0093679905000659, | |
| "grad_norm": 12.03491497039795, | |
| "learning_rate": 1.3333922339532626e-05, | |
| "loss": 0.695, | |
| "step": 7650 | |
| }, | |
| { | |
| "epoch": 1.0159651669085632, | |
| "grad_norm": 15.055414199829102, | |
| "learning_rate": 1.3289746874585857e-05, | |
| "loss": 0.779, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 1.0225623433170603, | |
| "grad_norm": 3.5831682682037354, | |
| "learning_rate": 1.3245571409639089e-05, | |
| "loss": 0.5876, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.0225623433170603, | |
| "eval_accuracy": 0.918367326259613, | |
| "eval_loss": 0.5775763392448425, | |
| "eval_runtime": 14.119, | |
| "eval_samples_per_second": 173.525, | |
| "eval_steps_per_second": 5.454, | |
| "step": 7750 | |
| }, | |
| { | |
| "epoch": 1.0291595197255574, | |
| "grad_norm": 14.637757301330566, | |
| "learning_rate": 1.3201395944692319e-05, | |
| "loss": 0.6372, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 1.0357566961340545, | |
| "grad_norm": 9.048910140991211, | |
| "learning_rate": 1.315722047974555e-05, | |
| "loss": 0.7066, | |
| "step": 7850 | |
| }, | |
| { | |
| "epoch": 1.0423538725425519, | |
| "grad_norm": 13.023659706115723, | |
| "learning_rate": 1.3113045014798781e-05, | |
| "loss": 0.6561, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 1.048951048951049, | |
| "grad_norm": 13.10300350189209, | |
| "learning_rate": 1.3068869549852012e-05, | |
| "loss": 0.6854, | |
| "step": 7950 | |
| }, | |
| { | |
| "epoch": 1.055548225359546, | |
| "grad_norm": 13.364474296569824, | |
| "learning_rate": 1.3024694084905245e-05, | |
| "loss": 0.7083, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.055548225359546, | |
| "eval_accuracy": 0.9212244749069214, | |
| "eval_loss": 0.5645425915718079, | |
| "eval_runtime": 14.1023, | |
| "eval_samples_per_second": 173.731, | |
| "eval_steps_per_second": 5.46, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.0621454017680432, | |
| "grad_norm": 10.857477188110352, | |
| "learning_rate": 1.2980518619958476e-05, | |
| "loss": 0.6618, | |
| "step": 8050 | |
| }, | |
| { | |
| "epoch": 1.0687425781765405, | |
| "grad_norm": 13.178641319274902, | |
| "learning_rate": 1.2936343155011708e-05, | |
| "loss": 0.6602, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 1.0753397545850376, | |
| "grad_norm": 8.929798126220703, | |
| "learning_rate": 1.2892167690064938e-05, | |
| "loss": 0.7141, | |
| "step": 8150 | |
| }, | |
| { | |
| "epoch": 1.0819369309935347, | |
| "grad_norm": 14.156282424926758, | |
| "learning_rate": 1.2847992225118172e-05, | |
| "loss": 0.7599, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 1.0885341074020318, | |
| "grad_norm": 11.46021842956543, | |
| "learning_rate": 1.2803816760171402e-05, | |
| "loss": 0.6307, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.0885341074020318, | |
| "eval_accuracy": 0.9159183502197266, | |
| "eval_loss": 0.5608085989952087, | |
| "eval_runtime": 14.088, | |
| "eval_samples_per_second": 173.907, | |
| "eval_steps_per_second": 5.466, | |
| "step": 8250 | |
| }, | |
| { | |
| "epoch": 1.0951312838105292, | |
| "grad_norm": 13.287457466125488, | |
| "learning_rate": 1.2759641295224634e-05, | |
| "loss": 0.6611, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 1.1017284602190263, | |
| "grad_norm": 7.9682793617248535, | |
| "learning_rate": 1.2715465830277864e-05, | |
| "loss": 0.6308, | |
| "step": 8350 | |
| }, | |
| { | |
| "epoch": 1.1083256366275234, | |
| "grad_norm": 8.86072826385498, | |
| "learning_rate": 1.2671290365331097e-05, | |
| "loss": 0.7035, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 1.1149228130360207, | |
| "grad_norm": 16.224716186523438, | |
| "learning_rate": 1.2627114900384327e-05, | |
| "loss": 0.683, | |
| "step": 8450 | |
| }, | |
| { | |
| "epoch": 1.1215199894445178, | |
| "grad_norm": 16.066835403442383, | |
| "learning_rate": 1.258293943543756e-05, | |
| "loss": 0.7077, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.1215199894445178, | |
| "eval_accuracy": 0.918367326259613, | |
| "eval_loss": 0.5556493401527405, | |
| "eval_runtime": 14.2677, | |
| "eval_samples_per_second": 171.717, | |
| "eval_steps_per_second": 5.397, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 1.128117165853015, | |
| "grad_norm": 16.001686096191406, | |
| "learning_rate": 1.2538763970490791e-05, | |
| "loss": 0.7153, | |
| "step": 8550 | |
| }, | |
| { | |
| "epoch": 1.134714342261512, | |
| "grad_norm": 10.751116752624512, | |
| "learning_rate": 1.2494588505544021e-05, | |
| "loss": 0.6186, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 1.1413115186700091, | |
| "grad_norm": 13.352745056152344, | |
| "learning_rate": 1.2450413040597253e-05, | |
| "loss": 0.6289, | |
| "step": 8650 | |
| }, | |
| { | |
| "epoch": 1.1479086950785065, | |
| "grad_norm": 13.567846298217773, | |
| "learning_rate": 1.2406237575650484e-05, | |
| "loss": 0.5718, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 1.1545058714870036, | |
| "grad_norm": 7.751793384552002, | |
| "learning_rate": 1.2362062110703717e-05, | |
| "loss": 0.5749, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.1545058714870036, | |
| "eval_accuracy": 0.9167346954345703, | |
| "eval_loss": 0.5695374011993408, | |
| "eval_runtime": 14.4147, | |
| "eval_samples_per_second": 169.965, | |
| "eval_steps_per_second": 5.342, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 1.1611030478955007, | |
| "grad_norm": 17.14850425720215, | |
| "learning_rate": 1.2317886645756948e-05, | |
| "loss": 0.6788, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 1.167700224303998, | |
| "grad_norm": 15.17955493927002, | |
| "learning_rate": 1.227371118081018e-05, | |
| "loss": 0.7731, | |
| "step": 8850 | |
| }, | |
| { | |
| "epoch": 1.174297400712495, | |
| "grad_norm": 15.97493839263916, | |
| "learning_rate": 1.222953571586341e-05, | |
| "loss": 0.6954, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 1.1808945771209922, | |
| "grad_norm": 13.843533515930176, | |
| "learning_rate": 1.2185360250916642e-05, | |
| "loss": 0.7404, | |
| "step": 8950 | |
| }, | |
| { | |
| "epoch": 1.1874917535294893, | |
| "grad_norm": 2.941951274871826, | |
| "learning_rate": 1.2141184785969872e-05, | |
| "loss": 0.6871, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.1874917535294893, | |
| "eval_accuracy": 0.9208163022994995, | |
| "eval_loss": 0.5665779709815979, | |
| "eval_runtime": 14.3831, | |
| "eval_samples_per_second": 170.339, | |
| "eval_steps_per_second": 5.354, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.1940889299379864, | |
| "grad_norm": 12.621596336364746, | |
| "learning_rate": 1.2097009321023106e-05, | |
| "loss": 0.6415, | |
| "step": 9050 | |
| }, | |
| { | |
| "epoch": 1.2006861063464838, | |
| "grad_norm": 15.431377410888672, | |
| "learning_rate": 1.2052833856076336e-05, | |
| "loss": 0.6517, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 1.2072832827549809, | |
| "grad_norm": 18.660377502441406, | |
| "learning_rate": 1.2008658391129568e-05, | |
| "loss": 0.7354, | |
| "step": 9150 | |
| }, | |
| { | |
| "epoch": 1.213880459163478, | |
| "grad_norm": 16.014867782592773, | |
| "learning_rate": 1.1964482926182799e-05, | |
| "loss": 0.7325, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 1.2204776355719753, | |
| "grad_norm": 15.354519844055176, | |
| "learning_rate": 1.192030746123603e-05, | |
| "loss": 0.6272, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.2204776355719753, | |
| "eval_accuracy": 0.9146938920021057, | |
| "eval_loss": 0.5714155435562134, | |
| "eval_runtime": 14.5346, | |
| "eval_samples_per_second": 168.563, | |
| "eval_steps_per_second": 5.298, | |
| "step": 9250 | |
| }, | |
| { | |
| "epoch": 1.2270748119804724, | |
| "grad_norm": 12.623982429504395, | |
| "learning_rate": 1.1876131996289261e-05, | |
| "loss": 0.7292, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 1.2336719883889695, | |
| "grad_norm": 9.906524658203125, | |
| "learning_rate": 1.1831956531342495e-05, | |
| "loss": 0.6325, | |
| "step": 9350 | |
| }, | |
| { | |
| "epoch": 1.2402691647974666, | |
| "grad_norm": 13.0123872756958, | |
| "learning_rate": 1.1787781066395725e-05, | |
| "loss": 0.6344, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 1.2468663412059637, | |
| "grad_norm": 11.591238975524902, | |
| "learning_rate": 1.1743605601448955e-05, | |
| "loss": 0.7218, | |
| "step": 9450 | |
| }, | |
| { | |
| "epoch": 1.253463517614461, | |
| "grad_norm": 6.004245758056641, | |
| "learning_rate": 1.1699430136502187e-05, | |
| "loss": 0.6815, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.253463517614461, | |
| "eval_accuracy": 0.9175510406494141, | |
| "eval_loss": 0.5650636553764343, | |
| "eval_runtime": 14.6735, | |
| "eval_samples_per_second": 166.967, | |
| "eval_steps_per_second": 5.248, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 1.2600606940229582, | |
| "grad_norm": 15.778864860534668, | |
| "learning_rate": 1.1655254671555418e-05, | |
| "loss": 0.7186, | |
| "step": 9550 | |
| }, | |
| { | |
| "epoch": 1.2666578704314553, | |
| "grad_norm": 9.6397123336792, | |
| "learning_rate": 1.1611079206608651e-05, | |
| "loss": 0.6145, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 1.2732550468399526, | |
| "grad_norm": 10.774910926818848, | |
| "learning_rate": 1.1566903741661882e-05, | |
| "loss": 0.7095, | |
| "step": 9650 | |
| }, | |
| { | |
| "epoch": 1.2798522232484497, | |
| "grad_norm": 7.923967361450195, | |
| "learning_rate": 1.1522728276715114e-05, | |
| "loss": 0.674, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 1.2864493996569468, | |
| "grad_norm": 15.660514831542969, | |
| "learning_rate": 1.1478552811768344e-05, | |
| "loss": 0.7405, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.2864493996569468, | |
| "eval_accuracy": 0.9200000166893005, | |
| "eval_loss": 0.5666268467903137, | |
| "eval_runtime": 14.8203, | |
| "eval_samples_per_second": 165.314, | |
| "eval_steps_per_second": 5.196, | |
| "step": 9750 | |
| }, | |
| { | |
| "epoch": 1.293046576065444, | |
| "grad_norm": 7.9538397789001465, | |
| "learning_rate": 1.1434377346821576e-05, | |
| "loss": 0.7186, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 1.299643752473941, | |
| "grad_norm": 10.569086074829102, | |
| "learning_rate": 1.1390201881874806e-05, | |
| "loss": 0.6352, | |
| "step": 9850 | |
| }, | |
| { | |
| "epoch": 1.3062409288824384, | |
| "grad_norm": 13.37822151184082, | |
| "learning_rate": 1.134602641692804e-05, | |
| "loss": 0.7077, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 1.3128381052909355, | |
| "grad_norm": 14.899065017700195, | |
| "learning_rate": 1.130185095198127e-05, | |
| "loss": 0.6873, | |
| "step": 9950 | |
| }, | |
| { | |
| "epoch": 1.3194352816994326, | |
| "grad_norm": 9.051203727722168, | |
| "learning_rate": 1.1257675487034503e-05, | |
| "loss": 0.5939, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.3194352816994326, | |
| "eval_accuracy": 0.9204081892967224, | |
| "eval_loss": 0.5752307176589966, | |
| "eval_runtime": 14.7189, | |
| "eval_samples_per_second": 166.453, | |
| "eval_steps_per_second": 5.231, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.32603245810793, | |
| "grad_norm": 9.774917602539062, | |
| "learning_rate": 1.1213500022087733e-05, | |
| "loss": 0.7171, | |
| "step": 10050 | |
| }, | |
| { | |
| "epoch": 1.332629634516427, | |
| "grad_norm": 12.088335037231445, | |
| "learning_rate": 1.1169324557140967e-05, | |
| "loss": 0.6592, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 1.3392268109249241, | |
| "grad_norm": 6.4445881843566895, | |
| "learning_rate": 1.1125149092194197e-05, | |
| "loss": 0.6631, | |
| "step": 10150 | |
| }, | |
| { | |
| "epoch": 1.3458239873334212, | |
| "grad_norm": 17.67377471923828, | |
| "learning_rate": 1.1080973627247427e-05, | |
| "loss": 0.7658, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 1.3524211637419183, | |
| "grad_norm": 9.594240188598633, | |
| "learning_rate": 1.103679816230066e-05, | |
| "loss": 0.6213, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.3524211637419183, | |
| "eval_accuracy": 0.9179591536521912, | |
| "eval_loss": 0.5678022503852844, | |
| "eval_runtime": 14.3553, | |
| "eval_samples_per_second": 170.669, | |
| "eval_steps_per_second": 5.364, | |
| "step": 10250 | |
| }, | |
| { | |
| "epoch": 1.3590183401504157, | |
| "grad_norm": 13.705334663391113, | |
| "learning_rate": 1.099262269735389e-05, | |
| "loss": 0.6486, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 1.3656155165589128, | |
| "grad_norm": 9.945523262023926, | |
| "learning_rate": 1.0948447232407122e-05, | |
| "loss": 0.662, | |
| "step": 10350 | |
| }, | |
| { | |
| "epoch": 1.3722126929674099, | |
| "grad_norm": 10.699588775634766, | |
| "learning_rate": 1.0904271767460352e-05, | |
| "loss": 0.6924, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 1.3788098693759072, | |
| "grad_norm": 11.88040828704834, | |
| "learning_rate": 1.0860096302513586e-05, | |
| "loss": 0.7106, | |
| "step": 10450 | |
| }, | |
| { | |
| "epoch": 1.3854070457844043, | |
| "grad_norm": 9.69964599609375, | |
| "learning_rate": 1.0815920837566816e-05, | |
| "loss": 0.7239, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.3854070457844043, | |
| "eval_accuracy": 0.918367326259613, | |
| "eval_loss": 0.5687153935432434, | |
| "eval_runtime": 15.0319, | |
| "eval_samples_per_second": 162.987, | |
| "eval_steps_per_second": 5.122, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 1.3920042221929014, | |
| "grad_norm": 7.956460475921631, | |
| "learning_rate": 1.0771745372620048e-05, | |
| "loss": 0.735, | |
| "step": 10550 | |
| }, | |
| { | |
| "epoch": 1.3986013986013985, | |
| "grad_norm": 15.08421802520752, | |
| "learning_rate": 1.0728453416972216e-05, | |
| "loss": 0.6784, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 1.4051985750098956, | |
| "grad_norm": 7.856141090393066, | |
| "learning_rate": 1.0684277952025447e-05, | |
| "loss": 0.6886, | |
| "step": 10650 | |
| }, | |
| { | |
| "epoch": 1.411795751418393, | |
| "grad_norm": 20.228710174560547, | |
| "learning_rate": 1.0640102487078679e-05, | |
| "loss": 0.649, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 1.41839292782689, | |
| "grad_norm": 8.827073097229004, | |
| "learning_rate": 1.0595927022131909e-05, | |
| "loss": 0.6133, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.41839292782689, | |
| "eval_accuracy": 0.9200000166893005, | |
| "eval_loss": 0.5682947039604187, | |
| "eval_runtime": 14.6806, | |
| "eval_samples_per_second": 166.887, | |
| "eval_steps_per_second": 5.245, | |
| "step": 10750 | |
| }, | |
| { | |
| "epoch": 1.4249901042353872, | |
| "grad_norm": 12.990625381469727, | |
| "learning_rate": 1.055175155718514e-05, | |
| "loss": 0.6635, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 1.4315872806438845, | |
| "grad_norm": 13.446993827819824, | |
| "learning_rate": 1.0507576092238371e-05, | |
| "loss": 0.6803, | |
| "step": 10850 | |
| }, | |
| { | |
| "epoch": 1.4381844570523816, | |
| "grad_norm": 16.174983978271484, | |
| "learning_rate": 1.0463400627291602e-05, | |
| "loss": 0.6497, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 1.4447816334608787, | |
| "grad_norm": 14.54861831665039, | |
| "learning_rate": 1.0419225162344835e-05, | |
| "loss": 0.6812, | |
| "step": 10950 | |
| }, | |
| { | |
| "epoch": 1.4513788098693758, | |
| "grad_norm": 15.023179054260254, | |
| "learning_rate": 1.0375049697398066e-05, | |
| "loss": 0.7493, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.4513788098693758, | |
| "eval_accuracy": 0.9220408201217651, | |
| "eval_loss": 0.5586878657341003, | |
| "eval_runtime": 14.6736, | |
| "eval_samples_per_second": 166.967, | |
| "eval_steps_per_second": 5.248, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.457975986277873, | |
| "grad_norm": 13.154565811157227, | |
| "learning_rate": 1.0330874232451298e-05, | |
| "loss": 0.6694, | |
| "step": 11050 | |
| }, | |
| { | |
| "epoch": 1.4645731626863703, | |
| "grad_norm": 10.797453880310059, | |
| "learning_rate": 1.0286698767504528e-05, | |
| "loss": 0.6782, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 1.4711703390948674, | |
| "grad_norm": 9.909940719604492, | |
| "learning_rate": 1.0242523302557762e-05, | |
| "loss": 0.6839, | |
| "step": 11150 | |
| }, | |
| { | |
| "epoch": 1.4777675155033645, | |
| "grad_norm": 10.230202674865723, | |
| "learning_rate": 1.0198347837610992e-05, | |
| "loss": 0.671, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 1.4843646919118618, | |
| "grad_norm": 10.81137752532959, | |
| "learning_rate": 1.0154172372664224e-05, | |
| "loss": 0.7648, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.4843646919118618, | |
| "eval_accuracy": 0.9208163022994995, | |
| "eval_loss": 0.5465655326843262, | |
| "eval_runtime": 14.3093, | |
| "eval_samples_per_second": 171.217, | |
| "eval_steps_per_second": 5.381, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 1.490961868320359, | |
| "grad_norm": 16.720306396484375, | |
| "learning_rate": 1.0109996907717454e-05, | |
| "loss": 0.7448, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 1.497559044728856, | |
| "grad_norm": 10.62578296661377, | |
| "learning_rate": 1.0065821442770686e-05, | |
| "loss": 0.6811, | |
| "step": 11350 | |
| }, | |
| { | |
| "epoch": 1.5041562211373534, | |
| "grad_norm": 12.68857479095459, | |
| "learning_rate": 1.0021645977823917e-05, | |
| "loss": 0.6984, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 1.5107533975458503, | |
| "grad_norm": 9.58633804321289, | |
| "learning_rate": 9.977470512877149e-06, | |
| "loss": 0.6676, | |
| "step": 11450 | |
| }, | |
| { | |
| "epoch": 1.5173505739543476, | |
| "grad_norm": 16.852190017700195, | |
| "learning_rate": 9.93329504793038e-06, | |
| "loss": 0.7054, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.5173505739543476, | |
| "eval_accuracy": 0.9204081892967224, | |
| "eval_loss": 0.5569261908531189, | |
| "eval_runtime": 14.3932, | |
| "eval_samples_per_second": 170.219, | |
| "eval_steps_per_second": 5.35, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 1.5239477503628447, | |
| "grad_norm": 12.476948738098145, | |
| "learning_rate": 9.889119582983611e-06, | |
| "loss": 0.6109, | |
| "step": 11550 | |
| }, | |
| { | |
| "epoch": 1.5305449267713418, | |
| "grad_norm": 15.437007904052734, | |
| "learning_rate": 9.844944118036843e-06, | |
| "loss": 0.7581, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 1.5371421031798391, | |
| "grad_norm": 14.643590927124023, | |
| "learning_rate": 9.800768653090075e-06, | |
| "loss": 0.7035, | |
| "step": 11650 | |
| }, | |
| { | |
| "epoch": 1.5437392795883362, | |
| "grad_norm": 8.441386222839355, | |
| "learning_rate": 9.756593188143305e-06, | |
| "loss": 0.6943, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 1.5503364559968333, | |
| "grad_norm": 17.568815231323242, | |
| "learning_rate": 9.713301232495472e-06, | |
| "loss": 0.6225, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.5503364559968333, | |
| "eval_accuracy": 0.922448992729187, | |
| "eval_loss": 0.5570839643478394, | |
| "eval_runtime": 14.4582, | |
| "eval_samples_per_second": 169.454, | |
| "eval_steps_per_second": 5.326, | |
| "step": 11750 | |
| }, | |
| { | |
| "epoch": 1.5569336324053307, | |
| "grad_norm": 9.131673812866211, | |
| "learning_rate": 9.669125767548704e-06, | |
| "loss": 0.661, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 1.5635308088138276, | |
| "grad_norm": 12.502500534057617, | |
| "learning_rate": 9.624950302601936e-06, | |
| "loss": 0.635, | |
| "step": 11850 | |
| }, | |
| { | |
| "epoch": 1.5701279852223249, | |
| "grad_norm": 16.2374210357666, | |
| "learning_rate": 9.580774837655166e-06, | |
| "loss": 0.613, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 1.576725161630822, | |
| "grad_norm": 14.896709442138672, | |
| "learning_rate": 9.536599372708398e-06, | |
| "loss": 0.6502, | |
| "step": 11950 | |
| }, | |
| { | |
| "epoch": 1.583322338039319, | |
| "grad_norm": 10.305893898010254, | |
| "learning_rate": 9.49242390776163e-06, | |
| "loss": 0.6935, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.583322338039319, | |
| "eval_accuracy": 0.9200000166893005, | |
| "eval_loss": 0.5578611493110657, | |
| "eval_runtime": 15.0346, | |
| "eval_samples_per_second": 162.958, | |
| "eval_steps_per_second": 5.122, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.5899195144478164, | |
| "grad_norm": 10.883207321166992, | |
| "learning_rate": 9.44824844281486e-06, | |
| "loss": 0.6147, | |
| "step": 12050 | |
| }, | |
| { | |
| "epoch": 1.5965166908563135, | |
| "grad_norm": 5.787095069885254, | |
| "learning_rate": 9.404072977868093e-06, | |
| "loss": 0.6575, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 1.6031138672648106, | |
| "grad_norm": 18.00186538696289, | |
| "learning_rate": 9.359897512921325e-06, | |
| "loss": 0.6837, | |
| "step": 12150 | |
| }, | |
| { | |
| "epoch": 1.609711043673308, | |
| "grad_norm": 11.488051414489746, | |
| "learning_rate": 9.315722047974555e-06, | |
| "loss": 0.7437, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 1.6163082200818049, | |
| "grad_norm": 8.060873031616211, | |
| "learning_rate": 9.271546583027787e-06, | |
| "loss": 0.6808, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.6163082200818049, | |
| "eval_accuracy": 0.9204081892967224, | |
| "eval_loss": 0.5507224202156067, | |
| "eval_runtime": 14.9999, | |
| "eval_samples_per_second": 163.335, | |
| "eval_steps_per_second": 5.133, | |
| "step": 12250 | |
| }, | |
| { | |
| "epoch": 1.6229053964903022, | |
| "grad_norm": 12.3642578125, | |
| "learning_rate": 9.227371118081019e-06, | |
| "loss": 0.6698, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 1.6295025728987993, | |
| "grad_norm": 15.267610549926758, | |
| "learning_rate": 9.183195653134251e-06, | |
| "loss": 0.6803, | |
| "step": 12350 | |
| }, | |
| { | |
| "epoch": 1.6360997493072964, | |
| "grad_norm": 14.562056541442871, | |
| "learning_rate": 9.139020188187481e-06, | |
| "loss": 0.676, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 1.6426969257157937, | |
| "grad_norm": 10.903446197509766, | |
| "learning_rate": 9.094844723240713e-06, | |
| "loss": 0.6418, | |
| "step": 12450 | |
| }, | |
| { | |
| "epoch": 1.6492941021242908, | |
| "grad_norm": 12.447013854980469, | |
| "learning_rate": 9.050669258293946e-06, | |
| "loss": 0.6042, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.6492941021242908, | |
| "eval_accuracy": 0.9187754988670349, | |
| "eval_loss": 0.556304395198822, | |
| "eval_runtime": 14.8826, | |
| "eval_samples_per_second": 164.622, | |
| "eval_steps_per_second": 5.174, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 1.655891278532788, | |
| "grad_norm": 14.32016658782959, | |
| "learning_rate": 9.006493793347176e-06, | |
| "loss": 0.7139, | |
| "step": 12550 | |
| }, | |
| { | |
| "epoch": 1.6624884549412853, | |
| "grad_norm": 10.937993049621582, | |
| "learning_rate": 8.962318328400406e-06, | |
| "loss": 0.6995, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 1.6690856313497822, | |
| "grad_norm": 9.443896293640137, | |
| "learning_rate": 8.918142863453638e-06, | |
| "loss": 0.6097, | |
| "step": 12650 | |
| }, | |
| { | |
| "epoch": 1.6756828077582795, | |
| "grad_norm": 13.650922775268555, | |
| "learning_rate": 8.87396739850687e-06, | |
| "loss": 0.6407, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 1.6822799841667766, | |
| "grad_norm": 13.268482208251953, | |
| "learning_rate": 8.8297919335601e-06, | |
| "loss": 0.5994, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.6822799841667766, | |
| "eval_accuracy": 0.9248979687690735, | |
| "eval_loss": 0.5621338486671448, | |
| "eval_runtime": 14.3912, | |
| "eval_samples_per_second": 170.243, | |
| "eval_steps_per_second": 5.35, | |
| "step": 12750 | |
| }, | |
| { | |
| "epoch": 1.6888771605752737, | |
| "grad_norm": 10.582622528076172, | |
| "learning_rate": 8.785616468613333e-06, | |
| "loss": 0.6642, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 1.695474336983771, | |
| "grad_norm": 11.034931182861328, | |
| "learning_rate": 8.741441003666565e-06, | |
| "loss": 0.6198, | |
| "step": 12850 | |
| }, | |
| { | |
| "epoch": 1.7020715133922681, | |
| "grad_norm": 13.703685760498047, | |
| "learning_rate": 8.697265538719795e-06, | |
| "loss": 0.6648, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 1.7086686898007653, | |
| "grad_norm": 10.968565940856934, | |
| "learning_rate": 8.653090073773027e-06, | |
| "loss": 0.5644, | |
| "step": 12950 | |
| }, | |
| { | |
| "epoch": 1.7152658662092626, | |
| "grad_norm": 12.422329902648926, | |
| "learning_rate": 8.608914608826259e-06, | |
| "loss": 0.6531, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.7152658662092626, | |
| "eval_accuracy": 0.9240816235542297, | |
| "eval_loss": 0.5617344975471497, | |
| "eval_runtime": 15.0031, | |
| "eval_samples_per_second": 163.299, | |
| "eval_steps_per_second": 5.132, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.7218630426177595, | |
| "grad_norm": 9.511701583862305, | |
| "learning_rate": 8.564739143879491e-06, | |
| "loss": 0.6159, | |
| "step": 13050 | |
| }, | |
| { | |
| "epoch": 1.7284602190262568, | |
| "grad_norm": 6.499239921569824, | |
| "learning_rate": 8.520563678932721e-06, | |
| "loss": 0.7855, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 1.735057395434754, | |
| "grad_norm": 7.864821910858154, | |
| "learning_rate": 8.476388213985953e-06, | |
| "loss": 0.6307, | |
| "step": 13150 | |
| }, | |
| { | |
| "epoch": 1.741654571843251, | |
| "grad_norm": 11.460110664367676, | |
| "learning_rate": 8.432212749039185e-06, | |
| "loss": 0.61, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 1.7482517482517483, | |
| "grad_norm": 12.433394432067871, | |
| "learning_rate": 8.388037284092416e-06, | |
| "loss": 0.6672, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.7482517482517483, | |
| "eval_accuracy": 0.9236734509468079, | |
| "eval_loss": 0.5589076280593872, | |
| "eval_runtime": 14.2338, | |
| "eval_samples_per_second": 172.125, | |
| "eval_steps_per_second": 5.41, | |
| "step": 13250 | |
| }, | |
| { | |
| "epoch": 1.7548489246602454, | |
| "grad_norm": 9.624537467956543, | |
| "learning_rate": 8.343861819145648e-06, | |
| "loss": 0.6002, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 1.7614461010687426, | |
| "grad_norm": 14.12790584564209, | |
| "learning_rate": 8.299686354198878e-06, | |
| "loss": 0.6638, | |
| "step": 13350 | |
| }, | |
| { | |
| "epoch": 1.7680432774772399, | |
| "grad_norm": 15.561441421508789, | |
| "learning_rate": 8.25551088925211e-06, | |
| "loss": 0.6112, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 1.7746404538857368, | |
| "grad_norm": 8.115078926086426, | |
| "learning_rate": 8.21133542430534e-06, | |
| "loss": 0.6236, | |
| "step": 13450 | |
| }, | |
| { | |
| "epoch": 1.781237630294234, | |
| "grad_norm": 5.141168117523193, | |
| "learning_rate": 8.167159959358572e-06, | |
| "loss": 0.6245, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.781237630294234, | |
| "eval_accuracy": 0.9220408201217651, | |
| "eval_loss": 0.557984471321106, | |
| "eval_runtime": 14.2809, | |
| "eval_samples_per_second": 171.557, | |
| "eval_steps_per_second": 5.392, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 1.7878348067027312, | |
| "grad_norm": 13.422981262207031, | |
| "learning_rate": 8.122984494411804e-06, | |
| "loss": 0.7146, | |
| "step": 13550 | |
| }, | |
| { | |
| "epoch": 1.7944319831112283, | |
| "grad_norm": 9.977944374084473, | |
| "learning_rate": 8.078809029465036e-06, | |
| "loss": 0.5969, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 1.8010291595197256, | |
| "grad_norm": 12.0841064453125, | |
| "learning_rate": 8.034633564518267e-06, | |
| "loss": 0.7246, | |
| "step": 13650 | |
| }, | |
| { | |
| "epoch": 1.8076263359282227, | |
| "grad_norm": 7.176680088043213, | |
| "learning_rate": 7.990458099571499e-06, | |
| "loss": 0.65, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 1.8142235123367199, | |
| "grad_norm": 16.529300689697266, | |
| "learning_rate": 7.94628263462473e-06, | |
| "loss": 0.7136, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.8142235123367199, | |
| "eval_accuracy": 0.9204081892967224, | |
| "eval_loss": 0.5532920360565186, | |
| "eval_runtime": 14.1588, | |
| "eval_samples_per_second": 173.038, | |
| "eval_steps_per_second": 5.438, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 1.8208206887452172, | |
| "grad_norm": 14.22460651397705, | |
| "learning_rate": 7.902107169677961e-06, | |
| "loss": 0.7062, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 1.827417865153714, | |
| "grad_norm": 12.760638236999512, | |
| "learning_rate": 7.857931704731193e-06, | |
| "loss": 0.6987, | |
| "step": 13850 | |
| }, | |
| { | |
| "epoch": 1.8340150415622114, | |
| "grad_norm": 18.206119537353516, | |
| "learning_rate": 7.813756239784425e-06, | |
| "loss": 0.6642, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 1.8406122179707085, | |
| "grad_norm": 10.713970184326172, | |
| "learning_rate": 7.769580774837655e-06, | |
| "loss": 0.6761, | |
| "step": 13950 | |
| }, | |
| { | |
| "epoch": 1.8472093943792056, | |
| "grad_norm": 15.882705688476562, | |
| "learning_rate": 7.725405309890888e-06, | |
| "loss": 0.6766, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.8472093943792056, | |
| "eval_accuracy": 0.9212244749069214, | |
| "eval_loss": 0.5655022263526917, | |
| "eval_runtime": 14.1648, | |
| "eval_samples_per_second": 172.964, | |
| "eval_steps_per_second": 5.436, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.853806570787703, | |
| "grad_norm": 14.32147216796875, | |
| "learning_rate": 7.68122984494412e-06, | |
| "loss": 0.5758, | |
| "step": 14050 | |
| }, | |
| { | |
| "epoch": 1.8604037471962, | |
| "grad_norm": 10.756980895996094, | |
| "learning_rate": 7.63705437999735e-06, | |
| "loss": 0.6594, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 1.8670009236046972, | |
| "grad_norm": 14.631691932678223, | |
| "learning_rate": 7.592878915050581e-06, | |
| "loss": 0.7866, | |
| "step": 14150 | |
| }, | |
| { | |
| "epoch": 1.8735981000131945, | |
| "grad_norm": 10.117656707763672, | |
| "learning_rate": 7.548703450103812e-06, | |
| "loss": 0.5798, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.8801952764216914, | |
| "grad_norm": 18.793254852294922, | |
| "learning_rate": 7.504527985157044e-06, | |
| "loss": 0.6472, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.8801952764216914, | |
| "eval_accuracy": 0.9212244749069214, | |
| "eval_loss": 0.5508715510368347, | |
| "eval_runtime": 14.2705, | |
| "eval_samples_per_second": 171.683, | |
| "eval_steps_per_second": 5.396, | |
| "step": 14250 | |
| }, | |
| { | |
| "epoch": 1.8867924528301887, | |
| "grad_norm": 6.77608585357666, | |
| "learning_rate": 7.460352520210275e-06, | |
| "loss": 0.7387, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.8933896292386858, | |
| "grad_norm": 9.065592765808105, | |
| "learning_rate": 7.416177055263507e-06, | |
| "loss": 0.6677, | |
| "step": 14350 | |
| }, | |
| { | |
| "epoch": 1.899986805647183, | |
| "grad_norm": 16.486297607421875, | |
| "learning_rate": 7.372001590316739e-06, | |
| "loss": 0.6697, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.9065839820556802, | |
| "grad_norm": 10.985074996948242, | |
| "learning_rate": 7.32782612536997e-06, | |
| "loss": 0.6711, | |
| "step": 14450 | |
| }, | |
| { | |
| "epoch": 1.9131811584641774, | |
| "grad_norm": 10.440354347229004, | |
| "learning_rate": 7.283650660423202e-06, | |
| "loss": 0.6988, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.9131811584641774, | |
| "eval_accuracy": 0.9228571653366089, | |
| "eval_loss": 0.5527560114860535, | |
| "eval_runtime": 14.6748, | |
| "eval_samples_per_second": 166.953, | |
| "eval_steps_per_second": 5.247, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.9197783348726745, | |
| "grad_norm": 10.955256462097168, | |
| "learning_rate": 7.239475195476433e-06, | |
| "loss": 0.6301, | |
| "step": 14550 | |
| }, | |
| { | |
| "epoch": 1.9263755112811718, | |
| "grad_norm": 12.683993339538574, | |
| "learning_rate": 7.195299730529665e-06, | |
| "loss": 0.6259, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.9329726876896687, | |
| "grad_norm": 12.907076835632324, | |
| "learning_rate": 7.151124265582896e-06, | |
| "loss": 0.6223, | |
| "step": 14650 | |
| }, | |
| { | |
| "epoch": 1.939569864098166, | |
| "grad_norm": 16.311803817749023, | |
| "learning_rate": 7.106948800636127e-06, | |
| "loss": 0.5702, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.9461670405066631, | |
| "grad_norm": 13.10996150970459, | |
| "learning_rate": 7.062773335689359e-06, | |
| "loss": 0.6324, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.9461670405066631, | |
| "eval_accuracy": 0.9253061413764954, | |
| "eval_loss": 0.5507711172103882, | |
| "eval_runtime": 14.1763, | |
| "eval_samples_per_second": 172.824, | |
| "eval_steps_per_second": 5.432, | |
| "step": 14750 | |
| }, | |
| { | |
| "epoch": 1.9527642169151602, | |
| "grad_norm": 10.274900436401367, | |
| "learning_rate": 7.0185978707425905e-06, | |
| "loss": 0.6409, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.9593613933236576, | |
| "grad_norm": 14.111708641052246, | |
| "learning_rate": 6.974422405795822e-06, | |
| "loss": 0.6609, | |
| "step": 14850 | |
| }, | |
| { | |
| "epoch": 1.9659585697321547, | |
| "grad_norm": 7.0049662590026855, | |
| "learning_rate": 6.930246940849053e-06, | |
| "loss": 0.6581, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.9725557461406518, | |
| "grad_norm": 8.995936393737793, | |
| "learning_rate": 6.886071475902284e-06, | |
| "loss": 0.6313, | |
| "step": 14950 | |
| }, | |
| { | |
| "epoch": 1.979152922549149, | |
| "grad_norm": 19.018213272094727, | |
| "learning_rate": 6.841896010955515e-06, | |
| "loss": 0.6191, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.979152922549149, | |
| "eval_accuracy": 0.9216326475143433, | |
| "eval_loss": 0.5451802611351013, | |
| "eval_runtime": 14.4726, | |
| "eval_samples_per_second": 169.286, | |
| "eval_steps_per_second": 5.32, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.985750098957646, | |
| "grad_norm": 14.76790714263916, | |
| "learning_rate": 6.797720546008747e-06, | |
| "loss": 0.6665, | |
| "step": 15050 | |
| }, | |
| { | |
| "epoch": 1.9923472753661433, | |
| "grad_norm": 5.290237903594971, | |
| "learning_rate": 6.753545081061978e-06, | |
| "loss": 0.5907, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.9989444517746404, | |
| "grad_norm": 14.515754699707031, | |
| "learning_rate": 6.70936961611521e-06, | |
| "loss": 0.6586, | |
| "step": 15150 | |
| }, | |
| { | |
| "epoch": 2.0055416281831375, | |
| "grad_norm": 5.708993434906006, | |
| "learning_rate": 6.665194151168442e-06, | |
| "loss": 0.5673, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 2.012138804591635, | |
| "grad_norm": 15.482401847839355, | |
| "learning_rate": 6.621018686221673e-06, | |
| "loss": 0.5516, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.012138804591635, | |
| "eval_accuracy": 0.923265278339386, | |
| "eval_loss": 0.5589110255241394, | |
| "eval_runtime": 14.2886, | |
| "eval_samples_per_second": 171.465, | |
| "eval_steps_per_second": 5.389, | |
| "step": 15250 | |
| }, | |
| { | |
| "epoch": 2.0187359810001317, | |
| "grad_norm": 5.612030506134033, | |
| "learning_rate": 6.576843221274905e-06, | |
| "loss": 0.5012, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 2.025333157408629, | |
| "grad_norm": 17.197193145751953, | |
| "learning_rate": 6.532667756328136e-06, | |
| "loss": 0.5227, | |
| "step": 15350 | |
| }, | |
| { | |
| "epoch": 2.0319303338171264, | |
| "grad_norm": 19.402557373046875, | |
| "learning_rate": 6.488492291381367e-06, | |
| "loss": 0.4449, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 2.0385275102256233, | |
| "grad_norm": 18.876649856567383, | |
| "learning_rate": 6.444316826434599e-06, | |
| "loss": 0.4862, | |
| "step": 15450 | |
| }, | |
| { | |
| "epoch": 2.0451246866341206, | |
| "grad_norm": 9.995197296142578, | |
| "learning_rate": 6.40014136148783e-06, | |
| "loss": 0.5413, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.0451246866341206, | |
| "eval_accuracy": 0.923265278339386, | |
| "eval_loss": 0.5642380714416504, | |
| "eval_runtime": 14.3185, | |
| "eval_samples_per_second": 171.108, | |
| "eval_steps_per_second": 5.378, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 2.051721863042618, | |
| "grad_norm": 9.785661697387695, | |
| "learning_rate": 6.355965896541062e-06, | |
| "loss": 0.5462, | |
| "step": 15550 | |
| }, | |
| { | |
| "epoch": 2.058319039451115, | |
| "grad_norm": 14.724440574645996, | |
| "learning_rate": 6.3117904315942935e-06, | |
| "loss": 0.5318, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 2.064916215859612, | |
| "grad_norm": 11.998701095581055, | |
| "learning_rate": 6.267614966647525e-06, | |
| "loss": 0.5706, | |
| "step": 15650 | |
| }, | |
| { | |
| "epoch": 2.071513392268109, | |
| "grad_norm": 3.4020655155181885, | |
| "learning_rate": 6.223439501700755e-06, | |
| "loss": 0.5055, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 2.0781105686766064, | |
| "grad_norm": 16.408964157104492, | |
| "learning_rate": 6.179264036753987e-06, | |
| "loss": 0.6141, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.0781105686766064, | |
| "eval_accuracy": 0.923265278339386, | |
| "eval_loss": 0.5610572695732117, | |
| "eval_runtime": 14.2227, | |
| "eval_samples_per_second": 172.26, | |
| "eval_steps_per_second": 5.414, | |
| "step": 15750 | |
| }, | |
| { | |
| "epoch": 2.0847077450851037, | |
| "grad_norm": 17.39078140258789, | |
| "learning_rate": 6.135088571807218e-06, | |
| "loss": 0.5247, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 2.0913049214936006, | |
| "grad_norm": 17.45914649963379, | |
| "learning_rate": 6.09091310686045e-06, | |
| "loss": 0.4817, | |
| "step": 15850 | |
| }, | |
| { | |
| "epoch": 2.097902097902098, | |
| "grad_norm": 13.649807929992676, | |
| "learning_rate": 6.0467376419136814e-06, | |
| "loss": 0.4599, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 2.1044992743105952, | |
| "grad_norm": 8.314746856689453, | |
| "learning_rate": 6.002562176966913e-06, | |
| "loss": 0.5676, | |
| "step": 15950 | |
| }, | |
| { | |
| "epoch": 2.111096450719092, | |
| "grad_norm": 14.881856918334961, | |
| "learning_rate": 5.958386712020145e-06, | |
| "loss": 0.3992, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.111096450719092, | |
| "eval_accuracy": 0.9236734509468079, | |
| "eval_loss": 0.5720360279083252, | |
| "eval_runtime": 14.2571, | |
| "eval_samples_per_second": 171.844, | |
| "eval_steps_per_second": 5.401, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.1176936271275895, | |
| "grad_norm": 21.33131217956543, | |
| "learning_rate": 5.914211247073376e-06, | |
| "loss": 0.5337, | |
| "step": 16050 | |
| }, | |
| { | |
| "epoch": 2.1242908035360863, | |
| "grad_norm": 14.612150192260742, | |
| "learning_rate": 5.870035782126608e-06, | |
| "loss": 0.4641, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 2.1308879799445837, | |
| "grad_norm": 19.05860137939453, | |
| "learning_rate": 5.825860317179839e-06, | |
| "loss": 0.5636, | |
| "step": 16150 | |
| }, | |
| { | |
| "epoch": 2.137485156353081, | |
| "grad_norm": 13.695535659790039, | |
| "learning_rate": 5.78168485223307e-06, | |
| "loss": 0.4811, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 2.144082332761578, | |
| "grad_norm": 11.873661041259766, | |
| "learning_rate": 5.737509387286302e-06, | |
| "loss": 0.499, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.144082332761578, | |
| "eval_accuracy": 0.9216326475143433, | |
| "eval_loss": 0.5672578811645508, | |
| "eval_runtime": 14.4037, | |
| "eval_samples_per_second": 170.096, | |
| "eval_steps_per_second": 5.346, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 2.150679509170075, | |
| "grad_norm": 10.252338409423828, | |
| "learning_rate": 5.693333922339533e-06, | |
| "loss": 0.5822, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 2.1572766855785726, | |
| "grad_norm": 21.956472396850586, | |
| "learning_rate": 5.6491584573927645e-06, | |
| "loss": 0.5935, | |
| "step": 16350 | |
| }, | |
| { | |
| "epoch": 2.1638738619870694, | |
| "grad_norm": 10.932018280029297, | |
| "learning_rate": 5.6049829924459966e-06, | |
| "loss": 0.5028, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 2.1704710383955668, | |
| "grad_norm": 11.411332130432129, | |
| "learning_rate": 5.560807527499228e-06, | |
| "loss": 0.5118, | |
| "step": 16450 | |
| }, | |
| { | |
| "epoch": 2.1770682148040637, | |
| "grad_norm": 12.977612495422363, | |
| "learning_rate": 5.516632062552458e-06, | |
| "loss": 0.5623, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.1770682148040637, | |
| "eval_accuracy": 0.9261224269866943, | |
| "eval_loss": 0.5655830502510071, | |
| "eval_runtime": 14.5501, | |
| "eval_samples_per_second": 168.384, | |
| "eval_steps_per_second": 5.292, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 2.183665391212561, | |
| "grad_norm": 16.45384979248047, | |
| "learning_rate": 5.47245659760569e-06, | |
| "loss": 0.481, | |
| "step": 16550 | |
| }, | |
| { | |
| "epoch": 2.1902625676210583, | |
| "grad_norm": 10.353941917419434, | |
| "learning_rate": 5.428281132658921e-06, | |
| "loss": 0.5461, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 2.196859744029555, | |
| "grad_norm": 13.859786987304688, | |
| "learning_rate": 5.3841056677121524e-06, | |
| "loss": 0.5802, | |
| "step": 16650 | |
| }, | |
| { | |
| "epoch": 2.2034569204380525, | |
| "grad_norm": 14.852931022644043, | |
| "learning_rate": 5.3399302027653845e-06, | |
| "loss": 0.5269, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 2.21005409684655, | |
| "grad_norm": 11.935972213745117, | |
| "learning_rate": 5.295754737818616e-06, | |
| "loss": 0.5022, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.21005409684655, | |
| "eval_accuracy": 0.9220408201217651, | |
| "eval_loss": 0.5671045184135437, | |
| "eval_runtime": 14.353, | |
| "eval_samples_per_second": 170.696, | |
| "eval_steps_per_second": 5.365, | |
| "step": 16750 | |
| }, | |
| { | |
| "epoch": 2.2166512732550467, | |
| "grad_norm": 6.890115261077881, | |
| "learning_rate": 5.251579272871848e-06, | |
| "loss": 0.5203, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 2.223248449663544, | |
| "grad_norm": 10.788956642150879, | |
| "learning_rate": 5.207403807925079e-06, | |
| "loss": 0.5461, | |
| "step": 16850 | |
| }, | |
| { | |
| "epoch": 2.2298456260720414, | |
| "grad_norm": 10.99864387512207, | |
| "learning_rate": 5.16322834297831e-06, | |
| "loss": 0.5711, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 2.2364428024805383, | |
| "grad_norm": 14.6043062210083, | |
| "learning_rate": 5.119052878031542e-06, | |
| "loss": 0.5615, | |
| "step": 16950 | |
| }, | |
| { | |
| "epoch": 2.2430399788890356, | |
| "grad_norm": 11.577956199645996, | |
| "learning_rate": 5.074877413084773e-06, | |
| "loss": 0.5748, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.2430399788890356, | |
| "eval_accuracy": 0.9257143139839172, | |
| "eval_loss": 0.560461699962616, | |
| "eval_runtime": 14.6816, | |
| "eval_samples_per_second": 166.875, | |
| "eval_steps_per_second": 5.245, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.2496371552975325, | |
| "grad_norm": 10.84367561340332, | |
| "learning_rate": 5.030701948138005e-06, | |
| "loss": 0.5272, | |
| "step": 17050 | |
| }, | |
| { | |
| "epoch": 2.25623433170603, | |
| "grad_norm": 9.228910446166992, | |
| "learning_rate": 4.9865264831912355e-06, | |
| "loss": 0.4401, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 2.262831508114527, | |
| "grad_norm": 5.861785411834717, | |
| "learning_rate": 4.9423510182444676e-06, | |
| "loss": 0.5158, | |
| "step": 17150 | |
| }, | |
| { | |
| "epoch": 2.269428684523024, | |
| "grad_norm": 19.159400939941406, | |
| "learning_rate": 4.898175553297699e-06, | |
| "loss": 0.5163, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 2.2760258609315214, | |
| "grad_norm": 9.03962516784668, | |
| "learning_rate": 4.85400008835093e-06, | |
| "loss": 0.5195, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 2.2760258609315214, | |
| "eval_accuracy": 0.9236734509468079, | |
| "eval_loss": 0.5647316575050354, | |
| "eval_runtime": 14.3918, | |
| "eval_samples_per_second": 170.236, | |
| "eval_steps_per_second": 5.35, | |
| "step": 17250 | |
| }, | |
| { | |
| "epoch": 2.2826230373400183, | |
| "grad_norm": 11.594718933105469, | |
| "learning_rate": 4.809824623404162e-06, | |
| "loss": 0.5235, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 2.2892202137485156, | |
| "grad_norm": 15.447309494018555, | |
| "learning_rate": 4.765649158457393e-06, | |
| "loss": 0.5335, | |
| "step": 17350 | |
| }, | |
| { | |
| "epoch": 2.295817390157013, | |
| "grad_norm": 8.447811126708984, | |
| "learning_rate": 4.721473693510625e-06, | |
| "loss": 0.4915, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 2.30241456656551, | |
| "grad_norm": 11.305243492126465, | |
| "learning_rate": 4.677298228563856e-06, | |
| "loss": 0.4915, | |
| "step": 17450 | |
| }, | |
| { | |
| "epoch": 2.309011742974007, | |
| "grad_norm": 16.881988525390625, | |
| "learning_rate": 4.6331227636170875e-06, | |
| "loss": 0.4959, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.309011742974007, | |
| "eval_accuracy": 0.923265278339386, | |
| "eval_loss": 0.5674872398376465, | |
| "eval_runtime": 14.3341, | |
| "eval_samples_per_second": 170.921, | |
| "eval_steps_per_second": 5.372, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 2.3156089193825045, | |
| "grad_norm": 8.994074821472168, | |
| "learning_rate": 4.588947298670319e-06, | |
| "loss": 0.5161, | |
| "step": 17550 | |
| }, | |
| { | |
| "epoch": 2.3222060957910013, | |
| "grad_norm": 14.556620597839355, | |
| "learning_rate": 4.54477183372355e-06, | |
| "loss": 0.4944, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 2.3288032721994987, | |
| "grad_norm": 15.484505653381348, | |
| "learning_rate": 4.500596368776782e-06, | |
| "loss": 0.5052, | |
| "step": 17650 | |
| }, | |
| { | |
| "epoch": 2.335400448607996, | |
| "grad_norm": 6.690243244171143, | |
| "learning_rate": 4.456420903830013e-06, | |
| "loss": 0.4937, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 2.341997625016493, | |
| "grad_norm": 12.30466365814209, | |
| "learning_rate": 4.4131289481821795e-06, | |
| "loss": 0.4695, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 2.341997625016493, | |
| "eval_accuracy": 0.9253061413764954, | |
| "eval_loss": 0.561543345451355, | |
| "eval_runtime": 14.3063, | |
| "eval_samples_per_second": 171.254, | |
| "eval_steps_per_second": 5.382, | |
| "step": 17750 | |
| }, | |
| { | |
| "epoch": 2.34859480142499, | |
| "grad_norm": 14.061612129211426, | |
| "learning_rate": 4.3689534832354116e-06, | |
| "loss": 0.5159, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 2.355191977833487, | |
| "grad_norm": 17.726974487304688, | |
| "learning_rate": 4.324778018288643e-06, | |
| "loss": 0.4992, | |
| "step": 17850 | |
| }, | |
| { | |
| "epoch": 2.3617891542419844, | |
| "grad_norm": 7.066623687744141, | |
| "learning_rate": 4.280602553341875e-06, | |
| "loss": 0.5288, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 2.3683863306504818, | |
| "grad_norm": 18.694576263427734, | |
| "learning_rate": 4.236427088395106e-06, | |
| "loss": 0.5247, | |
| "step": 17950 | |
| }, | |
| { | |
| "epoch": 2.3749835070589786, | |
| "grad_norm": 14.194579124450684, | |
| "learning_rate": 4.192251623448337e-06, | |
| "loss": 0.5491, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.3749835070589786, | |
| "eval_accuracy": 0.9257143139839172, | |
| "eval_loss": 0.5593844652175903, | |
| "eval_runtime": 14.4387, | |
| "eval_samples_per_second": 169.683, | |
| "eval_steps_per_second": 5.333, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.381580683467476, | |
| "grad_norm": 8.745909690856934, | |
| "learning_rate": 4.148076158501568e-06, | |
| "loss": 0.5332, | |
| "step": 18050 | |
| }, | |
| { | |
| "epoch": 2.388177859875973, | |
| "grad_norm": 7.993963241577148, | |
| "learning_rate": 4.1039006935547995e-06, | |
| "loss": 0.529, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 2.39477503628447, | |
| "grad_norm": 11.705142974853516, | |
| "learning_rate": 4.0597252286080315e-06, | |
| "loss": 0.5534, | |
| "step": 18150 | |
| }, | |
| { | |
| "epoch": 2.4013722126929675, | |
| "grad_norm": 15.30136775970459, | |
| "learning_rate": 4.015549763661263e-06, | |
| "loss": 0.5595, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 2.4079693891014644, | |
| "grad_norm": 14.33283805847168, | |
| "learning_rate": 3.971374298714495e-06, | |
| "loss": 0.573, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 2.4079693891014644, | |
| "eval_accuracy": 0.9261224269866943, | |
| "eval_loss": 0.5610310435295105, | |
| "eval_runtime": 14.5813, | |
| "eval_samples_per_second": 168.023, | |
| "eval_steps_per_second": 5.281, | |
| "step": 18250 | |
| }, | |
| { | |
| "epoch": 2.4145665655099617, | |
| "grad_norm": 15.337475776672363, | |
| "learning_rate": 3.928082343066661e-06, | |
| "loss": 0.4859, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 2.421163741918459, | |
| "grad_norm": 14.803478240966797, | |
| "learning_rate": 3.883906878119892e-06, | |
| "loss": 0.5019, | |
| "step": 18350 | |
| }, | |
| { | |
| "epoch": 2.427760918326956, | |
| "grad_norm": 17.378925323486328, | |
| "learning_rate": 3.8397314131731236e-06, | |
| "loss": 0.4771, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 2.4343580947354533, | |
| "grad_norm": 11.473735809326172, | |
| "learning_rate": 3.7955559482263556e-06, | |
| "loss": 0.5062, | |
| "step": 18450 | |
| }, | |
| { | |
| "epoch": 2.4409552711439506, | |
| "grad_norm": 14.394603729248047, | |
| "learning_rate": 3.7513804832795868e-06, | |
| "loss": 0.5342, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.4409552711439506, | |
| "eval_accuracy": 0.9228571653366089, | |
| "eval_loss": 0.5616511106491089, | |
| "eval_runtime": 14.3525, | |
| "eval_samples_per_second": 170.701, | |
| "eval_steps_per_second": 5.365, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 2.4475524475524475, | |
| "grad_norm": 13.148124694824219, | |
| "learning_rate": 3.7072050183328184e-06, | |
| "loss": 0.5275, | |
| "step": 18550 | |
| }, | |
| { | |
| "epoch": 2.454149623960945, | |
| "grad_norm": 18.74552345275879, | |
| "learning_rate": 3.66302955338605e-06, | |
| "loss": 0.576, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 2.4607468003694417, | |
| "grad_norm": 9.60922622680664, | |
| "learning_rate": 3.6188540884392807e-06, | |
| "loss": 0.5172, | |
| "step": 18650 | |
| }, | |
| { | |
| "epoch": 2.467343976777939, | |
| "grad_norm": 19.15401840209961, | |
| "learning_rate": 3.5746786234925123e-06, | |
| "loss": 0.5127, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 2.4739411531864364, | |
| "grad_norm": 14.698090553283691, | |
| "learning_rate": 3.530503158545744e-06, | |
| "loss": 0.4728, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.4739411531864364, | |
| "eval_accuracy": 0.9248979687690735, | |
| "eval_loss": 0.5650564432144165, | |
| "eval_runtime": 14.464, | |
| "eval_samples_per_second": 169.387, | |
| "eval_steps_per_second": 5.324, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 2.4805383295949333, | |
| "grad_norm": 11.760072708129883, | |
| "learning_rate": 3.4863276935989755e-06, | |
| "loss": 0.4256, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 2.4871355060034306, | |
| "grad_norm": 11.057052612304688, | |
| "learning_rate": 3.442152228652207e-06, | |
| "loss": 0.4493, | |
| "step": 18850 | |
| }, | |
| { | |
| "epoch": 2.4937326824119275, | |
| "grad_norm": 14.985106468200684, | |
| "learning_rate": 3.3979767637054383e-06, | |
| "loss": 0.4881, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 2.500329858820425, | |
| "grad_norm": 8.24613094329834, | |
| "learning_rate": 3.35380129875867e-06, | |
| "loss": 0.4843, | |
| "step": 18950 | |
| }, | |
| { | |
| "epoch": 2.506927035228922, | |
| "grad_norm": 12.78288459777832, | |
| "learning_rate": 3.3096258338119015e-06, | |
| "loss": 0.517, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.506927035228922, | |
| "eval_accuracy": 0.9248979687690735, | |
| "eval_loss": 0.5626258850097656, | |
| "eval_runtime": 14.0676, | |
| "eval_samples_per_second": 174.159, | |
| "eval_steps_per_second": 5.474, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.513524211637419, | |
| "grad_norm": 13.174288749694824, | |
| "learning_rate": 3.265450368865132e-06, | |
| "loss": 0.5927, | |
| "step": 19050 | |
| }, | |
| { | |
| "epoch": 2.5201213880459163, | |
| "grad_norm": 6.971681594848633, | |
| "learning_rate": 3.221274903918364e-06, | |
| "loss": 0.5687, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 2.5267185644544137, | |
| "grad_norm": 10.98085880279541, | |
| "learning_rate": 3.1770994389715954e-06, | |
| "loss": 0.5261, | |
| "step": 19150 | |
| }, | |
| { | |
| "epoch": 2.5333157408629106, | |
| "grad_norm": 15.29484748840332, | |
| "learning_rate": 3.132923974024827e-06, | |
| "loss": 0.5698, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 2.539912917271408, | |
| "grad_norm": 22.54600715637207, | |
| "learning_rate": 3.088748509078058e-06, | |
| "loss": 0.5593, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.539912917271408, | |
| "eval_accuracy": 0.9269387722015381, | |
| "eval_loss": 0.5581403374671936, | |
| "eval_runtime": 14.0758, | |
| "eval_samples_per_second": 174.057, | |
| "eval_steps_per_second": 5.47, | |
| "step": 19250 | |
| }, | |
| { | |
| "epoch": 2.546510093679905, | |
| "grad_norm": 10.993823051452637, | |
| "learning_rate": 3.0445730441312898e-06, | |
| "loss": 0.571, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 2.553107270088402, | |
| "grad_norm": 21.144847869873047, | |
| "learning_rate": 3.0003975791845214e-06, | |
| "loss": 0.5606, | |
| "step": 19350 | |
| }, | |
| { | |
| "epoch": 2.5597044464968994, | |
| "grad_norm": 16.376079559326172, | |
| "learning_rate": 2.956222114237753e-06, | |
| "loss": 0.4912, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 2.5663016229053968, | |
| "grad_norm": 13.594402313232422, | |
| "learning_rate": 2.9120466492909837e-06, | |
| "loss": 0.4805, | |
| "step": 19450 | |
| }, | |
| { | |
| "epoch": 2.5728987993138936, | |
| "grad_norm": 17.23542594909668, | |
| "learning_rate": 2.8678711843442153e-06, | |
| "loss": 0.5324, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.5728987993138936, | |
| "eval_accuracy": 0.9281632900238037, | |
| "eval_loss": 0.553718626499176, | |
| "eval_runtime": 14.1602, | |
| "eval_samples_per_second": 173.02, | |
| "eval_steps_per_second": 5.438, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 2.579495975722391, | |
| "grad_norm": 17.32400131225586, | |
| "learning_rate": 2.823695719397447e-06, | |
| "loss": 0.5584, | |
| "step": 19550 | |
| }, | |
| { | |
| "epoch": 2.586093152130888, | |
| "grad_norm": 5.780141830444336, | |
| "learning_rate": 2.7795202544506785e-06, | |
| "loss": 0.508, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 2.592690328539385, | |
| "grad_norm": 12.641766548156738, | |
| "learning_rate": 2.7353447895039097e-06, | |
| "loss": 0.5231, | |
| "step": 19650 | |
| }, | |
| { | |
| "epoch": 2.599287504947882, | |
| "grad_norm": 18.93987464904785, | |
| "learning_rate": 2.6920528338560762e-06, | |
| "loss": 0.557, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 2.6058846813563794, | |
| "grad_norm": 15.360589027404785, | |
| "learning_rate": 2.647877368909308e-06, | |
| "loss": 0.5338, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.6058846813563794, | |
| "eval_accuracy": 0.9257143139839172, | |
| "eval_loss": 0.551838219165802, | |
| "eval_runtime": 15.1358, | |
| "eval_samples_per_second": 161.868, | |
| "eval_steps_per_second": 5.087, | |
| "step": 19750 | |
| }, | |
| { | |
| "epoch": 2.6124818577648767, | |
| "grad_norm": 15.32451057434082, | |
| "learning_rate": 2.6037019039625394e-06, | |
| "loss": 0.5037, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 2.6190790341733736, | |
| "grad_norm": 11.314981460571289, | |
| "learning_rate": 2.559526439015771e-06, | |
| "loss": 0.6057, | |
| "step": 19850 | |
| }, | |
| { | |
| "epoch": 2.625676210581871, | |
| "grad_norm": 7.916543006896973, | |
| "learning_rate": 2.5153509740690026e-06, | |
| "loss": 0.5571, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 2.6322733869903683, | |
| "grad_norm": 17.10308837890625, | |
| "learning_rate": 2.4711755091222338e-06, | |
| "loss": 0.5177, | |
| "step": 19950 | |
| }, | |
| { | |
| "epoch": 2.638870563398865, | |
| "grad_norm": 16.19850730895996, | |
| "learning_rate": 2.427000044175465e-06, | |
| "loss": 0.4946, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.638870563398865, | |
| "eval_accuracy": 0.9253061413764954, | |
| "eval_loss": 0.5547569394111633, | |
| "eval_runtime": 14.6842, | |
| "eval_samples_per_second": 166.846, | |
| "eval_steps_per_second": 5.244, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.6454677398073625, | |
| "grad_norm": 9.776342391967773, | |
| "learning_rate": 2.3828245792286966e-06, | |
| "loss": 0.5256, | |
| "step": 20050 | |
| }, | |
| { | |
| "epoch": 2.65206491621586, | |
| "grad_norm": 14.741767883300781, | |
| "learning_rate": 2.338649114281928e-06, | |
| "loss": 0.5107, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 2.6586620926243567, | |
| "grad_norm": 10.714197158813477, | |
| "learning_rate": 2.2944736493351593e-06, | |
| "loss": 0.5988, | |
| "step": 20150 | |
| }, | |
| { | |
| "epoch": 2.665259269032854, | |
| "grad_norm": 16.533546447753906, | |
| "learning_rate": 2.250298184388391e-06, | |
| "loss": 0.4907, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 2.6718564454413514, | |
| "grad_norm": 18.46228790283203, | |
| "learning_rate": 2.2061227194416225e-06, | |
| "loss": 0.4697, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.6718564454413514, | |
| "eval_accuracy": 0.9269387722015381, | |
| "eval_loss": 0.5565572381019592, | |
| "eval_runtime": 15.4517, | |
| "eval_samples_per_second": 158.559, | |
| "eval_steps_per_second": 4.983, | |
| "step": 20250 | |
| }, | |
| { | |
| "epoch": 2.6784536218498483, | |
| "grad_norm": 11.330911636352539, | |
| "learning_rate": 2.1619472544948537e-06, | |
| "loss": 0.4897, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 2.6850507982583456, | |
| "grad_norm": 12.666998863220215, | |
| "learning_rate": 2.1177717895480853e-06, | |
| "loss": 0.5088, | |
| "step": 20350 | |
| }, | |
| { | |
| "epoch": 2.6916479746668425, | |
| "grad_norm": 21.95562171936035, | |
| "learning_rate": 2.0735963246013165e-06, | |
| "loss": 0.5442, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 2.69824515107534, | |
| "grad_norm": 13.3275785446167, | |
| "learning_rate": 2.029420859654548e-06, | |
| "loss": 0.536, | |
| "step": 20450 | |
| }, | |
| { | |
| "epoch": 2.7048423274838367, | |
| "grad_norm": 16.374469757080078, | |
| "learning_rate": 1.9852453947077792e-06, | |
| "loss": 0.551, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.7048423274838367, | |
| "eval_accuracy": 0.9269387722015381, | |
| "eval_loss": 0.5562152862548828, | |
| "eval_runtime": 14.1322, | |
| "eval_samples_per_second": 173.363, | |
| "eval_steps_per_second": 5.449, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 2.711439503892334, | |
| "grad_norm": 9.546135902404785, | |
| "learning_rate": 1.941069929761011e-06, | |
| "loss": 0.5038, | |
| "step": 20550 | |
| }, | |
| { | |
| "epoch": 2.7180366803008313, | |
| "grad_norm": 8.056339263916016, | |
| "learning_rate": 1.8968944648142424e-06, | |
| "loss": 0.502, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 2.7246338567093282, | |
| "grad_norm": 15.2578706741333, | |
| "learning_rate": 1.8527189998674738e-06, | |
| "loss": 0.5021, | |
| "step": 20650 | |
| }, | |
| { | |
| "epoch": 2.7312310331178256, | |
| "grad_norm": 9.090350151062012, | |
| "learning_rate": 1.808543534920705e-06, | |
| "loss": 0.5441, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 2.737828209526323, | |
| "grad_norm": 8.323760032653809, | |
| "learning_rate": 1.7643680699739366e-06, | |
| "loss": 0.4818, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.737828209526323, | |
| "eval_accuracy": 0.9285714030265808, | |
| "eval_loss": 0.554760754108429, | |
| "eval_runtime": 14.1428, | |
| "eval_samples_per_second": 173.233, | |
| "eval_steps_per_second": 5.444, | |
| "step": 20750 | |
| }, | |
| { | |
| "epoch": 2.7444253859348198, | |
| "grad_norm": 9.076456069946289, | |
| "learning_rate": 1.720192605027168e-06, | |
| "loss": 0.5012, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 2.751022562343317, | |
| "grad_norm": 11.11436939239502, | |
| "learning_rate": 1.6760171400803996e-06, | |
| "loss": 0.5294, | |
| "step": 20850 | |
| }, | |
| { | |
| "epoch": 2.7576197387518144, | |
| "grad_norm": 10.291386604309082, | |
| "learning_rate": 1.6318416751336307e-06, | |
| "loss": 0.4674, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 2.7642169151603113, | |
| "grad_norm": 19.83849334716797, | |
| "learning_rate": 1.5876662101868623e-06, | |
| "loss": 0.5436, | |
| "step": 20950 | |
| }, | |
| { | |
| "epoch": 2.7708140915688086, | |
| "grad_norm": 12.49002456665039, | |
| "learning_rate": 1.5434907452400937e-06, | |
| "loss": 0.4609, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.7708140915688086, | |
| "eval_accuracy": 0.9269387722015381, | |
| "eval_loss": 0.5537921190261841, | |
| "eval_runtime": 14.1539, | |
| "eval_samples_per_second": 173.097, | |
| "eval_steps_per_second": 5.44, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 2.777411267977306, | |
| "grad_norm": 11.501051902770996, | |
| "learning_rate": 1.4993152802933253e-06, | |
| "loss": 0.5015, | |
| "step": 21050 | |
| }, | |
| { | |
| "epoch": 2.784008444385803, | |
| "grad_norm": 11.332602500915527, | |
| "learning_rate": 1.4551398153465565e-06, | |
| "loss": 0.5299, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 2.7906056207943, | |
| "grad_norm": 14.520770072937012, | |
| "learning_rate": 1.410964350399788e-06, | |
| "loss": 0.4363, | |
| "step": 21150 | |
| }, | |
| { | |
| "epoch": 2.797202797202797, | |
| "grad_norm": 19.919044494628906, | |
| "learning_rate": 1.3667888854530195e-06, | |
| "loss": 0.5018, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 2.8037999736112944, | |
| "grad_norm": 17.189006805419922, | |
| "learning_rate": 1.322613420506251e-06, | |
| "loss": 0.5079, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.8037999736112944, | |
| "eval_accuracy": 0.9265305995941162, | |
| "eval_loss": 0.5549395680427551, | |
| "eval_runtime": 14.2355, | |
| "eval_samples_per_second": 172.105, | |
| "eval_steps_per_second": 5.409, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 2.8103971500197913, | |
| "grad_norm": 10.528189659118652, | |
| "learning_rate": 1.2784379555594823e-06, | |
| "loss": 0.4467, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 2.8169943264282886, | |
| "grad_norm": 10.16166877746582, | |
| "learning_rate": 1.2342624906127139e-06, | |
| "loss": 0.5769, | |
| "step": 21350 | |
| }, | |
| { | |
| "epoch": 2.823591502836786, | |
| "grad_norm": 5.988204002380371, | |
| "learning_rate": 1.1900870256659452e-06, | |
| "loss": 0.5323, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 2.830188679245283, | |
| "grad_norm": 18.418853759765625, | |
| "learning_rate": 1.1459115607191766e-06, | |
| "loss": 0.4714, | |
| "step": 21450 | |
| }, | |
| { | |
| "epoch": 2.83678585565378, | |
| "grad_norm": 12.283252716064453, | |
| "learning_rate": 1.1017360957724082e-06, | |
| "loss": 0.4491, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.83678585565378, | |
| "eval_accuracy": 0.9257143139839172, | |
| "eval_loss": 0.5538486242294312, | |
| "eval_runtime": 14.1053, | |
| "eval_samples_per_second": 173.693, | |
| "eval_steps_per_second": 5.459, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 2.8433830320622775, | |
| "grad_norm": 6.51999568939209, | |
| "learning_rate": 1.0575606308256394e-06, | |
| "loss": 0.4801, | |
| "step": 21550 | |
| }, | |
| { | |
| "epoch": 2.8499802084707744, | |
| "grad_norm": 12.273625373840332, | |
| "learning_rate": 1.013385165878871e-06, | |
| "loss": 0.5132, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 2.8565773848792717, | |
| "grad_norm": 14.393851280212402, | |
| "learning_rate": 9.692097009321024e-07, | |
| "loss": 0.4542, | |
| "step": 21650 | |
| }, | |
| { | |
| "epoch": 2.863174561287769, | |
| "grad_norm": 14.198440551757812, | |
| "learning_rate": 9.250342359853339e-07, | |
| "loss": 0.5015, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 2.869771737696266, | |
| "grad_norm": 17.55302619934082, | |
| "learning_rate": 8.808587710385653e-07, | |
| "loss": 0.4818, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.869771737696266, | |
| "eval_accuracy": 0.9277551174163818, | |
| "eval_loss": 0.5553678870201111, | |
| "eval_runtime": 14.0909, | |
| "eval_samples_per_second": 173.871, | |
| "eval_steps_per_second": 5.465, | |
| "step": 21750 | |
| }, | |
| { | |
| "epoch": 2.8763689141047633, | |
| "grad_norm": 14.505319595336914, | |
| "learning_rate": 8.366833060917967e-07, | |
| "loss": 0.5078, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 2.8829660905132606, | |
| "grad_norm": 15.411627769470215, | |
| "learning_rate": 7.92507841145028e-07, | |
| "loss": 0.508, | |
| "step": 21850 | |
| }, | |
| { | |
| "epoch": 2.8895632669217575, | |
| "grad_norm": 17.951257705688477, | |
| "learning_rate": 7.483323761982596e-07, | |
| "loss": 0.5331, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 2.896160443330255, | |
| "grad_norm": 8.112133979797363, | |
| "learning_rate": 7.041569112514909e-07, | |
| "loss": 0.5185, | |
| "step": 21950 | |
| }, | |
| { | |
| "epoch": 2.9027576197387517, | |
| "grad_norm": 16.92367172241211, | |
| "learning_rate": 6.599814463047224e-07, | |
| "loss": 0.4469, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.9027576197387517, | |
| "eval_accuracy": 0.9265305995941162, | |
| "eval_loss": 0.555115818977356, | |
| "eval_runtime": 14.0889, | |
| "eval_samples_per_second": 173.896, | |
| "eval_steps_per_second": 5.465, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 2.909354796147249, | |
| "grad_norm": 7.048780918121338, | |
| "learning_rate": 6.158059813579539e-07, | |
| "loss": 0.4762, | |
| "step": 22050 | |
| }, | |
| { | |
| "epoch": 2.915951972555746, | |
| "grad_norm": 16.79896354675293, | |
| "learning_rate": 5.716305164111853e-07, | |
| "loss": 0.5799, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 2.9225491489642432, | |
| "grad_norm": 10.826476097106934, | |
| "learning_rate": 5.274550514644168e-07, | |
| "loss": 0.4978, | |
| "step": 22150 | |
| }, | |
| { | |
| "epoch": 2.9291463253727406, | |
| "grad_norm": 12.840262413024902, | |
| "learning_rate": 4.832795865176481e-07, | |
| "loss": 0.566, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 2.9357435017812374, | |
| "grad_norm": 20.16173553466797, | |
| "learning_rate": 4.391041215708796e-07, | |
| "loss": 0.5837, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.9357435017812374, | |
| "eval_accuracy": 0.9269387722015381, | |
| "eval_loss": 0.5531713962554932, | |
| "eval_runtime": 14.0502, | |
| "eval_samples_per_second": 174.375, | |
| "eval_steps_per_second": 5.48, | |
| "step": 22250 | |
| }, | |
| { | |
| "epoch": 2.9423406781897348, | |
| "grad_norm": 12.399968147277832, | |
| "learning_rate": 3.94928656624111e-07, | |
| "loss": 0.5401, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 2.948937854598232, | |
| "grad_norm": 18.302248001098633, | |
| "learning_rate": 3.5075319167734247e-07, | |
| "loss": 0.523, | |
| "step": 22350 | |
| }, | |
| { | |
| "epoch": 2.955535031006729, | |
| "grad_norm": 13.304845809936523, | |
| "learning_rate": 3.0657772673057385e-07, | |
| "loss": 0.5913, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 2.9621322074152263, | |
| "grad_norm": 18.745372772216797, | |
| "learning_rate": 2.624022617838053e-07, | |
| "loss": 0.4701, | |
| "step": 22450 | |
| }, | |
| { | |
| "epoch": 2.9687293838237236, | |
| "grad_norm": 14.230325698852539, | |
| "learning_rate": 2.1822679683703673e-07, | |
| "loss": 0.5568, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.9687293838237236, | |
| "eval_accuracy": 0.92734694480896, | |
| "eval_loss": 0.5528694987297058, | |
| "eval_runtime": 14.0688, | |
| "eval_samples_per_second": 174.144, | |
| "eval_steps_per_second": 5.473, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 2.9753265602322205, | |
| "grad_norm": 13.408769607543945, | |
| "learning_rate": 1.7405133189026817e-07, | |
| "loss": 0.5266, | |
| "step": 22550 | |
| }, | |
| { | |
| "epoch": 2.981923736640718, | |
| "grad_norm": 14.969887733459473, | |
| "learning_rate": 1.298758669434996e-07, | |
| "loss": 0.4969, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 2.988520913049215, | |
| "grad_norm": 3.428957939147949, | |
| "learning_rate": 8.570040199673103e-08, | |
| "loss": 0.4917, | |
| "step": 22650 | |
| }, | |
| { | |
| "epoch": 2.995118089457712, | |
| "grad_norm": 9.547283172607422, | |
| "learning_rate": 4.152493704996245e-08, | |
| "loss": 0.5128, | |
| "step": 22700 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 22737, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 250, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 16, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |