| { |
| "best_metric": 0.08412499725818634, |
| "best_model_checkpoint": "./fine-tuned/checkpoint-7500", |
| "epoch": 0.6, |
| "eval_steps": 500, |
| "global_step": 7500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.24035809934139252, |
| "learning_rate": 2.99412e-05, |
| "loss": 0.3973, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.39832672476768494, |
| "learning_rate": 2.98812e-05, |
| "loss": 0.1126, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.2938326299190521, |
| "learning_rate": 2.9821200000000002e-05, |
| "loss": 0.0932, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.18936102092266083, |
| "learning_rate": 2.9761200000000002e-05, |
| "loss": 0.0897, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.18386273086071014, |
| "learning_rate": 2.9701200000000003e-05, |
| "loss": 0.092, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 0.1968618482351303, |
| "learning_rate": 2.96412e-05, |
| "loss": 0.0845, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 0.18028958141803741, |
| "learning_rate": 2.95812e-05, |
| "loss": 0.0792, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.2615596652030945, |
| "learning_rate": 2.95212e-05, |
| "loss": 0.0796, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 0.1913922131061554, |
| "learning_rate": 2.9461200000000002e-05, |
| "loss": 0.0798, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.2351102977991104, |
| "learning_rate": 2.9401200000000002e-05, |
| "loss": 0.081, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_loss": 0.09441258758306503, |
| "eval_runtime": 88.0185, |
| "eval_samples_per_second": 22.722, |
| "eval_steps_per_second": 5.681, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 0.1859619915485382, |
| "learning_rate": 2.9341200000000003e-05, |
| "loss": 0.077, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.11936317384243011, |
| "learning_rate": 2.92812e-05, |
| "loss": 0.0727, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 0.2207396775484085, |
| "learning_rate": 2.92212e-05, |
| "loss": 0.0743, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 0.18488994240760803, |
| "learning_rate": 2.91612e-05, |
| "loss": 0.0824, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.22228538990020752, |
| "learning_rate": 2.9101200000000002e-05, |
| "loss": 0.0716, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.24138867855072021, |
| "learning_rate": 2.9041200000000002e-05, |
| "loss": 0.0814, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 0.25113552808761597, |
| "learning_rate": 2.89812e-05, |
| "loss": 0.076, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 0.8853724598884583, |
| "learning_rate": 2.89212e-05, |
| "loss": 0.0781, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 0.1753206104040146, |
| "learning_rate": 2.88612e-05, |
| "loss": 0.084, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.1704334318637848, |
| "learning_rate": 2.88012e-05, |
| "loss": 0.0769, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_loss": 0.09160277992486954, |
| "eval_runtime": 88.0495, |
| "eval_samples_per_second": 22.714, |
| "eval_steps_per_second": 5.679, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 0.16729697585105896, |
| "learning_rate": 2.8741200000000002e-05, |
| "loss": 0.0823, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 0.14851506054401398, |
| "learning_rate": 2.86812e-05, |
| "loss": 0.0785, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 0.22481797635555267, |
| "learning_rate": 2.86212e-05, |
| "loss": 0.0786, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.13808289170265198, |
| "learning_rate": 2.85612e-05, |
| "loss": 0.0785, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.17833128571510315, |
| "learning_rate": 2.85012e-05, |
| "loss": 0.0737, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.14926594495773315, |
| "learning_rate": 2.84412e-05, |
| "loss": 0.0767, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 0.19346196949481964, |
| "learning_rate": 2.8381200000000002e-05, |
| "loss": 0.077, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.2675027847290039, |
| "learning_rate": 2.83212e-05, |
| "loss": 0.0806, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 0.19048169255256653, |
| "learning_rate": 2.82612e-05, |
| "loss": 0.0756, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.1875162124633789, |
| "learning_rate": 2.82012e-05, |
| "loss": 0.0823, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_loss": 0.09038107097148895, |
| "eval_runtime": 87.8777, |
| "eval_samples_per_second": 22.759, |
| "eval_steps_per_second": 5.69, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 0.19083499908447266, |
| "learning_rate": 2.81412e-05, |
| "loss": 0.0736, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.19123569130897522, |
| "learning_rate": 2.80812e-05, |
| "loss": 0.0766, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 0.24691827595233917, |
| "learning_rate": 2.80212e-05, |
| "loss": 0.0798, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 0.17910048365592957, |
| "learning_rate": 2.79612e-05, |
| "loss": 0.0687, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.1740667223930359, |
| "learning_rate": 2.79012e-05, |
| "loss": 0.0758, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.15178219974040985, |
| "learning_rate": 2.78412e-05, |
| "loss": 0.0732, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 0.1904926896095276, |
| "learning_rate": 2.77812e-05, |
| "loss": 0.0734, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 0.2795208990573883, |
| "learning_rate": 2.77212e-05, |
| "loss": 0.076, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 0.18160228431224823, |
| "learning_rate": 2.76612e-05, |
| "loss": 0.0716, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.15877611935138702, |
| "learning_rate": 2.76012e-05, |
| "loss": 0.0773, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 0.08910445868968964, |
| "eval_runtime": 87.8635, |
| "eval_samples_per_second": 22.763, |
| "eval_steps_per_second": 5.691, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 0.2411368191242218, |
| "learning_rate": 2.75412e-05, |
| "loss": 0.0786, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 0.16663742065429688, |
| "learning_rate": 2.74812e-05, |
| "loss": 0.0724, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 0.23420193791389465, |
| "learning_rate": 2.74212e-05, |
| "loss": 0.0653, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.1807372272014618, |
| "learning_rate": 2.7361199999999998e-05, |
| "loss": 0.0676, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.16474364697933197, |
| "learning_rate": 2.73012e-05, |
| "loss": 0.0767, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 0.17184095084667206, |
| "learning_rate": 2.72412e-05, |
| "loss": 0.0658, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 0.16993258893489838, |
| "learning_rate": 2.71812e-05, |
| "loss": 0.0755, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.1555277407169342, |
| "learning_rate": 2.71212e-05, |
| "loss": 0.0698, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 0.09040562808513641, |
| "learning_rate": 2.7061199999999998e-05, |
| "loss": 0.0757, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.12910398840904236, |
| "learning_rate": 2.7001199999999998e-05, |
| "loss": 0.0688, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_loss": 0.08841572701931, |
| "eval_runtime": 87.7555, |
| "eval_samples_per_second": 22.791, |
| "eval_steps_per_second": 5.698, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 0.14937053620815277, |
| "learning_rate": 2.69412e-05, |
| "loss": 0.0727, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.15660254657268524, |
| "learning_rate": 2.68812e-05, |
| "loss": 0.0656, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 0.09593763947486877, |
| "learning_rate": 2.68212e-05, |
| "loss": 0.0726, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 0.25192323327064514, |
| "learning_rate": 2.67624e-05, |
| "loss": 0.0753, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.1267642378807068, |
| "learning_rate": 2.67024e-05, |
| "loss": 0.0707, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.13844658434391022, |
| "learning_rate": 2.66424e-05, |
| "loss": 0.0711, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 0.15095186233520508, |
| "learning_rate": 2.65824e-05, |
| "loss": 0.0696, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 0.09553442895412445, |
| "learning_rate": 2.65224e-05, |
| "loss": 0.0709, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 0.21425922214984894, |
| "learning_rate": 2.64624e-05, |
| "loss": 0.0662, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.10706017166376114, |
| "learning_rate": 2.64024e-05, |
| "loss": 0.0721, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_loss": 0.08755213767290115, |
| "eval_runtime": 87.7619, |
| "eval_samples_per_second": 22.789, |
| "eval_steps_per_second": 5.697, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 0.18366344273090363, |
| "learning_rate": 2.63424e-05, |
| "loss": 0.0781, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 0.15975314378738403, |
| "learning_rate": 2.62824e-05, |
| "loss": 0.0771, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 0.14510446786880493, |
| "learning_rate": 2.6222399999999998e-05, |
| "loss": 0.0734, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.10040156543254852, |
| "learning_rate": 2.61624e-05, |
| "loss": 0.0624, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.1453912854194641, |
| "learning_rate": 2.61024e-05, |
| "loss": 0.0661, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 0.13999666273593903, |
| "learning_rate": 2.60424e-05, |
| "loss": 0.0694, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 0.13396582007408142, |
| "learning_rate": 2.59824e-05, |
| "loss": 0.0692, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.1334969699382782, |
| "learning_rate": 2.59224e-05, |
| "loss": 0.0629, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 0.16296976804733276, |
| "learning_rate": 2.5862399999999998e-05, |
| "loss": 0.0679, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1321544647216797, |
| "learning_rate": 2.58024e-05, |
| "loss": 0.0764, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.28, |
| "eval_loss": 0.0869474709033966, |
| "eval_runtime": 87.6679, |
| "eval_samples_per_second": 22.813, |
| "eval_steps_per_second": 5.703, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 0.13956592977046967, |
| "learning_rate": 2.57424e-05, |
| "loss": 0.0749, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.2318839579820633, |
| "learning_rate": 2.56824e-05, |
| "loss": 0.0658, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 0.10267651081085205, |
| "learning_rate": 2.56224e-05, |
| "loss": 0.0651, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 0.13903647661209106, |
| "learning_rate": 2.5562399999999998e-05, |
| "loss": 0.0684, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.07369179278612137, |
| "learning_rate": 2.5502399999999998e-05, |
| "loss": 0.0643, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.18283645808696747, |
| "learning_rate": 2.54424e-05, |
| "loss": 0.0745, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 0.1031743511557579, |
| "learning_rate": 2.5382400000000003e-05, |
| "loss": 0.0678, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 0.2246129959821701, |
| "learning_rate": 2.5322400000000003e-05, |
| "loss": 0.0701, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 0.1434032917022705, |
| "learning_rate": 2.52624e-05, |
| "loss": 0.071, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.11730857193470001, |
| "learning_rate": 2.52024e-05, |
| "loss": 0.0696, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_loss": 0.0867869183421135, |
| "eval_runtime": 87.7801, |
| "eval_samples_per_second": 22.784, |
| "eval_steps_per_second": 5.696, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 0.07968447357416153, |
| "learning_rate": 2.5142400000000002e-05, |
| "loss": 0.0635, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 0.1641731858253479, |
| "learning_rate": 2.5082400000000002e-05, |
| "loss": 0.0667, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 0.15083415806293488, |
| "learning_rate": 2.5022400000000003e-05, |
| "loss": 0.071, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.16534006595611572, |
| "learning_rate": 2.4962400000000003e-05, |
| "loss": 0.0735, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.1629945933818817, |
| "learning_rate": 2.49024e-05, |
| "loss": 0.0641, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 0.11393357813358307, |
| "learning_rate": 2.48424e-05, |
| "loss": 0.0679, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 0.10581399500370026, |
| "learning_rate": 2.4782400000000002e-05, |
| "loss": 0.0666, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.1571837067604065, |
| "learning_rate": 2.4722400000000002e-05, |
| "loss": 0.0619, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 0.11944606155157089, |
| "learning_rate": 2.4662400000000003e-05, |
| "loss": 0.0666, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.17318391799926758, |
| "learning_rate": 2.46024e-05, |
| "loss": 0.0684, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_loss": 0.08653330057859421, |
| "eval_runtime": 87.7823, |
| "eval_samples_per_second": 22.784, |
| "eval_steps_per_second": 5.696, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 0.09011202305555344, |
| "learning_rate": 2.45424e-05, |
| "loss": 0.0699, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.1441943347454071, |
| "learning_rate": 2.44824e-05, |
| "loss": 0.0634, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 0.17736917734146118, |
| "learning_rate": 2.4422400000000002e-05, |
| "loss": 0.0688, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 0.16143649816513062, |
| "learning_rate": 2.4362400000000002e-05, |
| "loss": 0.0575, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.1789257675409317, |
| "learning_rate": 2.43024e-05, |
| "loss": 0.067, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.13589045405387878, |
| "learning_rate": 2.42424e-05, |
| "loss": 0.0713, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 0.12760789692401886, |
| "learning_rate": 2.41824e-05, |
| "loss": 0.0644, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 0.17507490515708923, |
| "learning_rate": 2.41224e-05, |
| "loss": 0.0634, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 0.15503354370594025, |
| "learning_rate": 2.4062400000000002e-05, |
| "loss": 0.0654, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.12309867143630981, |
| "learning_rate": 2.4002400000000002e-05, |
| "loss": 0.0698, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_loss": 0.0860014408826828, |
| "eval_runtime": 87.7287, |
| "eval_samples_per_second": 22.798, |
| "eval_steps_per_second": 5.699, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.404, |
| "grad_norm": 0.1547604650259018, |
| "learning_rate": 2.39424e-05, |
| "loss": 0.0691, |
| "step": 5050 |
| }, |
| { |
| "epoch": 0.408, |
| "grad_norm": 0.17712494730949402, |
| "learning_rate": 2.38824e-05, |
| "loss": 0.064, |
| "step": 5100 |
| }, |
| { |
| "epoch": 0.412, |
| "grad_norm": 0.16606800258159637, |
| "learning_rate": 2.38224e-05, |
| "loss": 0.0687, |
| "step": 5150 |
| }, |
| { |
| "epoch": 0.416, |
| "grad_norm": 0.18862195312976837, |
| "learning_rate": 2.37624e-05, |
| "loss": 0.0662, |
| "step": 5200 |
| }, |
| { |
| "epoch": 0.42, |
| "grad_norm": 0.1778397411108017, |
| "learning_rate": 2.3702400000000002e-05, |
| "loss": 0.0632, |
| "step": 5250 |
| }, |
| { |
| "epoch": 0.424, |
| "grad_norm": 0.15363118052482605, |
| "learning_rate": 2.36424e-05, |
| "loss": 0.0617, |
| "step": 5300 |
| }, |
| { |
| "epoch": 0.428, |
| "grad_norm": 0.1690954864025116, |
| "learning_rate": 2.35824e-05, |
| "loss": 0.0717, |
| "step": 5350 |
| }, |
| { |
| "epoch": 0.432, |
| "grad_norm": 0.19384829699993134, |
| "learning_rate": 2.35224e-05, |
| "loss": 0.0669, |
| "step": 5400 |
| }, |
| { |
| "epoch": 0.436, |
| "grad_norm": 0.20759643614292145, |
| "learning_rate": 2.34624e-05, |
| "loss": 0.0662, |
| "step": 5450 |
| }, |
| { |
| "epoch": 0.44, |
| "grad_norm": 0.16528765857219696, |
| "learning_rate": 2.34024e-05, |
| "loss": 0.0737, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.44, |
| "eval_loss": 0.08539459854364395, |
| "eval_runtime": 87.7109, |
| "eval_samples_per_second": 22.802, |
| "eval_steps_per_second": 5.701, |
| "step": 5500 |
| }, |
| { |
| "epoch": 0.444, |
| "grad_norm": 0.09632231295108795, |
| "learning_rate": 2.3342400000000002e-05, |
| "loss": 0.0621, |
| "step": 5550 |
| }, |
| { |
| "epoch": 0.448, |
| "grad_norm": 0.15080232918262482, |
| "learning_rate": 2.32824e-05, |
| "loss": 0.0674, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.452, |
| "grad_norm": 0.14794333279132843, |
| "learning_rate": 2.32224e-05, |
| "loss": 0.0662, |
| "step": 5650 |
| }, |
| { |
| "epoch": 0.456, |
| "grad_norm": 0.15014077723026276, |
| "learning_rate": 2.31624e-05, |
| "loss": 0.0574, |
| "step": 5700 |
| }, |
| { |
| "epoch": 0.46, |
| "grad_norm": 0.10953331738710403, |
| "learning_rate": 2.31024e-05, |
| "loss": 0.0684, |
| "step": 5750 |
| }, |
| { |
| "epoch": 0.464, |
| "grad_norm": 0.17434537410736084, |
| "learning_rate": 2.30424e-05, |
| "loss": 0.0687, |
| "step": 5800 |
| }, |
| { |
| "epoch": 0.468, |
| "grad_norm": 0.12245655059814453, |
| "learning_rate": 2.29824e-05, |
| "loss": 0.0689, |
| "step": 5850 |
| }, |
| { |
| "epoch": 0.472, |
| "grad_norm": 0.10118559747934341, |
| "learning_rate": 2.29224e-05, |
| "loss": 0.0609, |
| "step": 5900 |
| }, |
| { |
| "epoch": 0.476, |
| "grad_norm": 0.14768172800540924, |
| "learning_rate": 2.28624e-05, |
| "loss": 0.0717, |
| "step": 5950 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 0.18510164320468903, |
| "learning_rate": 2.28024e-05, |
| "loss": 0.0716, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.48, |
| "eval_loss": 0.08568704128265381, |
| "eval_runtime": 87.7082, |
| "eval_samples_per_second": 22.803, |
| "eval_steps_per_second": 5.701, |
| "step": 6000 |
| }, |
| { |
| "epoch": 0.484, |
| "grad_norm": 0.09803249686956406, |
| "learning_rate": 2.27424e-05, |
| "loss": 0.0693, |
| "step": 6050 |
| }, |
| { |
| "epoch": 0.488, |
| "grad_norm": 0.1030198410153389, |
| "learning_rate": 2.2682399999999998e-05, |
| "loss": 0.065, |
| "step": 6100 |
| }, |
| { |
| "epoch": 0.492, |
| "grad_norm": 0.15723823010921478, |
| "learning_rate": 2.26224e-05, |
| "loss": 0.063, |
| "step": 6150 |
| }, |
| { |
| "epoch": 0.496, |
| "grad_norm": 0.1828642040491104, |
| "learning_rate": 2.25624e-05, |
| "loss": 0.0665, |
| "step": 6200 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.11824677884578705, |
| "learning_rate": 2.25024e-05, |
| "loss": 0.0656, |
| "step": 6250 |
| }, |
| { |
| "epoch": 0.504, |
| "grad_norm": 0.07376304268836975, |
| "learning_rate": 2.24424e-05, |
| "loss": 0.0664, |
| "step": 6300 |
| }, |
| { |
| "epoch": 0.508, |
| "grad_norm": 0.09230540692806244, |
| "learning_rate": 2.23824e-05, |
| "loss": 0.0668, |
| "step": 6350 |
| }, |
| { |
| "epoch": 0.512, |
| "grad_norm": 0.1208396852016449, |
| "learning_rate": 2.2322399999999998e-05, |
| "loss": 0.0642, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.516, |
| "grad_norm": 0.12650001049041748, |
| "learning_rate": 2.22624e-05, |
| "loss": 0.0656, |
| "step": 6450 |
| }, |
| { |
| "epoch": 0.52, |
| "grad_norm": 0.11461616307497025, |
| "learning_rate": 2.22024e-05, |
| "loss": 0.0676, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.52, |
| "eval_loss": 0.08497656136751175, |
| "eval_runtime": 87.668, |
| "eval_samples_per_second": 22.813, |
| "eval_steps_per_second": 5.703, |
| "step": 6500 |
| }, |
| { |
| "epoch": 0.524, |
| "grad_norm": 0.24468739330768585, |
| "learning_rate": 2.21424e-05, |
| "loss": 0.0688, |
| "step": 6550 |
| }, |
| { |
| "epoch": 0.528, |
| "grad_norm": 0.17887485027313232, |
| "learning_rate": 2.20824e-05, |
| "loss": 0.0645, |
| "step": 6600 |
| }, |
| { |
| "epoch": 0.532, |
| "grad_norm": 0.12986980378627777, |
| "learning_rate": 2.2022399999999998e-05, |
| "loss": 0.0609, |
| "step": 6650 |
| }, |
| { |
| "epoch": 0.536, |
| "grad_norm": 0.25361281633377075, |
| "learning_rate": 2.1962399999999998e-05, |
| "loss": 0.0603, |
| "step": 6700 |
| }, |
| { |
| "epoch": 0.54, |
| "grad_norm": 0.1815791130065918, |
| "learning_rate": 2.19024e-05, |
| "loss": 0.0659, |
| "step": 6750 |
| }, |
| { |
| "epoch": 0.544, |
| "grad_norm": 0.12782719731330872, |
| "learning_rate": 2.18424e-05, |
| "loss": 0.0641, |
| "step": 6800 |
| }, |
| { |
| "epoch": 0.548, |
| "grad_norm": 0.1801528036594391, |
| "learning_rate": 2.17824e-05, |
| "loss": 0.0666, |
| "step": 6850 |
| }, |
| { |
| "epoch": 0.552, |
| "grad_norm": 0.1247314065694809, |
| "learning_rate": 2.17224e-05, |
| "loss": 0.0592, |
| "step": 6900 |
| }, |
| { |
| "epoch": 0.556, |
| "grad_norm": 0.19411933422088623, |
| "learning_rate": 2.16624e-05, |
| "loss": 0.0688, |
| "step": 6950 |
| }, |
| { |
| "epoch": 0.56, |
| "grad_norm": 0.11316727846860886, |
| "learning_rate": 2.1602400000000002e-05, |
| "loss": 0.0635, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.56, |
| "eval_loss": 0.08500248938798904, |
| "eval_runtime": 87.7238, |
| "eval_samples_per_second": 22.799, |
| "eval_steps_per_second": 5.7, |
| "step": 7000 |
| }, |
| { |
| "epoch": 0.564, |
| "grad_norm": 0.14090943336486816, |
| "learning_rate": 2.1542400000000002e-05, |
| "loss": 0.0648, |
| "step": 7050 |
| }, |
| { |
| "epoch": 0.568, |
| "grad_norm": 0.16401338577270508, |
| "learning_rate": 2.1482400000000003e-05, |
| "loss": 0.064, |
| "step": 7100 |
| }, |
| { |
| "epoch": 0.572, |
| "grad_norm": 0.14420969784259796, |
| "learning_rate": 2.1422400000000003e-05, |
| "loss": 0.0627, |
| "step": 7150 |
| }, |
| { |
| "epoch": 0.576, |
| "grad_norm": 0.3247956931591034, |
| "learning_rate": 2.13624e-05, |
| "loss": 0.0621, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.58, |
| "grad_norm": 0.1672086864709854, |
| "learning_rate": 2.13024e-05, |
| "loss": 0.0673, |
| "step": 7250 |
| }, |
| { |
| "epoch": 0.584, |
| "grad_norm": 0.16130150854587555, |
| "learning_rate": 2.1242400000000002e-05, |
| "loss": 0.0604, |
| "step": 7300 |
| }, |
| { |
| "epoch": 0.588, |
| "grad_norm": 0.201412633061409, |
| "learning_rate": 2.1182400000000002e-05, |
| "loss": 0.0687, |
| "step": 7350 |
| }, |
| { |
| "epoch": 0.592, |
| "grad_norm": 0.17173218727111816, |
| "learning_rate": 2.1122400000000003e-05, |
| "loss": 0.0626, |
| "step": 7400 |
| }, |
| { |
| "epoch": 0.596, |
| "grad_norm": 0.14102816581726074, |
| "learning_rate": 2.10624e-05, |
| "loss": 0.0662, |
| "step": 7450 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.12381123006343842, |
| "learning_rate": 2.10024e-05, |
| "loss": 0.0676, |
| "step": 7500 |
| }, |
| { |
| "epoch": 0.6, |
| "eval_loss": 0.08412499725818634, |
| "eval_runtime": 87.8551, |
| "eval_samples_per_second": 22.765, |
| "eval_steps_per_second": 5.691, |
| "step": 7500 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 25000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.82687367168e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|