| { |
| "best_metric": 0.0860014408826828, |
| "best_model_checkpoint": "./fine-tuned/checkpoint-5000", |
| "epoch": 0.4, |
| "eval_steps": 500, |
| "global_step": 5000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.004, |
| "grad_norm": 0.24035809934139252, |
| "learning_rate": 2.99412e-05, |
| "loss": 0.3973, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.008, |
| "grad_norm": 0.39832672476768494, |
| "learning_rate": 2.98812e-05, |
| "loss": 0.1126, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.012, |
| "grad_norm": 0.2938326299190521, |
| "learning_rate": 2.9821200000000002e-05, |
| "loss": 0.0932, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.016, |
| "grad_norm": 0.18936102092266083, |
| "learning_rate": 2.9761200000000002e-05, |
| "loss": 0.0897, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.02, |
| "grad_norm": 0.18386273086071014, |
| "learning_rate": 2.9701200000000003e-05, |
| "loss": 0.092, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.024, |
| "grad_norm": 0.1968618482351303, |
| "learning_rate": 2.96412e-05, |
| "loss": 0.0845, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.028, |
| "grad_norm": 0.18028958141803741, |
| "learning_rate": 2.95812e-05, |
| "loss": 0.0792, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.032, |
| "grad_norm": 0.2615596652030945, |
| "learning_rate": 2.95212e-05, |
| "loss": 0.0796, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.036, |
| "grad_norm": 0.1913922131061554, |
| "learning_rate": 2.9461200000000002e-05, |
| "loss": 0.0798, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.04, |
| "grad_norm": 0.2351102977991104, |
| "learning_rate": 2.9401200000000002e-05, |
| "loss": 0.081, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.04, |
| "eval_loss": 0.09441258758306503, |
| "eval_runtime": 88.0185, |
| "eval_samples_per_second": 22.722, |
| "eval_steps_per_second": 5.681, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.044, |
| "grad_norm": 0.1859619915485382, |
| "learning_rate": 2.9341200000000003e-05, |
| "loss": 0.077, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.048, |
| "grad_norm": 0.11936317384243011, |
| "learning_rate": 2.92812e-05, |
| "loss": 0.0727, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.052, |
| "grad_norm": 0.2207396775484085, |
| "learning_rate": 2.92212e-05, |
| "loss": 0.0743, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.056, |
| "grad_norm": 0.18488994240760803, |
| "learning_rate": 2.91612e-05, |
| "loss": 0.0824, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.06, |
| "grad_norm": 0.22228538990020752, |
| "learning_rate": 2.9101200000000002e-05, |
| "loss": 0.0716, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.064, |
| "grad_norm": 0.24138867855072021, |
| "learning_rate": 2.9041200000000002e-05, |
| "loss": 0.0814, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.068, |
| "grad_norm": 0.25113552808761597, |
| "learning_rate": 2.89812e-05, |
| "loss": 0.076, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.072, |
| "grad_norm": 0.8853724598884583, |
| "learning_rate": 2.89212e-05, |
| "loss": 0.0781, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.076, |
| "grad_norm": 0.1753206104040146, |
| "learning_rate": 2.88612e-05, |
| "loss": 0.084, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.08, |
| "grad_norm": 0.1704334318637848, |
| "learning_rate": 2.88012e-05, |
| "loss": 0.0769, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.08, |
| "eval_loss": 0.09160277992486954, |
| "eval_runtime": 88.0495, |
| "eval_samples_per_second": 22.714, |
| "eval_steps_per_second": 5.679, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.084, |
| "grad_norm": 0.16729697585105896, |
| "learning_rate": 2.8741200000000002e-05, |
| "loss": 0.0823, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.088, |
| "grad_norm": 0.14851506054401398, |
| "learning_rate": 2.86812e-05, |
| "loss": 0.0785, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.092, |
| "grad_norm": 0.22481797635555267, |
| "learning_rate": 2.86212e-05, |
| "loss": 0.0786, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.096, |
| "grad_norm": 0.13808289170265198, |
| "learning_rate": 2.85612e-05, |
| "loss": 0.0785, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 0.17833128571510315, |
| "learning_rate": 2.85012e-05, |
| "loss": 0.0737, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.104, |
| "grad_norm": 0.14926594495773315, |
| "learning_rate": 2.84412e-05, |
| "loss": 0.0767, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.108, |
| "grad_norm": 0.19346196949481964, |
| "learning_rate": 2.8381200000000002e-05, |
| "loss": 0.077, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.112, |
| "grad_norm": 0.2675027847290039, |
| "learning_rate": 2.83212e-05, |
| "loss": 0.0806, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.116, |
| "grad_norm": 0.19048169255256653, |
| "learning_rate": 2.82612e-05, |
| "loss": 0.0756, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.12, |
| "grad_norm": 0.1875162124633789, |
| "learning_rate": 2.82012e-05, |
| "loss": 0.0823, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.12, |
| "eval_loss": 0.09038107097148895, |
| "eval_runtime": 87.8777, |
| "eval_samples_per_second": 22.759, |
| "eval_steps_per_second": 5.69, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.124, |
| "grad_norm": 0.19083499908447266, |
| "learning_rate": 2.81412e-05, |
| "loss": 0.0736, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.128, |
| "grad_norm": 0.19123569130897522, |
| "learning_rate": 2.80812e-05, |
| "loss": 0.0766, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.132, |
| "grad_norm": 0.24691827595233917, |
| "learning_rate": 2.80212e-05, |
| "loss": 0.0798, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.136, |
| "grad_norm": 0.17910048365592957, |
| "learning_rate": 2.79612e-05, |
| "loss": 0.0687, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.14, |
| "grad_norm": 0.1740667223930359, |
| "learning_rate": 2.79012e-05, |
| "loss": 0.0758, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.144, |
| "grad_norm": 0.15178219974040985, |
| "learning_rate": 2.78412e-05, |
| "loss": 0.0732, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.148, |
| "grad_norm": 0.1904926896095276, |
| "learning_rate": 2.77812e-05, |
| "loss": 0.0734, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.152, |
| "grad_norm": 0.2795208990573883, |
| "learning_rate": 2.77212e-05, |
| "loss": 0.076, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.156, |
| "grad_norm": 0.18160228431224823, |
| "learning_rate": 2.76612e-05, |
| "loss": 0.0716, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.16, |
| "grad_norm": 0.15877611935138702, |
| "learning_rate": 2.76012e-05, |
| "loss": 0.0773, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.16, |
| "eval_loss": 0.08910445868968964, |
| "eval_runtime": 87.8635, |
| "eval_samples_per_second": 22.763, |
| "eval_steps_per_second": 5.691, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.164, |
| "grad_norm": 0.2411368191242218, |
| "learning_rate": 2.75412e-05, |
| "loss": 0.0786, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.168, |
| "grad_norm": 0.16663742065429688, |
| "learning_rate": 2.74812e-05, |
| "loss": 0.0724, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.172, |
| "grad_norm": 0.23420193791389465, |
| "learning_rate": 2.74212e-05, |
| "loss": 0.0653, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.176, |
| "grad_norm": 0.1807372272014618, |
| "learning_rate": 2.7361199999999998e-05, |
| "loss": 0.0676, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.18, |
| "grad_norm": 0.16474364697933197, |
| "learning_rate": 2.73012e-05, |
| "loss": 0.0767, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.184, |
| "grad_norm": 0.17184095084667206, |
| "learning_rate": 2.72412e-05, |
| "loss": 0.0658, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.188, |
| "grad_norm": 0.16993258893489838, |
| "learning_rate": 2.71812e-05, |
| "loss": 0.0755, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.192, |
| "grad_norm": 0.1555277407169342, |
| "learning_rate": 2.71212e-05, |
| "loss": 0.0698, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.196, |
| "grad_norm": 0.09040562808513641, |
| "learning_rate": 2.7061199999999998e-05, |
| "loss": 0.0757, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 0.12910398840904236, |
| "learning_rate": 2.7001199999999998e-05, |
| "loss": 0.0688, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_loss": 0.08841572701931, |
| "eval_runtime": 87.7555, |
| "eval_samples_per_second": 22.791, |
| "eval_steps_per_second": 5.698, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.204, |
| "grad_norm": 0.14937053620815277, |
| "learning_rate": 2.69412e-05, |
| "loss": 0.0727, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.208, |
| "grad_norm": 0.15660254657268524, |
| "learning_rate": 2.68812e-05, |
| "loss": 0.0656, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.212, |
| "grad_norm": 0.09593763947486877, |
| "learning_rate": 2.68212e-05, |
| "loss": 0.0726, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.216, |
| "grad_norm": 0.25192323327064514, |
| "learning_rate": 2.67624e-05, |
| "loss": 0.0753, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.22, |
| "grad_norm": 0.1267642378807068, |
| "learning_rate": 2.67024e-05, |
| "loss": 0.0707, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.224, |
| "grad_norm": 0.13844658434391022, |
| "learning_rate": 2.66424e-05, |
| "loss": 0.0711, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.228, |
| "grad_norm": 0.15095186233520508, |
| "learning_rate": 2.65824e-05, |
| "loss": 0.0696, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.232, |
| "grad_norm": 0.09553442895412445, |
| "learning_rate": 2.65224e-05, |
| "loss": 0.0709, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.236, |
| "grad_norm": 0.21425922214984894, |
| "learning_rate": 2.64624e-05, |
| "loss": 0.0662, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.24, |
| "grad_norm": 0.10706017166376114, |
| "learning_rate": 2.64024e-05, |
| "loss": 0.0721, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.24, |
| "eval_loss": 0.08755213767290115, |
| "eval_runtime": 87.7619, |
| "eval_samples_per_second": 22.789, |
| "eval_steps_per_second": 5.697, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.244, |
| "grad_norm": 0.18366344273090363, |
| "learning_rate": 2.63424e-05, |
| "loss": 0.0781, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.248, |
| "grad_norm": 0.15975314378738403, |
| "learning_rate": 2.62824e-05, |
| "loss": 0.0771, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.252, |
| "grad_norm": 0.14510446786880493, |
| "learning_rate": 2.6222399999999998e-05, |
| "loss": 0.0734, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.256, |
| "grad_norm": 0.10040156543254852, |
| "learning_rate": 2.61624e-05, |
| "loss": 0.0624, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.26, |
| "grad_norm": 0.1453912854194641, |
| "learning_rate": 2.61024e-05, |
| "loss": 0.0661, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.264, |
| "grad_norm": 0.13999666273593903, |
| "learning_rate": 2.60424e-05, |
| "loss": 0.0694, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.268, |
| "grad_norm": 0.13396582007408142, |
| "learning_rate": 2.59824e-05, |
| "loss": 0.0692, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.272, |
| "grad_norm": 0.1334969699382782, |
| "learning_rate": 2.59224e-05, |
| "loss": 0.0629, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.276, |
| "grad_norm": 0.16296976804733276, |
| "learning_rate": 2.5862399999999998e-05, |
| "loss": 0.0679, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.28, |
| "grad_norm": 0.1321544647216797, |
| "learning_rate": 2.58024e-05, |
| "loss": 0.0764, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.28, |
| "eval_loss": 0.0869474709033966, |
| "eval_runtime": 87.6679, |
| "eval_samples_per_second": 22.813, |
| "eval_steps_per_second": 5.703, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.284, |
| "grad_norm": 0.13956592977046967, |
| "learning_rate": 2.57424e-05, |
| "loss": 0.0749, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.288, |
| "grad_norm": 0.2318839579820633, |
| "learning_rate": 2.56824e-05, |
| "loss": 0.0658, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.292, |
| "grad_norm": 0.10267651081085205, |
| "learning_rate": 2.56224e-05, |
| "loss": 0.0651, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.296, |
| "grad_norm": 0.13903647661209106, |
| "learning_rate": 2.5562399999999998e-05, |
| "loss": 0.0684, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.07369179278612137, |
| "learning_rate": 2.5502399999999998e-05, |
| "loss": 0.0643, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.304, |
| "grad_norm": 0.18283645808696747, |
| "learning_rate": 2.54424e-05, |
| "loss": 0.0745, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.308, |
| "grad_norm": 0.1031743511557579, |
| "learning_rate": 2.5382400000000003e-05, |
| "loss": 0.0678, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.312, |
| "grad_norm": 0.2246129959821701, |
| "learning_rate": 2.5322400000000003e-05, |
| "loss": 0.0701, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.316, |
| "grad_norm": 0.1434032917022705, |
| "learning_rate": 2.52624e-05, |
| "loss": 0.071, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 0.11730857193470001, |
| "learning_rate": 2.52024e-05, |
| "loss": 0.0696, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.32, |
| "eval_loss": 0.0867869183421135, |
| "eval_runtime": 87.7801, |
| "eval_samples_per_second": 22.784, |
| "eval_steps_per_second": 5.696, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.324, |
| "grad_norm": 0.07968447357416153, |
| "learning_rate": 2.5142400000000002e-05, |
| "loss": 0.0635, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.328, |
| "grad_norm": 0.1641731858253479, |
| "learning_rate": 2.5082400000000002e-05, |
| "loss": 0.0667, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.332, |
| "grad_norm": 0.15083415806293488, |
| "learning_rate": 2.5022400000000003e-05, |
| "loss": 0.071, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.336, |
| "grad_norm": 0.16534006595611572, |
| "learning_rate": 2.4962400000000003e-05, |
| "loss": 0.0735, |
| "step": 4200 |
| }, |
| { |
| "epoch": 0.34, |
| "grad_norm": 0.1629945933818817, |
| "learning_rate": 2.49024e-05, |
| "loss": 0.0641, |
| "step": 4250 |
| }, |
| { |
| "epoch": 0.344, |
| "grad_norm": 0.11393357813358307, |
| "learning_rate": 2.48424e-05, |
| "loss": 0.0679, |
| "step": 4300 |
| }, |
| { |
| "epoch": 0.348, |
| "grad_norm": 0.10581399500370026, |
| "learning_rate": 2.4782400000000002e-05, |
| "loss": 0.0666, |
| "step": 4350 |
| }, |
| { |
| "epoch": 0.352, |
| "grad_norm": 0.1571837067604065, |
| "learning_rate": 2.4722400000000002e-05, |
| "loss": 0.0619, |
| "step": 4400 |
| }, |
| { |
| "epoch": 0.356, |
| "grad_norm": 0.11944606155157089, |
| "learning_rate": 2.4662400000000003e-05, |
| "loss": 0.0666, |
| "step": 4450 |
| }, |
| { |
| "epoch": 0.36, |
| "grad_norm": 0.17318391799926758, |
| "learning_rate": 2.46024e-05, |
| "loss": 0.0684, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.36, |
| "eval_loss": 0.08653330057859421, |
| "eval_runtime": 87.7823, |
| "eval_samples_per_second": 22.784, |
| "eval_steps_per_second": 5.696, |
| "step": 4500 |
| }, |
| { |
| "epoch": 0.364, |
| "grad_norm": 0.09011202305555344, |
| "learning_rate": 2.45424e-05, |
| "loss": 0.0699, |
| "step": 4550 |
| }, |
| { |
| "epoch": 0.368, |
| "grad_norm": 0.1441943347454071, |
| "learning_rate": 2.44824e-05, |
| "loss": 0.0634, |
| "step": 4600 |
| }, |
| { |
| "epoch": 0.372, |
| "grad_norm": 0.17736917734146118, |
| "learning_rate": 2.4422400000000002e-05, |
| "loss": 0.0688, |
| "step": 4650 |
| }, |
| { |
| "epoch": 0.376, |
| "grad_norm": 0.16143649816513062, |
| "learning_rate": 2.4362400000000002e-05, |
| "loss": 0.0575, |
| "step": 4700 |
| }, |
| { |
| "epoch": 0.38, |
| "grad_norm": 0.1789257675409317, |
| "learning_rate": 2.43024e-05, |
| "loss": 0.067, |
| "step": 4750 |
| }, |
| { |
| "epoch": 0.384, |
| "grad_norm": 0.13589045405387878, |
| "learning_rate": 2.42424e-05, |
| "loss": 0.0713, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.388, |
| "grad_norm": 0.12760789692401886, |
| "learning_rate": 2.41824e-05, |
| "loss": 0.0644, |
| "step": 4850 |
| }, |
| { |
| "epoch": 0.392, |
| "grad_norm": 0.17507490515708923, |
| "learning_rate": 2.41224e-05, |
| "loss": 0.0634, |
| "step": 4900 |
| }, |
| { |
| "epoch": 0.396, |
| "grad_norm": 0.15503354370594025, |
| "learning_rate": 2.4062400000000002e-05, |
| "loss": 0.0654, |
| "step": 4950 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.12309867143630981, |
| "learning_rate": 2.4002400000000002e-05, |
| "loss": 0.0698, |
| "step": 5000 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_loss": 0.0860014408826828, |
| "eval_runtime": 87.7287, |
| "eval_samples_per_second": 22.798, |
| "eval_steps_per_second": 5.699, |
| "step": 5000 |
| } |
| ], |
| "logging_steps": 50, |
| "max_steps": 25000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.21791578112e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|