| { | |
| "best_global_step": 95000, | |
| "best_metric": 2.405331611633301, | |
| "best_model_checkpoint": "output/checkpoint-95000", | |
| "epoch": 76.0, | |
| "eval_steps": 500, | |
| "global_step": 95000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.3970232009887695, | |
| "learning_rate": 4.9800000000000004e-05, | |
| "loss": 6.725, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 1.4344754219055176, | |
| "learning_rate": 4.96e-05, | |
| "loss": 6.1958, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 6.076746940612793, | |
| "eval_runtime": 61.2746, | |
| "eval_samples_per_second": 163.2, | |
| "eval_steps_per_second": 5.108, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 1.259356141090393, | |
| "learning_rate": 4.94e-05, | |
| "loss": 6.118, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 1.1414670944213867, | |
| "learning_rate": 4.92e-05, | |
| "loss": 6.0705, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.2967591285705566, | |
| "learning_rate": 4.9e-05, | |
| "loss": 6.0444, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 6.011667251586914, | |
| "eval_runtime": 61.2402, | |
| "eval_samples_per_second": 163.291, | |
| "eval_steps_per_second": 5.111, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 1.572110652923584, | |
| "learning_rate": 4.88e-05, | |
| "loss": 6.0122, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 1.6025742292404175, | |
| "learning_rate": 4.86e-05, | |
| "loss": 5.991, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 5.93717622756958, | |
| "eval_runtime": 61.2634, | |
| "eval_samples_per_second": 163.23, | |
| "eval_steps_per_second": 5.109, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 3.2, | |
| "grad_norm": 1.4724808931350708, | |
| "learning_rate": 4.8400000000000004e-05, | |
| "loss": 5.9594, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 3.6, | |
| "grad_norm": 1.7168885469436646, | |
| "learning_rate": 4.82e-05, | |
| "loss": 5.8787, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 1.6915048360824585, | |
| "learning_rate": 4.8e-05, | |
| "loss": 5.7669, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 5.624746322631836, | |
| "eval_runtime": 61.2596, | |
| "eval_samples_per_second": 163.24, | |
| "eval_steps_per_second": 5.109, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 4.4, | |
| "grad_norm": 2.1659460067749023, | |
| "learning_rate": 4.78e-05, | |
| "loss": 5.593, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 4.8, | |
| "grad_norm": 2.476262331008911, | |
| "learning_rate": 4.76e-05, | |
| "loss": 5.422, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 5.117976665496826, | |
| "eval_runtime": 61.2737, | |
| "eval_samples_per_second": 163.202, | |
| "eval_steps_per_second": 5.108, | |
| "step": 6250 | |
| }, | |
| { | |
| "epoch": 5.2, | |
| "grad_norm": 2.4147756099700928, | |
| "learning_rate": 4.74e-05, | |
| "loss": 5.2305, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 5.6, | |
| "grad_norm": 2.6457254886627197, | |
| "learning_rate": 4.72e-05, | |
| "loss": 4.9511, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 2.7153351306915283, | |
| "learning_rate": 4.7e-05, | |
| "loss": 4.7339, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 4.504222869873047, | |
| "eval_runtime": 61.2685, | |
| "eval_samples_per_second": 163.216, | |
| "eval_steps_per_second": 5.109, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 6.4, | |
| "grad_norm": 2.6116857528686523, | |
| "learning_rate": 4.6800000000000006e-05, | |
| "loss": 4.5647, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 6.8, | |
| "grad_norm": 2.3627805709838867, | |
| "learning_rate": 4.660000000000001e-05, | |
| "loss": 4.432, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 4.213283061981201, | |
| "eval_runtime": 61.2562, | |
| "eval_samples_per_second": 163.249, | |
| "eval_steps_per_second": 5.11, | |
| "step": 8750 | |
| }, | |
| { | |
| "epoch": 7.2, | |
| "grad_norm": 2.548539161682129, | |
| "learning_rate": 4.64e-05, | |
| "loss": 4.3363, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 7.6, | |
| "grad_norm": 2.6018478870391846, | |
| "learning_rate": 4.6200000000000005e-05, | |
| "loss": 4.2403, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 2.2613413333892822, | |
| "learning_rate": 4.600000000000001e-05, | |
| "loss": 4.1724, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 4.003890514373779, | |
| "eval_runtime": 61.4625, | |
| "eval_samples_per_second": 162.701, | |
| "eval_steps_per_second": 5.093, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 8.4, | |
| "grad_norm": 2.538825273513794, | |
| "learning_rate": 4.58e-05, | |
| "loss": 4.0805, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 8.8, | |
| "grad_norm": 2.3357536792755127, | |
| "learning_rate": 4.5600000000000004e-05, | |
| "loss": 4.0316, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 3.875485897064209, | |
| "eval_runtime": 61.2731, | |
| "eval_samples_per_second": 163.204, | |
| "eval_steps_per_second": 5.108, | |
| "step": 11250 | |
| }, | |
| { | |
| "epoch": 9.2, | |
| "grad_norm": 2.7909834384918213, | |
| "learning_rate": 4.5400000000000006e-05, | |
| "loss": 3.9733, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 9.6, | |
| "grad_norm": 2.770125150680542, | |
| "learning_rate": 4.52e-05, | |
| "loss": 3.9146, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 2.6937015056610107, | |
| "learning_rate": 4.5e-05, | |
| "loss": 3.8765, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 3.7373440265655518, | |
| "eval_runtime": 61.2543, | |
| "eval_samples_per_second": 163.254, | |
| "eval_steps_per_second": 5.11, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 10.4, | |
| "grad_norm": 2.762148380279541, | |
| "learning_rate": 4.4800000000000005e-05, | |
| "loss": 3.8169, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 10.8, | |
| "grad_norm": 2.428678035736084, | |
| "learning_rate": 4.46e-05, | |
| "loss": 3.7702, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 3.6505491733551025, | |
| "eval_runtime": 61.3079, | |
| "eval_samples_per_second": 163.111, | |
| "eval_steps_per_second": 5.105, | |
| "step": 13750 | |
| }, | |
| { | |
| "epoch": 11.2, | |
| "grad_norm": 2.57837176322937, | |
| "learning_rate": 4.44e-05, | |
| "loss": 3.7268, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 11.6, | |
| "grad_norm": 2.6295292377471924, | |
| "learning_rate": 4.4200000000000004e-05, | |
| "loss": 3.6882, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 2.4153482913970947, | |
| "learning_rate": 4.4000000000000006e-05, | |
| "loss": 3.6646, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 3.5465199947357178, | |
| "eval_runtime": 61.2796, | |
| "eval_samples_per_second": 163.187, | |
| "eval_steps_per_second": 5.108, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 12.4, | |
| "grad_norm": 2.5010106563568115, | |
| "learning_rate": 4.38e-05, | |
| "loss": 3.617, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 12.8, | |
| "grad_norm": 2.515644073486328, | |
| "learning_rate": 4.36e-05, | |
| "loss": 3.5834, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 3.478050708770752, | |
| "eval_runtime": 61.4335, | |
| "eval_samples_per_second": 162.778, | |
| "eval_steps_per_second": 5.095, | |
| "step": 16250 | |
| }, | |
| { | |
| "epoch": 13.2, | |
| "grad_norm": 3.00858211517334, | |
| "learning_rate": 4.3400000000000005e-05, | |
| "loss": 3.5563, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 13.6, | |
| "grad_norm": 2.528775453567505, | |
| "learning_rate": 4.32e-05, | |
| "loss": 3.5116, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 2.637566089630127, | |
| "learning_rate": 4.3e-05, | |
| "loss": 3.4912, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 3.401700019836426, | |
| "eval_runtime": 61.2641, | |
| "eval_samples_per_second": 163.228, | |
| "eval_steps_per_second": 5.109, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 14.4, | |
| "grad_norm": 2.4223690032958984, | |
| "learning_rate": 4.2800000000000004e-05, | |
| "loss": 3.4544, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 14.8, | |
| "grad_norm": 2.7772583961486816, | |
| "learning_rate": 4.26e-05, | |
| "loss": 3.437, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 3.337768077850342, | |
| "eval_runtime": 61.2955, | |
| "eval_samples_per_second": 163.144, | |
| "eval_steps_per_second": 5.106, | |
| "step": 18750 | |
| }, | |
| { | |
| "epoch": 15.2, | |
| "grad_norm": 2.417879104614258, | |
| "learning_rate": 4.24e-05, | |
| "loss": 3.4121, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 15.6, | |
| "grad_norm": 2.525871753692627, | |
| "learning_rate": 4.22e-05, | |
| "loss": 3.3835, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 2.49729061126709, | |
| "learning_rate": 4.2e-05, | |
| "loss": 3.3679, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 3.2797868251800537, | |
| "eval_runtime": 61.2076, | |
| "eval_samples_per_second": 163.378, | |
| "eval_steps_per_second": 5.114, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 16.4, | |
| "grad_norm": 2.6784422397613525, | |
| "learning_rate": 4.18e-05, | |
| "loss": 3.3326, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 16.8, | |
| "grad_norm": 2.5526607036590576, | |
| "learning_rate": 4.16e-05, | |
| "loss": 3.3217, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 3.233443021774292, | |
| "eval_runtime": 61.2163, | |
| "eval_samples_per_second": 163.355, | |
| "eval_steps_per_second": 5.113, | |
| "step": 21250 | |
| }, | |
| { | |
| "epoch": 17.2, | |
| "grad_norm": 2.685425281524658, | |
| "learning_rate": 4.14e-05, | |
| "loss": 3.2826, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 17.6, | |
| "grad_norm": 2.7774910926818848, | |
| "learning_rate": 4.12e-05, | |
| "loss": 3.2716, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "grad_norm": 2.404536008834839, | |
| "learning_rate": 4.1e-05, | |
| "loss": 3.2583, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 3.1886167526245117, | |
| "eval_runtime": 61.4623, | |
| "eval_samples_per_second": 162.701, | |
| "eval_steps_per_second": 5.093, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 18.4, | |
| "grad_norm": 2.5723462104797363, | |
| "learning_rate": 4.08e-05, | |
| "loss": 3.2232, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 18.8, | |
| "grad_norm": 2.5137925148010254, | |
| "learning_rate": 4.0600000000000004e-05, | |
| "loss": 3.2212, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 3.1394808292388916, | |
| "eval_runtime": 61.2181, | |
| "eval_samples_per_second": 163.35, | |
| "eval_steps_per_second": 5.113, | |
| "step": 23750 | |
| }, | |
| { | |
| "epoch": 19.2, | |
| "grad_norm": 2.7127292156219482, | |
| "learning_rate": 4.0400000000000006e-05, | |
| "loss": 3.1803, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 19.6, | |
| "grad_norm": 2.5438954830169678, | |
| "learning_rate": 4.02e-05, | |
| "loss": 3.1692, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "grad_norm": 2.503960132598877, | |
| "learning_rate": 4e-05, | |
| "loss": 3.1697, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 3.112931251525879, | |
| "eval_runtime": 61.2263, | |
| "eval_samples_per_second": 163.329, | |
| "eval_steps_per_second": 5.112, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 20.4, | |
| "grad_norm": 2.501166820526123, | |
| "learning_rate": 3.9800000000000005e-05, | |
| "loss": 3.1396, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 20.8, | |
| "grad_norm": 2.7387282848358154, | |
| "learning_rate": 3.960000000000001e-05, | |
| "loss": 3.1209, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 3.0616869926452637, | |
| "eval_runtime": 61.2523, | |
| "eval_samples_per_second": 163.259, | |
| "eval_steps_per_second": 5.11, | |
| "step": 26250 | |
| }, | |
| { | |
| "epoch": 21.2, | |
| "grad_norm": 2.641589641571045, | |
| "learning_rate": 3.94e-05, | |
| "loss": 3.1079, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 21.6, | |
| "grad_norm": 2.8861656188964844, | |
| "learning_rate": 3.9200000000000004e-05, | |
| "loss": 3.0907, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "grad_norm": 2.9420666694641113, | |
| "learning_rate": 3.9000000000000006e-05, | |
| "loss": 3.0772, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 3.029193639755249, | |
| "eval_runtime": 61.3008, | |
| "eval_samples_per_second": 163.13, | |
| "eval_steps_per_second": 5.106, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 22.4, | |
| "grad_norm": 2.6903202533721924, | |
| "learning_rate": 3.88e-05, | |
| "loss": 3.0618, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 22.8, | |
| "grad_norm": 2.4258873462677, | |
| "learning_rate": 3.86e-05, | |
| "loss": 3.0482, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 2.998987913131714, | |
| "eval_runtime": 61.2446, | |
| "eval_samples_per_second": 163.28, | |
| "eval_steps_per_second": 5.111, | |
| "step": 28750 | |
| }, | |
| { | |
| "epoch": 23.2, | |
| "grad_norm": 2.8380045890808105, | |
| "learning_rate": 3.8400000000000005e-05, | |
| "loss": 3.0271, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 23.6, | |
| "grad_norm": 2.936922788619995, | |
| "learning_rate": 3.82e-05, | |
| "loss": 3.016, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "grad_norm": 2.829270362854004, | |
| "learning_rate": 3.8e-05, | |
| "loss": 3.0102, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 2.972419023513794, | |
| "eval_runtime": 61.3686, | |
| "eval_samples_per_second": 162.95, | |
| "eval_steps_per_second": 5.1, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 24.4, | |
| "grad_norm": 2.6595258712768555, | |
| "learning_rate": 3.7800000000000004e-05, | |
| "loss": 2.9857, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 24.8, | |
| "grad_norm": 2.9528753757476807, | |
| "learning_rate": 3.76e-05, | |
| "loss": 2.9833, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 2.943739414215088, | |
| "eval_runtime": 61.233, | |
| "eval_samples_per_second": 163.311, | |
| "eval_steps_per_second": 5.112, | |
| "step": 31250 | |
| }, | |
| { | |
| "epoch": 25.2, | |
| "grad_norm": 2.8180456161499023, | |
| "learning_rate": 3.74e-05, | |
| "loss": 2.9631, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 25.6, | |
| "grad_norm": 3.0738282203674316, | |
| "learning_rate": 3.72e-05, | |
| "loss": 2.9521, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "grad_norm": 3.0022482872009277, | |
| "learning_rate": 3.7e-05, | |
| "loss": 2.9429, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 2.918297290802002, | |
| "eval_runtime": 61.2635, | |
| "eval_samples_per_second": 163.229, | |
| "eval_steps_per_second": 5.109, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 26.4, | |
| "grad_norm": 2.5933921337127686, | |
| "learning_rate": 3.68e-05, | |
| "loss": 2.9205, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 26.8, | |
| "grad_norm": 2.725785493850708, | |
| "learning_rate": 3.66e-05, | |
| "loss": 2.9138, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 2.8853907585144043, | |
| "eval_runtime": 61.2544, | |
| "eval_samples_per_second": 163.254, | |
| "eval_steps_per_second": 5.11, | |
| "step": 33750 | |
| }, | |
| { | |
| "epoch": 27.2, | |
| "grad_norm": 2.8924100399017334, | |
| "learning_rate": 3.6400000000000004e-05, | |
| "loss": 2.9061, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 27.6, | |
| "grad_norm": 2.9126296043395996, | |
| "learning_rate": 3.62e-05, | |
| "loss": 2.8858, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "grad_norm": 2.768714189529419, | |
| "learning_rate": 3.6e-05, | |
| "loss": 2.8815, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 2.8625736236572266, | |
| "eval_runtime": 61.266, | |
| "eval_samples_per_second": 163.223, | |
| "eval_steps_per_second": 5.109, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 28.4, | |
| "grad_norm": 2.8135673999786377, | |
| "learning_rate": 3.58e-05, | |
| "loss": 2.8635, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 28.8, | |
| "grad_norm": 2.8309783935546875, | |
| "learning_rate": 3.56e-05, | |
| "loss": 2.848, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 2.8396055698394775, | |
| "eval_runtime": 61.5143, | |
| "eval_samples_per_second": 162.564, | |
| "eval_steps_per_second": 5.088, | |
| "step": 36250 | |
| }, | |
| { | |
| "epoch": 29.2, | |
| "grad_norm": 2.9002137184143066, | |
| "learning_rate": 3.54e-05, | |
| "loss": 2.8476, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 29.6, | |
| "grad_norm": 2.79744815826416, | |
| "learning_rate": 3.52e-05, | |
| "loss": 2.8341, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "grad_norm": 3.041928768157959, | |
| "learning_rate": 3.5e-05, | |
| "loss": 2.8276, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 2.805696487426758, | |
| "eval_runtime": 61.2638, | |
| "eval_samples_per_second": 163.229, | |
| "eval_steps_per_second": 5.109, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 30.4, | |
| "grad_norm": 2.7281506061553955, | |
| "learning_rate": 3.48e-05, | |
| "loss": 2.8067, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 30.8, | |
| "grad_norm": 2.6974635124206543, | |
| "learning_rate": 3.46e-05, | |
| "loss": 2.8114, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 2.7848477363586426, | |
| "eval_runtime": 61.2633, | |
| "eval_samples_per_second": 163.23, | |
| "eval_steps_per_second": 5.109, | |
| "step": 38750 | |
| }, | |
| { | |
| "epoch": 31.2, | |
| "grad_norm": 3.201719045639038, | |
| "learning_rate": 3.4399999999999996e-05, | |
| "loss": 2.7928, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 31.6, | |
| "grad_norm": 2.7298779487609863, | |
| "learning_rate": 3.4200000000000005e-05, | |
| "loss": 2.7814, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "grad_norm": 2.873957872390747, | |
| "learning_rate": 3.4000000000000007e-05, | |
| "loss": 2.7806, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 2.773723840713501, | |
| "eval_runtime": 61.1994, | |
| "eval_samples_per_second": 163.4, | |
| "eval_steps_per_second": 5.114, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 32.4, | |
| "grad_norm": 2.733128547668457, | |
| "learning_rate": 3.38e-05, | |
| "loss": 2.7619, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 32.8, | |
| "grad_norm": 2.825956106185913, | |
| "learning_rate": 3.3600000000000004e-05, | |
| "loss": 2.7608, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 2.7562143802642822, | |
| "eval_runtime": 61.2126, | |
| "eval_samples_per_second": 163.365, | |
| "eval_steps_per_second": 5.113, | |
| "step": 41250 | |
| }, | |
| { | |
| "epoch": 33.2, | |
| "grad_norm": 2.9455862045288086, | |
| "learning_rate": 3.3400000000000005e-05, | |
| "loss": 2.742, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 33.6, | |
| "grad_norm": 2.9023234844207764, | |
| "learning_rate": 3.32e-05, | |
| "loss": 2.7347, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "grad_norm": 2.7190394401550293, | |
| "learning_rate": 3.3e-05, | |
| "loss": 2.7366, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 2.7386631965637207, | |
| "eval_runtime": 61.2856, | |
| "eval_samples_per_second": 163.17, | |
| "eval_steps_per_second": 5.107, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 34.4, | |
| "grad_norm": 2.9011335372924805, | |
| "learning_rate": 3.2800000000000004e-05, | |
| "loss": 2.7113, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 34.8, | |
| "grad_norm": 2.6606903076171875, | |
| "learning_rate": 3.26e-05, | |
| "loss": 2.7157, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 2.713857889175415, | |
| "eval_runtime": 61.228, | |
| "eval_samples_per_second": 163.324, | |
| "eval_steps_per_second": 5.112, | |
| "step": 43750 | |
| }, | |
| { | |
| "epoch": 35.2, | |
| "grad_norm": 2.7517592906951904, | |
| "learning_rate": 3.24e-05, | |
| "loss": 2.7046, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 35.6, | |
| "grad_norm": 3.1010994911193848, | |
| "learning_rate": 3.2200000000000003e-05, | |
| "loss": 2.6955, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "grad_norm": 2.7236251831054688, | |
| "learning_rate": 3.2000000000000005e-05, | |
| "loss": 2.6894, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 2.707624673843384, | |
| "eval_runtime": 61.2324, | |
| "eval_samples_per_second": 163.312, | |
| "eval_steps_per_second": 5.112, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 36.4, | |
| "grad_norm": 2.961221933364868, | |
| "learning_rate": 3.18e-05, | |
| "loss": 2.6825, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 36.8, | |
| "grad_norm": 2.9229893684387207, | |
| "learning_rate": 3.16e-05, | |
| "loss": 2.6708, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 2.686065196990967, | |
| "eval_runtime": 61.2242, | |
| "eval_samples_per_second": 163.334, | |
| "eval_steps_per_second": 5.112, | |
| "step": 46250 | |
| }, | |
| { | |
| "epoch": 37.2, | |
| "grad_norm": 3.1210429668426514, | |
| "learning_rate": 3.1400000000000004e-05, | |
| "loss": 2.6717, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 37.6, | |
| "grad_norm": 2.785784959793091, | |
| "learning_rate": 3.12e-05, | |
| "loss": 2.6594, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "grad_norm": 2.8745310306549072, | |
| "learning_rate": 3.1e-05, | |
| "loss": 2.6577, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 2.6833102703094482, | |
| "eval_runtime": 61.2268, | |
| "eval_samples_per_second": 163.327, | |
| "eval_steps_per_second": 5.112, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 38.4, | |
| "grad_norm": 2.848191022872925, | |
| "learning_rate": 3.08e-05, | |
| "loss": 2.6407, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 38.8, | |
| "grad_norm": 2.9842936992645264, | |
| "learning_rate": 3.06e-05, | |
| "loss": 2.6437, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 2.6599860191345215, | |
| "eval_runtime": 61.3483, | |
| "eval_samples_per_second": 163.004, | |
| "eval_steps_per_second": 5.102, | |
| "step": 48750 | |
| }, | |
| { | |
| "epoch": 39.2, | |
| "grad_norm": 2.9027788639068604, | |
| "learning_rate": 3.04e-05, | |
| "loss": 2.6275, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 39.6, | |
| "grad_norm": 2.8184854984283447, | |
| "learning_rate": 3.02e-05, | |
| "loss": 2.6188, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "grad_norm": 2.8621346950531006, | |
| "learning_rate": 3e-05, | |
| "loss": 2.6255, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 2.6438729763031006, | |
| "eval_runtime": 61.2542, | |
| "eval_samples_per_second": 163.254, | |
| "eval_steps_per_second": 5.11, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 40.4, | |
| "grad_norm": 2.661895275115967, | |
| "learning_rate": 2.98e-05, | |
| "loss": 2.6124, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 40.8, | |
| "grad_norm": 2.913214683532715, | |
| "learning_rate": 2.96e-05, | |
| "loss": 2.6128, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 2.634756088256836, | |
| "eval_runtime": 61.2199, | |
| "eval_samples_per_second": 163.346, | |
| "eval_steps_per_second": 5.113, | |
| "step": 51250 | |
| }, | |
| { | |
| "epoch": 41.2, | |
| "grad_norm": 2.8805391788482666, | |
| "learning_rate": 2.94e-05, | |
| "loss": 2.5944, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 41.6, | |
| "grad_norm": 3.3086748123168945, | |
| "learning_rate": 2.9199999999999998e-05, | |
| "loss": 2.5968, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "grad_norm": 2.8824660778045654, | |
| "learning_rate": 2.9e-05, | |
| "loss": 2.5929, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 2.6286351680755615, | |
| "eval_runtime": 61.2491, | |
| "eval_samples_per_second": 163.268, | |
| "eval_steps_per_second": 5.11, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 42.4, | |
| "grad_norm": 3.0503251552581787, | |
| "learning_rate": 2.88e-05, | |
| "loss": 2.5691, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 42.8, | |
| "grad_norm": 2.777174711227417, | |
| "learning_rate": 2.86e-05, | |
| "loss": 2.5861, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 2.605225086212158, | |
| "eval_runtime": 61.226, | |
| "eval_samples_per_second": 163.329, | |
| "eval_steps_per_second": 5.112, | |
| "step": 53750 | |
| }, | |
| { | |
| "epoch": 43.2, | |
| "grad_norm": 2.6447839736938477, | |
| "learning_rate": 2.84e-05, | |
| "loss": 2.5709, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 43.6, | |
| "grad_norm": 2.7956857681274414, | |
| "learning_rate": 2.8199999999999998e-05, | |
| "loss": 2.5672, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "grad_norm": 3.081850528717041, | |
| "learning_rate": 2.8000000000000003e-05, | |
| "loss": 2.5665, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 2.588907480239868, | |
| "eval_runtime": 61.2915, | |
| "eval_samples_per_second": 163.155, | |
| "eval_steps_per_second": 5.107, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 44.4, | |
| "grad_norm": 2.979550838470459, | |
| "learning_rate": 2.7800000000000005e-05, | |
| "loss": 2.5553, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 44.8, | |
| "grad_norm": 2.8044862747192383, | |
| "learning_rate": 2.7600000000000003e-05, | |
| "loss": 2.5428, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 2.5998992919921875, | |
| "eval_runtime": 61.2365, | |
| "eval_samples_per_second": 163.301, | |
| "eval_steps_per_second": 5.111, | |
| "step": 56250 | |
| }, | |
| { | |
| "epoch": 45.2, | |
| "grad_norm": 2.7438066005706787, | |
| "learning_rate": 2.7400000000000002e-05, | |
| "loss": 2.5435, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 45.6, | |
| "grad_norm": 2.8336572647094727, | |
| "learning_rate": 2.7200000000000004e-05, | |
| "loss": 2.5333, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "grad_norm": 2.8354380130767822, | |
| "learning_rate": 2.7000000000000002e-05, | |
| "loss": 2.5385, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 2.5822722911834717, | |
| "eval_runtime": 61.2292, | |
| "eval_samples_per_second": 163.321, | |
| "eval_steps_per_second": 5.112, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 46.4, | |
| "grad_norm": 3.035831928253174, | |
| "learning_rate": 2.6800000000000004e-05, | |
| "loss": 2.5185, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 46.8, | |
| "grad_norm": 3.1147818565368652, | |
| "learning_rate": 2.6600000000000003e-05, | |
| "loss": 2.5273, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 2.5733859539031982, | |
| "eval_runtime": 61.2138, | |
| "eval_samples_per_second": 163.362, | |
| "eval_steps_per_second": 5.113, | |
| "step": 58750 | |
| }, | |
| { | |
| "epoch": 47.2, | |
| "grad_norm": 2.987078905105591, | |
| "learning_rate": 2.64e-05, | |
| "loss": 2.5207, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 47.6, | |
| "grad_norm": 2.7736542224884033, | |
| "learning_rate": 2.6200000000000003e-05, | |
| "loss": 2.5038, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "grad_norm": 2.8549487590789795, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 2.5175, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 2.561167001724243, | |
| "eval_runtime": 61.2382, | |
| "eval_samples_per_second": 163.297, | |
| "eval_steps_per_second": 5.111, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 48.4, | |
| "grad_norm": 3.058418035507202, | |
| "learning_rate": 2.58e-05, | |
| "loss": 2.4956, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 48.8, | |
| "grad_norm": 2.955167293548584, | |
| "learning_rate": 2.5600000000000002e-05, | |
| "loss": 2.5064, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 2.555574417114258, | |
| "eval_runtime": 61.2242, | |
| "eval_samples_per_second": 163.334, | |
| "eval_steps_per_second": 5.112, | |
| "step": 61250 | |
| }, | |
| { | |
| "epoch": 49.2, | |
| "grad_norm": 2.9953746795654297, | |
| "learning_rate": 2.54e-05, | |
| "loss": 2.4899, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 49.6, | |
| "grad_norm": 3.176257610321045, | |
| "learning_rate": 2.5200000000000003e-05, | |
| "loss": 2.4862, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "grad_norm": 3.131290912628174, | |
| "learning_rate": 2.5e-05, | |
| "loss": 2.4931, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 2.5496270656585693, | |
| "eval_runtime": 61.2301, | |
| "eval_samples_per_second": 163.319, | |
| "eval_steps_per_second": 5.112, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 50.4, | |
| "grad_norm": 2.871742010116577, | |
| "learning_rate": 2.48e-05, | |
| "loss": 2.4794, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 50.8, | |
| "grad_norm": 2.7357499599456787, | |
| "learning_rate": 2.46e-05, | |
| "loss": 2.4802, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 51.0, | |
| "eval_loss": 2.528775691986084, | |
| "eval_runtime": 61.2246, | |
| "eval_samples_per_second": 163.333, | |
| "eval_steps_per_second": 5.112, | |
| "step": 63750 | |
| }, | |
| { | |
| "epoch": 51.2, | |
| "grad_norm": 2.903444528579712, | |
| "learning_rate": 2.44e-05, | |
| "loss": 2.4712, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 51.6, | |
| "grad_norm": 3.0363874435424805, | |
| "learning_rate": 2.4200000000000002e-05, | |
| "loss": 2.4673, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "grad_norm": 2.935192823410034, | |
| "learning_rate": 2.4e-05, | |
| "loss": 2.4604, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 52.0, | |
| "eval_loss": 2.5387279987335205, | |
| "eval_runtime": 61.5389, | |
| "eval_samples_per_second": 162.499, | |
| "eval_steps_per_second": 5.086, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 52.4, | |
| "grad_norm": 2.8958323001861572, | |
| "learning_rate": 2.38e-05, | |
| "loss": 2.4495, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 52.8, | |
| "grad_norm": 2.9605252742767334, | |
| "learning_rate": 2.36e-05, | |
| "loss": 2.4585, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 53.0, | |
| "eval_loss": 2.5214128494262695, | |
| "eval_runtime": 61.2198, | |
| "eval_samples_per_second": 163.346, | |
| "eval_steps_per_second": 5.113, | |
| "step": 66250 | |
| }, | |
| { | |
| "epoch": 53.2, | |
| "grad_norm": 2.8114893436431885, | |
| "learning_rate": 2.3400000000000003e-05, | |
| "loss": 2.4386, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 53.6, | |
| "grad_norm": 2.9136197566986084, | |
| "learning_rate": 2.32e-05, | |
| "loss": 2.442, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "grad_norm": 2.891444683074951, | |
| "learning_rate": 2.3000000000000003e-05, | |
| "loss": 2.4517, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 54.0, | |
| "eval_loss": 2.520853042602539, | |
| "eval_runtime": 61.2251, | |
| "eval_samples_per_second": 163.332, | |
| "eval_steps_per_second": 5.112, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 54.4, | |
| "grad_norm": 2.8234751224517822, | |
| "learning_rate": 2.2800000000000002e-05, | |
| "loss": 2.4323, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 54.8, | |
| "grad_norm": 2.6390535831451416, | |
| "learning_rate": 2.26e-05, | |
| "loss": 2.4389, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 55.0, | |
| "eval_loss": 2.508610725402832, | |
| "eval_runtime": 61.2599, | |
| "eval_samples_per_second": 163.239, | |
| "eval_steps_per_second": 5.109, | |
| "step": 68750 | |
| }, | |
| { | |
| "epoch": 55.2, | |
| "grad_norm": 2.766709566116333, | |
| "learning_rate": 2.2400000000000002e-05, | |
| "loss": 2.4248, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 55.6, | |
| "grad_norm": 2.9248626232147217, | |
| "learning_rate": 2.22e-05, | |
| "loss": 2.4301, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "grad_norm": 3.262801170349121, | |
| "learning_rate": 2.2000000000000003e-05, | |
| "loss": 2.4225, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 56.0, | |
| "eval_loss": 2.5070488452911377, | |
| "eval_runtime": 61.2656, | |
| "eval_samples_per_second": 163.224, | |
| "eval_steps_per_second": 5.109, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 56.4, | |
| "grad_norm": 3.05898118019104, | |
| "learning_rate": 2.18e-05, | |
| "loss": 2.414, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 56.8, | |
| "grad_norm": 2.8624179363250732, | |
| "learning_rate": 2.16e-05, | |
| "loss": 2.4145, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 57.0, | |
| "eval_loss": 2.4903688430786133, | |
| "eval_runtime": 61.3344, | |
| "eval_samples_per_second": 163.041, | |
| "eval_steps_per_second": 5.103, | |
| "step": 71250 | |
| }, | |
| { | |
| "epoch": 57.2, | |
| "grad_norm": 2.8718996047973633, | |
| "learning_rate": 2.1400000000000002e-05, | |
| "loss": 2.4152, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 57.6, | |
| "grad_norm": 3.02909517288208, | |
| "learning_rate": 2.12e-05, | |
| "loss": 2.4107, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "grad_norm": 3.092914581298828, | |
| "learning_rate": 2.1e-05, | |
| "loss": 2.4058, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 58.0, | |
| "eval_loss": 2.4901366233825684, | |
| "eval_runtime": 61.3186, | |
| "eval_samples_per_second": 163.083, | |
| "eval_steps_per_second": 5.104, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 58.4, | |
| "grad_norm": 3.0763566493988037, | |
| "learning_rate": 2.08e-05, | |
| "loss": 2.3941, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 58.8, | |
| "grad_norm": 2.809154748916626, | |
| "learning_rate": 2.06e-05, | |
| "loss": 2.398, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 59.0, | |
| "eval_loss": 2.491445302963257, | |
| "eval_runtime": 61.2316, | |
| "eval_samples_per_second": 163.314, | |
| "eval_steps_per_second": 5.112, | |
| "step": 73750 | |
| }, | |
| { | |
| "epoch": 59.2, | |
| "grad_norm": 2.948514699935913, | |
| "learning_rate": 2.04e-05, | |
| "loss": 2.3977, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 59.6, | |
| "grad_norm": 2.9708919525146484, | |
| "learning_rate": 2.0200000000000003e-05, | |
| "loss": 2.3849, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "grad_norm": 3.0955567359924316, | |
| "learning_rate": 2e-05, | |
| "loss": 2.3847, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 60.0, | |
| "eval_loss": 2.481839418411255, | |
| "eval_runtime": 61.2301, | |
| "eval_samples_per_second": 163.318, | |
| "eval_steps_per_second": 5.112, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 60.4, | |
| "grad_norm": 2.8540799617767334, | |
| "learning_rate": 1.9800000000000004e-05, | |
| "loss": 2.3831, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 60.8, | |
| "grad_norm": 2.79695987701416, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 2.3833, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 61.0, | |
| "eval_loss": 2.4663026332855225, | |
| "eval_runtime": 61.2384, | |
| "eval_samples_per_second": 163.296, | |
| "eval_steps_per_second": 5.111, | |
| "step": 76250 | |
| }, | |
| { | |
| "epoch": 61.2, | |
| "grad_norm": 3.144937515258789, | |
| "learning_rate": 1.94e-05, | |
| "loss": 2.3802, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 61.6, | |
| "grad_norm": 3.230477809906006, | |
| "learning_rate": 1.9200000000000003e-05, | |
| "loss": 2.369, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "grad_norm": 3.04854416847229, | |
| "learning_rate": 1.9e-05, | |
| "loss": 2.3772, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 62.0, | |
| "eval_loss": 2.475891351699829, | |
| "eval_runtime": 61.2453, | |
| "eval_samples_per_second": 163.278, | |
| "eval_steps_per_second": 5.111, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 62.4, | |
| "grad_norm": 2.8289458751678467, | |
| "learning_rate": 1.88e-05, | |
| "loss": 2.3607, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 62.8, | |
| "grad_norm": 2.9443745613098145, | |
| "learning_rate": 1.86e-05, | |
| "loss": 2.3711, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 63.0, | |
| "eval_loss": 2.45942759513855, | |
| "eval_runtime": 61.2995, | |
| "eval_samples_per_second": 163.133, | |
| "eval_steps_per_second": 5.106, | |
| "step": 78750 | |
| }, | |
| { | |
| "epoch": 63.2, | |
| "grad_norm": 3.4686219692230225, | |
| "learning_rate": 1.84e-05, | |
| "loss": 2.3635, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 63.6, | |
| "grad_norm": 2.9754679203033447, | |
| "learning_rate": 1.8200000000000002e-05, | |
| "loss": 2.3544, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "grad_norm": 2.959327459335327, | |
| "learning_rate": 1.8e-05, | |
| "loss": 2.3603, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 64.0, | |
| "eval_loss": 2.454479455947876, | |
| "eval_runtime": 61.2094, | |
| "eval_samples_per_second": 163.374, | |
| "eval_steps_per_second": 5.114, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 64.4, | |
| "grad_norm": 3.086289882659912, | |
| "learning_rate": 1.78e-05, | |
| "loss": 2.348, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 64.8, | |
| "grad_norm": 2.840416193008423, | |
| "learning_rate": 1.76e-05, | |
| "loss": 2.3546, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 65.0, | |
| "eval_loss": 2.4469666481018066, | |
| "eval_runtime": 61.2942, | |
| "eval_samples_per_second": 163.147, | |
| "eval_steps_per_second": 5.107, | |
| "step": 81250 | |
| }, | |
| { | |
| "epoch": 65.2, | |
| "grad_norm": 3.062312364578247, | |
| "learning_rate": 1.74e-05, | |
| "loss": 2.3485, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 65.6, | |
| "grad_norm": 2.8818204402923584, | |
| "learning_rate": 1.7199999999999998e-05, | |
| "loss": 2.3434, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "grad_norm": 2.827214002609253, | |
| "learning_rate": 1.7000000000000003e-05, | |
| "loss": 2.3497, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 66.0, | |
| "eval_loss": 2.4515304565429688, | |
| "eval_runtime": 61.2229, | |
| "eval_samples_per_second": 163.338, | |
| "eval_steps_per_second": 5.112, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 66.4, | |
| "grad_norm": 2.871967315673828, | |
| "learning_rate": 1.6800000000000002e-05, | |
| "loss": 2.3415, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 66.8, | |
| "grad_norm": 3.2059788703918457, | |
| "learning_rate": 1.66e-05, | |
| "loss": 2.3328, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 67.0, | |
| "eval_loss": 2.4459116458892822, | |
| "eval_runtime": 61.2313, | |
| "eval_samples_per_second": 163.315, | |
| "eval_steps_per_second": 5.112, | |
| "step": 83750 | |
| }, | |
| { | |
| "epoch": 67.2, | |
| "grad_norm": 2.958317518234253, | |
| "learning_rate": 1.6400000000000002e-05, | |
| "loss": 2.3355, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 67.6, | |
| "grad_norm": 3.2304327487945557, | |
| "learning_rate": 1.62e-05, | |
| "loss": 2.329, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "grad_norm": 3.0539510250091553, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 2.3281, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 68.0, | |
| "eval_loss": 2.4377968311309814, | |
| "eval_runtime": 61.2555, | |
| "eval_samples_per_second": 163.251, | |
| "eval_steps_per_second": 5.11, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 68.4, | |
| "grad_norm": 3.338268995285034, | |
| "learning_rate": 1.58e-05, | |
| "loss": 2.3221, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 68.8, | |
| "grad_norm": 3.2005858421325684, | |
| "learning_rate": 1.56e-05, | |
| "loss": 2.3272, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 69.0, | |
| "eval_loss": 2.4320311546325684, | |
| "eval_runtime": 61.3445, | |
| "eval_samples_per_second": 163.014, | |
| "eval_steps_per_second": 5.102, | |
| "step": 86250 | |
| }, | |
| { | |
| "epoch": 69.2, | |
| "grad_norm": 3.010361671447754, | |
| "learning_rate": 1.54e-05, | |
| "loss": 2.312, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 69.6, | |
| "grad_norm": 3.110023021697998, | |
| "learning_rate": 1.52e-05, | |
| "loss": 2.3175, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "grad_norm": 2.972137212753296, | |
| "learning_rate": 1.5e-05, | |
| "loss": 2.3172, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 70.0, | |
| "eval_loss": 2.428523063659668, | |
| "eval_runtime": 61.2024, | |
| "eval_samples_per_second": 163.392, | |
| "eval_steps_per_second": 5.114, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 70.4, | |
| "grad_norm": 2.8720614910125732, | |
| "learning_rate": 1.48e-05, | |
| "loss": 2.3103, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 70.8, | |
| "grad_norm": 2.9163401126861572, | |
| "learning_rate": 1.4599999999999999e-05, | |
| "loss": 2.3128, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 71.0, | |
| "eval_loss": 2.4292280673980713, | |
| "eval_runtime": 61.2279, | |
| "eval_samples_per_second": 163.324, | |
| "eval_steps_per_second": 5.112, | |
| "step": 88750 | |
| }, | |
| { | |
| "epoch": 71.2, | |
| "grad_norm": 2.890698194503784, | |
| "learning_rate": 1.44e-05, | |
| "loss": 2.304, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 71.6, | |
| "grad_norm": 3.2436020374298096, | |
| "learning_rate": 1.42e-05, | |
| "loss": 2.3067, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "grad_norm": 2.849823236465454, | |
| "learning_rate": 1.4000000000000001e-05, | |
| "loss": 2.3082, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 72.0, | |
| "eval_loss": 2.41927170753479, | |
| "eval_runtime": 61.2261, | |
| "eval_samples_per_second": 163.329, | |
| "eval_steps_per_second": 5.112, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 72.4, | |
| "grad_norm": 3.191131353378296, | |
| "learning_rate": 1.3800000000000002e-05, | |
| "loss": 2.2925, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 72.8, | |
| "grad_norm": 3.203021764755249, | |
| "learning_rate": 1.3600000000000002e-05, | |
| "loss": 2.3004, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 73.0, | |
| "eval_loss": 2.4278526306152344, | |
| "eval_runtime": 61.2358, | |
| "eval_samples_per_second": 163.303, | |
| "eval_steps_per_second": 5.111, | |
| "step": 91250 | |
| }, | |
| { | |
| "epoch": 73.2, | |
| "grad_norm": 3.002692461013794, | |
| "learning_rate": 1.3400000000000002e-05, | |
| "loss": 2.2949, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 73.6, | |
| "grad_norm": 3.137084722518921, | |
| "learning_rate": 1.32e-05, | |
| "loss": 2.2898, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "grad_norm": 3.325843095779419, | |
| "learning_rate": 1.3000000000000001e-05, | |
| "loss": 2.2955, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 74.0, | |
| "eval_loss": 2.409475326538086, | |
| "eval_runtime": 61.3551, | |
| "eval_samples_per_second": 162.986, | |
| "eval_steps_per_second": 5.101, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 74.4, | |
| "grad_norm": 3.207425594329834, | |
| "learning_rate": 1.2800000000000001e-05, | |
| "loss": 2.2895, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 74.8, | |
| "grad_norm": 3.2221837043762207, | |
| "learning_rate": 1.2600000000000001e-05, | |
| "loss": 2.2849, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 75.0, | |
| "eval_loss": 2.4111411571502686, | |
| "eval_runtime": 61.2648, | |
| "eval_samples_per_second": 163.226, | |
| "eval_steps_per_second": 5.109, | |
| "step": 93750 | |
| }, | |
| { | |
| "epoch": 75.2, | |
| "grad_norm": 3.0594756603240967, | |
| "learning_rate": 1.24e-05, | |
| "loss": 2.2791, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 75.6, | |
| "grad_norm": 3.013936996459961, | |
| "learning_rate": 1.22e-05, | |
| "loss": 2.2935, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "grad_norm": 3.296487808227539, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.2816, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 76.0, | |
| "eval_loss": 2.405331611633301, | |
| "eval_runtime": 61.2866, | |
| "eval_samples_per_second": 163.168, | |
| "eval_steps_per_second": 5.107, | |
| "step": 95000 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 125000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 100, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 2, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.0316781723648e+17, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |