{ "best_global_step": 95000, "best_metric": 2.405331611633301, "best_model_checkpoint": "output/checkpoint-95000", "epoch": 76.0, "eval_steps": 500, "global_step": 95000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.4, "grad_norm": 1.3970232009887695, "learning_rate": 4.9800000000000004e-05, "loss": 6.725, "step": 500 }, { "epoch": 0.8, "grad_norm": 1.4344754219055176, "learning_rate": 4.96e-05, "loss": 6.1958, "step": 1000 }, { "epoch": 1.0, "eval_loss": 6.076746940612793, "eval_runtime": 61.2746, "eval_samples_per_second": 163.2, "eval_steps_per_second": 5.108, "step": 1250 }, { "epoch": 1.2, "grad_norm": 1.259356141090393, "learning_rate": 4.94e-05, "loss": 6.118, "step": 1500 }, { "epoch": 1.6, "grad_norm": 1.1414670944213867, "learning_rate": 4.92e-05, "loss": 6.0705, "step": 2000 }, { "epoch": 2.0, "grad_norm": 1.2967591285705566, "learning_rate": 4.9e-05, "loss": 6.0444, "step": 2500 }, { "epoch": 2.0, "eval_loss": 6.011667251586914, "eval_runtime": 61.2402, "eval_samples_per_second": 163.291, "eval_steps_per_second": 5.111, "step": 2500 }, { "epoch": 2.4, "grad_norm": 1.572110652923584, "learning_rate": 4.88e-05, "loss": 6.0122, "step": 3000 }, { "epoch": 2.8, "grad_norm": 1.6025742292404175, "learning_rate": 4.86e-05, "loss": 5.991, "step": 3500 }, { "epoch": 3.0, "eval_loss": 5.93717622756958, "eval_runtime": 61.2634, "eval_samples_per_second": 163.23, "eval_steps_per_second": 5.109, "step": 3750 }, { "epoch": 3.2, "grad_norm": 1.4724808931350708, "learning_rate": 4.8400000000000004e-05, "loss": 5.9594, "step": 4000 }, { "epoch": 3.6, "grad_norm": 1.7168885469436646, "learning_rate": 4.82e-05, "loss": 5.8787, "step": 4500 }, { "epoch": 4.0, "grad_norm": 1.6915048360824585, "learning_rate": 4.8e-05, "loss": 5.7669, "step": 5000 }, { "epoch": 4.0, "eval_loss": 5.624746322631836, "eval_runtime": 61.2596, "eval_samples_per_second": 163.24, "eval_steps_per_second": 5.109, "step": 5000 }, { "epoch": 4.4, "grad_norm": 2.1659460067749023, "learning_rate": 4.78e-05, "loss": 5.593, "step": 5500 }, { "epoch": 4.8, "grad_norm": 2.476262331008911, "learning_rate": 4.76e-05, "loss": 5.422, "step": 6000 }, { "epoch": 5.0, "eval_loss": 5.117976665496826, "eval_runtime": 61.2737, "eval_samples_per_second": 163.202, "eval_steps_per_second": 5.108, "step": 6250 }, { "epoch": 5.2, "grad_norm": 2.4147756099700928, "learning_rate": 4.74e-05, "loss": 5.2305, "step": 6500 }, { "epoch": 5.6, "grad_norm": 2.6457254886627197, "learning_rate": 4.72e-05, "loss": 4.9511, "step": 7000 }, { "epoch": 6.0, "grad_norm": 2.7153351306915283, "learning_rate": 4.7e-05, "loss": 4.7339, "step": 7500 }, { "epoch": 6.0, "eval_loss": 4.504222869873047, "eval_runtime": 61.2685, "eval_samples_per_second": 163.216, "eval_steps_per_second": 5.109, "step": 7500 }, { "epoch": 6.4, "grad_norm": 2.6116857528686523, "learning_rate": 4.6800000000000006e-05, "loss": 4.5647, "step": 8000 }, { "epoch": 6.8, "grad_norm": 2.3627805709838867, "learning_rate": 4.660000000000001e-05, "loss": 4.432, "step": 8500 }, { "epoch": 7.0, "eval_loss": 4.213283061981201, "eval_runtime": 61.2562, "eval_samples_per_second": 163.249, "eval_steps_per_second": 5.11, "step": 8750 }, { "epoch": 7.2, "grad_norm": 2.548539161682129, "learning_rate": 4.64e-05, "loss": 4.3363, "step": 9000 }, { "epoch": 7.6, "grad_norm": 2.6018478870391846, "learning_rate": 4.6200000000000005e-05, "loss": 4.2403, "step": 9500 }, { "epoch": 8.0, "grad_norm": 2.2613413333892822, "learning_rate": 4.600000000000001e-05, "loss": 4.1724, "step": 10000 }, { "epoch": 8.0, "eval_loss": 4.003890514373779, "eval_runtime": 61.4625, "eval_samples_per_second": 162.701, "eval_steps_per_second": 5.093, "step": 10000 }, { "epoch": 8.4, "grad_norm": 2.538825273513794, "learning_rate": 4.58e-05, "loss": 4.0805, "step": 10500 }, { "epoch": 8.8, "grad_norm": 2.3357536792755127, "learning_rate": 4.5600000000000004e-05, "loss": 4.0316, "step": 11000 }, { "epoch": 9.0, "eval_loss": 3.875485897064209, "eval_runtime": 61.2731, "eval_samples_per_second": 163.204, "eval_steps_per_second": 5.108, "step": 11250 }, { "epoch": 9.2, "grad_norm": 2.7909834384918213, "learning_rate": 4.5400000000000006e-05, "loss": 3.9733, "step": 11500 }, { "epoch": 9.6, "grad_norm": 2.770125150680542, "learning_rate": 4.52e-05, "loss": 3.9146, "step": 12000 }, { "epoch": 10.0, "grad_norm": 2.6937015056610107, "learning_rate": 4.5e-05, "loss": 3.8765, "step": 12500 }, { "epoch": 10.0, "eval_loss": 3.7373440265655518, "eval_runtime": 61.2543, "eval_samples_per_second": 163.254, "eval_steps_per_second": 5.11, "step": 12500 }, { "epoch": 10.4, "grad_norm": 2.762148380279541, "learning_rate": 4.4800000000000005e-05, "loss": 3.8169, "step": 13000 }, { "epoch": 10.8, "grad_norm": 2.428678035736084, "learning_rate": 4.46e-05, "loss": 3.7702, "step": 13500 }, { "epoch": 11.0, "eval_loss": 3.6505491733551025, "eval_runtime": 61.3079, "eval_samples_per_second": 163.111, "eval_steps_per_second": 5.105, "step": 13750 }, { "epoch": 11.2, "grad_norm": 2.57837176322937, "learning_rate": 4.44e-05, "loss": 3.7268, "step": 14000 }, { "epoch": 11.6, "grad_norm": 2.6295292377471924, "learning_rate": 4.4200000000000004e-05, "loss": 3.6882, "step": 14500 }, { "epoch": 12.0, "grad_norm": 2.4153482913970947, "learning_rate": 4.4000000000000006e-05, "loss": 3.6646, "step": 15000 }, { "epoch": 12.0, "eval_loss": 3.5465199947357178, "eval_runtime": 61.2796, "eval_samples_per_second": 163.187, "eval_steps_per_second": 5.108, "step": 15000 }, { "epoch": 12.4, "grad_norm": 2.5010106563568115, "learning_rate": 4.38e-05, "loss": 3.617, "step": 15500 }, { "epoch": 12.8, "grad_norm": 2.515644073486328, "learning_rate": 4.36e-05, "loss": 3.5834, "step": 16000 }, { "epoch": 13.0, "eval_loss": 3.478050708770752, "eval_runtime": 61.4335, "eval_samples_per_second": 162.778, "eval_steps_per_second": 5.095, "step": 16250 }, { "epoch": 13.2, "grad_norm": 3.00858211517334, "learning_rate": 4.3400000000000005e-05, "loss": 3.5563, "step": 16500 }, { "epoch": 13.6, "grad_norm": 2.528775453567505, "learning_rate": 4.32e-05, "loss": 3.5116, "step": 17000 }, { "epoch": 14.0, "grad_norm": 2.637566089630127, "learning_rate": 4.3e-05, "loss": 3.4912, "step": 17500 }, { "epoch": 14.0, "eval_loss": 3.401700019836426, "eval_runtime": 61.2641, "eval_samples_per_second": 163.228, "eval_steps_per_second": 5.109, "step": 17500 }, { "epoch": 14.4, "grad_norm": 2.4223690032958984, "learning_rate": 4.2800000000000004e-05, "loss": 3.4544, "step": 18000 }, { "epoch": 14.8, "grad_norm": 2.7772583961486816, "learning_rate": 4.26e-05, "loss": 3.437, "step": 18500 }, { "epoch": 15.0, "eval_loss": 3.337768077850342, "eval_runtime": 61.2955, "eval_samples_per_second": 163.144, "eval_steps_per_second": 5.106, "step": 18750 }, { "epoch": 15.2, "grad_norm": 2.417879104614258, "learning_rate": 4.24e-05, "loss": 3.4121, "step": 19000 }, { "epoch": 15.6, "grad_norm": 2.525871753692627, "learning_rate": 4.22e-05, "loss": 3.3835, "step": 19500 }, { "epoch": 16.0, "grad_norm": 2.49729061126709, "learning_rate": 4.2e-05, "loss": 3.3679, "step": 20000 }, { "epoch": 16.0, "eval_loss": 3.2797868251800537, "eval_runtime": 61.2076, "eval_samples_per_second": 163.378, "eval_steps_per_second": 5.114, "step": 20000 }, { "epoch": 16.4, "grad_norm": 2.6784422397613525, "learning_rate": 4.18e-05, "loss": 3.3326, "step": 20500 }, { "epoch": 16.8, "grad_norm": 2.5526607036590576, "learning_rate": 4.16e-05, "loss": 3.3217, "step": 21000 }, { "epoch": 17.0, "eval_loss": 3.233443021774292, "eval_runtime": 61.2163, "eval_samples_per_second": 163.355, "eval_steps_per_second": 5.113, "step": 21250 }, { "epoch": 17.2, "grad_norm": 2.685425281524658, "learning_rate": 4.14e-05, "loss": 3.2826, "step": 21500 }, { "epoch": 17.6, "grad_norm": 2.7774910926818848, "learning_rate": 4.12e-05, "loss": 3.2716, "step": 22000 }, { "epoch": 18.0, "grad_norm": 2.404536008834839, "learning_rate": 4.1e-05, "loss": 3.2583, "step": 22500 }, { "epoch": 18.0, "eval_loss": 3.1886167526245117, "eval_runtime": 61.4623, "eval_samples_per_second": 162.701, "eval_steps_per_second": 5.093, "step": 22500 }, { "epoch": 18.4, "grad_norm": 2.5723462104797363, "learning_rate": 4.08e-05, "loss": 3.2232, "step": 23000 }, { "epoch": 18.8, "grad_norm": 2.5137925148010254, "learning_rate": 4.0600000000000004e-05, "loss": 3.2212, "step": 23500 }, { "epoch": 19.0, "eval_loss": 3.1394808292388916, "eval_runtime": 61.2181, "eval_samples_per_second": 163.35, "eval_steps_per_second": 5.113, "step": 23750 }, { "epoch": 19.2, "grad_norm": 2.7127292156219482, "learning_rate": 4.0400000000000006e-05, "loss": 3.1803, "step": 24000 }, { "epoch": 19.6, "grad_norm": 2.5438954830169678, "learning_rate": 4.02e-05, "loss": 3.1692, "step": 24500 }, { "epoch": 20.0, "grad_norm": 2.503960132598877, "learning_rate": 4e-05, "loss": 3.1697, "step": 25000 }, { "epoch": 20.0, "eval_loss": 3.112931251525879, "eval_runtime": 61.2263, "eval_samples_per_second": 163.329, "eval_steps_per_second": 5.112, "step": 25000 }, { "epoch": 20.4, "grad_norm": 2.501166820526123, "learning_rate": 3.9800000000000005e-05, "loss": 3.1396, "step": 25500 }, { "epoch": 20.8, "grad_norm": 2.7387282848358154, "learning_rate": 3.960000000000001e-05, "loss": 3.1209, "step": 26000 }, { "epoch": 21.0, "eval_loss": 3.0616869926452637, "eval_runtime": 61.2523, "eval_samples_per_second": 163.259, "eval_steps_per_second": 5.11, "step": 26250 }, { "epoch": 21.2, "grad_norm": 2.641589641571045, "learning_rate": 3.94e-05, "loss": 3.1079, "step": 26500 }, { "epoch": 21.6, "grad_norm": 2.8861656188964844, "learning_rate": 3.9200000000000004e-05, "loss": 3.0907, "step": 27000 }, { "epoch": 22.0, "grad_norm": 2.9420666694641113, "learning_rate": 3.9000000000000006e-05, "loss": 3.0772, "step": 27500 }, { "epoch": 22.0, "eval_loss": 3.029193639755249, "eval_runtime": 61.3008, "eval_samples_per_second": 163.13, "eval_steps_per_second": 5.106, "step": 27500 }, { "epoch": 22.4, "grad_norm": 2.6903202533721924, "learning_rate": 3.88e-05, "loss": 3.0618, "step": 28000 }, { "epoch": 22.8, "grad_norm": 2.4258873462677, "learning_rate": 3.86e-05, "loss": 3.0482, "step": 28500 }, { "epoch": 23.0, "eval_loss": 2.998987913131714, "eval_runtime": 61.2446, "eval_samples_per_second": 163.28, "eval_steps_per_second": 5.111, "step": 28750 }, { "epoch": 23.2, "grad_norm": 2.8380045890808105, "learning_rate": 3.8400000000000005e-05, "loss": 3.0271, "step": 29000 }, { "epoch": 23.6, "grad_norm": 2.936922788619995, "learning_rate": 3.82e-05, "loss": 3.016, "step": 29500 }, { "epoch": 24.0, "grad_norm": 2.829270362854004, "learning_rate": 3.8e-05, "loss": 3.0102, "step": 30000 }, { "epoch": 24.0, "eval_loss": 2.972419023513794, "eval_runtime": 61.3686, "eval_samples_per_second": 162.95, "eval_steps_per_second": 5.1, "step": 30000 }, { "epoch": 24.4, "grad_norm": 2.6595258712768555, "learning_rate": 3.7800000000000004e-05, "loss": 2.9857, "step": 30500 }, { "epoch": 24.8, "grad_norm": 2.9528753757476807, "learning_rate": 3.76e-05, "loss": 2.9833, "step": 31000 }, { "epoch": 25.0, "eval_loss": 2.943739414215088, "eval_runtime": 61.233, "eval_samples_per_second": 163.311, "eval_steps_per_second": 5.112, "step": 31250 }, { "epoch": 25.2, "grad_norm": 2.8180456161499023, "learning_rate": 3.74e-05, "loss": 2.9631, "step": 31500 }, { "epoch": 25.6, "grad_norm": 3.0738282203674316, "learning_rate": 3.72e-05, "loss": 2.9521, "step": 32000 }, { "epoch": 26.0, "grad_norm": 3.0022482872009277, "learning_rate": 3.7e-05, "loss": 2.9429, "step": 32500 }, { "epoch": 26.0, "eval_loss": 2.918297290802002, "eval_runtime": 61.2635, "eval_samples_per_second": 163.229, "eval_steps_per_second": 5.109, "step": 32500 }, { "epoch": 26.4, "grad_norm": 2.5933921337127686, "learning_rate": 3.68e-05, "loss": 2.9205, "step": 33000 }, { "epoch": 26.8, "grad_norm": 2.725785493850708, "learning_rate": 3.66e-05, "loss": 2.9138, "step": 33500 }, { "epoch": 27.0, "eval_loss": 2.8853907585144043, "eval_runtime": 61.2544, "eval_samples_per_second": 163.254, "eval_steps_per_second": 5.11, "step": 33750 }, { "epoch": 27.2, "grad_norm": 2.8924100399017334, "learning_rate": 3.6400000000000004e-05, "loss": 2.9061, "step": 34000 }, { "epoch": 27.6, "grad_norm": 2.9126296043395996, "learning_rate": 3.62e-05, "loss": 2.8858, "step": 34500 }, { "epoch": 28.0, "grad_norm": 2.768714189529419, "learning_rate": 3.6e-05, "loss": 2.8815, "step": 35000 }, { "epoch": 28.0, "eval_loss": 2.8625736236572266, "eval_runtime": 61.266, "eval_samples_per_second": 163.223, "eval_steps_per_second": 5.109, "step": 35000 }, { "epoch": 28.4, "grad_norm": 2.8135673999786377, "learning_rate": 3.58e-05, "loss": 2.8635, "step": 35500 }, { "epoch": 28.8, "grad_norm": 2.8309783935546875, "learning_rate": 3.56e-05, "loss": 2.848, "step": 36000 }, { "epoch": 29.0, "eval_loss": 2.8396055698394775, "eval_runtime": 61.5143, "eval_samples_per_second": 162.564, "eval_steps_per_second": 5.088, "step": 36250 }, { "epoch": 29.2, "grad_norm": 2.9002137184143066, "learning_rate": 3.54e-05, "loss": 2.8476, "step": 36500 }, { "epoch": 29.6, "grad_norm": 2.79744815826416, "learning_rate": 3.52e-05, "loss": 2.8341, "step": 37000 }, { "epoch": 30.0, "grad_norm": 3.041928768157959, "learning_rate": 3.5e-05, "loss": 2.8276, "step": 37500 }, { "epoch": 30.0, "eval_loss": 2.805696487426758, "eval_runtime": 61.2638, "eval_samples_per_second": 163.229, "eval_steps_per_second": 5.109, "step": 37500 }, { "epoch": 30.4, "grad_norm": 2.7281506061553955, "learning_rate": 3.48e-05, "loss": 2.8067, "step": 38000 }, { "epoch": 30.8, "grad_norm": 2.6974635124206543, "learning_rate": 3.46e-05, "loss": 2.8114, "step": 38500 }, { "epoch": 31.0, "eval_loss": 2.7848477363586426, "eval_runtime": 61.2633, "eval_samples_per_second": 163.23, "eval_steps_per_second": 5.109, "step": 38750 }, { "epoch": 31.2, "grad_norm": 3.201719045639038, "learning_rate": 3.4399999999999996e-05, "loss": 2.7928, "step": 39000 }, { "epoch": 31.6, "grad_norm": 2.7298779487609863, "learning_rate": 3.4200000000000005e-05, "loss": 2.7814, "step": 39500 }, { "epoch": 32.0, "grad_norm": 2.873957872390747, "learning_rate": 3.4000000000000007e-05, "loss": 2.7806, "step": 40000 }, { "epoch": 32.0, "eval_loss": 2.773723840713501, "eval_runtime": 61.1994, "eval_samples_per_second": 163.4, "eval_steps_per_second": 5.114, "step": 40000 }, { "epoch": 32.4, "grad_norm": 2.733128547668457, "learning_rate": 3.38e-05, "loss": 2.7619, "step": 40500 }, { "epoch": 32.8, "grad_norm": 2.825956106185913, "learning_rate": 3.3600000000000004e-05, "loss": 2.7608, "step": 41000 }, { "epoch": 33.0, "eval_loss": 2.7562143802642822, "eval_runtime": 61.2126, "eval_samples_per_second": 163.365, "eval_steps_per_second": 5.113, "step": 41250 }, { "epoch": 33.2, "grad_norm": 2.9455862045288086, "learning_rate": 3.3400000000000005e-05, "loss": 2.742, "step": 41500 }, { "epoch": 33.6, "grad_norm": 2.9023234844207764, "learning_rate": 3.32e-05, "loss": 2.7347, "step": 42000 }, { "epoch": 34.0, "grad_norm": 2.7190394401550293, "learning_rate": 3.3e-05, "loss": 2.7366, "step": 42500 }, { "epoch": 34.0, "eval_loss": 2.7386631965637207, "eval_runtime": 61.2856, "eval_samples_per_second": 163.17, "eval_steps_per_second": 5.107, "step": 42500 }, { "epoch": 34.4, "grad_norm": 2.9011335372924805, "learning_rate": 3.2800000000000004e-05, "loss": 2.7113, "step": 43000 }, { "epoch": 34.8, "grad_norm": 2.6606903076171875, "learning_rate": 3.26e-05, "loss": 2.7157, "step": 43500 }, { "epoch": 35.0, "eval_loss": 2.713857889175415, "eval_runtime": 61.228, "eval_samples_per_second": 163.324, "eval_steps_per_second": 5.112, "step": 43750 }, { "epoch": 35.2, "grad_norm": 2.7517592906951904, "learning_rate": 3.24e-05, "loss": 2.7046, "step": 44000 }, { "epoch": 35.6, "grad_norm": 3.1010994911193848, "learning_rate": 3.2200000000000003e-05, "loss": 2.6955, "step": 44500 }, { "epoch": 36.0, "grad_norm": 2.7236251831054688, "learning_rate": 3.2000000000000005e-05, "loss": 2.6894, "step": 45000 }, { "epoch": 36.0, "eval_loss": 2.707624673843384, "eval_runtime": 61.2324, "eval_samples_per_second": 163.312, "eval_steps_per_second": 5.112, "step": 45000 }, { "epoch": 36.4, "grad_norm": 2.961221933364868, "learning_rate": 3.18e-05, "loss": 2.6825, "step": 45500 }, { "epoch": 36.8, "grad_norm": 2.9229893684387207, "learning_rate": 3.16e-05, "loss": 2.6708, "step": 46000 }, { "epoch": 37.0, "eval_loss": 2.686065196990967, "eval_runtime": 61.2242, "eval_samples_per_second": 163.334, "eval_steps_per_second": 5.112, "step": 46250 }, { "epoch": 37.2, "grad_norm": 3.1210429668426514, "learning_rate": 3.1400000000000004e-05, "loss": 2.6717, "step": 46500 }, { "epoch": 37.6, "grad_norm": 2.785784959793091, "learning_rate": 3.12e-05, "loss": 2.6594, "step": 47000 }, { "epoch": 38.0, "grad_norm": 2.8745310306549072, "learning_rate": 3.1e-05, "loss": 2.6577, "step": 47500 }, { "epoch": 38.0, "eval_loss": 2.6833102703094482, "eval_runtime": 61.2268, "eval_samples_per_second": 163.327, "eval_steps_per_second": 5.112, "step": 47500 }, { "epoch": 38.4, "grad_norm": 2.848191022872925, "learning_rate": 3.08e-05, "loss": 2.6407, "step": 48000 }, { "epoch": 38.8, "grad_norm": 2.9842936992645264, "learning_rate": 3.06e-05, "loss": 2.6437, "step": 48500 }, { "epoch": 39.0, "eval_loss": 2.6599860191345215, "eval_runtime": 61.3483, "eval_samples_per_second": 163.004, "eval_steps_per_second": 5.102, "step": 48750 }, { "epoch": 39.2, "grad_norm": 2.9027788639068604, "learning_rate": 3.04e-05, "loss": 2.6275, "step": 49000 }, { "epoch": 39.6, "grad_norm": 2.8184854984283447, "learning_rate": 3.02e-05, "loss": 2.6188, "step": 49500 }, { "epoch": 40.0, "grad_norm": 2.8621346950531006, "learning_rate": 3e-05, "loss": 2.6255, "step": 50000 }, { "epoch": 40.0, "eval_loss": 2.6438729763031006, "eval_runtime": 61.2542, "eval_samples_per_second": 163.254, "eval_steps_per_second": 5.11, "step": 50000 }, { "epoch": 40.4, "grad_norm": 2.661895275115967, "learning_rate": 2.98e-05, "loss": 2.6124, "step": 50500 }, { "epoch": 40.8, "grad_norm": 2.913214683532715, "learning_rate": 2.96e-05, "loss": 2.6128, "step": 51000 }, { "epoch": 41.0, "eval_loss": 2.634756088256836, "eval_runtime": 61.2199, "eval_samples_per_second": 163.346, "eval_steps_per_second": 5.113, "step": 51250 }, { "epoch": 41.2, "grad_norm": 2.8805391788482666, "learning_rate": 2.94e-05, "loss": 2.5944, "step": 51500 }, { "epoch": 41.6, "grad_norm": 3.3086748123168945, "learning_rate": 2.9199999999999998e-05, "loss": 2.5968, "step": 52000 }, { "epoch": 42.0, "grad_norm": 2.8824660778045654, "learning_rate": 2.9e-05, "loss": 2.5929, "step": 52500 }, { "epoch": 42.0, "eval_loss": 2.6286351680755615, "eval_runtime": 61.2491, "eval_samples_per_second": 163.268, "eval_steps_per_second": 5.11, "step": 52500 }, { "epoch": 42.4, "grad_norm": 3.0503251552581787, "learning_rate": 2.88e-05, "loss": 2.5691, "step": 53000 }, { "epoch": 42.8, "grad_norm": 2.777174711227417, "learning_rate": 2.86e-05, "loss": 2.5861, "step": 53500 }, { "epoch": 43.0, "eval_loss": 2.605225086212158, "eval_runtime": 61.226, "eval_samples_per_second": 163.329, "eval_steps_per_second": 5.112, "step": 53750 }, { "epoch": 43.2, "grad_norm": 2.6447839736938477, "learning_rate": 2.84e-05, "loss": 2.5709, "step": 54000 }, { "epoch": 43.6, "grad_norm": 2.7956857681274414, "learning_rate": 2.8199999999999998e-05, "loss": 2.5672, "step": 54500 }, { "epoch": 44.0, "grad_norm": 3.081850528717041, "learning_rate": 2.8000000000000003e-05, "loss": 2.5665, "step": 55000 }, { "epoch": 44.0, "eval_loss": 2.588907480239868, "eval_runtime": 61.2915, "eval_samples_per_second": 163.155, "eval_steps_per_second": 5.107, "step": 55000 }, { "epoch": 44.4, "grad_norm": 2.979550838470459, "learning_rate": 2.7800000000000005e-05, "loss": 2.5553, "step": 55500 }, { "epoch": 44.8, "grad_norm": 2.8044862747192383, "learning_rate": 2.7600000000000003e-05, "loss": 2.5428, "step": 56000 }, { "epoch": 45.0, "eval_loss": 2.5998992919921875, "eval_runtime": 61.2365, "eval_samples_per_second": 163.301, "eval_steps_per_second": 5.111, "step": 56250 }, { "epoch": 45.2, "grad_norm": 2.7438066005706787, "learning_rate": 2.7400000000000002e-05, "loss": 2.5435, "step": 56500 }, { "epoch": 45.6, "grad_norm": 2.8336572647094727, "learning_rate": 2.7200000000000004e-05, "loss": 2.5333, "step": 57000 }, { "epoch": 46.0, "grad_norm": 2.8354380130767822, "learning_rate": 2.7000000000000002e-05, "loss": 2.5385, "step": 57500 }, { "epoch": 46.0, "eval_loss": 2.5822722911834717, "eval_runtime": 61.2292, "eval_samples_per_second": 163.321, "eval_steps_per_second": 5.112, "step": 57500 }, { "epoch": 46.4, "grad_norm": 3.035831928253174, "learning_rate": 2.6800000000000004e-05, "loss": 2.5185, "step": 58000 }, { "epoch": 46.8, "grad_norm": 3.1147818565368652, "learning_rate": 2.6600000000000003e-05, "loss": 2.5273, "step": 58500 }, { "epoch": 47.0, "eval_loss": 2.5733859539031982, "eval_runtime": 61.2138, "eval_samples_per_second": 163.362, "eval_steps_per_second": 5.113, "step": 58750 }, { "epoch": 47.2, "grad_norm": 2.987078905105591, "learning_rate": 2.64e-05, "loss": 2.5207, "step": 59000 }, { "epoch": 47.6, "grad_norm": 2.7736542224884033, "learning_rate": 2.6200000000000003e-05, "loss": 2.5038, "step": 59500 }, { "epoch": 48.0, "grad_norm": 2.8549487590789795, "learning_rate": 2.6000000000000002e-05, "loss": 2.5175, "step": 60000 }, { "epoch": 48.0, "eval_loss": 2.561167001724243, "eval_runtime": 61.2382, "eval_samples_per_second": 163.297, "eval_steps_per_second": 5.111, "step": 60000 }, { "epoch": 48.4, "grad_norm": 3.058418035507202, "learning_rate": 2.58e-05, "loss": 2.4956, "step": 60500 }, { "epoch": 48.8, "grad_norm": 2.955167293548584, "learning_rate": 2.5600000000000002e-05, "loss": 2.5064, "step": 61000 }, { "epoch": 49.0, "eval_loss": 2.555574417114258, "eval_runtime": 61.2242, "eval_samples_per_second": 163.334, "eval_steps_per_second": 5.112, "step": 61250 }, { "epoch": 49.2, "grad_norm": 2.9953746795654297, "learning_rate": 2.54e-05, "loss": 2.4899, "step": 61500 }, { "epoch": 49.6, "grad_norm": 3.176257610321045, "learning_rate": 2.5200000000000003e-05, "loss": 2.4862, "step": 62000 }, { "epoch": 50.0, "grad_norm": 3.131290912628174, "learning_rate": 2.5e-05, "loss": 2.4931, "step": 62500 }, { "epoch": 50.0, "eval_loss": 2.5496270656585693, "eval_runtime": 61.2301, "eval_samples_per_second": 163.319, "eval_steps_per_second": 5.112, "step": 62500 }, { "epoch": 50.4, "grad_norm": 2.871742010116577, "learning_rate": 2.48e-05, "loss": 2.4794, "step": 63000 }, { "epoch": 50.8, "grad_norm": 2.7357499599456787, "learning_rate": 2.46e-05, "loss": 2.4802, "step": 63500 }, { "epoch": 51.0, "eval_loss": 2.528775691986084, "eval_runtime": 61.2246, "eval_samples_per_second": 163.333, "eval_steps_per_second": 5.112, "step": 63750 }, { "epoch": 51.2, "grad_norm": 2.903444528579712, "learning_rate": 2.44e-05, "loss": 2.4712, "step": 64000 }, { "epoch": 51.6, "grad_norm": 3.0363874435424805, "learning_rate": 2.4200000000000002e-05, "loss": 2.4673, "step": 64500 }, { "epoch": 52.0, "grad_norm": 2.935192823410034, "learning_rate": 2.4e-05, "loss": 2.4604, "step": 65000 }, { "epoch": 52.0, "eval_loss": 2.5387279987335205, "eval_runtime": 61.5389, "eval_samples_per_second": 162.499, "eval_steps_per_second": 5.086, "step": 65000 }, { "epoch": 52.4, "grad_norm": 2.8958323001861572, "learning_rate": 2.38e-05, "loss": 2.4495, "step": 65500 }, { "epoch": 52.8, "grad_norm": 2.9605252742767334, "learning_rate": 2.36e-05, "loss": 2.4585, "step": 66000 }, { "epoch": 53.0, "eval_loss": 2.5214128494262695, "eval_runtime": 61.2198, "eval_samples_per_second": 163.346, "eval_steps_per_second": 5.113, "step": 66250 }, { "epoch": 53.2, "grad_norm": 2.8114893436431885, "learning_rate": 2.3400000000000003e-05, "loss": 2.4386, "step": 66500 }, { "epoch": 53.6, "grad_norm": 2.9136197566986084, "learning_rate": 2.32e-05, "loss": 2.442, "step": 67000 }, { "epoch": 54.0, "grad_norm": 2.891444683074951, "learning_rate": 2.3000000000000003e-05, "loss": 2.4517, "step": 67500 }, { "epoch": 54.0, "eval_loss": 2.520853042602539, "eval_runtime": 61.2251, "eval_samples_per_second": 163.332, "eval_steps_per_second": 5.112, "step": 67500 }, { "epoch": 54.4, "grad_norm": 2.8234751224517822, "learning_rate": 2.2800000000000002e-05, "loss": 2.4323, "step": 68000 }, { "epoch": 54.8, "grad_norm": 2.6390535831451416, "learning_rate": 2.26e-05, "loss": 2.4389, "step": 68500 }, { "epoch": 55.0, "eval_loss": 2.508610725402832, "eval_runtime": 61.2599, "eval_samples_per_second": 163.239, "eval_steps_per_second": 5.109, "step": 68750 }, { "epoch": 55.2, "grad_norm": 2.766709566116333, "learning_rate": 2.2400000000000002e-05, "loss": 2.4248, "step": 69000 }, { "epoch": 55.6, "grad_norm": 2.9248626232147217, "learning_rate": 2.22e-05, "loss": 2.4301, "step": 69500 }, { "epoch": 56.0, "grad_norm": 3.262801170349121, "learning_rate": 2.2000000000000003e-05, "loss": 2.4225, "step": 70000 }, { "epoch": 56.0, "eval_loss": 2.5070488452911377, "eval_runtime": 61.2656, "eval_samples_per_second": 163.224, "eval_steps_per_second": 5.109, "step": 70000 }, { "epoch": 56.4, "grad_norm": 3.05898118019104, "learning_rate": 2.18e-05, "loss": 2.414, "step": 70500 }, { "epoch": 56.8, "grad_norm": 2.8624179363250732, "learning_rate": 2.16e-05, "loss": 2.4145, "step": 71000 }, { "epoch": 57.0, "eval_loss": 2.4903688430786133, "eval_runtime": 61.3344, "eval_samples_per_second": 163.041, "eval_steps_per_second": 5.103, "step": 71250 }, { "epoch": 57.2, "grad_norm": 2.8718996047973633, "learning_rate": 2.1400000000000002e-05, "loss": 2.4152, "step": 71500 }, { "epoch": 57.6, "grad_norm": 3.02909517288208, "learning_rate": 2.12e-05, "loss": 2.4107, "step": 72000 }, { "epoch": 58.0, "grad_norm": 3.092914581298828, "learning_rate": 2.1e-05, "loss": 2.4058, "step": 72500 }, { "epoch": 58.0, "eval_loss": 2.4901366233825684, "eval_runtime": 61.3186, "eval_samples_per_second": 163.083, "eval_steps_per_second": 5.104, "step": 72500 }, { "epoch": 58.4, "grad_norm": 3.0763566493988037, "learning_rate": 2.08e-05, "loss": 2.3941, "step": 73000 }, { "epoch": 58.8, "grad_norm": 2.809154748916626, "learning_rate": 2.06e-05, "loss": 2.398, "step": 73500 }, { "epoch": 59.0, "eval_loss": 2.491445302963257, "eval_runtime": 61.2316, "eval_samples_per_second": 163.314, "eval_steps_per_second": 5.112, "step": 73750 }, { "epoch": 59.2, "grad_norm": 2.948514699935913, "learning_rate": 2.04e-05, "loss": 2.3977, "step": 74000 }, { "epoch": 59.6, "grad_norm": 2.9708919525146484, "learning_rate": 2.0200000000000003e-05, "loss": 2.3849, "step": 74500 }, { "epoch": 60.0, "grad_norm": 3.0955567359924316, "learning_rate": 2e-05, "loss": 2.3847, "step": 75000 }, { "epoch": 60.0, "eval_loss": 2.481839418411255, "eval_runtime": 61.2301, "eval_samples_per_second": 163.318, "eval_steps_per_second": 5.112, "step": 75000 }, { "epoch": 60.4, "grad_norm": 2.8540799617767334, "learning_rate": 1.9800000000000004e-05, "loss": 2.3831, "step": 75500 }, { "epoch": 60.8, "grad_norm": 2.79695987701416, "learning_rate": 1.9600000000000002e-05, "loss": 2.3833, "step": 76000 }, { "epoch": 61.0, "eval_loss": 2.4663026332855225, "eval_runtime": 61.2384, "eval_samples_per_second": 163.296, "eval_steps_per_second": 5.111, "step": 76250 }, { "epoch": 61.2, "grad_norm": 3.144937515258789, "learning_rate": 1.94e-05, "loss": 2.3802, "step": 76500 }, { "epoch": 61.6, "grad_norm": 3.230477809906006, "learning_rate": 1.9200000000000003e-05, "loss": 2.369, "step": 77000 }, { "epoch": 62.0, "grad_norm": 3.04854416847229, "learning_rate": 1.9e-05, "loss": 2.3772, "step": 77500 }, { "epoch": 62.0, "eval_loss": 2.475891351699829, "eval_runtime": 61.2453, "eval_samples_per_second": 163.278, "eval_steps_per_second": 5.111, "step": 77500 }, { "epoch": 62.4, "grad_norm": 2.8289458751678467, "learning_rate": 1.88e-05, "loss": 2.3607, "step": 78000 }, { "epoch": 62.8, "grad_norm": 2.9443745613098145, "learning_rate": 1.86e-05, "loss": 2.3711, "step": 78500 }, { "epoch": 63.0, "eval_loss": 2.45942759513855, "eval_runtime": 61.2995, "eval_samples_per_second": 163.133, "eval_steps_per_second": 5.106, "step": 78750 }, { "epoch": 63.2, "grad_norm": 3.4686219692230225, "learning_rate": 1.84e-05, "loss": 2.3635, "step": 79000 }, { "epoch": 63.6, "grad_norm": 2.9754679203033447, "learning_rate": 1.8200000000000002e-05, "loss": 2.3544, "step": 79500 }, { "epoch": 64.0, "grad_norm": 2.959327459335327, "learning_rate": 1.8e-05, "loss": 2.3603, "step": 80000 }, { "epoch": 64.0, "eval_loss": 2.454479455947876, "eval_runtime": 61.2094, "eval_samples_per_second": 163.374, "eval_steps_per_second": 5.114, "step": 80000 }, { "epoch": 64.4, "grad_norm": 3.086289882659912, "learning_rate": 1.78e-05, "loss": 2.348, "step": 80500 }, { "epoch": 64.8, "grad_norm": 2.840416193008423, "learning_rate": 1.76e-05, "loss": 2.3546, "step": 81000 }, { "epoch": 65.0, "eval_loss": 2.4469666481018066, "eval_runtime": 61.2942, "eval_samples_per_second": 163.147, "eval_steps_per_second": 5.107, "step": 81250 }, { "epoch": 65.2, "grad_norm": 3.062312364578247, "learning_rate": 1.74e-05, "loss": 2.3485, "step": 81500 }, { "epoch": 65.6, "grad_norm": 2.8818204402923584, "learning_rate": 1.7199999999999998e-05, "loss": 2.3434, "step": 82000 }, { "epoch": 66.0, "grad_norm": 2.827214002609253, "learning_rate": 1.7000000000000003e-05, "loss": 2.3497, "step": 82500 }, { "epoch": 66.0, "eval_loss": 2.4515304565429688, "eval_runtime": 61.2229, "eval_samples_per_second": 163.338, "eval_steps_per_second": 5.112, "step": 82500 }, { "epoch": 66.4, "grad_norm": 2.871967315673828, "learning_rate": 1.6800000000000002e-05, "loss": 2.3415, "step": 83000 }, { "epoch": 66.8, "grad_norm": 3.2059788703918457, "learning_rate": 1.66e-05, "loss": 2.3328, "step": 83500 }, { "epoch": 67.0, "eval_loss": 2.4459116458892822, "eval_runtime": 61.2313, "eval_samples_per_second": 163.315, "eval_steps_per_second": 5.112, "step": 83750 }, { "epoch": 67.2, "grad_norm": 2.958317518234253, "learning_rate": 1.6400000000000002e-05, "loss": 2.3355, "step": 84000 }, { "epoch": 67.6, "grad_norm": 3.2304327487945557, "learning_rate": 1.62e-05, "loss": 2.329, "step": 84500 }, { "epoch": 68.0, "grad_norm": 3.0539510250091553, "learning_rate": 1.6000000000000003e-05, "loss": 2.3281, "step": 85000 }, { "epoch": 68.0, "eval_loss": 2.4377968311309814, "eval_runtime": 61.2555, "eval_samples_per_second": 163.251, "eval_steps_per_second": 5.11, "step": 85000 }, { "epoch": 68.4, "grad_norm": 3.338268995285034, "learning_rate": 1.58e-05, "loss": 2.3221, "step": 85500 }, { "epoch": 68.8, "grad_norm": 3.2005858421325684, "learning_rate": 1.56e-05, "loss": 2.3272, "step": 86000 }, { "epoch": 69.0, "eval_loss": 2.4320311546325684, "eval_runtime": 61.3445, "eval_samples_per_second": 163.014, "eval_steps_per_second": 5.102, "step": 86250 }, { "epoch": 69.2, "grad_norm": 3.010361671447754, "learning_rate": 1.54e-05, "loss": 2.312, "step": 86500 }, { "epoch": 69.6, "grad_norm": 3.110023021697998, "learning_rate": 1.52e-05, "loss": 2.3175, "step": 87000 }, { "epoch": 70.0, "grad_norm": 2.972137212753296, "learning_rate": 1.5e-05, "loss": 2.3172, "step": 87500 }, { "epoch": 70.0, "eval_loss": 2.428523063659668, "eval_runtime": 61.2024, "eval_samples_per_second": 163.392, "eval_steps_per_second": 5.114, "step": 87500 }, { "epoch": 70.4, "grad_norm": 2.8720614910125732, "learning_rate": 1.48e-05, "loss": 2.3103, "step": 88000 }, { "epoch": 70.8, "grad_norm": 2.9163401126861572, "learning_rate": 1.4599999999999999e-05, "loss": 2.3128, "step": 88500 }, { "epoch": 71.0, "eval_loss": 2.4292280673980713, "eval_runtime": 61.2279, "eval_samples_per_second": 163.324, "eval_steps_per_second": 5.112, "step": 88750 }, { "epoch": 71.2, "grad_norm": 2.890698194503784, "learning_rate": 1.44e-05, "loss": 2.304, "step": 89000 }, { "epoch": 71.6, "grad_norm": 3.2436020374298096, "learning_rate": 1.42e-05, "loss": 2.3067, "step": 89500 }, { "epoch": 72.0, "grad_norm": 2.849823236465454, "learning_rate": 1.4000000000000001e-05, "loss": 2.3082, "step": 90000 }, { "epoch": 72.0, "eval_loss": 2.41927170753479, "eval_runtime": 61.2261, "eval_samples_per_second": 163.329, "eval_steps_per_second": 5.112, "step": 90000 }, { "epoch": 72.4, "grad_norm": 3.191131353378296, "learning_rate": 1.3800000000000002e-05, "loss": 2.2925, "step": 90500 }, { "epoch": 72.8, "grad_norm": 3.203021764755249, "learning_rate": 1.3600000000000002e-05, "loss": 2.3004, "step": 91000 }, { "epoch": 73.0, "eval_loss": 2.4278526306152344, "eval_runtime": 61.2358, "eval_samples_per_second": 163.303, "eval_steps_per_second": 5.111, "step": 91250 }, { "epoch": 73.2, "grad_norm": 3.002692461013794, "learning_rate": 1.3400000000000002e-05, "loss": 2.2949, "step": 91500 }, { "epoch": 73.6, "grad_norm": 3.137084722518921, "learning_rate": 1.32e-05, "loss": 2.2898, "step": 92000 }, { "epoch": 74.0, "grad_norm": 3.325843095779419, "learning_rate": 1.3000000000000001e-05, "loss": 2.2955, "step": 92500 }, { "epoch": 74.0, "eval_loss": 2.409475326538086, "eval_runtime": 61.3551, "eval_samples_per_second": 162.986, "eval_steps_per_second": 5.101, "step": 92500 }, { "epoch": 74.4, "grad_norm": 3.207425594329834, "learning_rate": 1.2800000000000001e-05, "loss": 2.2895, "step": 93000 }, { "epoch": 74.8, "grad_norm": 3.2221837043762207, "learning_rate": 1.2600000000000001e-05, "loss": 2.2849, "step": 93500 }, { "epoch": 75.0, "eval_loss": 2.4111411571502686, "eval_runtime": 61.2648, "eval_samples_per_second": 163.226, "eval_steps_per_second": 5.109, "step": 93750 }, { "epoch": 75.2, "grad_norm": 3.0594756603240967, "learning_rate": 1.24e-05, "loss": 2.2791, "step": 94000 }, { "epoch": 75.6, "grad_norm": 3.013936996459961, "learning_rate": 1.22e-05, "loss": 2.2935, "step": 94500 }, { "epoch": 76.0, "grad_norm": 3.296487808227539, "learning_rate": 1.2e-05, "loss": 2.2816, "step": 95000 }, { "epoch": 76.0, "eval_loss": 2.405331611633301, "eval_runtime": 61.2866, "eval_samples_per_second": 163.168, "eval_steps_per_second": 5.107, "step": 95000 } ], "logging_steps": 500, "max_steps": 125000, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 2, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.0316781723648e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }