| { | |
| "best_global_step": 19990, | |
| "best_metric": 0.566254198551178, | |
| "best_model_checkpoint": "/media/user/Expansion1/granite-embedding-107m-multilingual-chat-difficulty/checkpoint-19990", | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 49975, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.05002501250625312, | |
| "grad_norm": 14.432242393493652, | |
| "learning_rate": 4.9500750375187595e-05, | |
| "loss": 0.7612, | |
| "num_input_tokens_seen": 512000, | |
| "step": 500, | |
| "train_runtime": 10.716, | |
| "train_tokens_per_second": 47778.963 | |
| }, | |
| { | |
| "epoch": 0.10005002501250625, | |
| "grad_norm": 4.861369609832764, | |
| "learning_rate": 4.900050025012506e-05, | |
| "loss": 0.682, | |
| "num_input_tokens_seen": 1024000, | |
| "step": 1000, | |
| "train_runtime": 21.117, | |
| "train_tokens_per_second": 48491.697 | |
| }, | |
| { | |
| "epoch": 0.1500750375187594, | |
| "grad_norm": 8.015384674072266, | |
| "learning_rate": 4.8500250125062535e-05, | |
| "loss": 0.6637, | |
| "num_input_tokens_seen": 1536000, | |
| "step": 1500, | |
| "train_runtime": 31.546, | |
| "train_tokens_per_second": 48690.793 | |
| }, | |
| { | |
| "epoch": 0.2001000500250125, | |
| "grad_norm": 8.866910934448242, | |
| "learning_rate": 4.8e-05, | |
| "loss": 0.6346, | |
| "num_input_tokens_seen": 2048000, | |
| "step": 2000, | |
| "train_runtime": 41.8744, | |
| "train_tokens_per_second": 48908.201 | |
| }, | |
| { | |
| "epoch": 0.25012506253126565, | |
| "grad_norm": 8.685997009277344, | |
| "learning_rate": 4.7499749874937475e-05, | |
| "loss": 0.6475, | |
| "num_input_tokens_seen": 2560000, | |
| "step": 2500, | |
| "train_runtime": 52.2059, | |
| "train_tokens_per_second": 49036.559 | |
| }, | |
| { | |
| "epoch": 0.3001500750375188, | |
| "grad_norm": 11.758138656616211, | |
| "learning_rate": 4.699949974987494e-05, | |
| "loss": 0.6349, | |
| "num_input_tokens_seen": 3072000, | |
| "step": 3000, | |
| "train_runtime": 62.5742, | |
| "train_tokens_per_second": 49093.701 | |
| }, | |
| { | |
| "epoch": 0.3501750875437719, | |
| "grad_norm": 8.118245124816895, | |
| "learning_rate": 4.649924962481241e-05, | |
| "loss": 0.6317, | |
| "num_input_tokens_seen": 3584000, | |
| "step": 3500, | |
| "train_runtime": 72.9422, | |
| "train_tokens_per_second": 49134.817 | |
| }, | |
| { | |
| "epoch": 0.400200100050025, | |
| "grad_norm": 6.139720916748047, | |
| "learning_rate": 4.599899949974988e-05, | |
| "loss": 0.6167, | |
| "num_input_tokens_seen": 4096000, | |
| "step": 4000, | |
| "train_runtime": 83.3179, | |
| "train_tokens_per_second": 49161.134 | |
| }, | |
| { | |
| "epoch": 0.4502251125562781, | |
| "grad_norm": 8.408066749572754, | |
| "learning_rate": 4.549874937468734e-05, | |
| "loss": 0.5989, | |
| "num_input_tokens_seen": 4608000, | |
| "step": 4500, | |
| "train_runtime": 93.7746, | |
| "train_tokens_per_second": 49139.121 | |
| }, | |
| { | |
| "epoch": 0.5002501250625313, | |
| "grad_norm": 10.197962760925293, | |
| "learning_rate": 4.4998499249624814e-05, | |
| "loss": 0.6064, | |
| "num_input_tokens_seen": 5120000, | |
| "step": 5000, | |
| "train_runtime": 104.1644, | |
| "train_tokens_per_second": 49153.06 | |
| }, | |
| { | |
| "epoch": 0.5502751375687844, | |
| "grad_norm": 4.074629306793213, | |
| "learning_rate": 4.449824912456229e-05, | |
| "loss": 0.6092, | |
| "num_input_tokens_seen": 5632000, | |
| "step": 5500, | |
| "train_runtime": 114.5316, | |
| "train_tokens_per_second": 49174.215 | |
| }, | |
| { | |
| "epoch": 0.6003001500750376, | |
| "grad_norm": 3.91839861869812, | |
| "learning_rate": 4.3997998999499754e-05, | |
| "loss": 0.589, | |
| "num_input_tokens_seen": 6144000, | |
| "step": 6000, | |
| "train_runtime": 124.9251, | |
| "train_tokens_per_second": 49181.485 | |
| }, | |
| { | |
| "epoch": 0.6503251625812907, | |
| "grad_norm": 8.543745040893555, | |
| "learning_rate": 4.349774887443722e-05, | |
| "loss": 0.6039, | |
| "num_input_tokens_seen": 6656000, | |
| "step": 6500, | |
| "train_runtime": 135.6035, | |
| "train_tokens_per_second": 49084.295 | |
| }, | |
| { | |
| "epoch": 0.7003501750875438, | |
| "grad_norm": 7.670341491699219, | |
| "learning_rate": 4.299749874937469e-05, | |
| "loss": 0.6106, | |
| "num_input_tokens_seen": 7168000, | |
| "step": 7000, | |
| "train_runtime": 146.0175, | |
| "train_tokens_per_second": 49090.01 | |
| }, | |
| { | |
| "epoch": 0.7503751875937968, | |
| "grad_norm": 7.7239580154418945, | |
| "learning_rate": 4.249724862431216e-05, | |
| "loss": 0.6287, | |
| "num_input_tokens_seen": 7680000, | |
| "step": 7500, | |
| "train_runtime": 156.4134, | |
| "train_tokens_per_second": 49100.637 | |
| }, | |
| { | |
| "epoch": 0.80040020010005, | |
| "grad_norm": 11.006497383117676, | |
| "learning_rate": 4.199699849924963e-05, | |
| "loss": 0.5984, | |
| "num_input_tokens_seen": 8192000, | |
| "step": 8000, | |
| "train_runtime": 166.8094, | |
| "train_tokens_per_second": 49109.928 | |
| }, | |
| { | |
| "epoch": 0.8504252126063031, | |
| "grad_norm": 6.032561779022217, | |
| "learning_rate": 4.1496748374187094e-05, | |
| "loss": 0.5775, | |
| "num_input_tokens_seen": 8704000, | |
| "step": 8500, | |
| "train_runtime": 177.6017, | |
| "train_tokens_per_second": 49008.538 | |
| }, | |
| { | |
| "epoch": 0.9004502251125562, | |
| "grad_norm": 19.184879302978516, | |
| "learning_rate": 4.099649824912457e-05, | |
| "loss": 0.6008, | |
| "num_input_tokens_seen": 9216000, | |
| "step": 9000, | |
| "train_runtime": 188.21, | |
| "train_tokens_per_second": 48966.59 | |
| }, | |
| { | |
| "epoch": 0.9504752376188094, | |
| "grad_norm": 21.708553314208984, | |
| "learning_rate": 4.049624812406203e-05, | |
| "loss": 0.5781, | |
| "num_input_tokens_seen": 9728000, | |
| "step": 9500, | |
| "train_runtime": 198.7447, | |
| "train_tokens_per_second": 48947.209 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.5996330380439758, | |
| "eval_mse": 0.599633070872774, | |
| "eval_runtime": 7.71, | |
| "eval_samples_per_second": 2592.618, | |
| "eval_steps_per_second": 324.126, | |
| "num_input_tokens_seen": 10234624, | |
| "step": 9995 | |
| }, | |
| { | |
| "epoch": 1.0005002501250626, | |
| "grad_norm": 14.216784477233887, | |
| "learning_rate": 3.99959979989995e-05, | |
| "loss": 0.5931, | |
| "num_input_tokens_seen": 10239744, | |
| "step": 10000, | |
| "train_runtime": 218.5356, | |
| "train_tokens_per_second": 46856.177 | |
| }, | |
| { | |
| "epoch": 1.0505252626313157, | |
| "grad_norm": 12.273909568786621, | |
| "learning_rate": 3.9495747873936974e-05, | |
| "loss": 0.5232, | |
| "num_input_tokens_seen": 10751744, | |
| "step": 10500, | |
| "train_runtime": 228.9682, | |
| "train_tokens_per_second": 46957.358 | |
| }, | |
| { | |
| "epoch": 1.1005502751375689, | |
| "grad_norm": 5.408884525299072, | |
| "learning_rate": 3.899549774887444e-05, | |
| "loss": 0.5491, | |
| "num_input_tokens_seen": 11263744, | |
| "step": 11000, | |
| "train_runtime": 239.4335, | |
| "train_tokens_per_second": 47043.302 | |
| }, | |
| { | |
| "epoch": 1.150575287643822, | |
| "grad_norm": 13.357819557189941, | |
| "learning_rate": 3.849524762381191e-05, | |
| "loss": 0.5141, | |
| "num_input_tokens_seen": 11775744, | |
| "step": 11500, | |
| "train_runtime": 249.8249, | |
| "train_tokens_per_second": 47135.996 | |
| }, | |
| { | |
| "epoch": 1.2006003001500751, | |
| "grad_norm": 6.6150641441345215, | |
| "learning_rate": 3.7994997498749374e-05, | |
| "loss": 0.5161, | |
| "num_input_tokens_seen": 12287744, | |
| "step": 12000, | |
| "train_runtime": 260.1103, | |
| "train_tokens_per_second": 47240.518 | |
| }, | |
| { | |
| "epoch": 1.2506253126563283, | |
| "grad_norm": 6.556243896484375, | |
| "learning_rate": 3.749474737368685e-05, | |
| "loss": 0.5046, | |
| "num_input_tokens_seen": 12799744, | |
| "step": 12500, | |
| "train_runtime": 270.3312, | |
| "train_tokens_per_second": 47348.381 | |
| }, | |
| { | |
| "epoch": 1.3006503251625814, | |
| "grad_norm": 15.295032501220703, | |
| "learning_rate": 3.6994497248624314e-05, | |
| "loss": 0.5032, | |
| "num_input_tokens_seen": 13311744, | |
| "step": 13000, | |
| "train_runtime": 280.9075, | |
| "train_tokens_per_second": 47388.352 | |
| }, | |
| { | |
| "epoch": 1.3506753376688345, | |
| "grad_norm": 5.232257843017578, | |
| "learning_rate": 3.649424712356178e-05, | |
| "loss": 0.5209, | |
| "num_input_tokens_seen": 13823744, | |
| "step": 13500, | |
| "train_runtime": 291.1832, | |
| "train_tokens_per_second": 47474.391 | |
| }, | |
| { | |
| "epoch": 1.4007003501750876, | |
| "grad_norm": 4.907810688018799, | |
| "learning_rate": 3.5993996998499254e-05, | |
| "loss": 0.5166, | |
| "num_input_tokens_seen": 14335744, | |
| "step": 14000, | |
| "train_runtime": 301.5311, | |
| "train_tokens_per_second": 47543.176 | |
| }, | |
| { | |
| "epoch": 1.4507253626813408, | |
| "grad_norm": 2.214416742324829, | |
| "learning_rate": 3.549374687343672e-05, | |
| "loss": 0.508, | |
| "num_input_tokens_seen": 14847744, | |
| "step": 14500, | |
| "train_runtime": 311.993, | |
| "train_tokens_per_second": 47589.991 | |
| }, | |
| { | |
| "epoch": 1.500750375187594, | |
| "grad_norm": 7.953266620635986, | |
| "learning_rate": 3.499349674837419e-05, | |
| "loss": 0.5163, | |
| "num_input_tokens_seen": 15359744, | |
| "step": 15000, | |
| "train_runtime": 322.4354, | |
| "train_tokens_per_second": 47636.659 | |
| }, | |
| { | |
| "epoch": 1.550775387693847, | |
| "grad_norm": 27.234952926635742, | |
| "learning_rate": 3.449324662331166e-05, | |
| "loss": 0.5105, | |
| "num_input_tokens_seen": 15871744, | |
| "step": 15500, | |
| "train_runtime": 332.8782, | |
| "train_tokens_per_second": 47680.34 | |
| }, | |
| { | |
| "epoch": 1.6008004002001002, | |
| "grad_norm": 8.335816383361816, | |
| "learning_rate": 3.399299649824913e-05, | |
| "loss": 0.5177, | |
| "num_input_tokens_seen": 16383744, | |
| "step": 16000, | |
| "train_runtime": 343.188, | |
| "train_tokens_per_second": 47739.856 | |
| }, | |
| { | |
| "epoch": 1.6508254127063533, | |
| "grad_norm": 12.731109619140625, | |
| "learning_rate": 3.3492746373186594e-05, | |
| "loss": 0.5282, | |
| "num_input_tokens_seen": 16895744, | |
| "step": 16500, | |
| "train_runtime": 353.7522, | |
| "train_tokens_per_second": 47761.528 | |
| }, | |
| { | |
| "epoch": 1.7008504252126064, | |
| "grad_norm": 2.0694406032562256, | |
| "learning_rate": 3.299249624812407e-05, | |
| "loss": 0.5217, | |
| "num_input_tokens_seen": 17407744, | |
| "step": 17000, | |
| "train_runtime": 364.3803, | |
| "train_tokens_per_second": 47773.564 | |
| }, | |
| { | |
| "epoch": 1.7508754377188596, | |
| "grad_norm": 6.190609455108643, | |
| "learning_rate": 3.2492246123061534e-05, | |
| "loss": 0.5185, | |
| "num_input_tokens_seen": 17919744, | |
| "step": 17500, | |
| "train_runtime": 374.8689, | |
| "train_tokens_per_second": 47802.702 | |
| }, | |
| { | |
| "epoch": 1.8009004502251127, | |
| "grad_norm": 6.968355655670166, | |
| "learning_rate": 3.1991995997999e-05, | |
| "loss": 0.5025, | |
| "num_input_tokens_seen": 18431744, | |
| "step": 18000, | |
| "train_runtime": 385.1924, | |
| "train_tokens_per_second": 47850.75 | |
| }, | |
| { | |
| "epoch": 1.8509254627313658, | |
| "grad_norm": 2.6967575550079346, | |
| "learning_rate": 3.149174587293647e-05, | |
| "loss": 0.5118, | |
| "num_input_tokens_seen": 18943744, | |
| "step": 18500, | |
| "train_runtime": 395.8463, | |
| "train_tokens_per_second": 47856.316 | |
| }, | |
| { | |
| "epoch": 1.900950475237619, | |
| "grad_norm": 13.992359161376953, | |
| "learning_rate": 3.099149574787394e-05, | |
| "loss": 0.5209, | |
| "num_input_tokens_seen": 19455744, | |
| "step": 19000, | |
| "train_runtime": 406.5011, | |
| "train_tokens_per_second": 47861.481 | |
| }, | |
| { | |
| "epoch": 1.950975487743872, | |
| "grad_norm": 4.206520080566406, | |
| "learning_rate": 3.049124562281141e-05, | |
| "loss": 0.5063, | |
| "num_input_tokens_seen": 19967744, | |
| "step": 19500, | |
| "train_runtime": 416.9681, | |
| "train_tokens_per_second": 47887.945 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.566254198551178, | |
| "eval_mse": 0.5662542309499291, | |
| "eval_runtime": 7.8118, | |
| "eval_samples_per_second": 2558.806, | |
| "eval_steps_per_second": 319.899, | |
| "num_input_tokens_seen": 20469248, | |
| "step": 19990 | |
| }, | |
| { | |
| "epoch": 2.001000500250125, | |
| "grad_norm": 5.060875415802002, | |
| "learning_rate": 2.9990995497748873e-05, | |
| "loss": 0.5111, | |
| "num_input_tokens_seen": 20479488, | |
| "step": 20000, | |
| "train_runtime": 437.0238, | |
| "train_tokens_per_second": 46861.264 | |
| }, | |
| { | |
| "epoch": 2.0510255127563783, | |
| "grad_norm": 4.569818019866943, | |
| "learning_rate": 2.9490745372686347e-05, | |
| "loss": 0.4432, | |
| "num_input_tokens_seen": 20991488, | |
| "step": 20500, | |
| "train_runtime": 447.3167, | |
| "train_tokens_per_second": 46927.57 | |
| }, | |
| { | |
| "epoch": 2.1010505252626315, | |
| "grad_norm": 6.6488142013549805, | |
| "learning_rate": 2.899049524762381e-05, | |
| "loss": 0.418, | |
| "num_input_tokens_seen": 21503488, | |
| "step": 21000, | |
| "train_runtime": 457.6456, | |
| "train_tokens_per_second": 46987.204 | |
| }, | |
| { | |
| "epoch": 2.1510755377688846, | |
| "grad_norm": 6.992687702178955, | |
| "learning_rate": 2.8490245122561283e-05, | |
| "loss": 0.4222, | |
| "num_input_tokens_seen": 22015488, | |
| "step": 21500, | |
| "train_runtime": 467.9444, | |
| "train_tokens_per_second": 47047.231 | |
| }, | |
| { | |
| "epoch": 2.2011005502751377, | |
| "grad_norm": 9.515182495117188, | |
| "learning_rate": 2.7989994997498753e-05, | |
| "loss": 0.4325, | |
| "num_input_tokens_seen": 22527488, | |
| "step": 22000, | |
| "train_runtime": 478.2535, | |
| "train_tokens_per_second": 47103.653 | |
| }, | |
| { | |
| "epoch": 2.251125562781391, | |
| "grad_norm": 9.444127082824707, | |
| "learning_rate": 2.748974487243622e-05, | |
| "loss": 0.42, | |
| "num_input_tokens_seen": 23039488, | |
| "step": 22500, | |
| "train_runtime": 488.5556, | |
| "train_tokens_per_second": 47158.378 | |
| }, | |
| { | |
| "epoch": 2.301150575287644, | |
| "grad_norm": 9.03792953491211, | |
| "learning_rate": 2.698949474737369e-05, | |
| "loss": 0.4123, | |
| "num_input_tokens_seen": 23551488, | |
| "step": 23000, | |
| "train_runtime": 498.876, | |
| "train_tokens_per_second": 47209.103 | |
| }, | |
| { | |
| "epoch": 2.351175587793897, | |
| "grad_norm": 8.053046226501465, | |
| "learning_rate": 2.6489244622311153e-05, | |
| "loss": 0.4179, | |
| "num_input_tokens_seen": 24063488, | |
| "step": 23500, | |
| "train_runtime": 509.2037, | |
| "train_tokens_per_second": 47257.098 | |
| }, | |
| { | |
| "epoch": 2.4012006003001503, | |
| "grad_norm": 13.128437995910645, | |
| "learning_rate": 2.5988994497248627e-05, | |
| "loss": 0.4239, | |
| "num_input_tokens_seen": 24575488, | |
| "step": 24000, | |
| "train_runtime": 519.5215, | |
| "train_tokens_per_second": 47304.079 | |
| }, | |
| { | |
| "epoch": 2.4512256128064034, | |
| "grad_norm": 10.007084846496582, | |
| "learning_rate": 2.5488744372186097e-05, | |
| "loss": 0.4316, | |
| "num_input_tokens_seen": 25087488, | |
| "step": 24500, | |
| "train_runtime": 529.8242, | |
| "train_tokens_per_second": 47350.591 | |
| }, | |
| { | |
| "epoch": 2.5012506253126565, | |
| "grad_norm": 11.051094055175781, | |
| "learning_rate": 2.4988494247123563e-05, | |
| "loss": 0.4233, | |
| "num_input_tokens_seen": 25599488, | |
| "step": 25000, | |
| "train_runtime": 540.1399, | |
| "train_tokens_per_second": 47394.182 | |
| }, | |
| { | |
| "epoch": 2.551275637818909, | |
| "grad_norm": 6.066195487976074, | |
| "learning_rate": 2.4488244122061033e-05, | |
| "loss": 0.4251, | |
| "num_input_tokens_seen": 26111488, | |
| "step": 25500, | |
| "train_runtime": 550.4474, | |
| "train_tokens_per_second": 47436.848 | |
| }, | |
| { | |
| "epoch": 2.6013006503251628, | |
| "grad_norm": 2.6425974369049072, | |
| "learning_rate": 2.39879939969985e-05, | |
| "loss": 0.4479, | |
| "num_input_tokens_seen": 26623488, | |
| "step": 26000, | |
| "train_runtime": 560.7806, | |
| "train_tokens_per_second": 47475.766 | |
| }, | |
| { | |
| "epoch": 2.6513256628314155, | |
| "grad_norm": 11.04046630859375, | |
| "learning_rate": 2.348774387193597e-05, | |
| "loss": 0.4457, | |
| "num_input_tokens_seen": 27135488, | |
| "step": 26500, | |
| "train_runtime": 571.1223, | |
| "train_tokens_per_second": 47512.567 | |
| }, | |
| { | |
| "epoch": 2.701350675337669, | |
| "grad_norm": 14.067301750183105, | |
| "learning_rate": 2.2987493746873436e-05, | |
| "loss": 0.4323, | |
| "num_input_tokens_seen": 27647488, | |
| "step": 27000, | |
| "train_runtime": 581.4573, | |
| "train_tokens_per_second": 47548.615 | |
| }, | |
| { | |
| "epoch": 2.7513756878439217, | |
| "grad_norm": 6.148231029510498, | |
| "learning_rate": 2.2487243621810906e-05, | |
| "loss": 0.4314, | |
| "num_input_tokens_seen": 28159488, | |
| "step": 27500, | |
| "train_runtime": 591.7788, | |
| "train_tokens_per_second": 47584.484 | |
| }, | |
| { | |
| "epoch": 2.8014007003501753, | |
| "grad_norm": 4.049166202545166, | |
| "learning_rate": 2.1986993496748376e-05, | |
| "loss": 0.4245, | |
| "num_input_tokens_seen": 28671488, | |
| "step": 28000, | |
| "train_runtime": 602.1254, | |
| "train_tokens_per_second": 47617.141 | |
| }, | |
| { | |
| "epoch": 2.851425712856428, | |
| "grad_norm": 10.769721031188965, | |
| "learning_rate": 2.1486743371685843e-05, | |
| "loss": 0.432, | |
| "num_input_tokens_seen": 29183488, | |
| "step": 28500, | |
| "train_runtime": 612.4701, | |
| "train_tokens_per_second": 47648.836 | |
| }, | |
| { | |
| "epoch": 2.9014507253626816, | |
| "grad_norm": 7.408927917480469, | |
| "learning_rate": 2.0986493246623313e-05, | |
| "loss": 0.4296, | |
| "num_input_tokens_seen": 29695488, | |
| "step": 29000, | |
| "train_runtime": 622.8209, | |
| "train_tokens_per_second": 47679.02 | |
| }, | |
| { | |
| "epoch": 2.9514757378689342, | |
| "grad_norm": 15.281734466552734, | |
| "learning_rate": 2.048624312156078e-05, | |
| "loss": 0.4124, | |
| "num_input_tokens_seen": 30207488, | |
| "step": 29500, | |
| "train_runtime": 633.1949, | |
| "train_tokens_per_second": 47706.46 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6329430937767029, | |
| "eval_mse": 0.6329430625566823, | |
| "eval_runtime": 7.4256, | |
| "eval_samples_per_second": 2691.912, | |
| "eval_steps_per_second": 336.539, | |
| "num_input_tokens_seen": 30703872, | |
| "step": 29985 | |
| }, | |
| { | |
| "epoch": 3.0015007503751874, | |
| "grad_norm": 5.859447479248047, | |
| "learning_rate": 1.998599299649825e-05, | |
| "loss": 0.3997, | |
| "num_input_tokens_seen": 30719232, | |
| "step": 30000, | |
| "train_runtime": 652.5502, | |
| "train_tokens_per_second": 47075.658 | |
| }, | |
| { | |
| "epoch": 3.0515257628814405, | |
| "grad_norm": 34.13383102416992, | |
| "learning_rate": 1.948574287143572e-05, | |
| "loss": 0.3636, | |
| "num_input_tokens_seen": 31231232, | |
| "step": 30500, | |
| "train_runtime": 662.8658, | |
| "train_tokens_per_second": 47115.47 | |
| }, | |
| { | |
| "epoch": 3.1015507753876936, | |
| "grad_norm": 7.527888298034668, | |
| "learning_rate": 1.8985492746373186e-05, | |
| "loss": 0.3514, | |
| "num_input_tokens_seen": 31743232, | |
| "step": 31000, | |
| "train_runtime": 673.2071, | |
| "train_tokens_per_second": 47152.251 | |
| }, | |
| { | |
| "epoch": 3.1515757878939468, | |
| "grad_norm": 13.83651351928711, | |
| "learning_rate": 1.8485242621310656e-05, | |
| "loss": 0.3339, | |
| "num_input_tokens_seen": 32255232, | |
| "step": 31500, | |
| "train_runtime": 683.5282, | |
| "train_tokens_per_second": 47189.32 | |
| }, | |
| { | |
| "epoch": 3.2016008004002, | |
| "grad_norm": 16.254074096679688, | |
| "learning_rate": 1.7984992496248123e-05, | |
| "loss": 0.3544, | |
| "num_input_tokens_seen": 32767232, | |
| "step": 32000, | |
| "train_runtime": 693.8526, | |
| "train_tokens_per_second": 47225.062 | |
| }, | |
| { | |
| "epoch": 3.251625812906453, | |
| "grad_norm": 11.702266693115234, | |
| "learning_rate": 1.7484742371185596e-05, | |
| "loss": 0.3475, | |
| "num_input_tokens_seen": 33279232, | |
| "step": 32500, | |
| "train_runtime": 704.191, | |
| "train_tokens_per_second": 47258.812 | |
| }, | |
| { | |
| "epoch": 3.3016508254127066, | |
| "grad_norm": 6.544358253479004, | |
| "learning_rate": 1.6984492246123063e-05, | |
| "loss": 0.3251, | |
| "num_input_tokens_seen": 33791232, | |
| "step": 33000, | |
| "train_runtime": 714.5267, | |
| "train_tokens_per_second": 47291.768 | |
| }, | |
| { | |
| "epoch": 3.3516758379189593, | |
| "grad_norm": 6.692401885986328, | |
| "learning_rate": 1.648424212106053e-05, | |
| "loss": 0.3596, | |
| "num_input_tokens_seen": 34303232, | |
| "step": 33500, | |
| "train_runtime": 724.8425, | |
| "train_tokens_per_second": 47325.083 | |
| }, | |
| { | |
| "epoch": 3.4017008504252124, | |
| "grad_norm": 1.8304575681686401, | |
| "learning_rate": 1.5983991995998e-05, | |
| "loss": 0.3368, | |
| "num_input_tokens_seen": 34815232, | |
| "step": 34000, | |
| "train_runtime": 735.1924, | |
| "train_tokens_per_second": 47355.269 | |
| }, | |
| { | |
| "epoch": 3.4517258629314655, | |
| "grad_norm": 16.374961853027344, | |
| "learning_rate": 1.548374187093547e-05, | |
| "loss": 0.355, | |
| "num_input_tokens_seen": 35327232, | |
| "step": 34500, | |
| "train_runtime": 745.5365, | |
| "train_tokens_per_second": 47384.977 | |
| }, | |
| { | |
| "epoch": 3.501750875437719, | |
| "grad_norm": 10.639739036560059, | |
| "learning_rate": 1.4983491745872938e-05, | |
| "loss": 0.3686, | |
| "num_input_tokens_seen": 35839232, | |
| "step": 35000, | |
| "train_runtime": 755.8868, | |
| "train_tokens_per_second": 47413.491 | |
| }, | |
| { | |
| "epoch": 3.551775887943972, | |
| "grad_norm": 7.099308967590332, | |
| "learning_rate": 1.4483241620810406e-05, | |
| "loss": 0.3526, | |
| "num_input_tokens_seen": 36351232, | |
| "step": 35500, | |
| "train_runtime": 766.2123, | |
| "train_tokens_per_second": 47442.767 | |
| }, | |
| { | |
| "epoch": 3.6018009004502254, | |
| "grad_norm": 5.288228988647461, | |
| "learning_rate": 1.3982991495747874e-05, | |
| "loss": 0.3478, | |
| "num_input_tokens_seen": 36863232, | |
| "step": 36000, | |
| "train_runtime": 776.5473, | |
| "train_tokens_per_second": 47470.685 | |
| }, | |
| { | |
| "epoch": 3.651825912956478, | |
| "grad_norm": 5.8353471755981445, | |
| "learning_rate": 1.3482741370685342e-05, | |
| "loss": 0.3551, | |
| "num_input_tokens_seen": 37375232, | |
| "step": 36500, | |
| "train_runtime": 786.8701, | |
| "train_tokens_per_second": 47498.606 | |
| }, | |
| { | |
| "epoch": 3.701850925462731, | |
| "grad_norm": 5.141830921173096, | |
| "learning_rate": 1.2982491245622812e-05, | |
| "loss": 0.3564, | |
| "num_input_tokens_seen": 37887232, | |
| "step": 37000, | |
| "train_runtime": 797.4137, | |
| "train_tokens_per_second": 47512.64 | |
| }, | |
| { | |
| "epoch": 3.7518759379689843, | |
| "grad_norm": 9.891301155090332, | |
| "learning_rate": 1.248224112056028e-05, | |
| "loss": 0.3287, | |
| "num_input_tokens_seen": 38399232, | |
| "step": 37500, | |
| "train_runtime": 807.7554, | |
| "train_tokens_per_second": 47538.195 | |
| }, | |
| { | |
| "epoch": 3.8019009504752375, | |
| "grad_norm": 12.06251049041748, | |
| "learning_rate": 1.1981990995497749e-05, | |
| "loss": 0.3685, | |
| "num_input_tokens_seen": 38911232, | |
| "step": 38000, | |
| "train_runtime": 818.0758, | |
| "train_tokens_per_second": 47564.337 | |
| }, | |
| { | |
| "epoch": 3.8519259629814906, | |
| "grad_norm": 3.9367198944091797, | |
| "learning_rate": 1.1481740870435217e-05, | |
| "loss": 0.3376, | |
| "num_input_tokens_seen": 39423232, | |
| "step": 38500, | |
| "train_runtime": 828.4313, | |
| "train_tokens_per_second": 47587.81 | |
| }, | |
| { | |
| "epoch": 3.9019509754877437, | |
| "grad_norm": 12.005465507507324, | |
| "learning_rate": 1.0981490745372687e-05, | |
| "loss": 0.3435, | |
| "num_input_tokens_seen": 39935232, | |
| "step": 39000, | |
| "train_runtime": 838.7871, | |
| "train_tokens_per_second": 47610.69 | |
| }, | |
| { | |
| "epoch": 3.951975987993997, | |
| "grad_norm": 8.57763671875, | |
| "learning_rate": 1.0481240620310156e-05, | |
| "loss": 0.3463, | |
| "num_input_tokens_seen": 40447232, | |
| "step": 39500, | |
| "train_runtime": 849.1478, | |
| "train_tokens_per_second": 47632.736 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.6491973996162415, | |
| "eval_mse": 0.6491973448647838, | |
| "eval_runtime": 7.4218, | |
| "eval_samples_per_second": 2693.27, | |
| "eval_steps_per_second": 336.709, | |
| "num_input_tokens_seen": 40938496, | |
| "step": 39980 | |
| }, | |
| { | |
| "epoch": 4.00200100050025, | |
| "grad_norm": 8.412797927856445, | |
| "learning_rate": 9.980990495247624e-06, | |
| "loss": 0.3346, | |
| "num_input_tokens_seen": 40958976, | |
| "step": 40000, | |
| "train_runtime": 868.7008, | |
| "train_tokens_per_second": 47149.696 | |
| }, | |
| { | |
| "epoch": 4.052026013006503, | |
| "grad_norm": 17.825761795043945, | |
| "learning_rate": 9.480740370185092e-06, | |
| "loss": 0.3034, | |
| "num_input_tokens_seen": 41470976, | |
| "step": 40500, | |
| "train_runtime": 878.9875, | |
| "train_tokens_per_second": 47180.395 | |
| }, | |
| { | |
| "epoch": 4.102051025512757, | |
| "grad_norm": 10.180594444274902, | |
| "learning_rate": 8.980490245122562e-06, | |
| "loss": 0.2971, | |
| "num_input_tokens_seen": 41982976, | |
| "step": 41000, | |
| "train_runtime": 889.3152, | |
| "train_tokens_per_second": 47208.207 | |
| }, | |
| { | |
| "epoch": 4.152076038019009, | |
| "grad_norm": 6.496691703796387, | |
| "learning_rate": 8.48024012006003e-06, | |
| "loss": 0.29, | |
| "num_input_tokens_seen": 42494976, | |
| "step": 41500, | |
| "train_runtime": 899.6157, | |
| "train_tokens_per_second": 47236.812 | |
| }, | |
| { | |
| "epoch": 4.202101050525263, | |
| "grad_norm": 7.926106929779053, | |
| "learning_rate": 7.979989994997499e-06, | |
| "loss": 0.2877, | |
| "num_input_tokens_seen": 43006976, | |
| "step": 42000, | |
| "train_runtime": 909.9204, | |
| "train_tokens_per_second": 47264.546 | |
| }, | |
| { | |
| "epoch": 4.252126063031516, | |
| "grad_norm": 9.483957290649414, | |
| "learning_rate": 7.479739869934968e-06, | |
| "loss": 0.3057, | |
| "num_input_tokens_seen": 43518976, | |
| "step": 42500, | |
| "train_runtime": 920.2145, | |
| "train_tokens_per_second": 47292.21 | |
| }, | |
| { | |
| "epoch": 4.302151075537769, | |
| "grad_norm": 13.678934097290039, | |
| "learning_rate": 6.979489744872436e-06, | |
| "loss": 0.2989, | |
| "num_input_tokens_seen": 44030976, | |
| "step": 43000, | |
| "train_runtime": 930.5087, | |
| "train_tokens_per_second": 47319.254 | |
| }, | |
| { | |
| "epoch": 4.352176088044022, | |
| "grad_norm": 4.871670246124268, | |
| "learning_rate": 6.479239619809905e-06, | |
| "loss": 0.2948, | |
| "num_input_tokens_seen": 44542976, | |
| "step": 43500, | |
| "train_runtime": 940.808, | |
| "train_tokens_per_second": 47345.449 | |
| }, | |
| { | |
| "epoch": 4.4022011005502755, | |
| "grad_norm": 11.43335247039795, | |
| "learning_rate": 5.978989494747374e-06, | |
| "loss": 0.301, | |
| "num_input_tokens_seen": 45054976, | |
| "step": 44000, | |
| "train_runtime": 951.1249, | |
| "train_tokens_per_second": 47370.197 | |
| }, | |
| { | |
| "epoch": 4.452226113056528, | |
| "grad_norm": 4.706591606140137, | |
| "learning_rate": 5.478739369684843e-06, | |
| "loss": 0.2929, | |
| "num_input_tokens_seen": 45566976, | |
| "step": 44500, | |
| "train_runtime": 961.4381, | |
| "train_tokens_per_second": 47394.601 | |
| }, | |
| { | |
| "epoch": 4.502251125562782, | |
| "grad_norm": 7.695474624633789, | |
| "learning_rate": 4.978489244622311e-06, | |
| "loss": 0.3027, | |
| "num_input_tokens_seen": 46078976, | |
| "step": 45000, | |
| "train_runtime": 971.7628, | |
| "train_tokens_per_second": 47417.925 | |
| }, | |
| { | |
| "epoch": 4.552276138069034, | |
| "grad_norm": 3.7478439807891846, | |
| "learning_rate": 4.47823911955978e-06, | |
| "loss": 0.317, | |
| "num_input_tokens_seen": 46590976, | |
| "step": 45500, | |
| "train_runtime": 982.0466, | |
| "train_tokens_per_second": 47442.735 | |
| }, | |
| { | |
| "epoch": 4.602301150575288, | |
| "grad_norm": 5.191857814788818, | |
| "learning_rate": 3.977988994497249e-06, | |
| "loss": 0.2869, | |
| "num_input_tokens_seen": 47102976, | |
| "step": 46000, | |
| "train_runtime": 992.3427, | |
| "train_tokens_per_second": 47466.441 | |
| }, | |
| { | |
| "epoch": 4.652326163081541, | |
| "grad_norm": 9.192943572998047, | |
| "learning_rate": 3.4777388694347177e-06, | |
| "loss": 0.2741, | |
| "num_input_tokens_seen": 47614976, | |
| "step": 46500, | |
| "train_runtime": 1002.6605, | |
| "train_tokens_per_second": 47488.632 | |
| }, | |
| { | |
| "epoch": 4.702351175587794, | |
| "grad_norm": 11.236271858215332, | |
| "learning_rate": 2.9774887443721864e-06, | |
| "loss": 0.2967, | |
| "num_input_tokens_seen": 48126976, | |
| "step": 47000, | |
| "train_runtime": 1012.9854, | |
| "train_tokens_per_second": 47510.041 | |
| }, | |
| { | |
| "epoch": 4.752376188094047, | |
| "grad_norm": 4.665492534637451, | |
| "learning_rate": 2.477238619309655e-06, | |
| "loss": 0.29, | |
| "num_input_tokens_seen": 48638976, | |
| "step": 47500, | |
| "train_runtime": 1023.2749, | |
| "train_tokens_per_second": 47532.66 | |
| }, | |
| { | |
| "epoch": 4.8024012006003005, | |
| "grad_norm": 14.87586784362793, | |
| "learning_rate": 1.9769884942471234e-06, | |
| "loss": 0.2883, | |
| "num_input_tokens_seen": 49150976, | |
| "step": 48000, | |
| "train_runtime": 1033.6207, | |
| "train_tokens_per_second": 47552.238 | |
| }, | |
| { | |
| "epoch": 4.852426213106553, | |
| "grad_norm": 12.128450393676758, | |
| "learning_rate": 1.4767383691845923e-06, | |
| "loss": 0.2925, | |
| "num_input_tokens_seen": 49662976, | |
| "step": 48500, | |
| "train_runtime": 1043.9662, | |
| "train_tokens_per_second": 47571.44 | |
| }, | |
| { | |
| "epoch": 4.902451225612807, | |
| "grad_norm": 7.318789958953857, | |
| "learning_rate": 9.76488244122061e-07, | |
| "loss": 0.2972, | |
| "num_input_tokens_seen": 50174976, | |
| "step": 49000, | |
| "train_runtime": 1054.2994, | |
| "train_tokens_per_second": 47590.821 | |
| }, | |
| { | |
| "epoch": 4.9524762381190595, | |
| "grad_norm": 6.287776947021484, | |
| "learning_rate": 4.762381190595298e-07, | |
| "loss": 0.3124, | |
| "num_input_tokens_seen": 50686976, | |
| "step": 49500, | |
| "train_runtime": 1064.6538, | |
| "train_tokens_per_second": 47608.88 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.6694201231002808, | |
| "eval_mse": 0.66942013065375, | |
| "eval_runtime": 7.3591, | |
| "eval_samples_per_second": 2716.211, | |
| "eval_steps_per_second": 339.577, | |
| "num_input_tokens_seen": 51173120, | |
| "step": 49975 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "num_input_tokens_seen": 51173120, | |
| "step": 49975, | |
| "total_flos": 3314721551485440.0, | |
| "train_loss": 0.4421676393006073, | |
| "train_runtime": 1083.6381, | |
| "train_samples_per_second": 368.933, | |
| "train_steps_per_second": 46.118 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 49975, | |
| "num_input_tokens_seen": 51173120, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3314721551485440.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |