{ "best_global_step": 19990, "best_metric": 0.566254198551178, "best_model_checkpoint": "/media/user/Expansion1/granite-embedding-107m-multilingual-chat-difficulty/checkpoint-19990", "epoch": 5.0, "eval_steps": 500, "global_step": 49975, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05002501250625312, "grad_norm": 14.432242393493652, "learning_rate": 4.9500750375187595e-05, "loss": 0.7612, "num_input_tokens_seen": 512000, "step": 500, "train_runtime": 10.716, "train_tokens_per_second": 47778.963 }, { "epoch": 0.10005002501250625, "grad_norm": 4.861369609832764, "learning_rate": 4.900050025012506e-05, "loss": 0.682, "num_input_tokens_seen": 1024000, "step": 1000, "train_runtime": 21.117, "train_tokens_per_second": 48491.697 }, { "epoch": 0.1500750375187594, "grad_norm": 8.015384674072266, "learning_rate": 4.8500250125062535e-05, "loss": 0.6637, "num_input_tokens_seen": 1536000, "step": 1500, "train_runtime": 31.546, "train_tokens_per_second": 48690.793 }, { "epoch": 0.2001000500250125, "grad_norm": 8.866910934448242, "learning_rate": 4.8e-05, "loss": 0.6346, "num_input_tokens_seen": 2048000, "step": 2000, "train_runtime": 41.8744, "train_tokens_per_second": 48908.201 }, { "epoch": 0.25012506253126565, "grad_norm": 8.685997009277344, "learning_rate": 4.7499749874937475e-05, "loss": 0.6475, "num_input_tokens_seen": 2560000, "step": 2500, "train_runtime": 52.2059, "train_tokens_per_second": 49036.559 }, { "epoch": 0.3001500750375188, "grad_norm": 11.758138656616211, "learning_rate": 4.699949974987494e-05, "loss": 0.6349, "num_input_tokens_seen": 3072000, "step": 3000, "train_runtime": 62.5742, "train_tokens_per_second": 49093.701 }, { "epoch": 0.3501750875437719, "grad_norm": 8.118245124816895, "learning_rate": 4.649924962481241e-05, "loss": 0.6317, "num_input_tokens_seen": 3584000, "step": 3500, "train_runtime": 72.9422, "train_tokens_per_second": 49134.817 }, { "epoch": 0.400200100050025, "grad_norm": 6.139720916748047, "learning_rate": 4.599899949974988e-05, "loss": 0.6167, "num_input_tokens_seen": 4096000, "step": 4000, "train_runtime": 83.3179, "train_tokens_per_second": 49161.134 }, { "epoch": 0.4502251125562781, "grad_norm": 8.408066749572754, "learning_rate": 4.549874937468734e-05, "loss": 0.5989, "num_input_tokens_seen": 4608000, "step": 4500, "train_runtime": 93.7746, "train_tokens_per_second": 49139.121 }, { "epoch": 0.5002501250625313, "grad_norm": 10.197962760925293, "learning_rate": 4.4998499249624814e-05, "loss": 0.6064, "num_input_tokens_seen": 5120000, "step": 5000, "train_runtime": 104.1644, "train_tokens_per_second": 49153.06 }, { "epoch": 0.5502751375687844, "grad_norm": 4.074629306793213, "learning_rate": 4.449824912456229e-05, "loss": 0.6092, "num_input_tokens_seen": 5632000, "step": 5500, "train_runtime": 114.5316, "train_tokens_per_second": 49174.215 }, { "epoch": 0.6003001500750376, "grad_norm": 3.91839861869812, "learning_rate": 4.3997998999499754e-05, "loss": 0.589, "num_input_tokens_seen": 6144000, "step": 6000, "train_runtime": 124.9251, "train_tokens_per_second": 49181.485 }, { "epoch": 0.6503251625812907, "grad_norm": 8.543745040893555, "learning_rate": 4.349774887443722e-05, "loss": 0.6039, "num_input_tokens_seen": 6656000, "step": 6500, "train_runtime": 135.6035, "train_tokens_per_second": 49084.295 }, { "epoch": 0.7003501750875438, "grad_norm": 7.670341491699219, "learning_rate": 4.299749874937469e-05, "loss": 0.6106, "num_input_tokens_seen": 7168000, "step": 7000, "train_runtime": 146.0175, "train_tokens_per_second": 49090.01 }, { "epoch": 0.7503751875937968, "grad_norm": 7.7239580154418945, "learning_rate": 4.249724862431216e-05, "loss": 0.6287, "num_input_tokens_seen": 7680000, "step": 7500, "train_runtime": 156.4134, "train_tokens_per_second": 49100.637 }, { "epoch": 0.80040020010005, "grad_norm": 11.006497383117676, "learning_rate": 4.199699849924963e-05, "loss": 0.5984, "num_input_tokens_seen": 8192000, "step": 8000, "train_runtime": 166.8094, "train_tokens_per_second": 49109.928 }, { "epoch": 0.8504252126063031, "grad_norm": 6.032561779022217, "learning_rate": 4.1496748374187094e-05, "loss": 0.5775, "num_input_tokens_seen": 8704000, "step": 8500, "train_runtime": 177.6017, "train_tokens_per_second": 49008.538 }, { "epoch": 0.9004502251125562, "grad_norm": 19.184879302978516, "learning_rate": 4.099649824912457e-05, "loss": 0.6008, "num_input_tokens_seen": 9216000, "step": 9000, "train_runtime": 188.21, "train_tokens_per_second": 48966.59 }, { "epoch": 0.9504752376188094, "grad_norm": 21.708553314208984, "learning_rate": 4.049624812406203e-05, "loss": 0.5781, "num_input_tokens_seen": 9728000, "step": 9500, "train_runtime": 198.7447, "train_tokens_per_second": 48947.209 }, { "epoch": 1.0, "eval_loss": 0.5996330380439758, "eval_mse": 0.599633070872774, "eval_runtime": 7.71, "eval_samples_per_second": 2592.618, "eval_steps_per_second": 324.126, "num_input_tokens_seen": 10234624, "step": 9995 }, { "epoch": 1.0005002501250626, "grad_norm": 14.216784477233887, "learning_rate": 3.99959979989995e-05, "loss": 0.5931, "num_input_tokens_seen": 10239744, "step": 10000, "train_runtime": 218.5356, "train_tokens_per_second": 46856.177 }, { "epoch": 1.0505252626313157, "grad_norm": 12.273909568786621, "learning_rate": 3.9495747873936974e-05, "loss": 0.5232, "num_input_tokens_seen": 10751744, "step": 10500, "train_runtime": 228.9682, "train_tokens_per_second": 46957.358 }, { "epoch": 1.1005502751375689, "grad_norm": 5.408884525299072, "learning_rate": 3.899549774887444e-05, "loss": 0.5491, "num_input_tokens_seen": 11263744, "step": 11000, "train_runtime": 239.4335, "train_tokens_per_second": 47043.302 }, { "epoch": 1.150575287643822, "grad_norm": 13.357819557189941, "learning_rate": 3.849524762381191e-05, "loss": 0.5141, "num_input_tokens_seen": 11775744, "step": 11500, "train_runtime": 249.8249, "train_tokens_per_second": 47135.996 }, { "epoch": 1.2006003001500751, "grad_norm": 6.6150641441345215, "learning_rate": 3.7994997498749374e-05, "loss": 0.5161, "num_input_tokens_seen": 12287744, "step": 12000, "train_runtime": 260.1103, "train_tokens_per_second": 47240.518 }, { "epoch": 1.2506253126563283, "grad_norm": 6.556243896484375, "learning_rate": 3.749474737368685e-05, "loss": 0.5046, "num_input_tokens_seen": 12799744, "step": 12500, "train_runtime": 270.3312, "train_tokens_per_second": 47348.381 }, { "epoch": 1.3006503251625814, "grad_norm": 15.295032501220703, "learning_rate": 3.6994497248624314e-05, "loss": 0.5032, "num_input_tokens_seen": 13311744, "step": 13000, "train_runtime": 280.9075, "train_tokens_per_second": 47388.352 }, { "epoch": 1.3506753376688345, "grad_norm": 5.232257843017578, "learning_rate": 3.649424712356178e-05, "loss": 0.5209, "num_input_tokens_seen": 13823744, "step": 13500, "train_runtime": 291.1832, "train_tokens_per_second": 47474.391 }, { "epoch": 1.4007003501750876, "grad_norm": 4.907810688018799, "learning_rate": 3.5993996998499254e-05, "loss": 0.5166, "num_input_tokens_seen": 14335744, "step": 14000, "train_runtime": 301.5311, "train_tokens_per_second": 47543.176 }, { "epoch": 1.4507253626813408, "grad_norm": 2.214416742324829, "learning_rate": 3.549374687343672e-05, "loss": 0.508, "num_input_tokens_seen": 14847744, "step": 14500, "train_runtime": 311.993, "train_tokens_per_second": 47589.991 }, { "epoch": 1.500750375187594, "grad_norm": 7.953266620635986, "learning_rate": 3.499349674837419e-05, "loss": 0.5163, "num_input_tokens_seen": 15359744, "step": 15000, "train_runtime": 322.4354, "train_tokens_per_second": 47636.659 }, { "epoch": 1.550775387693847, "grad_norm": 27.234952926635742, "learning_rate": 3.449324662331166e-05, "loss": 0.5105, "num_input_tokens_seen": 15871744, "step": 15500, "train_runtime": 332.8782, "train_tokens_per_second": 47680.34 }, { "epoch": 1.6008004002001002, "grad_norm": 8.335816383361816, "learning_rate": 3.399299649824913e-05, "loss": 0.5177, "num_input_tokens_seen": 16383744, "step": 16000, "train_runtime": 343.188, "train_tokens_per_second": 47739.856 }, { "epoch": 1.6508254127063533, "grad_norm": 12.731109619140625, "learning_rate": 3.3492746373186594e-05, "loss": 0.5282, "num_input_tokens_seen": 16895744, "step": 16500, "train_runtime": 353.7522, "train_tokens_per_second": 47761.528 }, { "epoch": 1.7008504252126064, "grad_norm": 2.0694406032562256, "learning_rate": 3.299249624812407e-05, "loss": 0.5217, "num_input_tokens_seen": 17407744, "step": 17000, "train_runtime": 364.3803, "train_tokens_per_second": 47773.564 }, { "epoch": 1.7508754377188596, "grad_norm": 6.190609455108643, "learning_rate": 3.2492246123061534e-05, "loss": 0.5185, "num_input_tokens_seen": 17919744, "step": 17500, "train_runtime": 374.8689, "train_tokens_per_second": 47802.702 }, { "epoch": 1.8009004502251127, "grad_norm": 6.968355655670166, "learning_rate": 3.1991995997999e-05, "loss": 0.5025, "num_input_tokens_seen": 18431744, "step": 18000, "train_runtime": 385.1924, "train_tokens_per_second": 47850.75 }, { "epoch": 1.8509254627313658, "grad_norm": 2.6967575550079346, "learning_rate": 3.149174587293647e-05, "loss": 0.5118, "num_input_tokens_seen": 18943744, "step": 18500, "train_runtime": 395.8463, "train_tokens_per_second": 47856.316 }, { "epoch": 1.900950475237619, "grad_norm": 13.992359161376953, "learning_rate": 3.099149574787394e-05, "loss": 0.5209, "num_input_tokens_seen": 19455744, "step": 19000, "train_runtime": 406.5011, "train_tokens_per_second": 47861.481 }, { "epoch": 1.950975487743872, "grad_norm": 4.206520080566406, "learning_rate": 3.049124562281141e-05, "loss": 0.5063, "num_input_tokens_seen": 19967744, "step": 19500, "train_runtime": 416.9681, "train_tokens_per_second": 47887.945 }, { "epoch": 2.0, "eval_loss": 0.566254198551178, "eval_mse": 0.5662542309499291, "eval_runtime": 7.8118, "eval_samples_per_second": 2558.806, "eval_steps_per_second": 319.899, "num_input_tokens_seen": 20469248, "step": 19990 }, { "epoch": 2.001000500250125, "grad_norm": 5.060875415802002, "learning_rate": 2.9990995497748873e-05, "loss": 0.5111, "num_input_tokens_seen": 20479488, "step": 20000, "train_runtime": 437.0238, "train_tokens_per_second": 46861.264 }, { "epoch": 2.0510255127563783, "grad_norm": 4.569818019866943, "learning_rate": 2.9490745372686347e-05, "loss": 0.4432, "num_input_tokens_seen": 20991488, "step": 20500, "train_runtime": 447.3167, "train_tokens_per_second": 46927.57 }, { "epoch": 2.1010505252626315, "grad_norm": 6.6488142013549805, "learning_rate": 2.899049524762381e-05, "loss": 0.418, "num_input_tokens_seen": 21503488, "step": 21000, "train_runtime": 457.6456, "train_tokens_per_second": 46987.204 }, { "epoch": 2.1510755377688846, "grad_norm": 6.992687702178955, "learning_rate": 2.8490245122561283e-05, "loss": 0.4222, "num_input_tokens_seen": 22015488, "step": 21500, "train_runtime": 467.9444, "train_tokens_per_second": 47047.231 }, { "epoch": 2.2011005502751377, "grad_norm": 9.515182495117188, "learning_rate": 2.7989994997498753e-05, "loss": 0.4325, "num_input_tokens_seen": 22527488, "step": 22000, "train_runtime": 478.2535, "train_tokens_per_second": 47103.653 }, { "epoch": 2.251125562781391, "grad_norm": 9.444127082824707, "learning_rate": 2.748974487243622e-05, "loss": 0.42, "num_input_tokens_seen": 23039488, "step": 22500, "train_runtime": 488.5556, "train_tokens_per_second": 47158.378 }, { "epoch": 2.301150575287644, "grad_norm": 9.03792953491211, "learning_rate": 2.698949474737369e-05, "loss": 0.4123, "num_input_tokens_seen": 23551488, "step": 23000, "train_runtime": 498.876, "train_tokens_per_second": 47209.103 }, { "epoch": 2.351175587793897, "grad_norm": 8.053046226501465, "learning_rate": 2.6489244622311153e-05, "loss": 0.4179, "num_input_tokens_seen": 24063488, "step": 23500, "train_runtime": 509.2037, "train_tokens_per_second": 47257.098 }, { "epoch": 2.4012006003001503, "grad_norm": 13.128437995910645, "learning_rate": 2.5988994497248627e-05, "loss": 0.4239, "num_input_tokens_seen": 24575488, "step": 24000, "train_runtime": 519.5215, "train_tokens_per_second": 47304.079 }, { "epoch": 2.4512256128064034, "grad_norm": 10.007084846496582, "learning_rate": 2.5488744372186097e-05, "loss": 0.4316, "num_input_tokens_seen": 25087488, "step": 24500, "train_runtime": 529.8242, "train_tokens_per_second": 47350.591 }, { "epoch": 2.5012506253126565, "grad_norm": 11.051094055175781, "learning_rate": 2.4988494247123563e-05, "loss": 0.4233, "num_input_tokens_seen": 25599488, "step": 25000, "train_runtime": 540.1399, "train_tokens_per_second": 47394.182 }, { "epoch": 2.551275637818909, "grad_norm": 6.066195487976074, "learning_rate": 2.4488244122061033e-05, "loss": 0.4251, "num_input_tokens_seen": 26111488, "step": 25500, "train_runtime": 550.4474, "train_tokens_per_second": 47436.848 }, { "epoch": 2.6013006503251628, "grad_norm": 2.6425974369049072, "learning_rate": 2.39879939969985e-05, "loss": 0.4479, "num_input_tokens_seen": 26623488, "step": 26000, "train_runtime": 560.7806, "train_tokens_per_second": 47475.766 }, { "epoch": 2.6513256628314155, "grad_norm": 11.04046630859375, "learning_rate": 2.348774387193597e-05, "loss": 0.4457, "num_input_tokens_seen": 27135488, "step": 26500, "train_runtime": 571.1223, "train_tokens_per_second": 47512.567 }, { "epoch": 2.701350675337669, "grad_norm": 14.067301750183105, "learning_rate": 2.2987493746873436e-05, "loss": 0.4323, "num_input_tokens_seen": 27647488, "step": 27000, "train_runtime": 581.4573, "train_tokens_per_second": 47548.615 }, { "epoch": 2.7513756878439217, "grad_norm": 6.148231029510498, "learning_rate": 2.2487243621810906e-05, "loss": 0.4314, "num_input_tokens_seen": 28159488, "step": 27500, "train_runtime": 591.7788, "train_tokens_per_second": 47584.484 }, { "epoch": 2.8014007003501753, "grad_norm": 4.049166202545166, "learning_rate": 2.1986993496748376e-05, "loss": 0.4245, "num_input_tokens_seen": 28671488, "step": 28000, "train_runtime": 602.1254, "train_tokens_per_second": 47617.141 }, { "epoch": 2.851425712856428, "grad_norm": 10.769721031188965, "learning_rate": 2.1486743371685843e-05, "loss": 0.432, "num_input_tokens_seen": 29183488, "step": 28500, "train_runtime": 612.4701, "train_tokens_per_second": 47648.836 }, { "epoch": 2.9014507253626816, "grad_norm": 7.408927917480469, "learning_rate": 2.0986493246623313e-05, "loss": 0.4296, "num_input_tokens_seen": 29695488, "step": 29000, "train_runtime": 622.8209, "train_tokens_per_second": 47679.02 }, { "epoch": 2.9514757378689342, "grad_norm": 15.281734466552734, "learning_rate": 2.048624312156078e-05, "loss": 0.4124, "num_input_tokens_seen": 30207488, "step": 29500, "train_runtime": 633.1949, "train_tokens_per_second": 47706.46 }, { "epoch": 3.0, "eval_loss": 0.6329430937767029, "eval_mse": 0.6329430625566823, "eval_runtime": 7.4256, "eval_samples_per_second": 2691.912, "eval_steps_per_second": 336.539, "num_input_tokens_seen": 30703872, "step": 29985 }, { "epoch": 3.0015007503751874, "grad_norm": 5.859447479248047, "learning_rate": 1.998599299649825e-05, "loss": 0.3997, "num_input_tokens_seen": 30719232, "step": 30000, "train_runtime": 652.5502, "train_tokens_per_second": 47075.658 }, { "epoch": 3.0515257628814405, "grad_norm": 34.13383102416992, "learning_rate": 1.948574287143572e-05, "loss": 0.3636, "num_input_tokens_seen": 31231232, "step": 30500, "train_runtime": 662.8658, "train_tokens_per_second": 47115.47 }, { "epoch": 3.1015507753876936, "grad_norm": 7.527888298034668, "learning_rate": 1.8985492746373186e-05, "loss": 0.3514, "num_input_tokens_seen": 31743232, "step": 31000, "train_runtime": 673.2071, "train_tokens_per_second": 47152.251 }, { "epoch": 3.1515757878939468, "grad_norm": 13.83651351928711, "learning_rate": 1.8485242621310656e-05, "loss": 0.3339, "num_input_tokens_seen": 32255232, "step": 31500, "train_runtime": 683.5282, "train_tokens_per_second": 47189.32 }, { "epoch": 3.2016008004002, "grad_norm": 16.254074096679688, "learning_rate": 1.7984992496248123e-05, "loss": 0.3544, "num_input_tokens_seen": 32767232, "step": 32000, "train_runtime": 693.8526, "train_tokens_per_second": 47225.062 }, { "epoch": 3.251625812906453, "grad_norm": 11.702266693115234, "learning_rate": 1.7484742371185596e-05, "loss": 0.3475, "num_input_tokens_seen": 33279232, "step": 32500, "train_runtime": 704.191, "train_tokens_per_second": 47258.812 }, { "epoch": 3.3016508254127066, "grad_norm": 6.544358253479004, "learning_rate": 1.6984492246123063e-05, "loss": 0.3251, "num_input_tokens_seen": 33791232, "step": 33000, "train_runtime": 714.5267, "train_tokens_per_second": 47291.768 }, { "epoch": 3.3516758379189593, "grad_norm": 6.692401885986328, "learning_rate": 1.648424212106053e-05, "loss": 0.3596, "num_input_tokens_seen": 34303232, "step": 33500, "train_runtime": 724.8425, "train_tokens_per_second": 47325.083 }, { "epoch": 3.4017008504252124, "grad_norm": 1.8304575681686401, "learning_rate": 1.5983991995998e-05, "loss": 0.3368, "num_input_tokens_seen": 34815232, "step": 34000, "train_runtime": 735.1924, "train_tokens_per_second": 47355.269 }, { "epoch": 3.4517258629314655, "grad_norm": 16.374961853027344, "learning_rate": 1.548374187093547e-05, "loss": 0.355, "num_input_tokens_seen": 35327232, "step": 34500, "train_runtime": 745.5365, "train_tokens_per_second": 47384.977 }, { "epoch": 3.501750875437719, "grad_norm": 10.639739036560059, "learning_rate": 1.4983491745872938e-05, "loss": 0.3686, "num_input_tokens_seen": 35839232, "step": 35000, "train_runtime": 755.8868, "train_tokens_per_second": 47413.491 }, { "epoch": 3.551775887943972, "grad_norm": 7.099308967590332, "learning_rate": 1.4483241620810406e-05, "loss": 0.3526, "num_input_tokens_seen": 36351232, "step": 35500, "train_runtime": 766.2123, "train_tokens_per_second": 47442.767 }, { "epoch": 3.6018009004502254, "grad_norm": 5.288228988647461, "learning_rate": 1.3982991495747874e-05, "loss": 0.3478, "num_input_tokens_seen": 36863232, "step": 36000, "train_runtime": 776.5473, "train_tokens_per_second": 47470.685 }, { "epoch": 3.651825912956478, "grad_norm": 5.8353471755981445, "learning_rate": 1.3482741370685342e-05, "loss": 0.3551, "num_input_tokens_seen": 37375232, "step": 36500, "train_runtime": 786.8701, "train_tokens_per_second": 47498.606 }, { "epoch": 3.701850925462731, "grad_norm": 5.141830921173096, "learning_rate": 1.2982491245622812e-05, "loss": 0.3564, "num_input_tokens_seen": 37887232, "step": 37000, "train_runtime": 797.4137, "train_tokens_per_second": 47512.64 }, { "epoch": 3.7518759379689843, "grad_norm": 9.891301155090332, "learning_rate": 1.248224112056028e-05, "loss": 0.3287, "num_input_tokens_seen": 38399232, "step": 37500, "train_runtime": 807.7554, "train_tokens_per_second": 47538.195 }, { "epoch": 3.8019009504752375, "grad_norm": 12.06251049041748, "learning_rate": 1.1981990995497749e-05, "loss": 0.3685, "num_input_tokens_seen": 38911232, "step": 38000, "train_runtime": 818.0758, "train_tokens_per_second": 47564.337 }, { "epoch": 3.8519259629814906, "grad_norm": 3.9367198944091797, "learning_rate": 1.1481740870435217e-05, "loss": 0.3376, "num_input_tokens_seen": 39423232, "step": 38500, "train_runtime": 828.4313, "train_tokens_per_second": 47587.81 }, { "epoch": 3.9019509754877437, "grad_norm": 12.005465507507324, "learning_rate": 1.0981490745372687e-05, "loss": 0.3435, "num_input_tokens_seen": 39935232, "step": 39000, "train_runtime": 838.7871, "train_tokens_per_second": 47610.69 }, { "epoch": 3.951975987993997, "grad_norm": 8.57763671875, "learning_rate": 1.0481240620310156e-05, "loss": 0.3463, "num_input_tokens_seen": 40447232, "step": 39500, "train_runtime": 849.1478, "train_tokens_per_second": 47632.736 }, { "epoch": 4.0, "eval_loss": 0.6491973996162415, "eval_mse": 0.6491973448647838, "eval_runtime": 7.4218, "eval_samples_per_second": 2693.27, "eval_steps_per_second": 336.709, "num_input_tokens_seen": 40938496, "step": 39980 }, { "epoch": 4.00200100050025, "grad_norm": 8.412797927856445, "learning_rate": 9.980990495247624e-06, "loss": 0.3346, "num_input_tokens_seen": 40958976, "step": 40000, "train_runtime": 868.7008, "train_tokens_per_second": 47149.696 }, { "epoch": 4.052026013006503, "grad_norm": 17.825761795043945, "learning_rate": 9.480740370185092e-06, "loss": 0.3034, "num_input_tokens_seen": 41470976, "step": 40500, "train_runtime": 878.9875, "train_tokens_per_second": 47180.395 }, { "epoch": 4.102051025512757, "grad_norm": 10.180594444274902, "learning_rate": 8.980490245122562e-06, "loss": 0.2971, "num_input_tokens_seen": 41982976, "step": 41000, "train_runtime": 889.3152, "train_tokens_per_second": 47208.207 }, { "epoch": 4.152076038019009, "grad_norm": 6.496691703796387, "learning_rate": 8.48024012006003e-06, "loss": 0.29, "num_input_tokens_seen": 42494976, "step": 41500, "train_runtime": 899.6157, "train_tokens_per_second": 47236.812 }, { "epoch": 4.202101050525263, "grad_norm": 7.926106929779053, "learning_rate": 7.979989994997499e-06, "loss": 0.2877, "num_input_tokens_seen": 43006976, "step": 42000, "train_runtime": 909.9204, "train_tokens_per_second": 47264.546 }, { "epoch": 4.252126063031516, "grad_norm": 9.483957290649414, "learning_rate": 7.479739869934968e-06, "loss": 0.3057, "num_input_tokens_seen": 43518976, "step": 42500, "train_runtime": 920.2145, "train_tokens_per_second": 47292.21 }, { "epoch": 4.302151075537769, "grad_norm": 13.678934097290039, "learning_rate": 6.979489744872436e-06, "loss": 0.2989, "num_input_tokens_seen": 44030976, "step": 43000, "train_runtime": 930.5087, "train_tokens_per_second": 47319.254 }, { "epoch": 4.352176088044022, "grad_norm": 4.871670246124268, "learning_rate": 6.479239619809905e-06, "loss": 0.2948, "num_input_tokens_seen": 44542976, "step": 43500, "train_runtime": 940.808, "train_tokens_per_second": 47345.449 }, { "epoch": 4.4022011005502755, "grad_norm": 11.43335247039795, "learning_rate": 5.978989494747374e-06, "loss": 0.301, "num_input_tokens_seen": 45054976, "step": 44000, "train_runtime": 951.1249, "train_tokens_per_second": 47370.197 }, { "epoch": 4.452226113056528, "grad_norm": 4.706591606140137, "learning_rate": 5.478739369684843e-06, "loss": 0.2929, "num_input_tokens_seen": 45566976, "step": 44500, "train_runtime": 961.4381, "train_tokens_per_second": 47394.601 }, { "epoch": 4.502251125562782, "grad_norm": 7.695474624633789, "learning_rate": 4.978489244622311e-06, "loss": 0.3027, "num_input_tokens_seen": 46078976, "step": 45000, "train_runtime": 971.7628, "train_tokens_per_second": 47417.925 }, { "epoch": 4.552276138069034, "grad_norm": 3.7478439807891846, "learning_rate": 4.47823911955978e-06, "loss": 0.317, "num_input_tokens_seen": 46590976, "step": 45500, "train_runtime": 982.0466, "train_tokens_per_second": 47442.735 }, { "epoch": 4.602301150575288, "grad_norm": 5.191857814788818, "learning_rate": 3.977988994497249e-06, "loss": 0.2869, "num_input_tokens_seen": 47102976, "step": 46000, "train_runtime": 992.3427, "train_tokens_per_second": 47466.441 }, { "epoch": 4.652326163081541, "grad_norm": 9.192943572998047, "learning_rate": 3.4777388694347177e-06, "loss": 0.2741, "num_input_tokens_seen": 47614976, "step": 46500, "train_runtime": 1002.6605, "train_tokens_per_second": 47488.632 }, { "epoch": 4.702351175587794, "grad_norm": 11.236271858215332, "learning_rate": 2.9774887443721864e-06, "loss": 0.2967, "num_input_tokens_seen": 48126976, "step": 47000, "train_runtime": 1012.9854, "train_tokens_per_second": 47510.041 }, { "epoch": 4.752376188094047, "grad_norm": 4.665492534637451, "learning_rate": 2.477238619309655e-06, "loss": 0.29, "num_input_tokens_seen": 48638976, "step": 47500, "train_runtime": 1023.2749, "train_tokens_per_second": 47532.66 }, { "epoch": 4.8024012006003005, "grad_norm": 14.87586784362793, "learning_rate": 1.9769884942471234e-06, "loss": 0.2883, "num_input_tokens_seen": 49150976, "step": 48000, "train_runtime": 1033.6207, "train_tokens_per_second": 47552.238 }, { "epoch": 4.852426213106553, "grad_norm": 12.128450393676758, "learning_rate": 1.4767383691845923e-06, "loss": 0.2925, "num_input_tokens_seen": 49662976, "step": 48500, "train_runtime": 1043.9662, "train_tokens_per_second": 47571.44 }, { "epoch": 4.902451225612807, "grad_norm": 7.318789958953857, "learning_rate": 9.76488244122061e-07, "loss": 0.2972, "num_input_tokens_seen": 50174976, "step": 49000, "train_runtime": 1054.2994, "train_tokens_per_second": 47590.821 }, { "epoch": 4.9524762381190595, "grad_norm": 6.287776947021484, "learning_rate": 4.762381190595298e-07, "loss": 0.3124, "num_input_tokens_seen": 50686976, "step": 49500, "train_runtime": 1064.6538, "train_tokens_per_second": 47608.88 }, { "epoch": 5.0, "eval_loss": 0.6694201231002808, "eval_mse": 0.66942013065375, "eval_runtime": 7.3591, "eval_samples_per_second": 2716.211, "eval_steps_per_second": 339.577, "num_input_tokens_seen": 51173120, "step": 49975 }, { "epoch": 5.0, "num_input_tokens_seen": 51173120, "step": 49975, "total_flos": 3314721551485440.0, "train_loss": 0.4421676393006073, "train_runtime": 1083.6381, "train_samples_per_second": 368.933, "train_steps_per_second": 46.118 } ], "logging_steps": 500, "max_steps": 49975, "num_input_tokens_seen": 51173120, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3314721551485440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }