| { | |
| "best_global_step": 4000, | |
| "best_metric": 0.19092191755771637, | |
| "best_model_checkpoint": "/home/flytekit/n0w0f/data/mattext_ckpt/results/2026-02-05/18-01-14/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000", | |
| "epoch": 8.602150537634408, | |
| "eval_steps": 50, | |
| "global_step": 4000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.10752688172043011, | |
| "grad_norm": 1.1888866424560547, | |
| "learning_rate": 0.00019957849462365592, | |
| "loss": 5.97920654296875, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10752688172043011, | |
| "eval_loss": 4.124914646148682, | |
| "eval_runtime": 60.5178, | |
| "eval_samples_per_second": 314.023, | |
| "eval_steps_per_second": 39.261, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "grad_norm": 0.9824994802474976, | |
| "learning_rate": 0.00019914838709677422, | |
| "loss": 3.916483154296875, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.21505376344086022, | |
| "eval_loss": 3.675534248352051, | |
| "eval_runtime": 61.1234, | |
| "eval_samples_per_second": 310.912, | |
| "eval_steps_per_second": 38.872, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "grad_norm": 0.867065966129303, | |
| "learning_rate": 0.00019871827956989248, | |
| "loss": 3.620672302246094, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.3225806451612903, | |
| "eval_loss": 3.4746599197387695, | |
| "eval_runtime": 61.4793, | |
| "eval_samples_per_second": 309.112, | |
| "eval_steps_per_second": 38.647, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 1.192267894744873, | |
| "learning_rate": 0.00019828817204301075, | |
| "loss": 3.471976013183594, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.43010752688172044, | |
| "eval_loss": 3.353644371032715, | |
| "eval_runtime": 60.5187, | |
| "eval_samples_per_second": 314.019, | |
| "eval_steps_per_second": 39.261, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5376344086021505, | |
| "grad_norm": 1.0798981189727783, | |
| "learning_rate": 0.00019785806451612904, | |
| "loss": 3.360224609375, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5376344086021505, | |
| "eval_loss": 3.247636079788208, | |
| "eval_runtime": 61.527, | |
| "eval_samples_per_second": 308.873, | |
| "eval_steps_per_second": 38.617, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 1.3051457405090332, | |
| "learning_rate": 0.00019742795698924733, | |
| "loss": 3.262052307128906, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6451612903225806, | |
| "eval_loss": 3.1502654552459717, | |
| "eval_runtime": 60.999, | |
| "eval_samples_per_second": 311.546, | |
| "eval_steps_per_second": 38.951, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7526881720430108, | |
| "grad_norm": 1.1396135091781616, | |
| "learning_rate": 0.0001969978494623656, | |
| "loss": 3.225200500488281, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7526881720430108, | |
| "eval_loss": 3.094292163848877, | |
| "eval_runtime": 61.381, | |
| "eval_samples_per_second": 309.607, | |
| "eval_steps_per_second": 38.709, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 1.0816289186477661, | |
| "learning_rate": 0.0001965677419354839, | |
| "loss": 3.1344537353515625, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8602150537634409, | |
| "eval_loss": 3.0037944316864014, | |
| "eval_runtime": 61.1417, | |
| "eval_samples_per_second": 310.819, | |
| "eval_steps_per_second": 38.861, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "grad_norm": 1.220457673072815, | |
| "learning_rate": 0.00019613763440860216, | |
| "loss": 3.024658203125, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.967741935483871, | |
| "eval_loss": 2.9253640174865723, | |
| "eval_runtime": 61.6823, | |
| "eval_samples_per_second": 308.095, | |
| "eval_steps_per_second": 38.52, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.075268817204301, | |
| "grad_norm": 1.18031644821167, | |
| "learning_rate": 0.00019570752688172045, | |
| "loss": 2.9539215087890627, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.075268817204301, | |
| "eval_loss": 2.827315092086792, | |
| "eval_runtime": 64.027, | |
| "eval_samples_per_second": 296.812, | |
| "eval_steps_per_second": 37.109, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.1827956989247312, | |
| "grad_norm": 1.4481481313705444, | |
| "learning_rate": 0.00019527741935483872, | |
| "loss": 2.8536431884765623, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1827956989247312, | |
| "eval_loss": 2.6743366718292236, | |
| "eval_runtime": 60.9092, | |
| "eval_samples_per_second": 312.005, | |
| "eval_steps_per_second": 39.009, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "grad_norm": 1.5985803604125977, | |
| "learning_rate": 0.00019484731182795698, | |
| "loss": 2.7353704833984374, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2903225806451613, | |
| "eval_loss": 2.4861812591552734, | |
| "eval_runtime": 61.6826, | |
| "eval_samples_per_second": 308.093, | |
| "eval_steps_per_second": 38.52, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.3978494623655915, | |
| "grad_norm": 2.046145439147949, | |
| "learning_rate": 0.00019441720430107528, | |
| "loss": 2.464430084228516, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3978494623655915, | |
| "eval_loss": 2.0265886783599854, | |
| "eval_runtime": 61.2709, | |
| "eval_samples_per_second": 310.164, | |
| "eval_steps_per_second": 38.779, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.5053763440860215, | |
| "grad_norm": 1.8674232959747314, | |
| "learning_rate": 0.00019398709677419354, | |
| "loss": 1.9112973022460937, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.5053763440860215, | |
| "eval_loss": 1.3678908348083496, | |
| "eval_runtime": 62.2031, | |
| "eval_samples_per_second": 305.515, | |
| "eval_steps_per_second": 38.197, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "grad_norm": 1.708408236503601, | |
| "learning_rate": 0.00019355698924731184, | |
| "loss": 1.4241523742675781, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.6129032258064515, | |
| "eval_loss": 1.0675994157791138, | |
| "eval_runtime": 62.2, | |
| "eval_samples_per_second": 305.53, | |
| "eval_steps_per_second": 38.199, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.7204301075268817, | |
| "grad_norm": 1.6592656373977661, | |
| "learning_rate": 0.00019312688172043013, | |
| "loss": 1.2252975463867188, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.7204301075268817, | |
| "eval_loss": 0.9175282716751099, | |
| "eval_runtime": 61.3094, | |
| "eval_samples_per_second": 309.969, | |
| "eval_steps_per_second": 38.754, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.827956989247312, | |
| "grad_norm": 1.2984247207641602, | |
| "learning_rate": 0.0001926967741935484, | |
| "loss": 1.0399230194091797, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.827956989247312, | |
| "eval_loss": 0.8346064686775208, | |
| "eval_runtime": 61.1605, | |
| "eval_samples_per_second": 310.724, | |
| "eval_steps_per_second": 38.849, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "grad_norm": 1.1744712591171265, | |
| "learning_rate": 0.0001922666666666667, | |
| "loss": 0.9568134307861328, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.935483870967742, | |
| "eval_loss": 0.7724924087524414, | |
| "eval_runtime": 62.2824, | |
| "eval_samples_per_second": 305.126, | |
| "eval_steps_per_second": 38.149, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.043010752688172, | |
| "grad_norm": 1.2494049072265625, | |
| "learning_rate": 0.00019183655913978495, | |
| "loss": 0.8979853820800782, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.043010752688172, | |
| "eval_loss": 0.7325491905212402, | |
| "eval_runtime": 62.8935, | |
| "eval_samples_per_second": 302.161, | |
| "eval_steps_per_second": 37.778, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.150537634408602, | |
| "grad_norm": 1.0687495470046997, | |
| "learning_rate": 0.00019140645161290322, | |
| "loss": 0.8724540710449219, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.150537634408602, | |
| "eval_loss": 0.6943864822387695, | |
| "eval_runtime": 64.2005, | |
| "eval_samples_per_second": 296.01, | |
| "eval_steps_per_second": 37.009, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "grad_norm": 0.9108296036720276, | |
| "learning_rate": 0.0001909763440860215, | |
| "loss": 0.8106794738769532, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.258064516129032, | |
| "eval_loss": 0.666123628616333, | |
| "eval_runtime": 60.9142, | |
| "eval_samples_per_second": 311.98, | |
| "eval_steps_per_second": 39.006, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.3655913978494625, | |
| "grad_norm": 0.8529163002967834, | |
| "learning_rate": 0.00019054623655913978, | |
| "loss": 0.7816014862060547, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.3655913978494625, | |
| "eval_loss": 0.6435992121696472, | |
| "eval_runtime": 61.9346, | |
| "eval_samples_per_second": 306.84, | |
| "eval_steps_per_second": 38.363, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.4731182795698925, | |
| "grad_norm": 0.9023746848106384, | |
| "learning_rate": 0.00019011612903225807, | |
| "loss": 0.7448858642578124, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.4731182795698925, | |
| "eval_loss": 0.6147477626800537, | |
| "eval_runtime": 60.7037, | |
| "eval_samples_per_second": 313.062, | |
| "eval_steps_per_second": 39.141, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "grad_norm": 0.7893891930580139, | |
| "learning_rate": 0.00018968602150537636, | |
| "loss": 0.7744358062744141, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.5806451612903225, | |
| "eval_loss": 0.6008749604225159, | |
| "eval_runtime": 62.0421, | |
| "eval_samples_per_second": 306.308, | |
| "eval_steps_per_second": 38.297, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.688172043010753, | |
| "grad_norm": 0.8543435335159302, | |
| "learning_rate": 0.00018925591397849463, | |
| "loss": 0.698813705444336, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.688172043010753, | |
| "eval_loss": 0.5843669176101685, | |
| "eval_runtime": 61.7236, | |
| "eval_samples_per_second": 307.889, | |
| "eval_steps_per_second": 38.494, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 2.795698924731183, | |
| "grad_norm": 0.862782895565033, | |
| "learning_rate": 0.00018882580645161292, | |
| "loss": 0.7231275939941406, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.795698924731183, | |
| "eval_loss": 0.560819149017334, | |
| "eval_runtime": 61.272, | |
| "eval_samples_per_second": 310.158, | |
| "eval_steps_per_second": 38.778, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "grad_norm": 0.8126527667045593, | |
| "learning_rate": 0.0001883956989247312, | |
| "loss": 0.6607036590576172, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 2.903225806451613, | |
| "eval_loss": 0.5523199439048767, | |
| "eval_runtime": 61.41, | |
| "eval_samples_per_second": 309.461, | |
| "eval_steps_per_second": 38.691, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 3.010752688172043, | |
| "grad_norm": 0.8788714408874512, | |
| "learning_rate": 0.00018796559139784945, | |
| "loss": 0.658017349243164, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.010752688172043, | |
| "eval_loss": 0.5504087805747986, | |
| "eval_runtime": 61.2893, | |
| "eval_samples_per_second": 310.07, | |
| "eval_steps_per_second": 38.767, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 3.118279569892473, | |
| "grad_norm": 0.8354722857475281, | |
| "learning_rate": 0.00018753548387096775, | |
| "loss": 0.6500599670410157, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.118279569892473, | |
| "eval_loss": 0.5395110845565796, | |
| "eval_runtime": 60.5063, | |
| "eval_samples_per_second": 314.083, | |
| "eval_steps_per_second": 39.269, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "grad_norm": 0.8122305870056152, | |
| "learning_rate": 0.000187105376344086, | |
| "loss": 0.6230792999267578, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.225806451612903, | |
| "eval_loss": 0.5187473297119141, | |
| "eval_runtime": 60.7322, | |
| "eval_samples_per_second": 312.915, | |
| "eval_steps_per_second": 39.123, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "grad_norm": 0.673494815826416, | |
| "learning_rate": 0.0001866752688172043, | |
| "loss": 0.6118016052246094, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.3333333333333335, | |
| "eval_loss": 0.5081239938735962, | |
| "eval_runtime": 60.5862, | |
| "eval_samples_per_second": 313.669, | |
| "eval_steps_per_second": 39.217, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 3.4408602150537635, | |
| "grad_norm": 0.8055212497711182, | |
| "learning_rate": 0.0001862451612903226, | |
| "loss": 0.6122843170166016, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.4408602150537635, | |
| "eval_loss": 0.49499744176864624, | |
| "eval_runtime": 60.6568, | |
| "eval_samples_per_second": 313.304, | |
| "eval_steps_per_second": 39.171, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 3.5483870967741935, | |
| "grad_norm": 0.7935542464256287, | |
| "learning_rate": 0.00018581505376344087, | |
| "loss": 0.5825344467163086, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.5483870967741935, | |
| "eval_loss": 0.48452192544937134, | |
| "eval_runtime": 60.5763, | |
| "eval_samples_per_second": 313.72, | |
| "eval_steps_per_second": 39.223, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 3.6559139784946235, | |
| "grad_norm": 0.6395400166511536, | |
| "learning_rate": 0.00018538494623655916, | |
| "loss": 0.5727723693847656, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.6559139784946235, | |
| "eval_loss": 0.4738766551017761, | |
| "eval_runtime": 60.5051, | |
| "eval_samples_per_second": 314.089, | |
| "eval_steps_per_second": 39.269, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 3.763440860215054, | |
| "grad_norm": 0.6544663906097412, | |
| "learning_rate": 0.00018495483870967742, | |
| "loss": 0.5858316421508789, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.763440860215054, | |
| "eval_loss": 0.4562221169471741, | |
| "eval_runtime": 60.4697, | |
| "eval_samples_per_second": 314.273, | |
| "eval_steps_per_second": 39.292, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 3.870967741935484, | |
| "grad_norm": 0.773256778717041, | |
| "learning_rate": 0.00018452473118279572, | |
| "loss": 0.5555976867675781, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.870967741935484, | |
| "eval_loss": 0.4462752342224121, | |
| "eval_runtime": 61.139, | |
| "eval_samples_per_second": 310.833, | |
| "eval_steps_per_second": 38.862, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 3.978494623655914, | |
| "grad_norm": 0.6679997444152832, | |
| "learning_rate": 0.00018409462365591398, | |
| "loss": 0.5079600143432618, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 3.978494623655914, | |
| "eval_loss": 0.43978169560432434, | |
| "eval_runtime": 60.5103, | |
| "eval_samples_per_second": 314.062, | |
| "eval_steps_per_second": 39.266, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 4.086021505376344, | |
| "grad_norm": 0.7930998206138611, | |
| "learning_rate": 0.00018366451612903225, | |
| "loss": 0.5580390548706055, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.086021505376344, | |
| "eval_loss": 0.4352206587791443, | |
| "eval_runtime": 60.8357, | |
| "eval_samples_per_second": 312.382, | |
| "eval_steps_per_second": 39.056, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 4.193548387096774, | |
| "grad_norm": 0.6607942581176758, | |
| "learning_rate": 0.00018323440860215054, | |
| "loss": 0.49173324584960937, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.193548387096774, | |
| "eval_loss": 0.4238659143447876, | |
| "eval_runtime": 60.9872, | |
| "eval_samples_per_second": 311.606, | |
| "eval_steps_per_second": 38.959, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 4.301075268817204, | |
| "grad_norm": 0.6287643909454346, | |
| "learning_rate": 0.00018280430107526884, | |
| "loss": 0.4687882232666016, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.301075268817204, | |
| "eval_loss": 0.4168907403945923, | |
| "eval_runtime": 61.005, | |
| "eval_samples_per_second": 311.515, | |
| "eval_steps_per_second": 38.948, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 4.408602150537634, | |
| "grad_norm": 0.6433095932006836, | |
| "learning_rate": 0.0001823741935483871, | |
| "loss": 0.4763982009887695, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.408602150537634, | |
| "eval_loss": 0.4120262861251831, | |
| "eval_runtime": 61.5507, | |
| "eval_samples_per_second": 308.753, | |
| "eval_steps_per_second": 38.602, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 4.516129032258064, | |
| "grad_norm": 0.76325523853302, | |
| "learning_rate": 0.0001819440860215054, | |
| "loss": 0.5169943237304687, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.516129032258064, | |
| "eval_loss": 0.40777090191841125, | |
| "eval_runtime": 61.9659, | |
| "eval_samples_per_second": 306.685, | |
| "eval_steps_per_second": 38.344, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 4.623655913978495, | |
| "grad_norm": 0.7534022331237793, | |
| "learning_rate": 0.00018151397849462366, | |
| "loss": 0.4840876770019531, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.623655913978495, | |
| "eval_loss": 0.396854966878891, | |
| "eval_runtime": 61.4429, | |
| "eval_samples_per_second": 309.295, | |
| "eval_steps_per_second": 38.67, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 4.731182795698925, | |
| "grad_norm": 0.688862144947052, | |
| "learning_rate": 0.00018108387096774195, | |
| "loss": 0.46516273498535154, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.731182795698925, | |
| "eval_loss": 0.38546594977378845, | |
| "eval_runtime": 60.8637, | |
| "eval_samples_per_second": 312.239, | |
| "eval_steps_per_second": 39.038, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "grad_norm": 0.5328208208084106, | |
| "learning_rate": 0.00018065376344086022, | |
| "loss": 0.5028326034545898, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.838709677419355, | |
| "eval_loss": 0.37445569038391113, | |
| "eval_runtime": 61.5819, | |
| "eval_samples_per_second": 308.597, | |
| "eval_steps_per_second": 38.583, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 4.946236559139785, | |
| "grad_norm": 0.5857045650482178, | |
| "learning_rate": 0.00018022365591397848, | |
| "loss": 0.43645286560058594, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 4.946236559139785, | |
| "eval_loss": 0.3690737187862396, | |
| "eval_runtime": 61.4895, | |
| "eval_samples_per_second": 309.061, | |
| "eval_steps_per_second": 38.641, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 5.053763440860215, | |
| "grad_norm": 0.6344749331474304, | |
| "learning_rate": 0.00017979354838709678, | |
| "loss": 0.42147178649902345, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.053763440860215, | |
| "eval_loss": 0.3570445775985718, | |
| "eval_runtime": 62.1748, | |
| "eval_samples_per_second": 305.654, | |
| "eval_steps_per_second": 38.215, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 5.161290322580645, | |
| "grad_norm": 0.6610215306282043, | |
| "learning_rate": 0.00017936344086021507, | |
| "loss": 0.4157654571533203, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.161290322580645, | |
| "eval_loss": 0.3497065603733063, | |
| "eval_runtime": 61.6389, | |
| "eval_samples_per_second": 308.312, | |
| "eval_steps_per_second": 38.547, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 5.268817204301075, | |
| "grad_norm": 0.5334368348121643, | |
| "learning_rate": 0.00017893333333333336, | |
| "loss": 0.4012648391723633, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 5.268817204301075, | |
| "eval_loss": 0.33196908235549927, | |
| "eval_runtime": 64.4623, | |
| "eval_samples_per_second": 294.808, | |
| "eval_steps_per_second": 36.859, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 5.376344086021505, | |
| "grad_norm": 0.7559072971343994, | |
| "learning_rate": 0.00017850322580645163, | |
| "loss": 0.4343834686279297, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 5.376344086021505, | |
| "eval_loss": 0.31756916642189026, | |
| "eval_runtime": 64.0899, | |
| "eval_samples_per_second": 296.521, | |
| "eval_steps_per_second": 37.073, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 5.483870967741936, | |
| "grad_norm": 0.6970711946487427, | |
| "learning_rate": 0.0001780731182795699, | |
| "loss": 0.3609016799926758, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 5.483870967741936, | |
| "eval_loss": 0.3129482567310333, | |
| "eval_runtime": 64.2007, | |
| "eval_samples_per_second": 296.009, | |
| "eval_steps_per_second": 37.009, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 5.591397849462366, | |
| "grad_norm": 0.7393150329589844, | |
| "learning_rate": 0.0001776430107526882, | |
| "loss": 0.36085220336914064, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.591397849462366, | |
| "eval_loss": 0.29907363653182983, | |
| "eval_runtime": 64.2974, | |
| "eval_samples_per_second": 295.564, | |
| "eval_steps_per_second": 36.953, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 5.698924731182796, | |
| "grad_norm": 0.6760246157646179, | |
| "learning_rate": 0.00017721290322580645, | |
| "loss": 0.3354073715209961, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 5.698924731182796, | |
| "eval_loss": 0.28903692960739136, | |
| "eval_runtime": 64.2379, | |
| "eval_samples_per_second": 295.838, | |
| "eval_steps_per_second": 36.988, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 5.806451612903226, | |
| "grad_norm": 0.6342934370040894, | |
| "learning_rate": 0.00017678279569892472, | |
| "loss": 0.33487789154052733, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 5.806451612903226, | |
| "eval_loss": 0.2763662040233612, | |
| "eval_runtime": 63.0262, | |
| "eval_samples_per_second": 301.525, | |
| "eval_steps_per_second": 37.699, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 5.913978494623656, | |
| "grad_norm": 0.6288059949874878, | |
| "learning_rate": 0.00017635268817204301, | |
| "loss": 0.3166103744506836, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 5.913978494623656, | |
| "eval_loss": 0.27043381333351135, | |
| "eval_runtime": 63.0792, | |
| "eval_samples_per_second": 301.272, | |
| "eval_steps_per_second": 37.667, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 6.021505376344086, | |
| "grad_norm": 0.8228830695152283, | |
| "learning_rate": 0.0001759225806451613, | |
| "loss": 0.3166475486755371, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.021505376344086, | |
| "eval_loss": 0.26023828983306885, | |
| "eval_runtime": 64.4666, | |
| "eval_samples_per_second": 294.788, | |
| "eval_steps_per_second": 36.856, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 6.129032258064516, | |
| "grad_norm": 0.6261463165283203, | |
| "learning_rate": 0.0001754924731182796, | |
| "loss": 0.30168416976928714, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 6.129032258064516, | |
| "eval_loss": 0.2530518174171448, | |
| "eval_runtime": 63.8775, | |
| "eval_samples_per_second": 297.507, | |
| "eval_steps_per_second": 37.196, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 6.236559139784946, | |
| "grad_norm": 0.7265720367431641, | |
| "learning_rate": 0.00017506236559139787, | |
| "loss": 0.29341196060180663, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 6.236559139784946, | |
| "eval_loss": 0.24442243576049805, | |
| "eval_runtime": 63.2991, | |
| "eval_samples_per_second": 300.226, | |
| "eval_steps_per_second": 37.536, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 6.344086021505376, | |
| "grad_norm": 0.5499133467674255, | |
| "learning_rate": 0.00017463225806451613, | |
| "loss": 0.2850730323791504, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 6.344086021505376, | |
| "eval_loss": 0.237361341714859, | |
| "eval_runtime": 64.5725, | |
| "eval_samples_per_second": 294.305, | |
| "eval_steps_per_second": 36.796, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "grad_norm": 0.7466527223587036, | |
| "learning_rate": 0.00017420215053763442, | |
| "loss": 0.2737441635131836, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 6.451612903225806, | |
| "eval_loss": 0.22867611050605774, | |
| "eval_runtime": 64.8912, | |
| "eval_samples_per_second": 292.86, | |
| "eval_steps_per_second": 36.615, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 6.559139784946236, | |
| "grad_norm": 0.605771005153656, | |
| "learning_rate": 0.0001737720430107527, | |
| "loss": 0.26982501983642576, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 6.559139784946236, | |
| "eval_loss": 0.22686000168323517, | |
| "eval_runtime": 64.8566, | |
| "eval_samples_per_second": 293.016, | |
| "eval_steps_per_second": 36.635, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "grad_norm": 0.6927595138549805, | |
| "learning_rate": 0.00017334193548387096, | |
| "loss": 0.2592777633666992, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 6.666666666666667, | |
| "eval_loss": 0.22359216213226318, | |
| "eval_runtime": 64.9559, | |
| "eval_samples_per_second": 292.568, | |
| "eval_steps_per_second": 36.579, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 6.774193548387097, | |
| "grad_norm": 0.6070519685745239, | |
| "learning_rate": 0.00017291182795698925, | |
| "loss": 0.2539858436584473, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 6.774193548387097, | |
| "eval_loss": 0.22382962703704834, | |
| "eval_runtime": 64.9172, | |
| "eval_samples_per_second": 292.742, | |
| "eval_steps_per_second": 36.6, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 6.881720430107527, | |
| "grad_norm": 0.7206361889839172, | |
| "learning_rate": 0.00017248172043010754, | |
| "loss": 0.2550803184509277, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 6.881720430107527, | |
| "eval_loss": 0.22055239975452423, | |
| "eval_runtime": 65.5818, | |
| "eval_samples_per_second": 289.775, | |
| "eval_steps_per_second": 36.23, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 6.989247311827957, | |
| "grad_norm": 0.6855896711349487, | |
| "learning_rate": 0.00017205161290322584, | |
| "loss": 0.2432615852355957, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 6.989247311827957, | |
| "eval_loss": 0.21467819809913635, | |
| "eval_runtime": 66.2905, | |
| "eval_samples_per_second": 286.677, | |
| "eval_steps_per_second": 35.842, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 7.096774193548387, | |
| "grad_norm": 0.5612008571624756, | |
| "learning_rate": 0.0001716215053763441, | |
| "loss": 0.24562849044799806, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 7.096774193548387, | |
| "eval_loss": 0.21375121176242828, | |
| "eval_runtime": 66.0151, | |
| "eval_samples_per_second": 287.874, | |
| "eval_steps_per_second": 35.992, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 7.204301075268817, | |
| "grad_norm": 0.7433006763458252, | |
| "learning_rate": 0.00017119139784946237, | |
| "loss": 0.2393852424621582, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 7.204301075268817, | |
| "eval_loss": 0.20871323347091675, | |
| "eval_runtime": 61.9563, | |
| "eval_samples_per_second": 306.732, | |
| "eval_steps_per_second": 38.35, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 7.311827956989247, | |
| "grad_norm": 0.6491153836250305, | |
| "learning_rate": 0.00017076129032258066, | |
| "loss": 0.24959787368774414, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 7.311827956989247, | |
| "eval_loss": 0.21120016276836395, | |
| "eval_runtime": 60.6864, | |
| "eval_samples_per_second": 313.151, | |
| "eval_steps_per_second": 39.152, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 7.419354838709677, | |
| "grad_norm": 0.5620025992393494, | |
| "learning_rate": 0.00017033118279569893, | |
| "loss": 0.2320168685913086, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 7.419354838709677, | |
| "eval_loss": 0.20816229283809662, | |
| "eval_runtime": 61.036, | |
| "eval_samples_per_second": 311.357, | |
| "eval_steps_per_second": 38.928, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 7.526881720430108, | |
| "grad_norm": 0.6183444261550903, | |
| "learning_rate": 0.00016990107526881722, | |
| "loss": 0.2322225570678711, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 7.526881720430108, | |
| "eval_loss": 0.20497609674930573, | |
| "eval_runtime": 60.5328, | |
| "eval_samples_per_second": 313.946, | |
| "eval_steps_per_second": 39.251, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 7.634408602150538, | |
| "grad_norm": 0.5328448414802551, | |
| "learning_rate": 0.00016947096774193548, | |
| "loss": 0.23304037094116212, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 7.634408602150538, | |
| "eval_loss": 0.20321960747241974, | |
| "eval_runtime": 62.1711, | |
| "eval_samples_per_second": 305.672, | |
| "eval_steps_per_second": 38.217, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 7.741935483870968, | |
| "grad_norm": 0.5241938829421997, | |
| "learning_rate": 0.00016904086021505378, | |
| "loss": 0.22476686477661134, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 7.741935483870968, | |
| "eval_loss": 0.2034502625465393, | |
| "eval_runtime": 64.8022, | |
| "eval_samples_per_second": 293.262, | |
| "eval_steps_per_second": 36.665, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 7.849462365591398, | |
| "grad_norm": 0.5440294742584229, | |
| "learning_rate": 0.00016861075268817207, | |
| "loss": 0.227796630859375, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 7.849462365591398, | |
| "eval_loss": 0.20562465488910675, | |
| "eval_runtime": 65.1543, | |
| "eval_samples_per_second": 291.677, | |
| "eval_steps_per_second": 36.467, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 7.956989247311828, | |
| "grad_norm": 0.5037738680839539, | |
| "learning_rate": 0.00016818064516129034, | |
| "loss": 0.23125221252441405, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 7.956989247311828, | |
| "eval_loss": 0.20223356783390045, | |
| "eval_runtime": 65.5561, | |
| "eval_samples_per_second": 289.889, | |
| "eval_steps_per_second": 36.244, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 8.064516129032258, | |
| "grad_norm": 0.843550980091095, | |
| "learning_rate": 0.0001677505376344086, | |
| "loss": 0.2236369514465332, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 8.064516129032258, | |
| "eval_loss": 0.19716867804527283, | |
| "eval_runtime": 66.4534, | |
| "eval_samples_per_second": 285.975, | |
| "eval_steps_per_second": 35.754, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 8.172043010752688, | |
| "grad_norm": 0.5562386512756348, | |
| "learning_rate": 0.0001673204301075269, | |
| "loss": 0.22720510482788087, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 8.172043010752688, | |
| "eval_loss": 0.1974799931049347, | |
| "eval_runtime": 66.0022, | |
| "eval_samples_per_second": 287.93, | |
| "eval_steps_per_second": 35.999, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 8.279569892473118, | |
| "grad_norm": 0.5003981590270996, | |
| "learning_rate": 0.00016689032258064516, | |
| "loss": 0.22547555923461915, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 8.279569892473118, | |
| "eval_loss": 0.19821035861968994, | |
| "eval_runtime": 60.464, | |
| "eval_samples_per_second": 314.303, | |
| "eval_steps_per_second": 39.296, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 8.387096774193548, | |
| "grad_norm": 0.4629065692424774, | |
| "learning_rate": 0.00016646021505376345, | |
| "loss": 0.22113780975341796, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 8.387096774193548, | |
| "eval_loss": 0.1924905627965927, | |
| "eval_runtime": 60.595, | |
| "eval_samples_per_second": 313.623, | |
| "eval_steps_per_second": 39.211, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 8.494623655913978, | |
| "grad_norm": 0.5043092966079712, | |
| "learning_rate": 0.00016603010752688172, | |
| "loss": 0.21599315643310546, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 8.494623655913978, | |
| "eval_loss": 0.19553141295909882, | |
| "eval_runtime": 60.5, | |
| "eval_samples_per_second": 314.116, | |
| "eval_steps_per_second": 39.273, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 8.602150537634408, | |
| "grad_norm": 0.6413733959197998, | |
| "learning_rate": 0.0001656, | |
| "loss": 0.2173159408569336, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 8.602150537634408, | |
| "eval_loss": 0.19092191755771637, | |
| "eval_runtime": 60.5854, | |
| "eval_samples_per_second": 313.673, | |
| "eval_steps_per_second": 39.217, | |
| "step": 4000 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 23250, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.685471179194368e+16, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |