{ "best_global_step": 4000, "best_metric": 0.19092191755771637, "best_model_checkpoint": "/home/flytekit/n0w0f/data/mattext_ckpt/results/2026-02-05/18-01-14/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000", "epoch": 8.602150537634408, "eval_steps": 50, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.10752688172043011, "grad_norm": 1.1888866424560547, "learning_rate": 0.00019957849462365592, "loss": 5.97920654296875, "step": 50 }, { "epoch": 0.10752688172043011, "eval_loss": 4.124914646148682, "eval_runtime": 60.5178, "eval_samples_per_second": 314.023, "eval_steps_per_second": 39.261, "step": 50 }, { "epoch": 0.21505376344086022, "grad_norm": 0.9824994802474976, "learning_rate": 0.00019914838709677422, "loss": 3.916483154296875, "step": 100 }, { "epoch": 0.21505376344086022, "eval_loss": 3.675534248352051, "eval_runtime": 61.1234, "eval_samples_per_second": 310.912, "eval_steps_per_second": 38.872, "step": 100 }, { "epoch": 0.3225806451612903, "grad_norm": 0.867065966129303, "learning_rate": 0.00019871827956989248, "loss": 3.620672302246094, "step": 150 }, { "epoch": 0.3225806451612903, "eval_loss": 3.4746599197387695, "eval_runtime": 61.4793, "eval_samples_per_second": 309.112, "eval_steps_per_second": 38.647, "step": 150 }, { "epoch": 0.43010752688172044, "grad_norm": 1.192267894744873, "learning_rate": 0.00019828817204301075, "loss": 3.471976013183594, "step": 200 }, { "epoch": 0.43010752688172044, "eval_loss": 3.353644371032715, "eval_runtime": 60.5187, "eval_samples_per_second": 314.019, "eval_steps_per_second": 39.261, "step": 200 }, { "epoch": 0.5376344086021505, "grad_norm": 1.0798981189727783, "learning_rate": 0.00019785806451612904, "loss": 3.360224609375, "step": 250 }, { "epoch": 0.5376344086021505, "eval_loss": 3.247636079788208, "eval_runtime": 61.527, "eval_samples_per_second": 308.873, "eval_steps_per_second": 38.617, "step": 250 }, { "epoch": 0.6451612903225806, "grad_norm": 1.3051457405090332, "learning_rate": 0.00019742795698924733, "loss": 3.262052307128906, "step": 300 }, { "epoch": 0.6451612903225806, "eval_loss": 3.1502654552459717, "eval_runtime": 60.999, "eval_samples_per_second": 311.546, "eval_steps_per_second": 38.951, "step": 300 }, { "epoch": 0.7526881720430108, "grad_norm": 1.1396135091781616, "learning_rate": 0.0001969978494623656, "loss": 3.225200500488281, "step": 350 }, { "epoch": 0.7526881720430108, "eval_loss": 3.094292163848877, "eval_runtime": 61.381, "eval_samples_per_second": 309.607, "eval_steps_per_second": 38.709, "step": 350 }, { "epoch": 0.8602150537634409, "grad_norm": 1.0816289186477661, "learning_rate": 0.0001965677419354839, "loss": 3.1344537353515625, "step": 400 }, { "epoch": 0.8602150537634409, "eval_loss": 3.0037944316864014, "eval_runtime": 61.1417, "eval_samples_per_second": 310.819, "eval_steps_per_second": 38.861, "step": 400 }, { "epoch": 0.967741935483871, "grad_norm": 1.220457673072815, "learning_rate": 0.00019613763440860216, "loss": 3.024658203125, "step": 450 }, { "epoch": 0.967741935483871, "eval_loss": 2.9253640174865723, "eval_runtime": 61.6823, "eval_samples_per_second": 308.095, "eval_steps_per_second": 38.52, "step": 450 }, { "epoch": 1.075268817204301, "grad_norm": 1.18031644821167, "learning_rate": 0.00019570752688172045, "loss": 2.9539215087890627, "step": 500 }, { "epoch": 1.075268817204301, "eval_loss": 2.827315092086792, "eval_runtime": 64.027, "eval_samples_per_second": 296.812, "eval_steps_per_second": 37.109, "step": 500 }, { "epoch": 1.1827956989247312, "grad_norm": 1.4481481313705444, "learning_rate": 0.00019527741935483872, "loss": 2.8536431884765623, "step": 550 }, { "epoch": 1.1827956989247312, "eval_loss": 2.6743366718292236, "eval_runtime": 60.9092, "eval_samples_per_second": 312.005, "eval_steps_per_second": 39.009, "step": 550 }, { "epoch": 1.2903225806451613, "grad_norm": 1.5985803604125977, "learning_rate": 0.00019484731182795698, "loss": 2.7353704833984374, "step": 600 }, { "epoch": 1.2903225806451613, "eval_loss": 2.4861812591552734, "eval_runtime": 61.6826, "eval_samples_per_second": 308.093, "eval_steps_per_second": 38.52, "step": 600 }, { "epoch": 1.3978494623655915, "grad_norm": 2.046145439147949, "learning_rate": 0.00019441720430107528, "loss": 2.464430084228516, "step": 650 }, { "epoch": 1.3978494623655915, "eval_loss": 2.0265886783599854, "eval_runtime": 61.2709, "eval_samples_per_second": 310.164, "eval_steps_per_second": 38.779, "step": 650 }, { "epoch": 1.5053763440860215, "grad_norm": 1.8674232959747314, "learning_rate": 0.00019398709677419354, "loss": 1.9112973022460937, "step": 700 }, { "epoch": 1.5053763440860215, "eval_loss": 1.3678908348083496, "eval_runtime": 62.2031, "eval_samples_per_second": 305.515, "eval_steps_per_second": 38.197, "step": 700 }, { "epoch": 1.6129032258064515, "grad_norm": 1.708408236503601, "learning_rate": 0.00019355698924731184, "loss": 1.4241523742675781, "step": 750 }, { "epoch": 1.6129032258064515, "eval_loss": 1.0675994157791138, "eval_runtime": 62.2, "eval_samples_per_second": 305.53, "eval_steps_per_second": 38.199, "step": 750 }, { "epoch": 1.7204301075268817, "grad_norm": 1.6592656373977661, "learning_rate": 0.00019312688172043013, "loss": 1.2252975463867188, "step": 800 }, { "epoch": 1.7204301075268817, "eval_loss": 0.9175282716751099, "eval_runtime": 61.3094, "eval_samples_per_second": 309.969, "eval_steps_per_second": 38.754, "step": 800 }, { "epoch": 1.827956989247312, "grad_norm": 1.2984247207641602, "learning_rate": 0.0001926967741935484, "loss": 1.0399230194091797, "step": 850 }, { "epoch": 1.827956989247312, "eval_loss": 0.8346064686775208, "eval_runtime": 61.1605, "eval_samples_per_second": 310.724, "eval_steps_per_second": 38.849, "step": 850 }, { "epoch": 1.935483870967742, "grad_norm": 1.1744712591171265, "learning_rate": 0.0001922666666666667, "loss": 0.9568134307861328, "step": 900 }, { "epoch": 1.935483870967742, "eval_loss": 0.7724924087524414, "eval_runtime": 62.2824, "eval_samples_per_second": 305.126, "eval_steps_per_second": 38.149, "step": 900 }, { "epoch": 2.043010752688172, "grad_norm": 1.2494049072265625, "learning_rate": 0.00019183655913978495, "loss": 0.8979853820800782, "step": 950 }, { "epoch": 2.043010752688172, "eval_loss": 0.7325491905212402, "eval_runtime": 62.8935, "eval_samples_per_second": 302.161, "eval_steps_per_second": 37.778, "step": 950 }, { "epoch": 2.150537634408602, "grad_norm": 1.0687495470046997, "learning_rate": 0.00019140645161290322, "loss": 0.8724540710449219, "step": 1000 }, { "epoch": 2.150537634408602, "eval_loss": 0.6943864822387695, "eval_runtime": 64.2005, "eval_samples_per_second": 296.01, "eval_steps_per_second": 37.009, "step": 1000 }, { "epoch": 2.258064516129032, "grad_norm": 0.9108296036720276, "learning_rate": 0.0001909763440860215, "loss": 0.8106794738769532, "step": 1050 }, { "epoch": 2.258064516129032, "eval_loss": 0.666123628616333, "eval_runtime": 60.9142, "eval_samples_per_second": 311.98, "eval_steps_per_second": 39.006, "step": 1050 }, { "epoch": 2.3655913978494625, "grad_norm": 0.8529163002967834, "learning_rate": 0.00019054623655913978, "loss": 0.7816014862060547, "step": 1100 }, { "epoch": 2.3655913978494625, "eval_loss": 0.6435992121696472, "eval_runtime": 61.9346, "eval_samples_per_second": 306.84, "eval_steps_per_second": 38.363, "step": 1100 }, { "epoch": 2.4731182795698925, "grad_norm": 0.9023746848106384, "learning_rate": 0.00019011612903225807, "loss": 0.7448858642578124, "step": 1150 }, { "epoch": 2.4731182795698925, "eval_loss": 0.6147477626800537, "eval_runtime": 60.7037, "eval_samples_per_second": 313.062, "eval_steps_per_second": 39.141, "step": 1150 }, { "epoch": 2.5806451612903225, "grad_norm": 0.7893891930580139, "learning_rate": 0.00018968602150537636, "loss": 0.7744358062744141, "step": 1200 }, { "epoch": 2.5806451612903225, "eval_loss": 0.6008749604225159, "eval_runtime": 62.0421, "eval_samples_per_second": 306.308, "eval_steps_per_second": 38.297, "step": 1200 }, { "epoch": 2.688172043010753, "grad_norm": 0.8543435335159302, "learning_rate": 0.00018925591397849463, "loss": 0.698813705444336, "step": 1250 }, { "epoch": 2.688172043010753, "eval_loss": 0.5843669176101685, "eval_runtime": 61.7236, "eval_samples_per_second": 307.889, "eval_steps_per_second": 38.494, "step": 1250 }, { "epoch": 2.795698924731183, "grad_norm": 0.862782895565033, "learning_rate": 0.00018882580645161292, "loss": 0.7231275939941406, "step": 1300 }, { "epoch": 2.795698924731183, "eval_loss": 0.560819149017334, "eval_runtime": 61.272, "eval_samples_per_second": 310.158, "eval_steps_per_second": 38.778, "step": 1300 }, { "epoch": 2.903225806451613, "grad_norm": 0.8126527667045593, "learning_rate": 0.0001883956989247312, "loss": 0.6607036590576172, "step": 1350 }, { "epoch": 2.903225806451613, "eval_loss": 0.5523199439048767, "eval_runtime": 61.41, "eval_samples_per_second": 309.461, "eval_steps_per_second": 38.691, "step": 1350 }, { "epoch": 3.010752688172043, "grad_norm": 0.8788714408874512, "learning_rate": 0.00018796559139784945, "loss": 0.658017349243164, "step": 1400 }, { "epoch": 3.010752688172043, "eval_loss": 0.5504087805747986, "eval_runtime": 61.2893, "eval_samples_per_second": 310.07, "eval_steps_per_second": 38.767, "step": 1400 }, { "epoch": 3.118279569892473, "grad_norm": 0.8354722857475281, "learning_rate": 0.00018753548387096775, "loss": 0.6500599670410157, "step": 1450 }, { "epoch": 3.118279569892473, "eval_loss": 0.5395110845565796, "eval_runtime": 60.5063, "eval_samples_per_second": 314.083, "eval_steps_per_second": 39.269, "step": 1450 }, { "epoch": 3.225806451612903, "grad_norm": 0.8122305870056152, "learning_rate": 0.000187105376344086, "loss": 0.6230792999267578, "step": 1500 }, { "epoch": 3.225806451612903, "eval_loss": 0.5187473297119141, "eval_runtime": 60.7322, "eval_samples_per_second": 312.915, "eval_steps_per_second": 39.123, "step": 1500 }, { "epoch": 3.3333333333333335, "grad_norm": 0.673494815826416, "learning_rate": 0.0001866752688172043, "loss": 0.6118016052246094, "step": 1550 }, { "epoch": 3.3333333333333335, "eval_loss": 0.5081239938735962, "eval_runtime": 60.5862, "eval_samples_per_second": 313.669, "eval_steps_per_second": 39.217, "step": 1550 }, { "epoch": 3.4408602150537635, "grad_norm": 0.8055212497711182, "learning_rate": 0.0001862451612903226, "loss": 0.6122843170166016, "step": 1600 }, { "epoch": 3.4408602150537635, "eval_loss": 0.49499744176864624, "eval_runtime": 60.6568, "eval_samples_per_second": 313.304, "eval_steps_per_second": 39.171, "step": 1600 }, { "epoch": 3.5483870967741935, "grad_norm": 0.7935542464256287, "learning_rate": 0.00018581505376344087, "loss": 0.5825344467163086, "step": 1650 }, { "epoch": 3.5483870967741935, "eval_loss": 0.48452192544937134, "eval_runtime": 60.5763, "eval_samples_per_second": 313.72, "eval_steps_per_second": 39.223, "step": 1650 }, { "epoch": 3.6559139784946235, "grad_norm": 0.6395400166511536, "learning_rate": 0.00018538494623655916, "loss": 0.5727723693847656, "step": 1700 }, { "epoch": 3.6559139784946235, "eval_loss": 0.4738766551017761, "eval_runtime": 60.5051, "eval_samples_per_second": 314.089, "eval_steps_per_second": 39.269, "step": 1700 }, { "epoch": 3.763440860215054, "grad_norm": 0.6544663906097412, "learning_rate": 0.00018495483870967742, "loss": 0.5858316421508789, "step": 1750 }, { "epoch": 3.763440860215054, "eval_loss": 0.4562221169471741, "eval_runtime": 60.4697, "eval_samples_per_second": 314.273, "eval_steps_per_second": 39.292, "step": 1750 }, { "epoch": 3.870967741935484, "grad_norm": 0.773256778717041, "learning_rate": 0.00018452473118279572, "loss": 0.5555976867675781, "step": 1800 }, { "epoch": 3.870967741935484, "eval_loss": 0.4462752342224121, "eval_runtime": 61.139, "eval_samples_per_second": 310.833, "eval_steps_per_second": 38.862, "step": 1800 }, { "epoch": 3.978494623655914, "grad_norm": 0.6679997444152832, "learning_rate": 0.00018409462365591398, "loss": 0.5079600143432618, "step": 1850 }, { "epoch": 3.978494623655914, "eval_loss": 0.43978169560432434, "eval_runtime": 60.5103, "eval_samples_per_second": 314.062, "eval_steps_per_second": 39.266, "step": 1850 }, { "epoch": 4.086021505376344, "grad_norm": 0.7930998206138611, "learning_rate": 0.00018366451612903225, "loss": 0.5580390548706055, "step": 1900 }, { "epoch": 4.086021505376344, "eval_loss": 0.4352206587791443, "eval_runtime": 60.8357, "eval_samples_per_second": 312.382, "eval_steps_per_second": 39.056, "step": 1900 }, { "epoch": 4.193548387096774, "grad_norm": 0.6607942581176758, "learning_rate": 0.00018323440860215054, "loss": 0.49173324584960937, "step": 1950 }, { "epoch": 4.193548387096774, "eval_loss": 0.4238659143447876, "eval_runtime": 60.9872, "eval_samples_per_second": 311.606, "eval_steps_per_second": 38.959, "step": 1950 }, { "epoch": 4.301075268817204, "grad_norm": 0.6287643909454346, "learning_rate": 0.00018280430107526884, "loss": 0.4687882232666016, "step": 2000 }, { "epoch": 4.301075268817204, "eval_loss": 0.4168907403945923, "eval_runtime": 61.005, "eval_samples_per_second": 311.515, "eval_steps_per_second": 38.948, "step": 2000 }, { "epoch": 4.408602150537634, "grad_norm": 0.6433095932006836, "learning_rate": 0.0001823741935483871, "loss": 0.4763982009887695, "step": 2050 }, { "epoch": 4.408602150537634, "eval_loss": 0.4120262861251831, "eval_runtime": 61.5507, "eval_samples_per_second": 308.753, "eval_steps_per_second": 38.602, "step": 2050 }, { "epoch": 4.516129032258064, "grad_norm": 0.76325523853302, "learning_rate": 0.0001819440860215054, "loss": 0.5169943237304687, "step": 2100 }, { "epoch": 4.516129032258064, "eval_loss": 0.40777090191841125, "eval_runtime": 61.9659, "eval_samples_per_second": 306.685, "eval_steps_per_second": 38.344, "step": 2100 }, { "epoch": 4.623655913978495, "grad_norm": 0.7534022331237793, "learning_rate": 0.00018151397849462366, "loss": 0.4840876770019531, "step": 2150 }, { "epoch": 4.623655913978495, "eval_loss": 0.396854966878891, "eval_runtime": 61.4429, "eval_samples_per_second": 309.295, "eval_steps_per_second": 38.67, "step": 2150 }, { "epoch": 4.731182795698925, "grad_norm": 0.688862144947052, "learning_rate": 0.00018108387096774195, "loss": 0.46516273498535154, "step": 2200 }, { "epoch": 4.731182795698925, "eval_loss": 0.38546594977378845, "eval_runtime": 60.8637, "eval_samples_per_second": 312.239, "eval_steps_per_second": 39.038, "step": 2200 }, { "epoch": 4.838709677419355, "grad_norm": 0.5328208208084106, "learning_rate": 0.00018065376344086022, "loss": 0.5028326034545898, "step": 2250 }, { "epoch": 4.838709677419355, "eval_loss": 0.37445569038391113, "eval_runtime": 61.5819, "eval_samples_per_second": 308.597, "eval_steps_per_second": 38.583, "step": 2250 }, { "epoch": 4.946236559139785, "grad_norm": 0.5857045650482178, "learning_rate": 0.00018022365591397848, "loss": 0.43645286560058594, "step": 2300 }, { "epoch": 4.946236559139785, "eval_loss": 0.3690737187862396, "eval_runtime": 61.4895, "eval_samples_per_second": 309.061, "eval_steps_per_second": 38.641, "step": 2300 }, { "epoch": 5.053763440860215, "grad_norm": 0.6344749331474304, "learning_rate": 0.00017979354838709678, "loss": 0.42147178649902345, "step": 2350 }, { "epoch": 5.053763440860215, "eval_loss": 0.3570445775985718, "eval_runtime": 62.1748, "eval_samples_per_second": 305.654, "eval_steps_per_second": 38.215, "step": 2350 }, { "epoch": 5.161290322580645, "grad_norm": 0.6610215306282043, "learning_rate": 0.00017936344086021507, "loss": 0.4157654571533203, "step": 2400 }, { "epoch": 5.161290322580645, "eval_loss": 0.3497065603733063, "eval_runtime": 61.6389, "eval_samples_per_second": 308.312, "eval_steps_per_second": 38.547, "step": 2400 }, { "epoch": 5.268817204301075, "grad_norm": 0.5334368348121643, "learning_rate": 0.00017893333333333336, "loss": 0.4012648391723633, "step": 2450 }, { "epoch": 5.268817204301075, "eval_loss": 0.33196908235549927, "eval_runtime": 64.4623, "eval_samples_per_second": 294.808, "eval_steps_per_second": 36.859, "step": 2450 }, { "epoch": 5.376344086021505, "grad_norm": 0.7559072971343994, "learning_rate": 0.00017850322580645163, "loss": 0.4343834686279297, "step": 2500 }, { "epoch": 5.376344086021505, "eval_loss": 0.31756916642189026, "eval_runtime": 64.0899, "eval_samples_per_second": 296.521, "eval_steps_per_second": 37.073, "step": 2500 }, { "epoch": 5.483870967741936, "grad_norm": 0.6970711946487427, "learning_rate": 0.0001780731182795699, "loss": 0.3609016799926758, "step": 2550 }, { "epoch": 5.483870967741936, "eval_loss": 0.3129482567310333, "eval_runtime": 64.2007, "eval_samples_per_second": 296.009, "eval_steps_per_second": 37.009, "step": 2550 }, { "epoch": 5.591397849462366, "grad_norm": 0.7393150329589844, "learning_rate": 0.0001776430107526882, "loss": 0.36085220336914064, "step": 2600 }, { "epoch": 5.591397849462366, "eval_loss": 0.29907363653182983, "eval_runtime": 64.2974, "eval_samples_per_second": 295.564, "eval_steps_per_second": 36.953, "step": 2600 }, { "epoch": 5.698924731182796, "grad_norm": 0.6760246157646179, "learning_rate": 0.00017721290322580645, "loss": 0.3354073715209961, "step": 2650 }, { "epoch": 5.698924731182796, "eval_loss": 0.28903692960739136, "eval_runtime": 64.2379, "eval_samples_per_second": 295.838, "eval_steps_per_second": 36.988, "step": 2650 }, { "epoch": 5.806451612903226, "grad_norm": 0.6342934370040894, "learning_rate": 0.00017678279569892472, "loss": 0.33487789154052733, "step": 2700 }, { "epoch": 5.806451612903226, "eval_loss": 0.2763662040233612, "eval_runtime": 63.0262, "eval_samples_per_second": 301.525, "eval_steps_per_second": 37.699, "step": 2700 }, { "epoch": 5.913978494623656, "grad_norm": 0.6288059949874878, "learning_rate": 0.00017635268817204301, "loss": 0.3166103744506836, "step": 2750 }, { "epoch": 5.913978494623656, "eval_loss": 0.27043381333351135, "eval_runtime": 63.0792, "eval_samples_per_second": 301.272, "eval_steps_per_second": 37.667, "step": 2750 }, { "epoch": 6.021505376344086, "grad_norm": 0.8228830695152283, "learning_rate": 0.0001759225806451613, "loss": 0.3166475486755371, "step": 2800 }, { "epoch": 6.021505376344086, "eval_loss": 0.26023828983306885, "eval_runtime": 64.4666, "eval_samples_per_second": 294.788, "eval_steps_per_second": 36.856, "step": 2800 }, { "epoch": 6.129032258064516, "grad_norm": 0.6261463165283203, "learning_rate": 0.0001754924731182796, "loss": 0.30168416976928714, "step": 2850 }, { "epoch": 6.129032258064516, "eval_loss": 0.2530518174171448, "eval_runtime": 63.8775, "eval_samples_per_second": 297.507, "eval_steps_per_second": 37.196, "step": 2850 }, { "epoch": 6.236559139784946, "grad_norm": 0.7265720367431641, "learning_rate": 0.00017506236559139787, "loss": 0.29341196060180663, "step": 2900 }, { "epoch": 6.236559139784946, "eval_loss": 0.24442243576049805, "eval_runtime": 63.2991, "eval_samples_per_second": 300.226, "eval_steps_per_second": 37.536, "step": 2900 }, { "epoch": 6.344086021505376, "grad_norm": 0.5499133467674255, "learning_rate": 0.00017463225806451613, "loss": 0.2850730323791504, "step": 2950 }, { "epoch": 6.344086021505376, "eval_loss": 0.237361341714859, "eval_runtime": 64.5725, "eval_samples_per_second": 294.305, "eval_steps_per_second": 36.796, "step": 2950 }, { "epoch": 6.451612903225806, "grad_norm": 0.7466527223587036, "learning_rate": 0.00017420215053763442, "loss": 0.2737441635131836, "step": 3000 }, { "epoch": 6.451612903225806, "eval_loss": 0.22867611050605774, "eval_runtime": 64.8912, "eval_samples_per_second": 292.86, "eval_steps_per_second": 36.615, "step": 3000 }, { "epoch": 6.559139784946236, "grad_norm": 0.605771005153656, "learning_rate": 0.0001737720430107527, "loss": 0.26982501983642576, "step": 3050 }, { "epoch": 6.559139784946236, "eval_loss": 0.22686000168323517, "eval_runtime": 64.8566, "eval_samples_per_second": 293.016, "eval_steps_per_second": 36.635, "step": 3050 }, { "epoch": 6.666666666666667, "grad_norm": 0.6927595138549805, "learning_rate": 0.00017334193548387096, "loss": 0.2592777633666992, "step": 3100 }, { "epoch": 6.666666666666667, "eval_loss": 0.22359216213226318, "eval_runtime": 64.9559, "eval_samples_per_second": 292.568, "eval_steps_per_second": 36.579, "step": 3100 }, { "epoch": 6.774193548387097, "grad_norm": 0.6070519685745239, "learning_rate": 0.00017291182795698925, "loss": 0.2539858436584473, "step": 3150 }, { "epoch": 6.774193548387097, "eval_loss": 0.22382962703704834, "eval_runtime": 64.9172, "eval_samples_per_second": 292.742, "eval_steps_per_second": 36.6, "step": 3150 }, { "epoch": 6.881720430107527, "grad_norm": 0.7206361889839172, "learning_rate": 0.00017248172043010754, "loss": 0.2550803184509277, "step": 3200 }, { "epoch": 6.881720430107527, "eval_loss": 0.22055239975452423, "eval_runtime": 65.5818, "eval_samples_per_second": 289.775, "eval_steps_per_second": 36.23, "step": 3200 }, { "epoch": 6.989247311827957, "grad_norm": 0.6855896711349487, "learning_rate": 0.00017205161290322584, "loss": 0.2432615852355957, "step": 3250 }, { "epoch": 6.989247311827957, "eval_loss": 0.21467819809913635, "eval_runtime": 66.2905, "eval_samples_per_second": 286.677, "eval_steps_per_second": 35.842, "step": 3250 }, { "epoch": 7.096774193548387, "grad_norm": 0.5612008571624756, "learning_rate": 0.0001716215053763441, "loss": 0.24562849044799806, "step": 3300 }, { "epoch": 7.096774193548387, "eval_loss": 0.21375121176242828, "eval_runtime": 66.0151, "eval_samples_per_second": 287.874, "eval_steps_per_second": 35.992, "step": 3300 }, { "epoch": 7.204301075268817, "grad_norm": 0.7433006763458252, "learning_rate": 0.00017119139784946237, "loss": 0.2393852424621582, "step": 3350 }, { "epoch": 7.204301075268817, "eval_loss": 0.20871323347091675, "eval_runtime": 61.9563, "eval_samples_per_second": 306.732, "eval_steps_per_second": 38.35, "step": 3350 }, { "epoch": 7.311827956989247, "grad_norm": 0.6491153836250305, "learning_rate": 0.00017076129032258066, "loss": 0.24959787368774414, "step": 3400 }, { "epoch": 7.311827956989247, "eval_loss": 0.21120016276836395, "eval_runtime": 60.6864, "eval_samples_per_second": 313.151, "eval_steps_per_second": 39.152, "step": 3400 }, { "epoch": 7.419354838709677, "grad_norm": 0.5620025992393494, "learning_rate": 0.00017033118279569893, "loss": 0.2320168685913086, "step": 3450 }, { "epoch": 7.419354838709677, "eval_loss": 0.20816229283809662, "eval_runtime": 61.036, "eval_samples_per_second": 311.357, "eval_steps_per_second": 38.928, "step": 3450 }, { "epoch": 7.526881720430108, "grad_norm": 0.6183444261550903, "learning_rate": 0.00016990107526881722, "loss": 0.2322225570678711, "step": 3500 }, { "epoch": 7.526881720430108, "eval_loss": 0.20497609674930573, "eval_runtime": 60.5328, "eval_samples_per_second": 313.946, "eval_steps_per_second": 39.251, "step": 3500 }, { "epoch": 7.634408602150538, "grad_norm": 0.5328448414802551, "learning_rate": 0.00016947096774193548, "loss": 0.23304037094116212, "step": 3550 }, { "epoch": 7.634408602150538, "eval_loss": 0.20321960747241974, "eval_runtime": 62.1711, "eval_samples_per_second": 305.672, "eval_steps_per_second": 38.217, "step": 3550 }, { "epoch": 7.741935483870968, "grad_norm": 0.5241938829421997, "learning_rate": 0.00016904086021505378, "loss": 0.22476686477661134, "step": 3600 }, { "epoch": 7.741935483870968, "eval_loss": 0.2034502625465393, "eval_runtime": 64.8022, "eval_samples_per_second": 293.262, "eval_steps_per_second": 36.665, "step": 3600 }, { "epoch": 7.849462365591398, "grad_norm": 0.5440294742584229, "learning_rate": 0.00016861075268817207, "loss": 0.227796630859375, "step": 3650 }, { "epoch": 7.849462365591398, "eval_loss": 0.20562465488910675, "eval_runtime": 65.1543, "eval_samples_per_second": 291.677, "eval_steps_per_second": 36.467, "step": 3650 }, { "epoch": 7.956989247311828, "grad_norm": 0.5037738680839539, "learning_rate": 0.00016818064516129034, "loss": 0.23125221252441405, "step": 3700 }, { "epoch": 7.956989247311828, "eval_loss": 0.20223356783390045, "eval_runtime": 65.5561, "eval_samples_per_second": 289.889, "eval_steps_per_second": 36.244, "step": 3700 }, { "epoch": 8.064516129032258, "grad_norm": 0.843550980091095, "learning_rate": 0.0001677505376344086, "loss": 0.2236369514465332, "step": 3750 }, { "epoch": 8.064516129032258, "eval_loss": 0.19716867804527283, "eval_runtime": 66.4534, "eval_samples_per_second": 285.975, "eval_steps_per_second": 35.754, "step": 3750 }, { "epoch": 8.172043010752688, "grad_norm": 0.5562386512756348, "learning_rate": 0.0001673204301075269, "loss": 0.22720510482788087, "step": 3800 }, { "epoch": 8.172043010752688, "eval_loss": 0.1974799931049347, "eval_runtime": 66.0022, "eval_samples_per_second": 287.93, "eval_steps_per_second": 35.999, "step": 3800 }, { "epoch": 8.279569892473118, "grad_norm": 0.5003981590270996, "learning_rate": 0.00016689032258064516, "loss": 0.22547555923461915, "step": 3850 }, { "epoch": 8.279569892473118, "eval_loss": 0.19821035861968994, "eval_runtime": 60.464, "eval_samples_per_second": 314.303, "eval_steps_per_second": 39.296, "step": 3850 }, { "epoch": 8.387096774193548, "grad_norm": 0.4629065692424774, "learning_rate": 0.00016646021505376345, "loss": 0.22113780975341796, "step": 3900 }, { "epoch": 8.387096774193548, "eval_loss": 0.1924905627965927, "eval_runtime": 60.595, "eval_samples_per_second": 313.623, "eval_steps_per_second": 39.211, "step": 3900 }, { "epoch": 8.494623655913978, "grad_norm": 0.5043092966079712, "learning_rate": 0.00016603010752688172, "loss": 0.21599315643310546, "step": 3950 }, { "epoch": 8.494623655913978, "eval_loss": 0.19553141295909882, "eval_runtime": 60.5, "eval_samples_per_second": 314.116, "eval_steps_per_second": 39.273, "step": 3950 }, { "epoch": 8.602150537634408, "grad_norm": 0.6413733959197998, "learning_rate": 0.0001656, "loss": 0.2173159408569336, "step": 4000 }, { "epoch": 8.602150537634408, "eval_loss": 0.19092191755771637, "eval_runtime": 60.5854, "eval_samples_per_second": 313.673, "eval_steps_per_second": 39.217, "step": 4000 } ], "logging_steps": 50, "max_steps": 23250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.685471179194368e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }