MatText_30k_robocrys_rep_test / trainer_state.json
n0w0f's picture
Upload checkpoint
432a13c verified
{
"best_global_step": 4000,
"best_metric": 0.19092191755771637,
"best_model_checkpoint": "/home/flytekit/n0w0f/data/mattext_ckpt/results/2026-02-05/18-01-14/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-4000",
"epoch": 8.602150537634408,
"eval_steps": 50,
"global_step": 4000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.10752688172043011,
"grad_norm": 1.1888866424560547,
"learning_rate": 0.00019957849462365592,
"loss": 5.97920654296875,
"step": 50
},
{
"epoch": 0.10752688172043011,
"eval_loss": 4.124914646148682,
"eval_runtime": 60.5178,
"eval_samples_per_second": 314.023,
"eval_steps_per_second": 39.261,
"step": 50
},
{
"epoch": 0.21505376344086022,
"grad_norm": 0.9824994802474976,
"learning_rate": 0.00019914838709677422,
"loss": 3.916483154296875,
"step": 100
},
{
"epoch": 0.21505376344086022,
"eval_loss": 3.675534248352051,
"eval_runtime": 61.1234,
"eval_samples_per_second": 310.912,
"eval_steps_per_second": 38.872,
"step": 100
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.867065966129303,
"learning_rate": 0.00019871827956989248,
"loss": 3.620672302246094,
"step": 150
},
{
"epoch": 0.3225806451612903,
"eval_loss": 3.4746599197387695,
"eval_runtime": 61.4793,
"eval_samples_per_second": 309.112,
"eval_steps_per_second": 38.647,
"step": 150
},
{
"epoch": 0.43010752688172044,
"grad_norm": 1.192267894744873,
"learning_rate": 0.00019828817204301075,
"loss": 3.471976013183594,
"step": 200
},
{
"epoch": 0.43010752688172044,
"eval_loss": 3.353644371032715,
"eval_runtime": 60.5187,
"eval_samples_per_second": 314.019,
"eval_steps_per_second": 39.261,
"step": 200
},
{
"epoch": 0.5376344086021505,
"grad_norm": 1.0798981189727783,
"learning_rate": 0.00019785806451612904,
"loss": 3.360224609375,
"step": 250
},
{
"epoch": 0.5376344086021505,
"eval_loss": 3.247636079788208,
"eval_runtime": 61.527,
"eval_samples_per_second": 308.873,
"eval_steps_per_second": 38.617,
"step": 250
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.3051457405090332,
"learning_rate": 0.00019742795698924733,
"loss": 3.262052307128906,
"step": 300
},
{
"epoch": 0.6451612903225806,
"eval_loss": 3.1502654552459717,
"eval_runtime": 60.999,
"eval_samples_per_second": 311.546,
"eval_steps_per_second": 38.951,
"step": 300
},
{
"epoch": 0.7526881720430108,
"grad_norm": 1.1396135091781616,
"learning_rate": 0.0001969978494623656,
"loss": 3.225200500488281,
"step": 350
},
{
"epoch": 0.7526881720430108,
"eval_loss": 3.094292163848877,
"eval_runtime": 61.381,
"eval_samples_per_second": 309.607,
"eval_steps_per_second": 38.709,
"step": 350
},
{
"epoch": 0.8602150537634409,
"grad_norm": 1.0816289186477661,
"learning_rate": 0.0001965677419354839,
"loss": 3.1344537353515625,
"step": 400
},
{
"epoch": 0.8602150537634409,
"eval_loss": 3.0037944316864014,
"eval_runtime": 61.1417,
"eval_samples_per_second": 310.819,
"eval_steps_per_second": 38.861,
"step": 400
},
{
"epoch": 0.967741935483871,
"grad_norm": 1.220457673072815,
"learning_rate": 0.00019613763440860216,
"loss": 3.024658203125,
"step": 450
},
{
"epoch": 0.967741935483871,
"eval_loss": 2.9253640174865723,
"eval_runtime": 61.6823,
"eval_samples_per_second": 308.095,
"eval_steps_per_second": 38.52,
"step": 450
},
{
"epoch": 1.075268817204301,
"grad_norm": 1.18031644821167,
"learning_rate": 0.00019570752688172045,
"loss": 2.9539215087890627,
"step": 500
},
{
"epoch": 1.075268817204301,
"eval_loss": 2.827315092086792,
"eval_runtime": 64.027,
"eval_samples_per_second": 296.812,
"eval_steps_per_second": 37.109,
"step": 500
},
{
"epoch": 1.1827956989247312,
"grad_norm": 1.4481481313705444,
"learning_rate": 0.00019527741935483872,
"loss": 2.8536431884765623,
"step": 550
},
{
"epoch": 1.1827956989247312,
"eval_loss": 2.6743366718292236,
"eval_runtime": 60.9092,
"eval_samples_per_second": 312.005,
"eval_steps_per_second": 39.009,
"step": 550
},
{
"epoch": 1.2903225806451613,
"grad_norm": 1.5985803604125977,
"learning_rate": 0.00019484731182795698,
"loss": 2.7353704833984374,
"step": 600
},
{
"epoch": 1.2903225806451613,
"eval_loss": 2.4861812591552734,
"eval_runtime": 61.6826,
"eval_samples_per_second": 308.093,
"eval_steps_per_second": 38.52,
"step": 600
},
{
"epoch": 1.3978494623655915,
"grad_norm": 2.046145439147949,
"learning_rate": 0.00019441720430107528,
"loss": 2.464430084228516,
"step": 650
},
{
"epoch": 1.3978494623655915,
"eval_loss": 2.0265886783599854,
"eval_runtime": 61.2709,
"eval_samples_per_second": 310.164,
"eval_steps_per_second": 38.779,
"step": 650
},
{
"epoch": 1.5053763440860215,
"grad_norm": 1.8674232959747314,
"learning_rate": 0.00019398709677419354,
"loss": 1.9112973022460937,
"step": 700
},
{
"epoch": 1.5053763440860215,
"eval_loss": 1.3678908348083496,
"eval_runtime": 62.2031,
"eval_samples_per_second": 305.515,
"eval_steps_per_second": 38.197,
"step": 700
},
{
"epoch": 1.6129032258064515,
"grad_norm": 1.708408236503601,
"learning_rate": 0.00019355698924731184,
"loss": 1.4241523742675781,
"step": 750
},
{
"epoch": 1.6129032258064515,
"eval_loss": 1.0675994157791138,
"eval_runtime": 62.2,
"eval_samples_per_second": 305.53,
"eval_steps_per_second": 38.199,
"step": 750
},
{
"epoch": 1.7204301075268817,
"grad_norm": 1.6592656373977661,
"learning_rate": 0.00019312688172043013,
"loss": 1.2252975463867188,
"step": 800
},
{
"epoch": 1.7204301075268817,
"eval_loss": 0.9175282716751099,
"eval_runtime": 61.3094,
"eval_samples_per_second": 309.969,
"eval_steps_per_second": 38.754,
"step": 800
},
{
"epoch": 1.827956989247312,
"grad_norm": 1.2984247207641602,
"learning_rate": 0.0001926967741935484,
"loss": 1.0399230194091797,
"step": 850
},
{
"epoch": 1.827956989247312,
"eval_loss": 0.8346064686775208,
"eval_runtime": 61.1605,
"eval_samples_per_second": 310.724,
"eval_steps_per_second": 38.849,
"step": 850
},
{
"epoch": 1.935483870967742,
"grad_norm": 1.1744712591171265,
"learning_rate": 0.0001922666666666667,
"loss": 0.9568134307861328,
"step": 900
},
{
"epoch": 1.935483870967742,
"eval_loss": 0.7724924087524414,
"eval_runtime": 62.2824,
"eval_samples_per_second": 305.126,
"eval_steps_per_second": 38.149,
"step": 900
},
{
"epoch": 2.043010752688172,
"grad_norm": 1.2494049072265625,
"learning_rate": 0.00019183655913978495,
"loss": 0.8979853820800782,
"step": 950
},
{
"epoch": 2.043010752688172,
"eval_loss": 0.7325491905212402,
"eval_runtime": 62.8935,
"eval_samples_per_second": 302.161,
"eval_steps_per_second": 37.778,
"step": 950
},
{
"epoch": 2.150537634408602,
"grad_norm": 1.0687495470046997,
"learning_rate": 0.00019140645161290322,
"loss": 0.8724540710449219,
"step": 1000
},
{
"epoch": 2.150537634408602,
"eval_loss": 0.6943864822387695,
"eval_runtime": 64.2005,
"eval_samples_per_second": 296.01,
"eval_steps_per_second": 37.009,
"step": 1000
},
{
"epoch": 2.258064516129032,
"grad_norm": 0.9108296036720276,
"learning_rate": 0.0001909763440860215,
"loss": 0.8106794738769532,
"step": 1050
},
{
"epoch": 2.258064516129032,
"eval_loss": 0.666123628616333,
"eval_runtime": 60.9142,
"eval_samples_per_second": 311.98,
"eval_steps_per_second": 39.006,
"step": 1050
},
{
"epoch": 2.3655913978494625,
"grad_norm": 0.8529163002967834,
"learning_rate": 0.00019054623655913978,
"loss": 0.7816014862060547,
"step": 1100
},
{
"epoch": 2.3655913978494625,
"eval_loss": 0.6435992121696472,
"eval_runtime": 61.9346,
"eval_samples_per_second": 306.84,
"eval_steps_per_second": 38.363,
"step": 1100
},
{
"epoch": 2.4731182795698925,
"grad_norm": 0.9023746848106384,
"learning_rate": 0.00019011612903225807,
"loss": 0.7448858642578124,
"step": 1150
},
{
"epoch": 2.4731182795698925,
"eval_loss": 0.6147477626800537,
"eval_runtime": 60.7037,
"eval_samples_per_second": 313.062,
"eval_steps_per_second": 39.141,
"step": 1150
},
{
"epoch": 2.5806451612903225,
"grad_norm": 0.7893891930580139,
"learning_rate": 0.00018968602150537636,
"loss": 0.7744358062744141,
"step": 1200
},
{
"epoch": 2.5806451612903225,
"eval_loss": 0.6008749604225159,
"eval_runtime": 62.0421,
"eval_samples_per_second": 306.308,
"eval_steps_per_second": 38.297,
"step": 1200
},
{
"epoch": 2.688172043010753,
"grad_norm": 0.8543435335159302,
"learning_rate": 0.00018925591397849463,
"loss": 0.698813705444336,
"step": 1250
},
{
"epoch": 2.688172043010753,
"eval_loss": 0.5843669176101685,
"eval_runtime": 61.7236,
"eval_samples_per_second": 307.889,
"eval_steps_per_second": 38.494,
"step": 1250
},
{
"epoch": 2.795698924731183,
"grad_norm": 0.862782895565033,
"learning_rate": 0.00018882580645161292,
"loss": 0.7231275939941406,
"step": 1300
},
{
"epoch": 2.795698924731183,
"eval_loss": 0.560819149017334,
"eval_runtime": 61.272,
"eval_samples_per_second": 310.158,
"eval_steps_per_second": 38.778,
"step": 1300
},
{
"epoch": 2.903225806451613,
"grad_norm": 0.8126527667045593,
"learning_rate": 0.0001883956989247312,
"loss": 0.6607036590576172,
"step": 1350
},
{
"epoch": 2.903225806451613,
"eval_loss": 0.5523199439048767,
"eval_runtime": 61.41,
"eval_samples_per_second": 309.461,
"eval_steps_per_second": 38.691,
"step": 1350
},
{
"epoch": 3.010752688172043,
"grad_norm": 0.8788714408874512,
"learning_rate": 0.00018796559139784945,
"loss": 0.658017349243164,
"step": 1400
},
{
"epoch": 3.010752688172043,
"eval_loss": 0.5504087805747986,
"eval_runtime": 61.2893,
"eval_samples_per_second": 310.07,
"eval_steps_per_second": 38.767,
"step": 1400
},
{
"epoch": 3.118279569892473,
"grad_norm": 0.8354722857475281,
"learning_rate": 0.00018753548387096775,
"loss": 0.6500599670410157,
"step": 1450
},
{
"epoch": 3.118279569892473,
"eval_loss": 0.5395110845565796,
"eval_runtime": 60.5063,
"eval_samples_per_second": 314.083,
"eval_steps_per_second": 39.269,
"step": 1450
},
{
"epoch": 3.225806451612903,
"grad_norm": 0.8122305870056152,
"learning_rate": 0.000187105376344086,
"loss": 0.6230792999267578,
"step": 1500
},
{
"epoch": 3.225806451612903,
"eval_loss": 0.5187473297119141,
"eval_runtime": 60.7322,
"eval_samples_per_second": 312.915,
"eval_steps_per_second": 39.123,
"step": 1500
},
{
"epoch": 3.3333333333333335,
"grad_norm": 0.673494815826416,
"learning_rate": 0.0001866752688172043,
"loss": 0.6118016052246094,
"step": 1550
},
{
"epoch": 3.3333333333333335,
"eval_loss": 0.5081239938735962,
"eval_runtime": 60.5862,
"eval_samples_per_second": 313.669,
"eval_steps_per_second": 39.217,
"step": 1550
},
{
"epoch": 3.4408602150537635,
"grad_norm": 0.8055212497711182,
"learning_rate": 0.0001862451612903226,
"loss": 0.6122843170166016,
"step": 1600
},
{
"epoch": 3.4408602150537635,
"eval_loss": 0.49499744176864624,
"eval_runtime": 60.6568,
"eval_samples_per_second": 313.304,
"eval_steps_per_second": 39.171,
"step": 1600
},
{
"epoch": 3.5483870967741935,
"grad_norm": 0.7935542464256287,
"learning_rate": 0.00018581505376344087,
"loss": 0.5825344467163086,
"step": 1650
},
{
"epoch": 3.5483870967741935,
"eval_loss": 0.48452192544937134,
"eval_runtime": 60.5763,
"eval_samples_per_second": 313.72,
"eval_steps_per_second": 39.223,
"step": 1650
},
{
"epoch": 3.6559139784946235,
"grad_norm": 0.6395400166511536,
"learning_rate": 0.00018538494623655916,
"loss": 0.5727723693847656,
"step": 1700
},
{
"epoch": 3.6559139784946235,
"eval_loss": 0.4738766551017761,
"eval_runtime": 60.5051,
"eval_samples_per_second": 314.089,
"eval_steps_per_second": 39.269,
"step": 1700
},
{
"epoch": 3.763440860215054,
"grad_norm": 0.6544663906097412,
"learning_rate": 0.00018495483870967742,
"loss": 0.5858316421508789,
"step": 1750
},
{
"epoch": 3.763440860215054,
"eval_loss": 0.4562221169471741,
"eval_runtime": 60.4697,
"eval_samples_per_second": 314.273,
"eval_steps_per_second": 39.292,
"step": 1750
},
{
"epoch": 3.870967741935484,
"grad_norm": 0.773256778717041,
"learning_rate": 0.00018452473118279572,
"loss": 0.5555976867675781,
"step": 1800
},
{
"epoch": 3.870967741935484,
"eval_loss": 0.4462752342224121,
"eval_runtime": 61.139,
"eval_samples_per_second": 310.833,
"eval_steps_per_second": 38.862,
"step": 1800
},
{
"epoch": 3.978494623655914,
"grad_norm": 0.6679997444152832,
"learning_rate": 0.00018409462365591398,
"loss": 0.5079600143432618,
"step": 1850
},
{
"epoch": 3.978494623655914,
"eval_loss": 0.43978169560432434,
"eval_runtime": 60.5103,
"eval_samples_per_second": 314.062,
"eval_steps_per_second": 39.266,
"step": 1850
},
{
"epoch": 4.086021505376344,
"grad_norm": 0.7930998206138611,
"learning_rate": 0.00018366451612903225,
"loss": 0.5580390548706055,
"step": 1900
},
{
"epoch": 4.086021505376344,
"eval_loss": 0.4352206587791443,
"eval_runtime": 60.8357,
"eval_samples_per_second": 312.382,
"eval_steps_per_second": 39.056,
"step": 1900
},
{
"epoch": 4.193548387096774,
"grad_norm": 0.6607942581176758,
"learning_rate": 0.00018323440860215054,
"loss": 0.49173324584960937,
"step": 1950
},
{
"epoch": 4.193548387096774,
"eval_loss": 0.4238659143447876,
"eval_runtime": 60.9872,
"eval_samples_per_second": 311.606,
"eval_steps_per_second": 38.959,
"step": 1950
},
{
"epoch": 4.301075268817204,
"grad_norm": 0.6287643909454346,
"learning_rate": 0.00018280430107526884,
"loss": 0.4687882232666016,
"step": 2000
},
{
"epoch": 4.301075268817204,
"eval_loss": 0.4168907403945923,
"eval_runtime": 61.005,
"eval_samples_per_second": 311.515,
"eval_steps_per_second": 38.948,
"step": 2000
},
{
"epoch": 4.408602150537634,
"grad_norm": 0.6433095932006836,
"learning_rate": 0.0001823741935483871,
"loss": 0.4763982009887695,
"step": 2050
},
{
"epoch": 4.408602150537634,
"eval_loss": 0.4120262861251831,
"eval_runtime": 61.5507,
"eval_samples_per_second": 308.753,
"eval_steps_per_second": 38.602,
"step": 2050
},
{
"epoch": 4.516129032258064,
"grad_norm": 0.76325523853302,
"learning_rate": 0.0001819440860215054,
"loss": 0.5169943237304687,
"step": 2100
},
{
"epoch": 4.516129032258064,
"eval_loss": 0.40777090191841125,
"eval_runtime": 61.9659,
"eval_samples_per_second": 306.685,
"eval_steps_per_second": 38.344,
"step": 2100
},
{
"epoch": 4.623655913978495,
"grad_norm": 0.7534022331237793,
"learning_rate": 0.00018151397849462366,
"loss": 0.4840876770019531,
"step": 2150
},
{
"epoch": 4.623655913978495,
"eval_loss": 0.396854966878891,
"eval_runtime": 61.4429,
"eval_samples_per_second": 309.295,
"eval_steps_per_second": 38.67,
"step": 2150
},
{
"epoch": 4.731182795698925,
"grad_norm": 0.688862144947052,
"learning_rate": 0.00018108387096774195,
"loss": 0.46516273498535154,
"step": 2200
},
{
"epoch": 4.731182795698925,
"eval_loss": 0.38546594977378845,
"eval_runtime": 60.8637,
"eval_samples_per_second": 312.239,
"eval_steps_per_second": 39.038,
"step": 2200
},
{
"epoch": 4.838709677419355,
"grad_norm": 0.5328208208084106,
"learning_rate": 0.00018065376344086022,
"loss": 0.5028326034545898,
"step": 2250
},
{
"epoch": 4.838709677419355,
"eval_loss": 0.37445569038391113,
"eval_runtime": 61.5819,
"eval_samples_per_second": 308.597,
"eval_steps_per_second": 38.583,
"step": 2250
},
{
"epoch": 4.946236559139785,
"grad_norm": 0.5857045650482178,
"learning_rate": 0.00018022365591397848,
"loss": 0.43645286560058594,
"step": 2300
},
{
"epoch": 4.946236559139785,
"eval_loss": 0.3690737187862396,
"eval_runtime": 61.4895,
"eval_samples_per_second": 309.061,
"eval_steps_per_second": 38.641,
"step": 2300
},
{
"epoch": 5.053763440860215,
"grad_norm": 0.6344749331474304,
"learning_rate": 0.00017979354838709678,
"loss": 0.42147178649902345,
"step": 2350
},
{
"epoch": 5.053763440860215,
"eval_loss": 0.3570445775985718,
"eval_runtime": 62.1748,
"eval_samples_per_second": 305.654,
"eval_steps_per_second": 38.215,
"step": 2350
},
{
"epoch": 5.161290322580645,
"grad_norm": 0.6610215306282043,
"learning_rate": 0.00017936344086021507,
"loss": 0.4157654571533203,
"step": 2400
},
{
"epoch": 5.161290322580645,
"eval_loss": 0.3497065603733063,
"eval_runtime": 61.6389,
"eval_samples_per_second": 308.312,
"eval_steps_per_second": 38.547,
"step": 2400
},
{
"epoch": 5.268817204301075,
"grad_norm": 0.5334368348121643,
"learning_rate": 0.00017893333333333336,
"loss": 0.4012648391723633,
"step": 2450
},
{
"epoch": 5.268817204301075,
"eval_loss": 0.33196908235549927,
"eval_runtime": 64.4623,
"eval_samples_per_second": 294.808,
"eval_steps_per_second": 36.859,
"step": 2450
},
{
"epoch": 5.376344086021505,
"grad_norm": 0.7559072971343994,
"learning_rate": 0.00017850322580645163,
"loss": 0.4343834686279297,
"step": 2500
},
{
"epoch": 5.376344086021505,
"eval_loss": 0.31756916642189026,
"eval_runtime": 64.0899,
"eval_samples_per_second": 296.521,
"eval_steps_per_second": 37.073,
"step": 2500
},
{
"epoch": 5.483870967741936,
"grad_norm": 0.6970711946487427,
"learning_rate": 0.0001780731182795699,
"loss": 0.3609016799926758,
"step": 2550
},
{
"epoch": 5.483870967741936,
"eval_loss": 0.3129482567310333,
"eval_runtime": 64.2007,
"eval_samples_per_second": 296.009,
"eval_steps_per_second": 37.009,
"step": 2550
},
{
"epoch": 5.591397849462366,
"grad_norm": 0.7393150329589844,
"learning_rate": 0.0001776430107526882,
"loss": 0.36085220336914064,
"step": 2600
},
{
"epoch": 5.591397849462366,
"eval_loss": 0.29907363653182983,
"eval_runtime": 64.2974,
"eval_samples_per_second": 295.564,
"eval_steps_per_second": 36.953,
"step": 2600
},
{
"epoch": 5.698924731182796,
"grad_norm": 0.6760246157646179,
"learning_rate": 0.00017721290322580645,
"loss": 0.3354073715209961,
"step": 2650
},
{
"epoch": 5.698924731182796,
"eval_loss": 0.28903692960739136,
"eval_runtime": 64.2379,
"eval_samples_per_second": 295.838,
"eval_steps_per_second": 36.988,
"step": 2650
},
{
"epoch": 5.806451612903226,
"grad_norm": 0.6342934370040894,
"learning_rate": 0.00017678279569892472,
"loss": 0.33487789154052733,
"step": 2700
},
{
"epoch": 5.806451612903226,
"eval_loss": 0.2763662040233612,
"eval_runtime": 63.0262,
"eval_samples_per_second": 301.525,
"eval_steps_per_second": 37.699,
"step": 2700
},
{
"epoch": 5.913978494623656,
"grad_norm": 0.6288059949874878,
"learning_rate": 0.00017635268817204301,
"loss": 0.3166103744506836,
"step": 2750
},
{
"epoch": 5.913978494623656,
"eval_loss": 0.27043381333351135,
"eval_runtime": 63.0792,
"eval_samples_per_second": 301.272,
"eval_steps_per_second": 37.667,
"step": 2750
},
{
"epoch": 6.021505376344086,
"grad_norm": 0.8228830695152283,
"learning_rate": 0.0001759225806451613,
"loss": 0.3166475486755371,
"step": 2800
},
{
"epoch": 6.021505376344086,
"eval_loss": 0.26023828983306885,
"eval_runtime": 64.4666,
"eval_samples_per_second": 294.788,
"eval_steps_per_second": 36.856,
"step": 2800
},
{
"epoch": 6.129032258064516,
"grad_norm": 0.6261463165283203,
"learning_rate": 0.0001754924731182796,
"loss": 0.30168416976928714,
"step": 2850
},
{
"epoch": 6.129032258064516,
"eval_loss": 0.2530518174171448,
"eval_runtime": 63.8775,
"eval_samples_per_second": 297.507,
"eval_steps_per_second": 37.196,
"step": 2850
},
{
"epoch": 6.236559139784946,
"grad_norm": 0.7265720367431641,
"learning_rate": 0.00017506236559139787,
"loss": 0.29341196060180663,
"step": 2900
},
{
"epoch": 6.236559139784946,
"eval_loss": 0.24442243576049805,
"eval_runtime": 63.2991,
"eval_samples_per_second": 300.226,
"eval_steps_per_second": 37.536,
"step": 2900
},
{
"epoch": 6.344086021505376,
"grad_norm": 0.5499133467674255,
"learning_rate": 0.00017463225806451613,
"loss": 0.2850730323791504,
"step": 2950
},
{
"epoch": 6.344086021505376,
"eval_loss": 0.237361341714859,
"eval_runtime": 64.5725,
"eval_samples_per_second": 294.305,
"eval_steps_per_second": 36.796,
"step": 2950
},
{
"epoch": 6.451612903225806,
"grad_norm": 0.7466527223587036,
"learning_rate": 0.00017420215053763442,
"loss": 0.2737441635131836,
"step": 3000
},
{
"epoch": 6.451612903225806,
"eval_loss": 0.22867611050605774,
"eval_runtime": 64.8912,
"eval_samples_per_second": 292.86,
"eval_steps_per_second": 36.615,
"step": 3000
},
{
"epoch": 6.559139784946236,
"grad_norm": 0.605771005153656,
"learning_rate": 0.0001737720430107527,
"loss": 0.26982501983642576,
"step": 3050
},
{
"epoch": 6.559139784946236,
"eval_loss": 0.22686000168323517,
"eval_runtime": 64.8566,
"eval_samples_per_second": 293.016,
"eval_steps_per_second": 36.635,
"step": 3050
},
{
"epoch": 6.666666666666667,
"grad_norm": 0.6927595138549805,
"learning_rate": 0.00017334193548387096,
"loss": 0.2592777633666992,
"step": 3100
},
{
"epoch": 6.666666666666667,
"eval_loss": 0.22359216213226318,
"eval_runtime": 64.9559,
"eval_samples_per_second": 292.568,
"eval_steps_per_second": 36.579,
"step": 3100
},
{
"epoch": 6.774193548387097,
"grad_norm": 0.6070519685745239,
"learning_rate": 0.00017291182795698925,
"loss": 0.2539858436584473,
"step": 3150
},
{
"epoch": 6.774193548387097,
"eval_loss": 0.22382962703704834,
"eval_runtime": 64.9172,
"eval_samples_per_second": 292.742,
"eval_steps_per_second": 36.6,
"step": 3150
},
{
"epoch": 6.881720430107527,
"grad_norm": 0.7206361889839172,
"learning_rate": 0.00017248172043010754,
"loss": 0.2550803184509277,
"step": 3200
},
{
"epoch": 6.881720430107527,
"eval_loss": 0.22055239975452423,
"eval_runtime": 65.5818,
"eval_samples_per_second": 289.775,
"eval_steps_per_second": 36.23,
"step": 3200
},
{
"epoch": 6.989247311827957,
"grad_norm": 0.6855896711349487,
"learning_rate": 0.00017205161290322584,
"loss": 0.2432615852355957,
"step": 3250
},
{
"epoch": 6.989247311827957,
"eval_loss": 0.21467819809913635,
"eval_runtime": 66.2905,
"eval_samples_per_second": 286.677,
"eval_steps_per_second": 35.842,
"step": 3250
},
{
"epoch": 7.096774193548387,
"grad_norm": 0.5612008571624756,
"learning_rate": 0.0001716215053763441,
"loss": 0.24562849044799806,
"step": 3300
},
{
"epoch": 7.096774193548387,
"eval_loss": 0.21375121176242828,
"eval_runtime": 66.0151,
"eval_samples_per_second": 287.874,
"eval_steps_per_second": 35.992,
"step": 3300
},
{
"epoch": 7.204301075268817,
"grad_norm": 0.7433006763458252,
"learning_rate": 0.00017119139784946237,
"loss": 0.2393852424621582,
"step": 3350
},
{
"epoch": 7.204301075268817,
"eval_loss": 0.20871323347091675,
"eval_runtime": 61.9563,
"eval_samples_per_second": 306.732,
"eval_steps_per_second": 38.35,
"step": 3350
},
{
"epoch": 7.311827956989247,
"grad_norm": 0.6491153836250305,
"learning_rate": 0.00017076129032258066,
"loss": 0.24959787368774414,
"step": 3400
},
{
"epoch": 7.311827956989247,
"eval_loss": 0.21120016276836395,
"eval_runtime": 60.6864,
"eval_samples_per_second": 313.151,
"eval_steps_per_second": 39.152,
"step": 3400
},
{
"epoch": 7.419354838709677,
"grad_norm": 0.5620025992393494,
"learning_rate": 0.00017033118279569893,
"loss": 0.2320168685913086,
"step": 3450
},
{
"epoch": 7.419354838709677,
"eval_loss": 0.20816229283809662,
"eval_runtime": 61.036,
"eval_samples_per_second": 311.357,
"eval_steps_per_second": 38.928,
"step": 3450
},
{
"epoch": 7.526881720430108,
"grad_norm": 0.6183444261550903,
"learning_rate": 0.00016990107526881722,
"loss": 0.2322225570678711,
"step": 3500
},
{
"epoch": 7.526881720430108,
"eval_loss": 0.20497609674930573,
"eval_runtime": 60.5328,
"eval_samples_per_second": 313.946,
"eval_steps_per_second": 39.251,
"step": 3500
},
{
"epoch": 7.634408602150538,
"grad_norm": 0.5328448414802551,
"learning_rate": 0.00016947096774193548,
"loss": 0.23304037094116212,
"step": 3550
},
{
"epoch": 7.634408602150538,
"eval_loss": 0.20321960747241974,
"eval_runtime": 62.1711,
"eval_samples_per_second": 305.672,
"eval_steps_per_second": 38.217,
"step": 3550
},
{
"epoch": 7.741935483870968,
"grad_norm": 0.5241938829421997,
"learning_rate": 0.00016904086021505378,
"loss": 0.22476686477661134,
"step": 3600
},
{
"epoch": 7.741935483870968,
"eval_loss": 0.2034502625465393,
"eval_runtime": 64.8022,
"eval_samples_per_second": 293.262,
"eval_steps_per_second": 36.665,
"step": 3600
},
{
"epoch": 7.849462365591398,
"grad_norm": 0.5440294742584229,
"learning_rate": 0.00016861075268817207,
"loss": 0.227796630859375,
"step": 3650
},
{
"epoch": 7.849462365591398,
"eval_loss": 0.20562465488910675,
"eval_runtime": 65.1543,
"eval_samples_per_second": 291.677,
"eval_steps_per_second": 36.467,
"step": 3650
},
{
"epoch": 7.956989247311828,
"grad_norm": 0.5037738680839539,
"learning_rate": 0.00016818064516129034,
"loss": 0.23125221252441405,
"step": 3700
},
{
"epoch": 7.956989247311828,
"eval_loss": 0.20223356783390045,
"eval_runtime": 65.5561,
"eval_samples_per_second": 289.889,
"eval_steps_per_second": 36.244,
"step": 3700
},
{
"epoch": 8.064516129032258,
"grad_norm": 0.843550980091095,
"learning_rate": 0.0001677505376344086,
"loss": 0.2236369514465332,
"step": 3750
},
{
"epoch": 8.064516129032258,
"eval_loss": 0.19716867804527283,
"eval_runtime": 66.4534,
"eval_samples_per_second": 285.975,
"eval_steps_per_second": 35.754,
"step": 3750
},
{
"epoch": 8.172043010752688,
"grad_norm": 0.5562386512756348,
"learning_rate": 0.0001673204301075269,
"loss": 0.22720510482788087,
"step": 3800
},
{
"epoch": 8.172043010752688,
"eval_loss": 0.1974799931049347,
"eval_runtime": 66.0022,
"eval_samples_per_second": 287.93,
"eval_steps_per_second": 35.999,
"step": 3800
},
{
"epoch": 8.279569892473118,
"grad_norm": 0.5003981590270996,
"learning_rate": 0.00016689032258064516,
"loss": 0.22547555923461915,
"step": 3850
},
{
"epoch": 8.279569892473118,
"eval_loss": 0.19821035861968994,
"eval_runtime": 60.464,
"eval_samples_per_second": 314.303,
"eval_steps_per_second": 39.296,
"step": 3850
},
{
"epoch": 8.387096774193548,
"grad_norm": 0.4629065692424774,
"learning_rate": 0.00016646021505376345,
"loss": 0.22113780975341796,
"step": 3900
},
{
"epoch": 8.387096774193548,
"eval_loss": 0.1924905627965927,
"eval_runtime": 60.595,
"eval_samples_per_second": 313.623,
"eval_steps_per_second": 39.211,
"step": 3900
},
{
"epoch": 8.494623655913978,
"grad_norm": 0.5043092966079712,
"learning_rate": 0.00016603010752688172,
"loss": 0.21599315643310546,
"step": 3950
},
{
"epoch": 8.494623655913978,
"eval_loss": 0.19553141295909882,
"eval_runtime": 60.5,
"eval_samples_per_second": 314.116,
"eval_steps_per_second": 39.273,
"step": 3950
},
{
"epoch": 8.602150537634408,
"grad_norm": 0.6413733959197998,
"learning_rate": 0.0001656,
"loss": 0.2173159408569336,
"step": 4000
},
{
"epoch": 8.602150537634408,
"eval_loss": 0.19092191755771637,
"eval_runtime": 60.5854,
"eval_samples_per_second": 313.673,
"eval_steps_per_second": 39.217,
"step": 4000
}
],
"logging_steps": 50,
"max_steps": 23250,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.685471179194368e+16,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}