| { | |
| "best_metric": 3.8748562335968018, | |
| "best_model_checkpoint": "/kaggle/working/checkpoint-669", | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 11150, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 3.875725507736206, | |
| "eval_mean_perplexity": 366.3703887939453, | |
| "eval_perplexities": [ | |
| 335.80645751953125, | |
| 377.47796630859375, | |
| 475.9695129394531, | |
| 276.7535705566406, | |
| 294.2838134765625, | |
| 264.73394775390625, | |
| 394.9538269042969, | |
| 325.2862854003906, | |
| 525.1426391601562, | |
| 393.2958679199219 | |
| ], | |
| "eval_runtime": 2.3638, | |
| "eval_samples_per_second": 4.231, | |
| "eval_steps_per_second": 0.846, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 3.877554416656494, | |
| "eval_mean_perplexity": 373.7104461669922, | |
| "eval_perplexities": [ | |
| 356.83929443359375, | |
| 425.78094482421875, | |
| 495.7661437988281, | |
| 287.1143798828125, | |
| 270.86737060546875, | |
| 256.76934814453125, | |
| 391.27044677734375, | |
| 345.6097412109375, | |
| 498.4945373535156, | |
| 408.5922546386719 | |
| ], | |
| "eval_runtime": 2.2583, | |
| "eval_samples_per_second": 4.428, | |
| "eval_steps_per_second": 0.886, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 3.8748562335968018, | |
| "eval_mean_perplexity": 378.68709869384764, | |
| "eval_perplexities": [ | |
| 404.7926940917969, | |
| 400.53558349609375, | |
| 487.69427490234375, | |
| 229.20298767089844, | |
| 278.294189453125, | |
| 259.6055908203125, | |
| 387.93963623046875, | |
| 365.1226501464844, | |
| 508.79669189453125, | |
| 464.8866882324219 | |
| ], | |
| "eval_runtime": 2.4538, | |
| "eval_samples_per_second": 4.075, | |
| "eval_steps_per_second": 0.815, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 3.889125108718872, | |
| "eval_mean_perplexity": 395.6389587402344, | |
| "eval_perplexities": [ | |
| 386.92559814453125, | |
| 444.32073974609375, | |
| 480.409912109375, | |
| 278.6746520996094, | |
| 279.8287353515625, | |
| 286.8155212402344, | |
| 426.2447204589844, | |
| 352.7798767089844, | |
| 545.115234375, | |
| 475.27459716796875 | |
| ], | |
| "eval_runtime": 2.2494, | |
| "eval_samples_per_second": 4.446, | |
| "eval_steps_per_second": 0.889, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 3.9018893241882324, | |
| "eval_mean_perplexity": 418.4004302978516, | |
| "eval_perplexities": [ | |
| 426.7615051269531, | |
| 474.9723815917969, | |
| 516.2012329101562, | |
| 285.6946105957031, | |
| 314.9617919921875, | |
| 305.4250793457031, | |
| 411.5309143066406, | |
| 412.8928527832031, | |
| 537.16943359375, | |
| 498.3945007324219 | |
| ], | |
| "eval_runtime": 2.2536, | |
| "eval_samples_per_second": 4.437, | |
| "eval_steps_per_second": 0.887, | |
| "step": 1115 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 3.9105305671691895, | |
| "eval_mean_perplexity": 423.3804229736328, | |
| "eval_perplexities": [ | |
| 421.5846862792969, | |
| 457.750244140625, | |
| 521.7881469726562, | |
| 293.4595642089844, | |
| 283.9613037109375, | |
| 287.46807861328125, | |
| 451.5904846191406, | |
| 458.28509521484375, | |
| 608.7252197265625, | |
| 449.19140625 | |
| ], | |
| "eval_runtime": 2.261, | |
| "eval_samples_per_second": 4.423, | |
| "eval_steps_per_second": 0.885, | |
| "step": 1338 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 3.910905122756958, | |
| "eval_mean_perplexity": 437.6560882568359, | |
| "eval_perplexities": [ | |
| 459.6704406738281, | |
| 471.1756591796875, | |
| 508.8789367675781, | |
| 278.22161865234375, | |
| 305.4996337890625, | |
| 335.0756530761719, | |
| 449.5406494140625, | |
| 402.190673828125, | |
| 618.786376953125, | |
| 547.521240234375 | |
| ], | |
| "eval_runtime": 2.4541, | |
| "eval_samples_per_second": 4.075, | |
| "eval_steps_per_second": 0.815, | |
| "step": 1561 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 3.9314892292022705, | |
| "eval_mean_perplexity": 451.49346618652345, | |
| "eval_perplexities": [ | |
| 432.6178283691406, | |
| 494.5645446777344, | |
| 518.4781494140625, | |
| 289.39727783203125, | |
| 317.63031005859375, | |
| 327.0579528808594, | |
| 491.4974670410156, | |
| 445.9523620605469, | |
| 667.2817993164062, | |
| 530.4569702148438 | |
| ], | |
| "eval_runtime": 3.1442, | |
| "eval_samples_per_second": 3.18, | |
| "eval_steps_per_second": 0.636, | |
| "step": 1784 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 3.9446353912353516, | |
| "eval_mean_perplexity": 451.08067626953124, | |
| "eval_perplexities": [ | |
| 452.6179504394531, | |
| 445.2970275878906, | |
| 576.3162841796875, | |
| 262.3009033203125, | |
| 326.6243591308594, | |
| 311.2583923339844, | |
| 440.389404296875, | |
| 448.5811767578125, | |
| 677.4688110351562, | |
| 569.9524536132812 | |
| ], | |
| "eval_runtime": 2.2555, | |
| "eval_samples_per_second": 4.434, | |
| "eval_steps_per_second": 0.887, | |
| "step": 2007 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 3.979356288909912, | |
| "eval_mean_perplexity": 466.0081390380859, | |
| "eval_perplexities": [ | |
| 479.5843505859375, | |
| 542.103759765625, | |
| 561.8331298828125, | |
| 283.5645751953125, | |
| 342.3445739746094, | |
| 315.0624694824219, | |
| 494.7655334472656, | |
| 397.63812255859375, | |
| 683.822265625, | |
| 559.3626098632812 | |
| ], | |
| "eval_runtime": 2.255, | |
| "eval_samples_per_second": 4.435, | |
| "eval_steps_per_second": 0.887, | |
| "step": 2230 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 3.985069751739502, | |
| "eval_mean_perplexity": 471.23480529785155, | |
| "eval_perplexities": [ | |
| 442.65118408203125, | |
| 578.3666381835938, | |
| 530.2559204101562, | |
| 292.97174072265625, | |
| 325.9568786621094, | |
| 331.6105651855469, | |
| 466.1777648925781, | |
| 453.5052490234375, | |
| 693.37646484375, | |
| 597.4756469726562 | |
| ], | |
| "eval_runtime": 2.2568, | |
| "eval_samples_per_second": 4.431, | |
| "eval_steps_per_second": 0.886, | |
| "step": 2453 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 4.011897087097168, | |
| "eval_mean_perplexity": 481.7092010498047, | |
| "eval_perplexities": [ | |
| 490.259765625, | |
| 590.09716796875, | |
| 524.1170043945312, | |
| 292.7325134277344, | |
| 362.09210205078125, | |
| 320.8348388671875, | |
| 483.432861328125, | |
| 492.00469970703125, | |
| 676.3094482421875, | |
| 585.2116088867188 | |
| ], | |
| "eval_runtime": 2.2714, | |
| "eval_samples_per_second": 4.402, | |
| "eval_steps_per_second": 0.88, | |
| "step": 2676 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 4.028537750244141, | |
| "eval_mean_perplexity": 491.83470153808594, | |
| "eval_perplexities": [ | |
| 434.7430725097656, | |
| 579.150390625, | |
| 531.6643676757812, | |
| 323.02447509765625, | |
| 381.5349426269531, | |
| 329.8095397949219, | |
| 569.4808349609375, | |
| 454.2198486328125, | |
| 723.43798828125, | |
| 591.2815551757812 | |
| ], | |
| "eval_runtime": 2.268, | |
| "eval_samples_per_second": 4.409, | |
| "eval_steps_per_second": 0.882, | |
| "step": 2899 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 4.046214580535889, | |
| "eval_mean_perplexity": 488.7431976318359, | |
| "eval_perplexities": [ | |
| 465.3315734863281, | |
| 577.7365112304688, | |
| 532.9083251953125, | |
| 298.2411193847656, | |
| 371.57415771484375, | |
| 335.2172546386719, | |
| 547.2592163085938, | |
| 480.532470703125, | |
| 732.7112426757812, | |
| 545.9201049804688 | |
| ], | |
| "eval_runtime": 2.2446, | |
| "eval_samples_per_second": 4.455, | |
| "eval_steps_per_second": 0.891, | |
| "step": 3122 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 4.064830780029297, | |
| "eval_mean_perplexity": 493.6821044921875, | |
| "eval_perplexities": [ | |
| 508.1759948730469, | |
| 588.5740356445312, | |
| 494.7876892089844, | |
| 310.3844299316406, | |
| 357.8840026855469, | |
| 349.0634460449219, | |
| 574.1718139648438, | |
| 489.5400085449219, | |
| 730.5029907226562, | |
| 533.7366333007812 | |
| ], | |
| "eval_runtime": 2.2496, | |
| "eval_samples_per_second": 4.445, | |
| "eval_steps_per_second": 0.889, | |
| "step": 3345 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 4.0865068435668945, | |
| "eval_mean_perplexity": 527.1822723388672, | |
| "eval_perplexities": [ | |
| 516.6371459960938, | |
| 603.1266479492188, | |
| 558.2211303710938, | |
| 320.877685546875, | |
| 440.86993408203125, | |
| 384.0972595214844, | |
| 586.449951171875, | |
| 441.53643798828125, | |
| 805.31201171875, | |
| 614.6945190429688 | |
| ], | |
| "eval_runtime": 2.2935, | |
| "eval_samples_per_second": 4.36, | |
| "eval_steps_per_second": 0.872, | |
| "step": 3568 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 4.112179756164551, | |
| "eval_mean_perplexity": 517.5738647460937, | |
| "eval_perplexities": [ | |
| 484.16485595703125, | |
| 623.9596557617188, | |
| 544.3837890625, | |
| 327.7882995605469, | |
| 432.2279052734375, | |
| 325.2957458496094, | |
| 587.2503051757812, | |
| 500.35589599609375, | |
| 768.452392578125, | |
| 581.8598022460938 | |
| ], | |
| "eval_runtime": 2.6951, | |
| "eval_samples_per_second": 3.71, | |
| "eval_steps_per_second": 0.742, | |
| "step": 3791 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 4.133630275726318, | |
| "eval_mean_perplexity": 532.7662231445313, | |
| "eval_perplexities": [ | |
| 459.6676025390625, | |
| 644.7689819335938, | |
| 559.6870727539062, | |
| 313.52886962890625, | |
| 407.5067443847656, | |
| 358.4698486328125, | |
| 635.401123046875, | |
| 511.8723449707031, | |
| 728.2614135742188, | |
| 708.4982299804688 | |
| ], | |
| "eval_runtime": 2.2702, | |
| "eval_samples_per_second": 4.405, | |
| "eval_steps_per_second": 0.881, | |
| "step": 4014 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 4.14639949798584, | |
| "eval_mean_perplexity": 546.8367309570312, | |
| "eval_perplexities": [ | |
| 513.874267578125, | |
| 633.9168090820312, | |
| 623.620849609375, | |
| 317.8645324707031, | |
| 417.3320007324219, | |
| 351.769287109375, | |
| 557.8528442382812, | |
| 530.8079223632812, | |
| 834.164306640625, | |
| 687.1644897460938 | |
| ], | |
| "eval_runtime": 2.28, | |
| "eval_samples_per_second": 4.386, | |
| "eval_steps_per_second": 0.877, | |
| "step": 4237 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 4.1683173179626465, | |
| "eval_mean_perplexity": 530.1545104980469, | |
| "eval_perplexities": [ | |
| 518.4563598632812, | |
| 616.3810424804688, | |
| 584.39404296875, | |
| 357.87530517578125, | |
| 424.8235778808594, | |
| 363.0649719238281, | |
| 572.78369140625, | |
| 489.98748779296875, | |
| 762.8850708007812, | |
| 610.8935546875 | |
| ], | |
| "eval_runtime": 2.5117, | |
| "eval_samples_per_second": 3.981, | |
| "eval_steps_per_second": 0.796, | |
| "step": 4460 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 4.197434425354004, | |
| "eval_mean_perplexity": 552.9715087890625, | |
| "eval_perplexities": [ | |
| 496.9158630371094, | |
| 622.5189208984375, | |
| 620.3365478515625, | |
| 315.29119873046875, | |
| 508.8257751464844, | |
| 386.271728515625, | |
| 597.5241088867188, | |
| 540.0391845703125, | |
| 771.0007934570312, | |
| 670.990966796875 | |
| ], | |
| "eval_runtime": 2.2574, | |
| "eval_samples_per_second": 4.43, | |
| "eval_steps_per_second": 0.886, | |
| "step": 4683 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 4.2334303855896, | |
| "eval_mean_perplexity": 591.004769897461, | |
| "eval_perplexities": [ | |
| 542.1947631835938, | |
| 699.9281616210938, | |
| 655.9802856445312, | |
| 351.9259948730469, | |
| 517.2227783203125, | |
| 364.0205078125, | |
| 738.9130859375, | |
| 537.7424926757812, | |
| 836.2189331054688, | |
| 665.9006958007812 | |
| ], | |
| "eval_runtime": 2.2478, | |
| "eval_samples_per_second": 4.449, | |
| "eval_steps_per_second": 0.89, | |
| "step": 4906 | |
| }, | |
| { | |
| "epoch": 22.42, | |
| "grad_norm": 1.9367769956588745, | |
| "learning_rate": 2.7578475336322873e-05, | |
| "loss": 3.112, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 4.230788707733154, | |
| "eval_mean_perplexity": 571.9811889648438, | |
| "eval_perplexities": [ | |
| 529.1312255859375, | |
| 677.629638671875, | |
| 639.4375, | |
| 351.9476623535156, | |
| 471.0734558105469, | |
| 388.02935791015625, | |
| 630.8019409179688, | |
| 550.748046875, | |
| 854.4290771484375, | |
| 626.583984375 | |
| ], | |
| "eval_runtime": 2.3322, | |
| "eval_samples_per_second": 4.288, | |
| "eval_steps_per_second": 0.858, | |
| "step": 5129 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 4.264792442321777, | |
| "eval_mean_perplexity": 588.0643798828125, | |
| "eval_perplexities": [ | |
| 532.3706665039062, | |
| 650.5995483398438, | |
| 640.096435546875, | |
| 367.4383850097656, | |
| 510.7347106933594, | |
| 402.79925537109375, | |
| 740.1875610351562, | |
| 580.5010375976562, | |
| 837.0123901367188, | |
| 618.90380859375 | |
| ], | |
| "eval_runtime": 2.414, | |
| "eval_samples_per_second": 4.142, | |
| "eval_steps_per_second": 0.828, | |
| "step": 5352 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 4.280625820159912, | |
| "eval_mean_perplexity": 568.3704132080078, | |
| "eval_perplexities": [ | |
| 493.6273193359375, | |
| 687.0193481445312, | |
| 617.8013305664062, | |
| 356.9241943359375, | |
| 462.1817321777344, | |
| 377.38800048828125, | |
| 662.7510375976562, | |
| 553.1047973632812, | |
| 769.9375, | |
| 702.9688720703125 | |
| ], | |
| "eval_runtime": 2.2617, | |
| "eval_samples_per_second": 4.421, | |
| "eval_steps_per_second": 0.884, | |
| "step": 5575 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 4.299654006958008, | |
| "eval_mean_perplexity": 585.3176055908203, | |
| "eval_perplexities": [ | |
| 507.7140808105469, | |
| 719.7584228515625, | |
| 610.5079956054688, | |
| 355.23577880859375, | |
| 474.7140197753906, | |
| 343.5024719238281, | |
| 644.6328125, | |
| 599.3197631835938, | |
| 868.6974487304688, | |
| 729.09326171875 | |
| ], | |
| "eval_runtime": 2.2752, | |
| "eval_samples_per_second": 4.395, | |
| "eval_steps_per_second": 0.879, | |
| "step": 5798 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 4.309383869171143, | |
| "eval_mean_perplexity": 572.4837829589844, | |
| "eval_perplexities": [ | |
| 532.3777465820312, | |
| 701.5357055664062, | |
| 574.2848510742188, | |
| 366.85614013671875, | |
| 481.5206298828125, | |
| 389.685546875, | |
| 579.2503662109375, | |
| 525.9729614257812, | |
| 906.0205078125, | |
| 667.3333740234375 | |
| ], | |
| "eval_runtime": 2.4436, | |
| "eval_samples_per_second": 4.092, | |
| "eval_steps_per_second": 0.818, | |
| "step": 6021 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 4.338665962219238, | |
| "eval_mean_perplexity": 617.5483917236328, | |
| "eval_perplexities": [ | |
| 530.51416015625, | |
| 770.9551391601562, | |
| 616.0216674804688, | |
| 386.1516418457031, | |
| 501.05426025390625, | |
| 419.87841796875, | |
| 735.8825073242188, | |
| 594.3335571289062, | |
| 911.9882202148438, | |
| 708.704345703125 | |
| ], | |
| "eval_runtime": 2.2597, | |
| "eval_samples_per_second": 4.425, | |
| "eval_steps_per_second": 0.885, | |
| "step": 6244 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 4.354172229766846, | |
| "eval_mean_perplexity": 609.77958984375, | |
| "eval_perplexities": [ | |
| 571.1878662109375, | |
| 744.3867797851562, | |
| 607.4262084960938, | |
| 374.7521057128906, | |
| 501.3298034667969, | |
| 401.453369140625, | |
| 706.0652465820312, | |
| 600.0166625976562, | |
| 898.824951171875, | |
| 692.3529052734375 | |
| ], | |
| "eval_runtime": 2.4521, | |
| "eval_samples_per_second": 4.078, | |
| "eval_steps_per_second": 0.816, | |
| "step": 6467 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 4.371840000152588, | |
| "eval_mean_perplexity": 639.5644592285156, | |
| "eval_perplexities": [ | |
| 518.4427490234375, | |
| 768.5755615234375, | |
| 643.536376953125, | |
| 434.2029724121094, | |
| 553.5997924804688, | |
| 401.9341735839844, | |
| 738.0169677734375, | |
| 627.0206909179688, | |
| 923.0927734375, | |
| 787.2225341796875 | |
| ], | |
| "eval_runtime": 2.2759, | |
| "eval_samples_per_second": 4.394, | |
| "eval_steps_per_second": 0.879, | |
| "step": 6690 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 4.397173881530762, | |
| "eval_mean_perplexity": 666.7775909423829, | |
| "eval_perplexities": [ | |
| 549.1520385742188, | |
| 851.6005249023438, | |
| 712.2999267578125, | |
| 395.8175048828125, | |
| 540.697021484375, | |
| 451.8854064941406, | |
| 723.3524780273438, | |
| 630.9887084960938, | |
| 1021.4713745117188, | |
| 790.5109252929688 | |
| ], | |
| "eval_runtime": 2.2555, | |
| "eval_samples_per_second": 4.434, | |
| "eval_steps_per_second": 0.887, | |
| "step": 6913 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 4.415882110595703, | |
| "eval_mean_perplexity": 660.8353820800781, | |
| "eval_perplexities": [ | |
| 512.7852783203125, | |
| 785.50439453125, | |
| 668.9815063476562, | |
| 457.69830322265625, | |
| 539.0671997070312, | |
| 459.60736083984375, | |
| 751.9481201171875, | |
| 625.0054321289062, | |
| 999.4456787109375, | |
| 808.310546875 | |
| ], | |
| "eval_runtime": 2.2544, | |
| "eval_samples_per_second": 4.436, | |
| "eval_steps_per_second": 0.887, | |
| "step": 7136 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 4.42371940612793, | |
| "eval_mean_perplexity": 662.7573822021484, | |
| "eval_perplexities": [ | |
| 559.4247436523438, | |
| 747.0921020507812, | |
| 678.9531860351562, | |
| 437.0193786621094, | |
| 544.0770874023438, | |
| 431.26568603515625, | |
| 756.7093505859375, | |
| 626.279052734375, | |
| 1022.3748168945312, | |
| 824.37841796875 | |
| ], | |
| "eval_runtime": 2.2901, | |
| "eval_samples_per_second": 4.367, | |
| "eval_steps_per_second": 0.873, | |
| "step": 7359 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 4.434357166290283, | |
| "eval_mean_perplexity": 650.7052612304688, | |
| "eval_perplexities": [ | |
| 549.6817626953125, | |
| 811.8685302734375, | |
| 639.683349609375, | |
| 410.87249755859375, | |
| 510.8334045410156, | |
| 438.6333923339844, | |
| 779.5153198242188, | |
| 584.6727294921875, | |
| 1027.2081298828125, | |
| 754.08349609375 | |
| ], | |
| "eval_runtime": 2.3566, | |
| "eval_samples_per_second": 4.243, | |
| "eval_steps_per_second": 0.849, | |
| "step": 7582 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 4.456236839294434, | |
| "eval_mean_perplexity": 683.213168334961, | |
| "eval_perplexities": [ | |
| 584.2586059570312, | |
| 837.6990966796875, | |
| 639.265869140625, | |
| 431.7611389160156, | |
| 558.5822143554688, | |
| 446.50067138671875, | |
| 781.8605346679688, | |
| 649.52392578125, | |
| 1031.22900390625, | |
| 871.4506225585938 | |
| ], | |
| "eval_runtime": 2.2758, | |
| "eval_samples_per_second": 4.394, | |
| "eval_steps_per_second": 0.879, | |
| "step": 7805 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 4.476213455200195, | |
| "eval_mean_perplexity": 673.9735595703125, | |
| "eval_perplexities": [ | |
| 570.093505859375, | |
| 876.3085327148438, | |
| 637.6167602539062, | |
| 410.8495788574219, | |
| 565.6197509765625, | |
| 431.1340637207031, | |
| 784.1497192382812, | |
| 635.3842163085938, | |
| 1020.8198852539062, | |
| 807.7595825195312 | |
| ], | |
| "eval_runtime": 2.2666, | |
| "eval_samples_per_second": 4.412, | |
| "eval_steps_per_second": 0.882, | |
| "step": 8028 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 4.485024452209473, | |
| "eval_mean_perplexity": 678.1103912353516, | |
| "eval_perplexities": [ | |
| 524.7376708984375, | |
| 897.5795288085938, | |
| 604.8717651367188, | |
| 421.68621826171875, | |
| 562.4842529296875, | |
| 447.8833312988281, | |
| 743.5353393554688, | |
| 641.0125122070312, | |
| 1038.24658203125, | |
| 899.0667114257812 | |
| ], | |
| "eval_runtime": 2.2714, | |
| "eval_samples_per_second": 4.403, | |
| "eval_steps_per_second": 0.881, | |
| "step": 8251 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 4.500662326812744, | |
| "eval_mean_perplexity": 667.2204986572266, | |
| "eval_perplexities": [ | |
| 517.3828125, | |
| 849.9619140625, | |
| 632.2133178710938, | |
| 421.2346496582031, | |
| 562.2227783203125, | |
| 412.367919921875, | |
| 801.9597778320312, | |
| 634.914794921875, | |
| 955.3936767578125, | |
| 884.5533447265625 | |
| ], | |
| "eval_runtime": 2.2586, | |
| "eval_samples_per_second": 4.428, | |
| "eval_steps_per_second": 0.886, | |
| "step": 8474 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 4.507510662078857, | |
| "eval_mean_perplexity": 674.6702362060547, | |
| "eval_perplexities": [ | |
| 523.6810913085938, | |
| 871.532958984375, | |
| 702.7888793945312, | |
| 412.113525390625, | |
| 559.32421875, | |
| 436.3467712402344, | |
| 741.5978393554688, | |
| 612.4104614257812, | |
| 969.3262329101562, | |
| 917.5803833007812 | |
| ], | |
| "eval_runtime": 2.2926, | |
| "eval_samples_per_second": 4.362, | |
| "eval_steps_per_second": 0.872, | |
| "step": 8697 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 4.5184006690979, | |
| "eval_mean_perplexity": 690.1411804199219, | |
| "eval_perplexities": [ | |
| 533.463623046875, | |
| 880.5369262695312, | |
| 677.1242065429688, | |
| 468.8255920410156, | |
| 524.9594116210938, | |
| 481.3450012207031, | |
| 724.6446533203125, | |
| 644.5559692382812, | |
| 1083.451416015625, | |
| 882.5050048828125 | |
| ], | |
| "eval_runtime": 2.2507, | |
| "eval_samples_per_second": 4.443, | |
| "eval_steps_per_second": 0.889, | |
| "step": 8920 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 4.528339385986328, | |
| "eval_mean_perplexity": 686.3480163574219, | |
| "eval_perplexities": [ | |
| 540.8236083984375, | |
| 941.376708984375, | |
| 641.9276733398438, | |
| 446.772216796875, | |
| 562.5877685546875, | |
| 455.73175048828125, | |
| 764.4960327148438, | |
| 647.960205078125, | |
| 983.826416015625, | |
| 877.977783203125 | |
| ], | |
| "eval_runtime": 2.2901, | |
| "eval_samples_per_second": 4.367, | |
| "eval_steps_per_second": 0.873, | |
| "step": 9143 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 4.525776386260986, | |
| "eval_mean_perplexity": 692.94345703125, | |
| "eval_perplexities": [ | |
| 512.2537841796875, | |
| 996.9277954101562, | |
| 626.796142578125, | |
| 407.5407409667969, | |
| 572.6943969726562, | |
| 439.5016174316406, | |
| 776.8684692382812, | |
| 660.894775390625, | |
| 1015.8809204101562, | |
| 920.075927734375 | |
| ], | |
| "eval_runtime": 2.2958, | |
| "eval_samples_per_second": 4.356, | |
| "eval_steps_per_second": 0.871, | |
| "step": 9366 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 4.5432844161987305, | |
| "eval_mean_perplexity": 698.305697631836, | |
| "eval_perplexities": [ | |
| 520.623779296875, | |
| 898.4200439453125, | |
| 635.4287719726562, | |
| 451.1359558105469, | |
| 590.0744018554688, | |
| 448.103759765625, | |
| 752.5787353515625, | |
| 682.3783569335938, | |
| 1070.27294921875, | |
| 934.0402221679688 | |
| ], | |
| "eval_runtime": 2.8416, | |
| "eval_samples_per_second": 3.519, | |
| "eval_steps_per_second": 0.704, | |
| "step": 9589 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 4.546627998352051, | |
| "eval_mean_perplexity": 698.2709533691407, | |
| "eval_perplexities": [ | |
| 496.5508728027344, | |
| 900.9185180664062, | |
| 651.6212768554688, | |
| 429.4538879394531, | |
| 572.0066528320312, | |
| 429.689208984375, | |
| 786.3671264648438, | |
| 686.788818359375, | |
| 1078.3935546875, | |
| 950.9196166992188 | |
| ], | |
| "eval_runtime": 2.2997, | |
| "eval_samples_per_second": 4.348, | |
| "eval_steps_per_second": 0.87, | |
| "step": 9812 | |
| }, | |
| { | |
| "epoch": 44.84, | |
| "grad_norm": 2.2081024646759033, | |
| "learning_rate": 5.15695067264574e-06, | |
| "loss": 2.4732, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 4.5482940673828125, | |
| "eval_mean_perplexity": 683.0953948974609, | |
| "eval_perplexities": [ | |
| 513.7635498046875, | |
| 881.8916625976562, | |
| 650.1325073242188, | |
| 426.90740966796875, | |
| 583.7874145507812, | |
| 441.8877258300781, | |
| 793.0048828125, | |
| 660.0620727539062, | |
| 992.5838012695312, | |
| 886.9329223632812 | |
| ], | |
| "eval_runtime": 2.6736, | |
| "eval_samples_per_second": 3.74, | |
| "eval_steps_per_second": 0.748, | |
| "step": 10035 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 4.556254863739014, | |
| "eval_mean_perplexity": 694.8246520996094, | |
| "eval_perplexities": [ | |
| 484.3752136230469, | |
| 864.9951171875, | |
| 721.4644775390625, | |
| 440.7280578613281, | |
| 583.1522216796875, | |
| 458.59246826171875, | |
| 795.0460815429688, | |
| 658.3065795898438, | |
| 1043.74267578125, | |
| 897.8436279296875 | |
| ], | |
| "eval_runtime": 2.6592, | |
| "eval_samples_per_second": 3.761, | |
| "eval_steps_per_second": 0.752, | |
| "step": 10258 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 4.565934181213379, | |
| "eval_mean_perplexity": 697.1294586181641, | |
| "eval_perplexities": [ | |
| 521.9780883789062, | |
| 891.4306030273438, | |
| 694.7803344726562, | |
| 465.2270812988281, | |
| 585.0350341796875, | |
| 454.22308349609375, | |
| 784.2135620117188, | |
| 672.1557006835938, | |
| 1016.0956420898438, | |
| 886.1554565429688 | |
| ], | |
| "eval_runtime": 2.3829, | |
| "eval_samples_per_second": 4.197, | |
| "eval_steps_per_second": 0.839, | |
| "step": 10481 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 4.569299221038818, | |
| "eval_mean_perplexity": 698.8598907470703, | |
| "eval_perplexities": [ | |
| 523.372802734375, | |
| 930.4688720703125, | |
| 683.875732421875, | |
| 451.5999755859375, | |
| 597.7974243164062, | |
| 447.0304870605469, | |
| 766.9913940429688, | |
| 671.3782348632812, | |
| 1039.4984130859375, | |
| 876.5855712890625 | |
| ], | |
| "eval_runtime": 2.2797, | |
| "eval_samples_per_second": 4.387, | |
| "eval_steps_per_second": 0.877, | |
| "step": 10704 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 4.569689750671387, | |
| "eval_mean_perplexity": 694.5519165039062, | |
| "eval_perplexities": [ | |
| 508.9573059082031, | |
| 927.446533203125, | |
| 672.3489379882812, | |
| 473.5368347167969, | |
| 568.0377807617188, | |
| 454.00933837890625, | |
| 733.6597900390625, | |
| 695.4989624023438, | |
| 1047.673828125, | |
| 864.349853515625 | |
| ], | |
| "eval_runtime": 2.3973, | |
| "eval_samples_per_second": 4.171, | |
| "eval_steps_per_second": 0.834, | |
| "step": 10927 | |
| }, | |
| { | |
| "epoch": 50.0, | |
| "eval_loss": 4.569555759429932, | |
| "eval_mean_perplexity": 698.6035186767579, | |
| "eval_perplexities": [ | |
| 517.4149169921875, | |
| 924.535888671875, | |
| 704.73291015625, | |
| 465.9677429199219, | |
| 577.629150390625, | |
| 443.994140625, | |
| 770.1861572265625, | |
| 683.028076171875, | |
| 1017.7510375976562, | |
| 880.795166015625 | |
| ], | |
| "eval_runtime": 2.2771, | |
| "eval_samples_per_second": 4.391, | |
| "eval_steps_per_second": 0.878, | |
| "step": 11150 | |
| } | |
| ], | |
| "logging_steps": 5000, | |
| "max_steps": 11150, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "total_flos": 1.1355181056e+16, | |
| "train_batch_size": 10, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |