{ "best_metric": 2.6055867671966553, "best_model_checkpoint": "learning_source_20260316/protein_sequence/bert-output/protein_sequence-medium/checkpoint-33000", "epoch": 3413.940256045519, "eval_steps": 100, "global_step": 60000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.689900426742532, "grad_norm": 1.3084443807601929, "learning_rate": 3e-06, "loss": 3.0162, "step": 100 }, { "epoch": 5.689900426742532, "eval_loss": 2.7116549015045166, "eval_runtime": 24.0347, "eval_samples_per_second": 57.999, "eval_steps_per_second": 7.281, "step": 100 }, { "epoch": 11.379800853485063, "grad_norm": 1.1205779314041138, "learning_rate": 6e-06, "loss": 2.7064, "step": 200 }, { "epoch": 11.379800853485063, "eval_loss": 2.694298267364502, "eval_runtime": 24.0236, "eval_samples_per_second": 58.026, "eval_steps_per_second": 7.285, "step": 200 }, { "epoch": 17.069701280227594, "grad_norm": 0.6339058876037598, "learning_rate": 5.989966555183947e-06, "loss": 2.6845, "step": 300 }, { "epoch": 17.069701280227594, "eval_loss": 2.6842806339263916, "eval_runtime": 24.0267, "eval_samples_per_second": 58.019, "eval_steps_per_second": 7.284, "step": 300 }, { "epoch": 22.759601706970127, "grad_norm": 0.6396139860153198, "learning_rate": 5.979933110367893e-06, "loss": 2.6786, "step": 400 }, { "epoch": 22.759601706970127, "eval_loss": 2.6777021884918213, "eval_runtime": 23.6734, "eval_samples_per_second": 58.885, "eval_steps_per_second": 7.392, "step": 400 }, { "epoch": 28.44950213371266, "grad_norm": 0.4754956364631653, "learning_rate": 5.96989966555184e-06, "loss": 2.6752, "step": 500 }, { "epoch": 28.44950213371266, "eval_loss": 2.672980785369873, "eval_runtime": 24.0309, "eval_samples_per_second": 58.009, "eval_steps_per_second": 7.282, "step": 500 }, { "epoch": 34.13940256045519, "grad_norm": 0.5028958320617676, "learning_rate": 5.959866220735786e-06, "loss": 2.6735, "step": 600 }, { "epoch": 34.13940256045519, "eval_loss": 2.673126220703125, "eval_runtime": 23.6782, "eval_samples_per_second": 58.873, "eval_steps_per_second": 7.391, "step": 600 }, { "epoch": 39.82930298719772, "grad_norm": 0.6218613386154175, "learning_rate": 5.949832775919732e-06, "loss": 2.6725, "step": 700 }, { "epoch": 39.82930298719772, "eval_loss": 2.666964292526245, "eval_runtime": 24.0295, "eval_samples_per_second": 58.012, "eval_steps_per_second": 7.283, "step": 700 }, { "epoch": 45.519203413940254, "grad_norm": 0.5778615474700928, "learning_rate": 5.939799331103679e-06, "loss": 2.6709, "step": 800 }, { "epoch": 45.519203413940254, "eval_loss": 2.6644680500030518, "eval_runtime": 24.0251, "eval_samples_per_second": 58.023, "eval_steps_per_second": 7.284, "step": 800 }, { "epoch": 51.209103840682786, "grad_norm": 0.46084001660346985, "learning_rate": 5.929765886287626e-06, "loss": 2.6703, "step": 900 }, { "epoch": 51.209103840682786, "eval_loss": 2.6703662872314453, "eval_runtime": 24.0429, "eval_samples_per_second": 57.98, "eval_steps_per_second": 7.279, "step": 900 }, { "epoch": 56.89900426742532, "grad_norm": 0.3723374307155609, "learning_rate": 5.919732441471572e-06, "loss": 2.6698, "step": 1000 }, { "epoch": 56.89900426742532, "eval_loss": 2.6655778884887695, "eval_runtime": 24.0375, "eval_samples_per_second": 57.993, "eval_steps_per_second": 7.28, "step": 1000 }, { "epoch": 62.58890469416785, "grad_norm": 0.3526281714439392, "learning_rate": 5.9096989966555185e-06, "loss": 2.6685, "step": 1100 }, { "epoch": 62.58890469416785, "eval_loss": 2.6650660037994385, "eval_runtime": 23.6507, "eval_samples_per_second": 58.941, "eval_steps_per_second": 7.399, "step": 1100 }, { "epoch": 68.27880512091038, "grad_norm": 0.4236809313297272, "learning_rate": 5.899665551839465e-06, "loss": 2.6679, "step": 1200 }, { "epoch": 68.27880512091038, "eval_loss": 2.66703200340271, "eval_runtime": 23.992, "eval_samples_per_second": 58.103, "eval_steps_per_second": 7.294, "step": 1200 }, { "epoch": 73.96870554765292, "grad_norm": 0.36087411642074585, "learning_rate": 5.889632107023412e-06, "loss": 2.6681, "step": 1300 }, { "epoch": 73.96870554765292, "eval_loss": 2.661330461502075, "eval_runtime": 24.024, "eval_samples_per_second": 58.025, "eval_steps_per_second": 7.284, "step": 1300 }, { "epoch": 79.65860597439544, "grad_norm": 0.40879154205322266, "learning_rate": 5.879598662207358e-06, "loss": 2.6674, "step": 1400 }, { "epoch": 79.65860597439544, "eval_loss": 2.6664044857025146, "eval_runtime": 23.6778, "eval_samples_per_second": 58.874, "eval_steps_per_second": 7.391, "step": 1400 }, { "epoch": 85.34850640113798, "grad_norm": 0.3477596044540405, "learning_rate": 5.869565217391305e-06, "loss": 2.6676, "step": 1500 }, { "epoch": 85.34850640113798, "eval_loss": 2.6639652252197266, "eval_runtime": 24.0594, "eval_samples_per_second": 57.94, "eval_steps_per_second": 7.274, "step": 1500 }, { "epoch": 91.03840682788051, "grad_norm": 0.39076292514801025, "learning_rate": 5.8595317725752514e-06, "loss": 2.6678, "step": 1600 }, { "epoch": 91.03840682788051, "eval_loss": 2.665249824523926, "eval_runtime": 24.0245, "eval_samples_per_second": 58.024, "eval_steps_per_second": 7.284, "step": 1600 }, { "epoch": 96.72830725462305, "grad_norm": 0.4453221261501312, "learning_rate": 5.849498327759197e-06, "loss": 2.6672, "step": 1700 }, { "epoch": 96.72830725462305, "eval_loss": 2.6631760597229004, "eval_runtime": 24.0462, "eval_samples_per_second": 57.972, "eval_steps_per_second": 7.278, "step": 1700 }, { "epoch": 102.41820768136557, "grad_norm": 0.4177381694316864, "learning_rate": 5.839464882943144e-06, "loss": 2.6669, "step": 1800 }, { "epoch": 102.41820768136557, "eval_loss": 2.6606359481811523, "eval_runtime": 24.0304, "eval_samples_per_second": 58.01, "eval_steps_per_second": 7.282, "step": 1800 }, { "epoch": 108.10810810810811, "grad_norm": 0.29331672191619873, "learning_rate": 5.829431438127091e-06, "loss": 2.6663, "step": 1900 }, { "epoch": 108.10810810810811, "eval_loss": 2.662499189376831, "eval_runtime": 23.6649, "eval_samples_per_second": 58.906, "eval_steps_per_second": 7.395, "step": 1900 }, { "epoch": 113.79800853485064, "grad_norm": 0.25176671147346497, "learning_rate": 5.819397993311037e-06, "loss": 2.6661, "step": 2000 }, { "epoch": 113.79800853485064, "eval_loss": 2.662942409515381, "eval_runtime": 24.0202, "eval_samples_per_second": 58.034, "eval_steps_per_second": 7.286, "step": 2000 }, { "epoch": 119.48790896159318, "grad_norm": 0.34689632058143616, "learning_rate": 5.8093645484949836e-06, "loss": 2.666, "step": 2100 }, { "epoch": 119.48790896159318, "eval_loss": 2.6656811237335205, "eval_runtime": 24.0348, "eval_samples_per_second": 57.999, "eval_steps_per_second": 7.281, "step": 2100 }, { "epoch": 125.1778093883357, "grad_norm": 0.36021965742111206, "learning_rate": 5.79933110367893e-06, "loss": 2.6662, "step": 2200 }, { "epoch": 125.1778093883357, "eval_loss": 2.6633687019348145, "eval_runtime": 24.0228, "eval_samples_per_second": 58.028, "eval_steps_per_second": 7.285, "step": 2200 }, { "epoch": 130.86770981507823, "grad_norm": 0.35580527782440186, "learning_rate": 5.789297658862876e-06, "loss": 2.6659, "step": 2300 }, { "epoch": 130.86770981507823, "eval_loss": 2.6656627655029297, "eval_runtime": 24.0299, "eval_samples_per_second": 58.011, "eval_steps_per_second": 7.283, "step": 2300 }, { "epoch": 136.55761024182075, "grad_norm": 0.33932358026504517, "learning_rate": 5.779264214046823e-06, "loss": 2.666, "step": 2400 }, { "epoch": 136.55761024182075, "eval_loss": 2.662867784500122, "eval_runtime": 23.985, "eval_samples_per_second": 58.12, "eval_steps_per_second": 7.296, "step": 2400 }, { "epoch": 142.2475106685633, "grad_norm": 0.25131767988204956, "learning_rate": 5.76923076923077e-06, "loss": 2.6657, "step": 2500 }, { "epoch": 142.2475106685633, "eval_loss": 2.665088415145874, "eval_runtime": 23.9717, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 2500 }, { "epoch": 147.93741109530583, "grad_norm": 0.3472673296928406, "learning_rate": 5.759197324414716e-06, "loss": 2.6655, "step": 2600 }, { "epoch": 147.93741109530583, "eval_loss": 2.664132833480835, "eval_runtime": 23.99, "eval_samples_per_second": 58.107, "eval_steps_per_second": 7.295, "step": 2600 }, { "epoch": 153.62731152204836, "grad_norm": 0.36761701107025146, "learning_rate": 5.7491638795986624e-06, "loss": 2.666, "step": 2700 }, { "epoch": 153.62731152204836, "eval_loss": 2.6648921966552734, "eval_runtime": 23.9945, "eval_samples_per_second": 58.097, "eval_steps_per_second": 7.293, "step": 2700 }, { "epoch": 159.31721194879088, "grad_norm": 0.2883091866970062, "learning_rate": 5.739130434782609e-06, "loss": 2.666, "step": 2800 }, { "epoch": 159.31721194879088, "eval_loss": 2.6666293144226074, "eval_runtime": 23.6521, "eval_samples_per_second": 58.938, "eval_steps_per_second": 7.399, "step": 2800 }, { "epoch": 165.00711237553344, "grad_norm": 0.28055015206336975, "learning_rate": 5.729096989966555e-06, "loss": 2.6651, "step": 2900 }, { "epoch": 165.00711237553344, "eval_loss": 2.664830207824707, "eval_runtime": 23.9951, "eval_samples_per_second": 58.095, "eval_steps_per_second": 7.293, "step": 2900 }, { "epoch": 170.69701280227596, "grad_norm": 0.24984011054039001, "learning_rate": 5.719063545150502e-06, "loss": 2.6649, "step": 3000 }, { "epoch": 170.69701280227596, "eval_loss": 2.6661195755004883, "eval_runtime": 24.0098, "eval_samples_per_second": 58.06, "eval_steps_per_second": 7.289, "step": 3000 }, { "epoch": 176.3869132290185, "grad_norm": 0.3036094307899475, "learning_rate": 5.709030100334449e-06, "loss": 2.6647, "step": 3100 }, { "epoch": 176.3869132290185, "eval_loss": 2.663729667663574, "eval_runtime": 23.6531, "eval_samples_per_second": 58.935, "eval_steps_per_second": 7.399, "step": 3100 }, { "epoch": 182.07681365576101, "grad_norm": 0.35330143570899963, "learning_rate": 5.698996655518395e-06, "loss": 2.665, "step": 3200 }, { "epoch": 182.07681365576101, "eval_loss": 2.6598894596099854, "eval_runtime": 24.0054, "eval_samples_per_second": 58.07, "eval_steps_per_second": 7.29, "step": 3200 }, { "epoch": 187.76671408250357, "grad_norm": 0.27144384384155273, "learning_rate": 5.688963210702341e-06, "loss": 2.6646, "step": 3300 }, { "epoch": 187.76671408250357, "eval_loss": 2.6629762649536133, "eval_runtime": 23.991, "eval_samples_per_second": 58.105, "eval_steps_per_second": 7.294, "step": 3300 }, { "epoch": 193.4566145092461, "grad_norm": 0.16871358454227448, "learning_rate": 5.678929765886288e-06, "loss": 2.6644, "step": 3400 }, { "epoch": 193.4566145092461, "eval_loss": 2.6613662242889404, "eval_runtime": 23.9942, "eval_samples_per_second": 58.097, "eval_steps_per_second": 7.293, "step": 3400 }, { "epoch": 199.14651493598862, "grad_norm": 0.23622137308120728, "learning_rate": 5.668896321070235e-06, "loss": 2.6647, "step": 3500 }, { "epoch": 199.14651493598862, "eval_loss": 2.665132522583008, "eval_runtime": 23.9894, "eval_samples_per_second": 58.109, "eval_steps_per_second": 7.295, "step": 3500 }, { "epoch": 204.83641536273115, "grad_norm": 0.22713766992092133, "learning_rate": 5.658862876254181e-06, "loss": 2.665, "step": 3600 }, { "epoch": 204.83641536273115, "eval_loss": 2.6628506183624268, "eval_runtime": 24.0029, "eval_samples_per_second": 58.076, "eval_steps_per_second": 7.291, "step": 3600 }, { "epoch": 210.52631578947367, "grad_norm": 0.2724650502204895, "learning_rate": 5.6488294314381275e-06, "loss": 2.6645, "step": 3700 }, { "epoch": 210.52631578947367, "eval_loss": 2.662940740585327, "eval_runtime": 24.0035, "eval_samples_per_second": 58.075, "eval_steps_per_second": 7.291, "step": 3700 }, { "epoch": 216.21621621621622, "grad_norm": 0.2551960349082947, "learning_rate": 5.638795986622074e-06, "loss": 2.6652, "step": 3800 }, { "epoch": 216.21621621621622, "eval_loss": 2.66235089302063, "eval_runtime": 23.6518, "eval_samples_per_second": 58.939, "eval_steps_per_second": 7.399, "step": 3800 }, { "epoch": 221.90611664295875, "grad_norm": 0.23962050676345825, "learning_rate": 5.62876254180602e-06, "loss": 2.6647, "step": 3900 }, { "epoch": 221.90611664295875, "eval_loss": 2.659684896469116, "eval_runtime": 24.0133, "eval_samples_per_second": 58.051, "eval_steps_per_second": 7.288, "step": 3900 }, { "epoch": 227.59601706970128, "grad_norm": 0.21915219724178314, "learning_rate": 5.618729096989967e-06, "loss": 2.6648, "step": 4000 }, { "epoch": 227.59601706970128, "eval_loss": 2.6648192405700684, "eval_runtime": 24.0019, "eval_samples_per_second": 58.079, "eval_steps_per_second": 7.291, "step": 4000 }, { "epoch": 233.2859174964438, "grad_norm": 0.236517071723938, "learning_rate": 5.608695652173914e-06, "loss": 2.6644, "step": 4100 }, { "epoch": 233.2859174964438, "eval_loss": 2.664208173751831, "eval_runtime": 23.9969, "eval_samples_per_second": 58.091, "eval_steps_per_second": 7.293, "step": 4100 }, { "epoch": 238.97581792318636, "grad_norm": 0.2850896716117859, "learning_rate": 5.59866220735786e-06, "loss": 2.6645, "step": 4200 }, { "epoch": 238.97581792318636, "eval_loss": 2.6635079383850098, "eval_runtime": 24.0086, "eval_samples_per_second": 58.063, "eval_steps_per_second": 7.289, "step": 4200 }, { "epoch": 244.66571834992888, "grad_norm": 0.19473238289356232, "learning_rate": 5.588628762541806e-06, "loss": 2.664, "step": 4300 }, { "epoch": 244.66571834992888, "eval_loss": 2.6625959873199463, "eval_runtime": 23.986, "eval_samples_per_second": 58.117, "eval_steps_per_second": 7.296, "step": 4300 }, { "epoch": 250.3556187766714, "grad_norm": 0.3315108120441437, "learning_rate": 5.578595317725753e-06, "loss": 2.6641, "step": 4400 }, { "epoch": 250.3556187766714, "eval_loss": 2.6661853790283203, "eval_runtime": 23.993, "eval_samples_per_second": 58.1, "eval_steps_per_second": 7.294, "step": 4400 }, { "epoch": 256.04551920341396, "grad_norm": 0.1736942082643509, "learning_rate": 5.568561872909699e-06, "loss": 2.6646, "step": 4500 }, { "epoch": 256.04551920341396, "eval_loss": 2.662870168685913, "eval_runtime": 24.0136, "eval_samples_per_second": 58.05, "eval_steps_per_second": 7.288, "step": 4500 }, { "epoch": 261.73541963015646, "grad_norm": 0.28544047474861145, "learning_rate": 5.558528428093646e-06, "loss": 2.6643, "step": 4600 }, { "epoch": 261.73541963015646, "eval_loss": 2.664302349090576, "eval_runtime": 23.6666, "eval_samples_per_second": 58.902, "eval_steps_per_second": 7.394, "step": 4600 }, { "epoch": 267.425320056899, "grad_norm": 0.2484113723039627, "learning_rate": 5.548494983277593e-06, "loss": 2.6647, "step": 4700 }, { "epoch": 267.425320056899, "eval_loss": 2.6621506214141846, "eval_runtime": 24.002, "eval_samples_per_second": 58.078, "eval_steps_per_second": 7.291, "step": 4700 }, { "epoch": 273.1152204836415, "grad_norm": 0.2153732031583786, "learning_rate": 5.5384615384615385e-06, "loss": 2.6642, "step": 4800 }, { "epoch": 273.1152204836415, "eval_loss": 2.661278009414673, "eval_runtime": 23.9926, "eval_samples_per_second": 58.101, "eval_steps_per_second": 7.294, "step": 4800 }, { "epoch": 278.80512091038406, "grad_norm": 0.2783970832824707, "learning_rate": 5.528428093645485e-06, "loss": 2.6645, "step": 4900 }, { "epoch": 278.80512091038406, "eval_loss": 2.6649537086486816, "eval_runtime": 24.0036, "eval_samples_per_second": 58.075, "eval_steps_per_second": 7.291, "step": 4900 }, { "epoch": 284.4950213371266, "grad_norm": 0.19151923060417175, "learning_rate": 5.518394648829432e-06, "loss": 2.6639, "step": 5000 }, { "epoch": 284.4950213371266, "eval_loss": 2.6626217365264893, "eval_runtime": 24.0086, "eval_samples_per_second": 58.062, "eval_steps_per_second": 7.289, "step": 5000 }, { "epoch": 290.1849217638691, "grad_norm": 0.2797650992870331, "learning_rate": 5.508361204013378e-06, "loss": 2.6638, "step": 5100 }, { "epoch": 290.1849217638691, "eval_loss": 2.6611075401306152, "eval_runtime": 24.0032, "eval_samples_per_second": 58.076, "eval_steps_per_second": 7.291, "step": 5100 }, { "epoch": 295.87482219061167, "grad_norm": 0.20348067581653595, "learning_rate": 5.498327759197324e-06, "loss": 2.6641, "step": 5200 }, { "epoch": 295.87482219061167, "eval_loss": 2.6633517742156982, "eval_runtime": 23.9937, "eval_samples_per_second": 58.099, "eval_steps_per_second": 7.294, "step": 5200 }, { "epoch": 301.5647226173542, "grad_norm": 0.28533390164375305, "learning_rate": 5.488294314381271e-06, "loss": 2.6644, "step": 5300 }, { "epoch": 301.5647226173542, "eval_loss": 2.6602139472961426, "eval_runtime": 23.9892, "eval_samples_per_second": 58.109, "eval_steps_per_second": 7.295, "step": 5300 }, { "epoch": 307.2546230440967, "grad_norm": 0.20466282963752747, "learning_rate": 5.478260869565217e-06, "loss": 2.664, "step": 5400 }, { "epoch": 307.2546230440967, "eval_loss": 2.6633496284484863, "eval_runtime": 23.9996, "eval_samples_per_second": 58.084, "eval_steps_per_second": 7.292, "step": 5400 }, { "epoch": 312.9445234708393, "grad_norm": 0.2744331657886505, "learning_rate": 5.468227424749163e-06, "loss": 2.6642, "step": 5500 }, { "epoch": 312.9445234708393, "eval_loss": 2.66107439994812, "eval_runtime": 23.6506, "eval_samples_per_second": 58.941, "eval_steps_per_second": 7.399, "step": 5500 }, { "epoch": 318.63442389758177, "grad_norm": 0.1974910944700241, "learning_rate": 5.45819397993311e-06, "loss": 2.6639, "step": 5600 }, { "epoch": 318.63442389758177, "eval_loss": 2.6617164611816406, "eval_runtime": 23.9925, "eval_samples_per_second": 58.101, "eval_steps_per_second": 7.294, "step": 5600 }, { "epoch": 324.3243243243243, "grad_norm": 0.2324630171060562, "learning_rate": 5.448160535117057e-06, "loss": 2.6639, "step": 5700 }, { "epoch": 324.3243243243243, "eval_loss": 2.662553071975708, "eval_runtime": 23.9969, "eval_samples_per_second": 58.091, "eval_steps_per_second": 7.293, "step": 5700 }, { "epoch": 330.0142247510669, "grad_norm": 0.19881758093833923, "learning_rate": 5.438127090301003e-06, "loss": 2.6642, "step": 5800 }, { "epoch": 330.0142247510669, "eval_loss": 2.6611857414245605, "eval_runtime": 23.9913, "eval_samples_per_second": 58.104, "eval_steps_per_second": 7.294, "step": 5800 }, { "epoch": 335.7041251778094, "grad_norm": 0.2262159287929535, "learning_rate": 5.4280936454849495e-06, "loss": 2.664, "step": 5900 }, { "epoch": 335.7041251778094, "eval_loss": 2.663783073425293, "eval_runtime": 23.9991, "eval_samples_per_second": 58.086, "eval_steps_per_second": 7.292, "step": 5900 }, { "epoch": 341.3940256045519, "grad_norm": 0.19771187007427216, "learning_rate": 5.418060200668896e-06, "loss": 2.6638, "step": 6000 }, { "epoch": 341.3940256045519, "eval_loss": 2.6637065410614014, "eval_runtime": 23.9906, "eval_samples_per_second": 58.106, "eval_steps_per_second": 7.295, "step": 6000 }, { "epoch": 347.0839260312944, "grad_norm": 0.21087121963500977, "learning_rate": 5.408026755852843e-06, "loss": 2.664, "step": 6100 }, { "epoch": 347.0839260312944, "eval_loss": 2.6605749130249023, "eval_runtime": 23.9813, "eval_samples_per_second": 58.129, "eval_steps_per_second": 7.297, "step": 6100 }, { "epoch": 352.773826458037, "grad_norm": 0.16268645226955414, "learning_rate": 5.397993311036789e-06, "loss": 2.6642, "step": 6200 }, { "epoch": 352.773826458037, "eval_loss": 2.66109299659729, "eval_runtime": 23.9898, "eval_samples_per_second": 58.108, "eval_steps_per_second": 7.295, "step": 6200 }, { "epoch": 358.46372688477953, "grad_norm": 0.17065826058387756, "learning_rate": 5.387959866220736e-06, "loss": 2.6637, "step": 6300 }, { "epoch": 358.46372688477953, "eval_loss": 2.662609100341797, "eval_runtime": 23.6456, "eval_samples_per_second": 58.954, "eval_steps_per_second": 7.401, "step": 6300 }, { "epoch": 364.15362731152203, "grad_norm": 0.21785375475883484, "learning_rate": 5.3779264214046825e-06, "loss": 2.6637, "step": 6400 }, { "epoch": 364.15362731152203, "eval_loss": 2.6664772033691406, "eval_runtime": 24.0047, "eval_samples_per_second": 58.072, "eval_steps_per_second": 7.29, "step": 6400 }, { "epoch": 369.8435277382646, "grad_norm": 0.26267367601394653, "learning_rate": 5.367892976588628e-06, "loss": 2.6637, "step": 6500 }, { "epoch": 369.8435277382646, "eval_loss": 2.6627228260040283, "eval_runtime": 23.9957, "eval_samples_per_second": 58.094, "eval_steps_per_second": 7.293, "step": 6500 }, { "epoch": 375.53342816500714, "grad_norm": 0.21765349805355072, "learning_rate": 5.357859531772575e-06, "loss": 2.6636, "step": 6600 }, { "epoch": 375.53342816500714, "eval_loss": 2.663877487182617, "eval_runtime": 24.0007, "eval_samples_per_second": 58.082, "eval_steps_per_second": 7.291, "step": 6600 }, { "epoch": 381.22332859174963, "grad_norm": 0.25031423568725586, "learning_rate": 5.347826086956522e-06, "loss": 2.6632, "step": 6700 }, { "epoch": 381.22332859174963, "eval_loss": 2.659921884536743, "eval_runtime": 24.0127, "eval_samples_per_second": 58.053, "eval_steps_per_second": 7.288, "step": 6700 }, { "epoch": 386.9132290184922, "grad_norm": 0.19761057198047638, "learning_rate": 5.337792642140468e-06, "loss": 2.6643, "step": 6800 }, { "epoch": 386.9132290184922, "eval_loss": 2.6612935066223145, "eval_runtime": 24.0055, "eval_samples_per_second": 58.07, "eval_steps_per_second": 7.29, "step": 6800 }, { "epoch": 392.6031294452347, "grad_norm": 0.2031594067811966, "learning_rate": 5.327759197324415e-06, "loss": 2.6639, "step": 6900 }, { "epoch": 392.6031294452347, "eval_loss": 2.6605868339538574, "eval_runtime": 23.9939, "eval_samples_per_second": 58.098, "eval_steps_per_second": 7.294, "step": 6900 }, { "epoch": 398.29302987197724, "grad_norm": 0.23052236437797546, "learning_rate": 5.317725752508361e-06, "loss": 2.6636, "step": 7000 }, { "epoch": 398.29302987197724, "eval_loss": 2.6643214225769043, "eval_runtime": 23.9814, "eval_samples_per_second": 58.128, "eval_steps_per_second": 7.297, "step": 7000 }, { "epoch": 403.9829302987198, "grad_norm": 0.22435928881168365, "learning_rate": 5.307692307692307e-06, "loss": 2.6641, "step": 7100 }, { "epoch": 403.9829302987198, "eval_loss": 2.6603801250457764, "eval_runtime": 23.6345, "eval_samples_per_second": 58.982, "eval_steps_per_second": 7.404, "step": 7100 }, { "epoch": 409.6728307254623, "grad_norm": 0.25684019923210144, "learning_rate": 5.297658862876254e-06, "loss": 2.6629, "step": 7200 }, { "epoch": 409.6728307254623, "eval_loss": 2.661247968673706, "eval_runtime": 24.0023, "eval_samples_per_second": 58.078, "eval_steps_per_second": 7.291, "step": 7200 }, { "epoch": 415.36273115220484, "grad_norm": 0.1864226758480072, "learning_rate": 5.287625418060201e-06, "loss": 2.6635, "step": 7300 }, { "epoch": 415.36273115220484, "eval_loss": 2.6621150970458984, "eval_runtime": 23.632, "eval_samples_per_second": 58.988, "eval_steps_per_second": 7.405, "step": 7300 }, { "epoch": 421.05263157894734, "grad_norm": 0.2722451090812683, "learning_rate": 5.277591973244147e-06, "loss": 2.6632, "step": 7400 }, { "epoch": 421.05263157894734, "eval_loss": 2.6618592739105225, "eval_runtime": 24.0004, "eval_samples_per_second": 58.082, "eval_steps_per_second": 7.292, "step": 7400 }, { "epoch": 426.7425320056899, "grad_norm": 0.22736287117004395, "learning_rate": 5.2675585284280935e-06, "loss": 2.6634, "step": 7500 }, { "epoch": 426.7425320056899, "eval_loss": 2.664785385131836, "eval_runtime": 23.9908, "eval_samples_per_second": 58.106, "eval_steps_per_second": 7.294, "step": 7500 }, { "epoch": 432.43243243243245, "grad_norm": 0.2094627469778061, "learning_rate": 5.25752508361204e-06, "loss": 2.663, "step": 7600 }, { "epoch": 432.43243243243245, "eval_loss": 2.6618707180023193, "eval_runtime": 23.9741, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.3, "step": 7600 }, { "epoch": 438.12233285917495, "grad_norm": 0.19710348546504974, "learning_rate": 5.247491638795986e-06, "loss": 2.6633, "step": 7700 }, { "epoch": 438.12233285917495, "eval_loss": 2.660637617111206, "eval_runtime": 23.9987, "eval_samples_per_second": 58.087, "eval_steps_per_second": 7.292, "step": 7700 }, { "epoch": 443.8122332859175, "grad_norm": 0.23215074837207794, "learning_rate": 5.237458193979933e-06, "loss": 2.6635, "step": 7800 }, { "epoch": 443.8122332859175, "eval_loss": 2.6619646549224854, "eval_runtime": 24.0021, "eval_samples_per_second": 58.078, "eval_steps_per_second": 7.291, "step": 7800 }, { "epoch": 449.50213371266005, "grad_norm": 0.2374960482120514, "learning_rate": 5.22742474916388e-06, "loss": 2.6632, "step": 7900 }, { "epoch": 449.50213371266005, "eval_loss": 2.660618305206299, "eval_runtime": 23.9883, "eval_samples_per_second": 58.112, "eval_steps_per_second": 7.295, "step": 7900 }, { "epoch": 455.19203413940255, "grad_norm": 0.22597618401050568, "learning_rate": 5.2173913043478265e-06, "loss": 2.6628, "step": 8000 }, { "epoch": 455.19203413940255, "eval_loss": 2.6641225814819336, "eval_runtime": 23.9895, "eval_samples_per_second": 58.109, "eval_steps_per_second": 7.295, "step": 8000 }, { "epoch": 460.8819345661451, "grad_norm": 0.16886833310127258, "learning_rate": 5.207357859531772e-06, "loss": 2.6624, "step": 8100 }, { "epoch": 460.8819345661451, "eval_loss": 2.663078784942627, "eval_runtime": 23.999, "eval_samples_per_second": 58.086, "eval_steps_per_second": 7.292, "step": 8100 }, { "epoch": 466.5718349928876, "grad_norm": 0.23887428641319275, "learning_rate": 5.197324414715719e-06, "loss": 2.6627, "step": 8200 }, { "epoch": 466.5718349928876, "eval_loss": 2.661996841430664, "eval_runtime": 23.676, "eval_samples_per_second": 58.878, "eval_steps_per_second": 7.391, "step": 8200 }, { "epoch": 472.26173541963016, "grad_norm": 0.21171975135803223, "learning_rate": 5.187290969899666e-06, "loss": 2.6628, "step": 8300 }, { "epoch": 472.26173541963016, "eval_loss": 2.661268711090088, "eval_runtime": 24.009, "eval_samples_per_second": 58.062, "eval_steps_per_second": 7.289, "step": 8300 }, { "epoch": 477.9516358463727, "grad_norm": 0.2626218795776367, "learning_rate": 5.177257525083612e-06, "loss": 2.6627, "step": 8400 }, { "epoch": 477.9516358463727, "eval_loss": 2.6632938385009766, "eval_runtime": 23.9972, "eval_samples_per_second": 58.09, "eval_steps_per_second": 7.293, "step": 8400 }, { "epoch": 483.6415362731152, "grad_norm": 0.2506119906902313, "learning_rate": 5.167224080267559e-06, "loss": 2.6626, "step": 8500 }, { "epoch": 483.6415362731152, "eval_loss": 2.6621577739715576, "eval_runtime": 24.0009, "eval_samples_per_second": 58.081, "eval_steps_per_second": 7.291, "step": 8500 }, { "epoch": 489.33143669985776, "grad_norm": 0.20735953748226166, "learning_rate": 5.157190635451505e-06, "loss": 2.6629, "step": 8600 }, { "epoch": 489.33143669985776, "eval_loss": 2.6636762619018555, "eval_runtime": 23.9985, "eval_samples_per_second": 58.087, "eval_steps_per_second": 7.292, "step": 8600 }, { "epoch": 495.02133712660026, "grad_norm": 0.2238348424434662, "learning_rate": 5.147157190635451e-06, "loss": 2.6627, "step": 8700 }, { "epoch": 495.02133712660026, "eval_loss": 2.6605405807495117, "eval_runtime": 24.0031, "eval_samples_per_second": 58.076, "eval_steps_per_second": 7.291, "step": 8700 }, { "epoch": 500.7112375533428, "grad_norm": 0.25992342829704285, "learning_rate": 5.137123745819398e-06, "loss": 2.6627, "step": 8800 }, { "epoch": 500.7112375533428, "eval_loss": 2.6636483669281006, "eval_runtime": 23.9915, "eval_samples_per_second": 58.104, "eval_steps_per_second": 7.294, "step": 8800 }, { "epoch": 506.40113798008537, "grad_norm": 0.16477429866790771, "learning_rate": 5.127090301003345e-06, "loss": 2.6626, "step": 8900 }, { "epoch": 506.40113798008537, "eval_loss": 2.6633760929107666, "eval_runtime": 23.6486, "eval_samples_per_second": 58.946, "eval_steps_per_second": 7.4, "step": 8900 }, { "epoch": 512.0910384068279, "grad_norm": 0.1675286740064621, "learning_rate": 5.117056856187291e-06, "loss": 2.6618, "step": 9000 }, { "epoch": 512.0910384068279, "eval_loss": 2.6642019748687744, "eval_runtime": 23.9989, "eval_samples_per_second": 58.086, "eval_steps_per_second": 7.292, "step": 9000 }, { "epoch": 517.7809388335704, "grad_norm": 0.26264622807502747, "learning_rate": 5.1070234113712375e-06, "loss": 2.6613, "step": 9100 }, { "epoch": 517.7809388335704, "eval_loss": 2.6624467372894287, "eval_runtime": 23.9956, "eval_samples_per_second": 58.094, "eval_steps_per_second": 7.293, "step": 9100 }, { "epoch": 523.4708392603129, "grad_norm": 0.20508848130702972, "learning_rate": 5.096989966555184e-06, "loss": 2.6592, "step": 9200 }, { "epoch": 523.4708392603129, "eval_loss": 2.6613006591796875, "eval_runtime": 24.0077, "eval_samples_per_second": 58.065, "eval_steps_per_second": 7.289, "step": 9200 }, { "epoch": 529.1607396870555, "grad_norm": 0.20402471721172333, "learning_rate": 5.08695652173913e-06, "loss": 2.6572, "step": 9300 }, { "epoch": 529.1607396870555, "eval_loss": 2.6541850566864014, "eval_runtime": 23.9913, "eval_samples_per_second": 58.105, "eval_steps_per_second": 7.294, "step": 9300 }, { "epoch": 534.850640113798, "grad_norm": 0.2935788929462433, "learning_rate": 5.076923076923077e-06, "loss": 2.6539, "step": 9400 }, { "epoch": 534.850640113798, "eval_loss": 2.6466681957244873, "eval_runtime": 24.0031, "eval_samples_per_second": 58.076, "eval_steps_per_second": 7.291, "step": 9400 }, { "epoch": 540.5405405405405, "grad_norm": 0.22674228250980377, "learning_rate": 5.066889632107024e-06, "loss": 2.6516, "step": 9500 }, { "epoch": 540.5405405405405, "eval_loss": 2.643643379211426, "eval_runtime": 23.9831, "eval_samples_per_second": 58.124, "eval_steps_per_second": 7.297, "step": 9500 }, { "epoch": 546.230440967283, "grad_norm": 0.20724594593048096, "learning_rate": 5.05685618729097e-06, "loss": 2.6499, "step": 9600 }, { "epoch": 546.230440967283, "eval_loss": 2.6470532417297363, "eval_runtime": 23.9913, "eval_samples_per_second": 58.104, "eval_steps_per_second": 7.294, "step": 9600 }, { "epoch": 551.9203413940256, "grad_norm": 0.22264239192008972, "learning_rate": 5.046822742474916e-06, "loss": 2.6484, "step": 9700 }, { "epoch": 551.9203413940256, "eval_loss": 2.6430468559265137, "eval_runtime": 23.6338, "eval_samples_per_second": 58.983, "eval_steps_per_second": 7.405, "step": 9700 }, { "epoch": 557.6102418207681, "grad_norm": 0.2465650737285614, "learning_rate": 5.036789297658863e-06, "loss": 2.6469, "step": 9800 }, { "epoch": 557.6102418207681, "eval_loss": 2.641869068145752, "eval_runtime": 23.9904, "eval_samples_per_second": 58.107, "eval_steps_per_second": 7.295, "step": 9800 }, { "epoch": 563.3001422475106, "grad_norm": 0.22635580599308014, "learning_rate": 5.02675585284281e-06, "loss": 2.6458, "step": 9900 }, { "epoch": 563.3001422475106, "eval_loss": 2.6428213119506836, "eval_runtime": 23.9892, "eval_samples_per_second": 58.11, "eval_steps_per_second": 7.295, "step": 9900 }, { "epoch": 568.9900426742532, "grad_norm": 0.26153308153152466, "learning_rate": 5.016722408026756e-06, "loss": 2.6446, "step": 10000 }, { "epoch": 568.9900426742532, "eval_loss": 2.639491081237793, "eval_runtime": 23.9942, "eval_samples_per_second": 58.097, "eval_steps_per_second": 7.293, "step": 10000 }, { "epoch": 574.6799431009957, "grad_norm": 0.21078501641750336, "learning_rate": 5.0066889632107026e-06, "loss": 2.6441, "step": 10100 }, { "epoch": 574.6799431009957, "eval_loss": 2.641256809234619, "eval_runtime": 23.6406, "eval_samples_per_second": 58.966, "eval_steps_per_second": 7.403, "step": 10100 }, { "epoch": 580.3698435277382, "grad_norm": 0.24298006296157837, "learning_rate": 4.996655518394649e-06, "loss": 2.6428, "step": 10200 }, { "epoch": 580.3698435277382, "eval_loss": 2.635876417160034, "eval_runtime": 23.9845, "eval_samples_per_second": 58.121, "eval_steps_per_second": 7.296, "step": 10200 }, { "epoch": 586.0597439544808, "grad_norm": 0.24525369703769684, "learning_rate": 4.986622073578595e-06, "loss": 2.6422, "step": 10300 }, { "epoch": 586.0597439544808, "eval_loss": 2.637178421020508, "eval_runtime": 23.9878, "eval_samples_per_second": 58.113, "eval_steps_per_second": 7.295, "step": 10300 }, { "epoch": 591.7496443812233, "grad_norm": 0.20690536499023438, "learning_rate": 4.976588628762542e-06, "loss": 2.6415, "step": 10400 }, { "epoch": 591.7496443812233, "eval_loss": 2.6367316246032715, "eval_runtime": 24.0107, "eval_samples_per_second": 58.057, "eval_steps_per_second": 7.288, "step": 10400 }, { "epoch": 597.4395448079658, "grad_norm": 0.2113800048828125, "learning_rate": 4.966555183946489e-06, "loss": 2.6403, "step": 10500 }, { "epoch": 597.4395448079658, "eval_loss": 2.6348278522491455, "eval_runtime": 23.9887, "eval_samples_per_second": 58.111, "eval_steps_per_second": 7.295, "step": 10500 }, { "epoch": 603.1294452347084, "grad_norm": 0.261877179145813, "learning_rate": 4.956521739130435e-06, "loss": 2.6402, "step": 10600 }, { "epoch": 603.1294452347084, "eval_loss": 2.636289358139038, "eval_runtime": 23.9838, "eval_samples_per_second": 58.123, "eval_steps_per_second": 7.297, "step": 10600 }, { "epoch": 608.8193456614509, "grad_norm": 0.19519160687923431, "learning_rate": 4.9464882943143815e-06, "loss": 2.6397, "step": 10700 }, { "epoch": 608.8193456614509, "eval_loss": 2.6352500915527344, "eval_runtime": 23.9822, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 10700 }, { "epoch": 614.5092460881934, "grad_norm": 0.1956864595413208, "learning_rate": 4.936454849498328e-06, "loss": 2.6389, "step": 10800 }, { "epoch": 614.5092460881934, "eval_loss": 2.6334805488586426, "eval_runtime": 23.9978, "eval_samples_per_second": 58.089, "eval_steps_per_second": 7.292, "step": 10800 }, { "epoch": 620.1991465149359, "grad_norm": 0.23185935616493225, "learning_rate": 4.926421404682274e-06, "loss": 2.6382, "step": 10900 }, { "epoch": 620.1991465149359, "eval_loss": 2.6361021995544434, "eval_runtime": 23.6451, "eval_samples_per_second": 58.955, "eval_steps_per_second": 7.401, "step": 10900 }, { "epoch": 625.8890469416785, "grad_norm": 0.20759572088718414, "learning_rate": 4.916387959866221e-06, "loss": 2.6382, "step": 11000 }, { "epoch": 625.8890469416785, "eval_loss": 2.634660243988037, "eval_runtime": 23.9847, "eval_samples_per_second": 58.12, "eval_steps_per_second": 7.296, "step": 11000 }, { "epoch": 631.578947368421, "grad_norm": 0.24066907167434692, "learning_rate": 4.906354515050168e-06, "loss": 2.6372, "step": 11100 }, { "epoch": 631.578947368421, "eval_loss": 2.6299962997436523, "eval_runtime": 23.979, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 11100 }, { "epoch": 637.2688477951635, "grad_norm": 0.23204158246517181, "learning_rate": 4.8963210702341136e-06, "loss": 2.637, "step": 11200 }, { "epoch": 637.2688477951635, "eval_loss": 2.631197690963745, "eval_runtime": 23.6362, "eval_samples_per_second": 58.977, "eval_steps_per_second": 7.404, "step": 11200 }, { "epoch": 642.9587482219061, "grad_norm": 0.21934348344802856, "learning_rate": 4.88628762541806e-06, "loss": 2.6363, "step": 11300 }, { "epoch": 642.9587482219061, "eval_loss": 2.629666566848755, "eval_runtime": 23.9798, "eval_samples_per_second": 58.132, "eval_steps_per_second": 7.298, "step": 11300 }, { "epoch": 648.6486486486486, "grad_norm": 0.18657149374485016, "learning_rate": 4.876254180602007e-06, "loss": 2.6358, "step": 11400 }, { "epoch": 648.6486486486486, "eval_loss": 2.632611036300659, "eval_runtime": 23.9657, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 11400 }, { "epoch": 654.3385490753911, "grad_norm": 0.2166602909564972, "learning_rate": 4.866220735785953e-06, "loss": 2.6355, "step": 11500 }, { "epoch": 654.3385490753911, "eval_loss": 2.6293506622314453, "eval_runtime": 23.6252, "eval_samples_per_second": 59.005, "eval_steps_per_second": 7.407, "step": 11500 }, { "epoch": 660.0284495021338, "grad_norm": 0.20717021822929382, "learning_rate": 4.8561872909699e-06, "loss": 2.6356, "step": 11600 }, { "epoch": 660.0284495021338, "eval_loss": 2.6316463947296143, "eval_runtime": 23.968, "eval_samples_per_second": 58.161, "eval_steps_per_second": 7.301, "step": 11600 }, { "epoch": 665.7183499288763, "grad_norm": 0.22796860337257385, "learning_rate": 4.8461538461538465e-06, "loss": 2.6347, "step": 11700 }, { "epoch": 665.7183499288763, "eval_loss": 2.6268954277038574, "eval_runtime": 23.9849, "eval_samples_per_second": 58.12, "eval_steps_per_second": 7.296, "step": 11700 }, { "epoch": 671.4082503556187, "grad_norm": 0.20367011427879333, "learning_rate": 4.8361204013377925e-06, "loss": 2.6345, "step": 11800 }, { "epoch": 671.4082503556187, "eval_loss": 2.63277530670166, "eval_runtime": 23.9971, "eval_samples_per_second": 58.09, "eval_steps_per_second": 7.293, "step": 11800 }, { "epoch": 677.0981507823614, "grad_norm": 0.186284601688385, "learning_rate": 4.826086956521739e-06, "loss": 2.6344, "step": 11900 }, { "epoch": 677.0981507823614, "eval_loss": 2.6296372413635254, "eval_runtime": 23.6255, "eval_samples_per_second": 59.004, "eval_steps_per_second": 7.407, "step": 11900 }, { "epoch": 682.7880512091039, "grad_norm": 0.16128043830394745, "learning_rate": 4.816053511705686e-06, "loss": 2.634, "step": 12000 }, { "epoch": 682.7880512091039, "eval_loss": 2.6282401084899902, "eval_runtime": 23.9698, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 12000 }, { "epoch": 688.4779516358464, "grad_norm": 0.21663859486579895, "learning_rate": 4.806020066889633e-06, "loss": 2.6336, "step": 12100 }, { "epoch": 688.4779516358464, "eval_loss": 2.62992525100708, "eval_runtime": 23.9831, "eval_samples_per_second": 58.124, "eval_steps_per_second": 7.297, "step": 12100 }, { "epoch": 694.1678520625888, "grad_norm": 0.15192851424217224, "learning_rate": 4.795986622073579e-06, "loss": 2.6328, "step": 12200 }, { "epoch": 694.1678520625888, "eval_loss": 2.628678321838379, "eval_runtime": 23.9973, "eval_samples_per_second": 58.09, "eval_steps_per_second": 7.292, "step": 12200 }, { "epoch": 699.8577524893315, "grad_norm": 0.22168278694152832, "learning_rate": 4.785953177257525e-06, "loss": 2.6323, "step": 12300 }, { "epoch": 699.8577524893315, "eval_loss": 2.6261205673217773, "eval_runtime": 24.0543, "eval_samples_per_second": 57.952, "eval_steps_per_second": 7.275, "step": 12300 }, { "epoch": 705.547652916074, "grad_norm": 0.1755346953868866, "learning_rate": 4.775919732441472e-06, "loss": 2.6324, "step": 12400 }, { "epoch": 705.547652916074, "eval_loss": 2.6293275356292725, "eval_runtime": 23.6313, "eval_samples_per_second": 58.99, "eval_steps_per_second": 7.405, "step": 12400 }, { "epoch": 711.2375533428165, "grad_norm": 0.21894784271717072, "learning_rate": 4.765886287625418e-06, "loss": 2.6321, "step": 12500 }, { "epoch": 711.2375533428165, "eval_loss": 2.627187967300415, "eval_runtime": 23.9772, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 12500 }, { "epoch": 716.9274537695591, "grad_norm": 0.1610960066318512, "learning_rate": 4.755852842809365e-06, "loss": 2.6319, "step": 12600 }, { "epoch": 716.9274537695591, "eval_loss": 2.6290180683135986, "eval_runtime": 23.9949, "eval_samples_per_second": 58.096, "eval_steps_per_second": 7.293, "step": 12600 }, { "epoch": 722.6173541963016, "grad_norm": 0.24491451680660248, "learning_rate": 4.745819397993312e-06, "loss": 2.6317, "step": 12700 }, { "epoch": 722.6173541963016, "eval_loss": 2.6274964809417725, "eval_runtime": 23.9713, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 12700 }, { "epoch": 728.3072546230441, "grad_norm": 0.16983364522457123, "learning_rate": 4.7357859531772575e-06, "loss": 2.6318, "step": 12800 }, { "epoch": 728.3072546230441, "eval_loss": 2.6273210048675537, "eval_runtime": 23.978, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 12800 }, { "epoch": 733.9971550497867, "grad_norm": 0.17345738410949707, "learning_rate": 4.725752508361204e-06, "loss": 2.6311, "step": 12900 }, { "epoch": 733.9971550497867, "eval_loss": 2.6259231567382812, "eval_runtime": 24.0133, "eval_samples_per_second": 58.051, "eval_steps_per_second": 7.288, "step": 12900 }, { "epoch": 739.6870554765292, "grad_norm": 0.21643972396850586, "learning_rate": 4.715719063545151e-06, "loss": 2.631, "step": 13000 }, { "epoch": 739.6870554765292, "eval_loss": 2.6242847442626953, "eval_runtime": 23.975, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 13000 }, { "epoch": 745.3769559032717, "grad_norm": 0.1840481460094452, "learning_rate": 4.705685618729097e-06, "loss": 2.6308, "step": 13100 }, { "epoch": 745.3769559032717, "eval_loss": 2.6283822059631348, "eval_runtime": 24.0095, "eval_samples_per_second": 58.06, "eval_steps_per_second": 7.289, "step": 13100 }, { "epoch": 751.0668563300143, "grad_norm": 0.23446989059448242, "learning_rate": 4.695652173913044e-06, "loss": 2.6304, "step": 13200 }, { "epoch": 751.0668563300143, "eval_loss": 2.629518985748291, "eval_runtime": 23.978, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 13200 }, { "epoch": 756.7567567567568, "grad_norm": 0.19253458082675934, "learning_rate": 4.6856187290969905e-06, "loss": 2.6298, "step": 13300 }, { "epoch": 756.7567567567568, "eval_loss": 2.624861001968384, "eval_runtime": 23.9689, "eval_samples_per_second": 58.159, "eval_steps_per_second": 7.301, "step": 13300 }, { "epoch": 762.4466571834993, "grad_norm": 0.18916001915931702, "learning_rate": 4.675585284280936e-06, "loss": 2.6295, "step": 13400 }, { "epoch": 762.4466571834993, "eval_loss": 2.6245834827423096, "eval_runtime": 23.6275, "eval_samples_per_second": 58.999, "eval_steps_per_second": 7.407, "step": 13400 }, { "epoch": 768.1365576102418, "grad_norm": 0.18140235543251038, "learning_rate": 4.665551839464883e-06, "loss": 2.6294, "step": 13500 }, { "epoch": 768.1365576102418, "eval_loss": 2.6280317306518555, "eval_runtime": 23.9728, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 13500 }, { "epoch": 773.8264580369844, "grad_norm": 0.2032419890165329, "learning_rate": 4.65551839464883e-06, "loss": 2.6296, "step": 13600 }, { "epoch": 773.8264580369844, "eval_loss": 2.625180721282959, "eval_runtime": 23.9783, "eval_samples_per_second": 58.136, "eval_steps_per_second": 7.298, "step": 13600 }, { "epoch": 779.5163584637269, "grad_norm": 0.19860903918743134, "learning_rate": 4.645484949832776e-06, "loss": 2.6285, "step": 13700 }, { "epoch": 779.5163584637269, "eval_loss": 2.6238112449645996, "eval_runtime": 23.6456, "eval_samples_per_second": 58.954, "eval_steps_per_second": 7.401, "step": 13700 }, { "epoch": 785.2062588904694, "grad_norm": 0.19440245628356934, "learning_rate": 4.635451505016723e-06, "loss": 2.6288, "step": 13800 }, { "epoch": 785.2062588904694, "eval_loss": 2.625662088394165, "eval_runtime": 23.9906, "eval_samples_per_second": 58.106, "eval_steps_per_second": 7.295, "step": 13800 }, { "epoch": 790.896159317212, "grad_norm": 0.2409134954214096, "learning_rate": 4.625418060200669e-06, "loss": 2.6283, "step": 13900 }, { "epoch": 790.896159317212, "eval_loss": 2.626107931137085, "eval_runtime": 23.9912, "eval_samples_per_second": 58.105, "eval_steps_per_second": 7.294, "step": 13900 }, { "epoch": 796.5860597439545, "grad_norm": 0.19188816845417023, "learning_rate": 4.615384615384616e-06, "loss": 2.6282, "step": 14000 }, { "epoch": 796.5860597439545, "eval_loss": 2.6254701614379883, "eval_runtime": 23.969, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 14000 }, { "epoch": 802.275960170697, "grad_norm": 0.2168819010257721, "learning_rate": 4.605351170568562e-06, "loss": 2.6277, "step": 14100 }, { "epoch": 802.275960170697, "eval_loss": 2.6247129440307617, "eval_runtime": 23.9775, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 14100 }, { "epoch": 807.9658605974396, "grad_norm": 0.21990340948104858, "learning_rate": 4.595317725752509e-06, "loss": 2.6282, "step": 14200 }, { "epoch": 807.9658605974396, "eval_loss": 2.6232125759124756, "eval_runtime": 23.9967, "eval_samples_per_second": 58.091, "eval_steps_per_second": 7.293, "step": 14200 }, { "epoch": 813.6557610241821, "grad_norm": 0.21468599140644073, "learning_rate": 4.585284280936456e-06, "loss": 2.6275, "step": 14300 }, { "epoch": 813.6557610241821, "eval_loss": 2.622816324234009, "eval_runtime": 23.9966, "eval_samples_per_second": 58.092, "eval_steps_per_second": 7.293, "step": 14300 }, { "epoch": 819.3456614509246, "grad_norm": 0.18678580224514008, "learning_rate": 4.5752508361204015e-06, "loss": 2.6269, "step": 14400 }, { "epoch": 819.3456614509246, "eval_loss": 2.6258599758148193, "eval_runtime": 23.9926, "eval_samples_per_second": 58.101, "eval_steps_per_second": 7.294, "step": 14400 }, { "epoch": 825.0355618776672, "grad_norm": 0.18838590383529663, "learning_rate": 4.565217391304348e-06, "loss": 2.6267, "step": 14500 }, { "epoch": 825.0355618776672, "eval_loss": 2.6210570335388184, "eval_runtime": 23.9725, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 14500 }, { "epoch": 830.7254623044097, "grad_norm": 0.19508691132068634, "learning_rate": 4.555183946488295e-06, "loss": 2.6266, "step": 14600 }, { "epoch": 830.7254623044097, "eval_loss": 2.6242544651031494, "eval_runtime": 23.6209, "eval_samples_per_second": 59.016, "eval_steps_per_second": 7.409, "step": 14600 }, { "epoch": 836.4153627311522, "grad_norm": 0.16247296333312988, "learning_rate": 4.545150501672241e-06, "loss": 2.6269, "step": 14700 }, { "epoch": 836.4153627311522, "eval_loss": 2.622783899307251, "eval_runtime": 24.0053, "eval_samples_per_second": 58.071, "eval_steps_per_second": 7.29, "step": 14700 }, { "epoch": 842.1052631578947, "grad_norm": 0.16786298155784607, "learning_rate": 4.535117056856188e-06, "loss": 2.6265, "step": 14800 }, { "epoch": 842.1052631578947, "eval_loss": 2.6206467151641846, "eval_runtime": 23.9727, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 14800 }, { "epoch": 847.7951635846373, "grad_norm": 0.16687853634357452, "learning_rate": 4.5250836120401345e-06, "loss": 2.6264, "step": 14900 }, { "epoch": 847.7951635846373, "eval_loss": 2.622387409210205, "eval_runtime": 23.9866, "eval_samples_per_second": 58.116, "eval_steps_per_second": 7.296, "step": 14900 }, { "epoch": 853.4850640113798, "grad_norm": 0.16502691805362701, "learning_rate": 4.51505016722408e-06, "loss": 2.6256, "step": 15000 }, { "epoch": 853.4850640113798, "eval_loss": 2.6233725547790527, "eval_runtime": 23.9809, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 15000 }, { "epoch": 859.1749644381223, "grad_norm": 0.1729804426431656, "learning_rate": 4.505016722408027e-06, "loss": 2.6261, "step": 15100 }, { "epoch": 859.1749644381223, "eval_loss": 2.6231296062469482, "eval_runtime": 23.6331, "eval_samples_per_second": 58.985, "eval_steps_per_second": 7.405, "step": 15100 }, { "epoch": 864.8648648648649, "grad_norm": 0.1754722148180008, "learning_rate": 4.494983277591973e-06, "loss": 2.6255, "step": 15200 }, { "epoch": 864.8648648648649, "eval_loss": 2.6251132488250732, "eval_runtime": 23.9803, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 15200 }, { "epoch": 870.5547652916074, "grad_norm": 0.18603281676769257, "learning_rate": 4.48494983277592e-06, "loss": 2.6254, "step": 15300 }, { "epoch": 870.5547652916074, "eval_loss": 2.62117600440979, "eval_runtime": 23.9772, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 15300 }, { "epoch": 876.2446657183499, "grad_norm": 0.1606762558221817, "learning_rate": 4.474916387959866e-06, "loss": 2.6254, "step": 15400 }, { "epoch": 876.2446657183499, "eval_loss": 2.6225478649139404, "eval_runtime": 23.9822, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 15400 }, { "epoch": 881.9345661450925, "grad_norm": 0.2024741768836975, "learning_rate": 4.4648829431438125e-06, "loss": 2.6253, "step": 15500 }, { "epoch": 881.9345661450925, "eval_loss": 2.6233015060424805, "eval_runtime": 23.9634, "eval_samples_per_second": 58.172, "eval_steps_per_second": 7.303, "step": 15500 }, { "epoch": 887.624466571835, "grad_norm": 0.15303322672843933, "learning_rate": 4.454849498327759e-06, "loss": 2.625, "step": 15600 }, { "epoch": 887.624466571835, "eval_loss": 2.6242151260375977, "eval_runtime": 23.9778, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 15600 }, { "epoch": 893.3143669985775, "grad_norm": 0.1664353907108307, "learning_rate": 4.444816053511705e-06, "loss": 2.6249, "step": 15700 }, { "epoch": 893.3143669985775, "eval_loss": 2.6248691082000732, "eval_runtime": 23.6226, "eval_samples_per_second": 59.011, "eval_steps_per_second": 7.408, "step": 15700 }, { "epoch": 899.0042674253201, "grad_norm": 0.19236522912979126, "learning_rate": 4.434782608695652e-06, "loss": 2.6246, "step": 15800 }, { "epoch": 899.0042674253201, "eval_loss": 2.621826410293579, "eval_runtime": 23.9772, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 15800 }, { "epoch": 904.6941678520626, "grad_norm": 0.18655799329280853, "learning_rate": 4.424749163879599e-06, "loss": 2.6245, "step": 15900 }, { "epoch": 904.6941678520626, "eval_loss": 2.6243951320648193, "eval_runtime": 23.976, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 15900 }, { "epoch": 910.3840682788051, "grad_norm": 0.1743532419204712, "learning_rate": 4.414715719063545e-06, "loss": 2.6241, "step": 16000 }, { "epoch": 910.3840682788051, "eval_loss": 2.6209657192230225, "eval_runtime": 23.9767, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 16000 }, { "epoch": 916.0739687055476, "grad_norm": 0.1673664003610611, "learning_rate": 4.404682274247491e-06, "loss": 2.6244, "step": 16100 }, { "epoch": 916.0739687055476, "eval_loss": 2.619206190109253, "eval_runtime": 23.9866, "eval_samples_per_second": 58.116, "eval_steps_per_second": 7.296, "step": 16100 }, { "epoch": 921.7638691322902, "grad_norm": 0.19697995483875275, "learning_rate": 4.394648829431438e-06, "loss": 2.6236, "step": 16200 }, { "epoch": 921.7638691322902, "eval_loss": 2.625276565551758, "eval_runtime": 23.6215, "eval_samples_per_second": 59.014, "eval_steps_per_second": 7.409, "step": 16200 }, { "epoch": 927.4537695590327, "grad_norm": 0.21309681236743927, "learning_rate": 4.384615384615384e-06, "loss": 2.6238, "step": 16300 }, { "epoch": 927.4537695590327, "eval_loss": 2.6223998069763184, "eval_runtime": 23.9648, "eval_samples_per_second": 58.169, "eval_steps_per_second": 7.302, "step": 16300 }, { "epoch": 933.1436699857752, "grad_norm": 0.18199092149734497, "learning_rate": 4.374581939799331e-06, "loss": 2.6235, "step": 16400 }, { "epoch": 933.1436699857752, "eval_loss": 2.621734380722046, "eval_runtime": 24.0006, "eval_samples_per_second": 58.082, "eval_steps_per_second": 7.291, "step": 16400 }, { "epoch": 938.8335704125178, "grad_norm": 0.1346639096736908, "learning_rate": 4.364548494983278e-06, "loss": 2.6229, "step": 16500 }, { "epoch": 938.8335704125178, "eval_loss": 2.6209397315979004, "eval_runtime": 23.9903, "eval_samples_per_second": 58.107, "eval_steps_per_second": 7.295, "step": 16500 }, { "epoch": 944.5234708392603, "grad_norm": 0.172411248087883, "learning_rate": 4.354515050167224e-06, "loss": 2.6234, "step": 16600 }, { "epoch": 944.5234708392603, "eval_loss": 2.622432231903076, "eval_runtime": 23.6313, "eval_samples_per_second": 58.99, "eval_steps_per_second": 7.405, "step": 16600 }, { "epoch": 950.2133712660028, "grad_norm": 0.18165941536426544, "learning_rate": 4.34448160535117e-06, "loss": 2.6233, "step": 16700 }, { "epoch": 950.2133712660028, "eval_loss": 2.622363567352295, "eval_runtime": 23.9675, "eval_samples_per_second": 58.162, "eval_steps_per_second": 7.302, "step": 16700 }, { "epoch": 955.9032716927454, "grad_norm": 0.1533946692943573, "learning_rate": 4.334448160535117e-06, "loss": 2.6231, "step": 16800 }, { "epoch": 955.9032716927454, "eval_loss": 2.622138738632202, "eval_runtime": 23.984, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.297, "step": 16800 }, { "epoch": 961.5931721194879, "grad_norm": 0.16410088539123535, "learning_rate": 4.324414715719064e-06, "loss": 2.623, "step": 16900 }, { "epoch": 961.5931721194879, "eval_loss": 2.6216089725494385, "eval_runtime": 22.7206, "eval_samples_per_second": 61.354, "eval_steps_per_second": 7.702, "step": 16900 }, { "epoch": 967.2830725462304, "grad_norm": 0.1414816975593567, "learning_rate": 4.31438127090301e-06, "loss": 2.6231, "step": 17000 }, { "epoch": 967.2830725462304, "eval_loss": 2.620095729827881, "eval_runtime": 23.9864, "eval_samples_per_second": 58.116, "eval_steps_per_second": 7.296, "step": 17000 }, { "epoch": 972.972972972973, "grad_norm": 0.1808498054742813, "learning_rate": 4.3043478260869565e-06, "loss": 2.6226, "step": 17100 }, { "epoch": 972.972972972973, "eval_loss": 2.620466947555542, "eval_runtime": 23.6354, "eval_samples_per_second": 58.979, "eval_steps_per_second": 7.404, "step": 17100 }, { "epoch": 978.6628733997155, "grad_norm": 0.18205687403678894, "learning_rate": 4.294314381270903e-06, "loss": 2.6222, "step": 17200 }, { "epoch": 978.6628733997155, "eval_loss": 2.622086763381958, "eval_runtime": 23.9861, "eval_samples_per_second": 58.117, "eval_steps_per_second": 7.296, "step": 17200 }, { "epoch": 984.352773826458, "grad_norm": 0.16486555337905884, "learning_rate": 4.284280936454849e-06, "loss": 2.6226, "step": 17300 }, { "epoch": 984.352773826458, "eval_loss": 2.6206889152526855, "eval_runtime": 23.9904, "eval_samples_per_second": 58.107, "eval_steps_per_second": 7.295, "step": 17300 }, { "epoch": 990.0426742532005, "grad_norm": 0.17627450823783875, "learning_rate": 4.274247491638796e-06, "loss": 2.6221, "step": 17400 }, { "epoch": 990.0426742532005, "eval_loss": 2.6199347972869873, "eval_runtime": 24.0002, "eval_samples_per_second": 58.083, "eval_steps_per_second": 7.292, "step": 17400 }, { "epoch": 995.7325746799431, "grad_norm": 0.179404616355896, "learning_rate": 4.264214046822743e-06, "loss": 2.622, "step": 17500 }, { "epoch": 995.7325746799431, "eval_loss": 2.6224658489227295, "eval_runtime": 23.9811, "eval_samples_per_second": 58.129, "eval_steps_per_second": 7.297, "step": 17500 }, { "epoch": 1001.4224751066856, "grad_norm": 0.1570415049791336, "learning_rate": 4.254180602006689e-06, "loss": 2.6221, "step": 17600 }, { "epoch": 1001.4224751066856, "eval_loss": 2.6203651428222656, "eval_runtime": 23.9819, "eval_samples_per_second": 58.127, "eval_steps_per_second": 7.297, "step": 17600 }, { "epoch": 1007.1123755334281, "grad_norm": 0.166086807847023, "learning_rate": 4.244147157190635e-06, "loss": 2.6216, "step": 17700 }, { "epoch": 1007.1123755334281, "eval_loss": 2.619248390197754, "eval_runtime": 23.9859, "eval_samples_per_second": 58.118, "eval_steps_per_second": 7.296, "step": 17700 }, { "epoch": 1012.8022759601707, "grad_norm": 0.20893247425556183, "learning_rate": 4.234113712374582e-06, "loss": 2.6218, "step": 17800 }, { "epoch": 1012.8022759601707, "eval_loss": 2.6189231872558594, "eval_runtime": 23.9924, "eval_samples_per_second": 58.102, "eval_steps_per_second": 7.294, "step": 17800 }, { "epoch": 1018.4921763869132, "grad_norm": 0.1779673546552658, "learning_rate": 4.224080267558528e-06, "loss": 2.6213, "step": 17900 }, { "epoch": 1018.4921763869132, "eval_loss": 2.6191885471343994, "eval_runtime": 23.6149, "eval_samples_per_second": 59.031, "eval_steps_per_second": 7.411, "step": 17900 }, { "epoch": 1024.1820768136558, "grad_norm": 0.1736188530921936, "learning_rate": 4.214046822742475e-06, "loss": 2.621, "step": 18000 }, { "epoch": 1024.1820768136558, "eval_loss": 2.619549036026001, "eval_runtime": 23.9787, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 18000 }, { "epoch": 1029.8719772403983, "grad_norm": 0.17679837346076965, "learning_rate": 4.2040133779264216e-06, "loss": 2.6215, "step": 18100 }, { "epoch": 1029.8719772403983, "eval_loss": 2.620149612426758, "eval_runtime": 23.9841, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.296, "step": 18100 }, { "epoch": 1035.5618776671408, "grad_norm": 0.18855205178260803, "learning_rate": 4.1939799331103675e-06, "loss": 2.6209, "step": 18200 }, { "epoch": 1035.5618776671408, "eval_loss": 2.620770215988159, "eval_runtime": 23.9826, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 18200 }, { "epoch": 1041.2517780938833, "grad_norm": 0.15188181400299072, "learning_rate": 4.183946488294314e-06, "loss": 2.6209, "step": 18300 }, { "epoch": 1041.2517780938833, "eval_loss": 2.6219208240509033, "eval_runtime": 23.99, "eval_samples_per_second": 58.108, "eval_steps_per_second": 7.295, "step": 18300 }, { "epoch": 1046.9416785206258, "grad_norm": 0.19299472868442535, "learning_rate": 4.173913043478261e-06, "loss": 2.6207, "step": 18400 }, { "epoch": 1046.9416785206258, "eval_loss": 2.6227285861968994, "eval_runtime": 23.9808, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.298, "step": 18400 }, { "epoch": 1052.6315789473683, "grad_norm": 0.19672277569770813, "learning_rate": 4.163879598662208e-06, "loss": 2.6208, "step": 18500 }, { "epoch": 1052.6315789473683, "eval_loss": 2.6191093921661377, "eval_runtime": 23.979, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 18500 }, { "epoch": 1058.321479374111, "grad_norm": 0.15439458191394806, "learning_rate": 4.153846153846154e-06, "loss": 2.6209, "step": 18600 }, { "epoch": 1058.321479374111, "eval_loss": 2.6170802116394043, "eval_runtime": 23.6298, "eval_samples_per_second": 58.993, "eval_steps_per_second": 7.406, "step": 18600 }, { "epoch": 1064.0113798008535, "grad_norm": 0.16907408833503723, "learning_rate": 4.1438127090301005e-06, "loss": 2.6204, "step": 18700 }, { "epoch": 1064.0113798008535, "eval_loss": 2.616074323654175, "eval_runtime": 23.9815, "eval_samples_per_second": 58.128, "eval_steps_per_second": 7.297, "step": 18700 }, { "epoch": 1069.701280227596, "grad_norm": 0.15719492733478546, "learning_rate": 4.133779264214047e-06, "loss": 2.6202, "step": 18800 }, { "epoch": 1069.701280227596, "eval_loss": 2.6210148334503174, "eval_runtime": 23.9903, "eval_samples_per_second": 58.107, "eval_steps_per_second": 7.295, "step": 18800 }, { "epoch": 1075.3911806543385, "grad_norm": 0.17413724958896637, "learning_rate": 4.123745819397993e-06, "loss": 2.6198, "step": 18900 }, { "epoch": 1075.3911806543385, "eval_loss": 2.6156530380249023, "eval_runtime": 23.991, "eval_samples_per_second": 58.105, "eval_steps_per_second": 7.294, "step": 18900 }, { "epoch": 1081.081081081081, "grad_norm": 0.2030758559703827, "learning_rate": 4.11371237458194e-06, "loss": 2.6196, "step": 19000 }, { "epoch": 1081.081081081081, "eval_loss": 2.619901180267334, "eval_runtime": 23.9766, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 19000 }, { "epoch": 1086.7709815078235, "grad_norm": 0.1614978164434433, "learning_rate": 4.103678929765887e-06, "loss": 2.6191, "step": 19100 }, { "epoch": 1086.7709815078235, "eval_loss": 2.6212069988250732, "eval_runtime": 23.9883, "eval_samples_per_second": 58.112, "eval_steps_per_second": 7.295, "step": 19100 }, { "epoch": 1092.460881934566, "grad_norm": 0.17999444901943207, "learning_rate": 4.0936454849498326e-06, "loss": 2.6194, "step": 19200 }, { "epoch": 1092.460881934566, "eval_loss": 2.6198325157165527, "eval_runtime": 23.9861, "eval_samples_per_second": 58.117, "eval_steps_per_second": 7.296, "step": 19200 }, { "epoch": 1098.1507823613088, "grad_norm": 0.18121473491191864, "learning_rate": 4.083612040133779e-06, "loss": 2.6197, "step": 19300 }, { "epoch": 1098.1507823613088, "eval_loss": 2.6189935207366943, "eval_runtime": 23.9907, "eval_samples_per_second": 58.106, "eval_steps_per_second": 7.295, "step": 19300 }, { "epoch": 1103.8406827880513, "grad_norm": 0.21219174563884735, "learning_rate": 4.073578595317726e-06, "loss": 2.6195, "step": 19400 }, { "epoch": 1103.8406827880513, "eval_loss": 2.620710611343384, "eval_runtime": 21.6561, "eval_samples_per_second": 64.37, "eval_steps_per_second": 8.081, "step": 19400 }, { "epoch": 1109.5305832147938, "grad_norm": 0.17006872594356537, "learning_rate": 4.063545150501672e-06, "loss": 2.6191, "step": 19500 }, { "epoch": 1109.5305832147938, "eval_loss": 2.6186723709106445, "eval_runtime": 23.9787, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 19500 }, { "epoch": 1115.2204836415362, "grad_norm": 0.18167956173419952, "learning_rate": 4.053511705685619e-06, "loss": 2.6184, "step": 19600 }, { "epoch": 1115.2204836415362, "eval_loss": 2.6187093257904053, "eval_runtime": 23.6119, "eval_samples_per_second": 59.038, "eval_steps_per_second": 7.412, "step": 19600 }, { "epoch": 1120.9103840682787, "grad_norm": 0.17147672176361084, "learning_rate": 4.0434782608695655e-06, "loss": 2.6191, "step": 19700 }, { "epoch": 1120.9103840682787, "eval_loss": 2.6219475269317627, "eval_runtime": 23.9735, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 19700 }, { "epoch": 1126.6002844950212, "grad_norm": 0.1981355994939804, "learning_rate": 4.0334448160535115e-06, "loss": 2.619, "step": 19800 }, { "epoch": 1126.6002844950212, "eval_loss": 2.61908221244812, "eval_runtime": 23.9658, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 19800 }, { "epoch": 1132.290184921764, "grad_norm": 0.17601875960826874, "learning_rate": 4.023411371237458e-06, "loss": 2.6185, "step": 19900 }, { "epoch": 1132.290184921764, "eval_loss": 2.6192004680633545, "eval_runtime": 23.9779, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 19900 }, { "epoch": 1137.9800853485065, "grad_norm": 0.16543589532375336, "learning_rate": 4.013377926421405e-06, "loss": 2.6186, "step": 20000 }, { "epoch": 1137.9800853485065, "eval_loss": 2.617563247680664, "eval_runtime": 23.9718, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 20000 }, { "epoch": 1143.669985775249, "grad_norm": 0.1984323263168335, "learning_rate": 4.003344481605351e-06, "loss": 2.6177, "step": 20100 }, { "epoch": 1143.669985775249, "eval_loss": 2.6192288398742676, "eval_runtime": 23.959, "eval_samples_per_second": 58.183, "eval_steps_per_second": 7.304, "step": 20100 }, { "epoch": 1149.3598862019915, "grad_norm": 0.1759239286184311, "learning_rate": 3.993311036789298e-06, "loss": 2.618, "step": 20200 }, { "epoch": 1149.3598862019915, "eval_loss": 2.6187233924865723, "eval_runtime": 23.9823, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 20200 }, { "epoch": 1155.049786628734, "grad_norm": 0.174777090549469, "learning_rate": 3.9832775919732444e-06, "loss": 2.618, "step": 20300 }, { "epoch": 1155.049786628734, "eval_loss": 2.6186254024505615, "eval_runtime": 23.6342, "eval_samples_per_second": 58.982, "eval_steps_per_second": 7.405, "step": 20300 }, { "epoch": 1160.7396870554765, "grad_norm": 0.16251406073570251, "learning_rate": 3.97324414715719e-06, "loss": 2.6176, "step": 20400 }, { "epoch": 1160.7396870554765, "eval_loss": 2.617729425430298, "eval_runtime": 23.9981, "eval_samples_per_second": 58.088, "eval_steps_per_second": 7.292, "step": 20400 }, { "epoch": 1166.429587482219, "grad_norm": 0.14838360249996185, "learning_rate": 3.963210702341137e-06, "loss": 2.6181, "step": 20500 }, { "epoch": 1166.429587482219, "eval_loss": 2.6153714656829834, "eval_runtime": 23.9768, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 20500 }, { "epoch": 1172.1194879089617, "grad_norm": 0.1753985583782196, "learning_rate": 3.953177257525084e-06, "loss": 2.6179, "step": 20600 }, { "epoch": 1172.1194879089617, "eval_loss": 2.618314743041992, "eval_runtime": 23.9673, "eval_samples_per_second": 58.163, "eval_steps_per_second": 7.302, "step": 20600 }, { "epoch": 1177.8093883357042, "grad_norm": 0.18267270922660828, "learning_rate": 3.943143812709031e-06, "loss": 2.6176, "step": 20700 }, { "epoch": 1177.8093883357042, "eval_loss": 2.6154963970184326, "eval_runtime": 23.9756, "eval_samples_per_second": 58.142, "eval_steps_per_second": 7.299, "step": 20700 }, { "epoch": 1183.4992887624467, "grad_norm": 0.16119986772537231, "learning_rate": 3.9331103678929765e-06, "loss": 2.6176, "step": 20800 }, { "epoch": 1183.4992887624467, "eval_loss": 2.6163604259490967, "eval_runtime": 23.9848, "eval_samples_per_second": 58.12, "eval_steps_per_second": 7.296, "step": 20800 }, { "epoch": 1189.1891891891892, "grad_norm": 0.1592591255903244, "learning_rate": 3.923076923076923e-06, "loss": 2.6166, "step": 20900 }, { "epoch": 1189.1891891891892, "eval_loss": 2.617856979370117, "eval_runtime": 23.6131, "eval_samples_per_second": 59.035, "eval_steps_per_second": 7.411, "step": 20900 }, { "epoch": 1194.8790896159317, "grad_norm": 0.16023056209087372, "learning_rate": 3.91304347826087e-06, "loss": 2.6173, "step": 21000 }, { "epoch": 1194.8790896159317, "eval_loss": 2.61501407623291, "eval_runtime": 23.9703, "eval_samples_per_second": 58.155, "eval_steps_per_second": 7.301, "step": 21000 }, { "epoch": 1200.5689900426742, "grad_norm": 0.1646001935005188, "learning_rate": 3.903010033444816e-06, "loss": 2.6172, "step": 21100 }, { "epoch": 1200.5689900426742, "eval_loss": 2.6198084354400635, "eval_runtime": 23.9829, "eval_samples_per_second": 58.125, "eval_steps_per_second": 7.297, "step": 21100 }, { "epoch": 1206.2588904694169, "grad_norm": 0.1908738762140274, "learning_rate": 3.892976588628763e-06, "loss": 2.6164, "step": 21200 }, { "epoch": 1206.2588904694169, "eval_loss": 2.6190242767333984, "eval_runtime": 23.976, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 21200 }, { "epoch": 1211.9487908961594, "grad_norm": 0.16407649219036102, "learning_rate": 3.8829431438127095e-06, "loss": 2.6163, "step": 21300 }, { "epoch": 1211.9487908961594, "eval_loss": 2.6163718700408936, "eval_runtime": 23.986, "eval_samples_per_second": 58.117, "eval_steps_per_second": 7.296, "step": 21300 }, { "epoch": 1217.6386913229019, "grad_norm": 0.16749687492847443, "learning_rate": 3.8729096989966554e-06, "loss": 2.6163, "step": 21400 }, { "epoch": 1217.6386913229019, "eval_loss": 2.6182687282562256, "eval_runtime": 23.9719, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 21400 }, { "epoch": 1223.3285917496444, "grad_norm": 0.1730409562587738, "learning_rate": 3.862876254180602e-06, "loss": 2.617, "step": 21500 }, { "epoch": 1223.3285917496444, "eval_loss": 2.619847059249878, "eval_runtime": 23.6171, "eval_samples_per_second": 59.025, "eval_steps_per_second": 7.41, "step": 21500 }, { "epoch": 1229.0184921763869, "grad_norm": 0.17030537128448486, "learning_rate": 3.852842809364549e-06, "loss": 2.6168, "step": 21600 }, { "epoch": 1229.0184921763869, "eval_loss": 2.6159706115722656, "eval_runtime": 23.9563, "eval_samples_per_second": 58.189, "eval_steps_per_second": 7.305, "step": 21600 }, { "epoch": 1234.7083926031294, "grad_norm": 0.15996751189231873, "learning_rate": 3.842809364548495e-06, "loss": 2.6159, "step": 21700 }, { "epoch": 1234.7083926031294, "eval_loss": 2.6217665672302246, "eval_runtime": 23.6191, "eval_samples_per_second": 59.02, "eval_steps_per_second": 7.409, "step": 21700 }, { "epoch": 1240.3982930298719, "grad_norm": 0.18178878724575043, "learning_rate": 3.832775919732442e-06, "loss": 2.6154, "step": 21800 }, { "epoch": 1240.3982930298719, "eval_loss": 2.6181161403656006, "eval_runtime": 23.9747, "eval_samples_per_second": 58.145, "eval_steps_per_second": 7.299, "step": 21800 }, { "epoch": 1246.0881934566146, "grad_norm": 0.1834726333618164, "learning_rate": 3.822742474916388e-06, "loss": 2.6158, "step": 21900 }, { "epoch": 1246.0881934566146, "eval_loss": 2.6183881759643555, "eval_runtime": 23.9644, "eval_samples_per_second": 58.17, "eval_steps_per_second": 7.302, "step": 21900 }, { "epoch": 1251.778093883357, "grad_norm": 0.18116818368434906, "learning_rate": 3.8127090301003347e-06, "loss": 2.6151, "step": 22000 }, { "epoch": 1251.778093883357, "eval_loss": 2.6145451068878174, "eval_runtime": 23.9726, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 22000 }, { "epoch": 1257.4679943100996, "grad_norm": 0.17791162431240082, "learning_rate": 3.802675585284281e-06, "loss": 2.6155, "step": 22100 }, { "epoch": 1257.4679943100996, "eval_loss": 2.615226984024048, "eval_runtime": 23.9896, "eval_samples_per_second": 58.109, "eval_steps_per_second": 7.295, "step": 22100 }, { "epoch": 1263.157894736842, "grad_norm": 0.18372969329357147, "learning_rate": 3.792642140468228e-06, "loss": 2.6156, "step": 22200 }, { "epoch": 1263.157894736842, "eval_loss": 2.6180429458618164, "eval_runtime": 23.9809, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 22200 }, { "epoch": 1268.8477951635846, "grad_norm": 0.1649881899356842, "learning_rate": 3.782608695652174e-06, "loss": 2.6154, "step": 22300 }, { "epoch": 1268.8477951635846, "eval_loss": 2.618147850036621, "eval_runtime": 23.9875, "eval_samples_per_second": 58.114, "eval_steps_per_second": 7.295, "step": 22300 }, { "epoch": 1274.537695590327, "grad_norm": 0.185517817735672, "learning_rate": 3.7725752508361205e-06, "loss": 2.6147, "step": 22400 }, { "epoch": 1274.537695590327, "eval_loss": 2.614933729171753, "eval_runtime": 23.98, "eval_samples_per_second": 58.132, "eval_steps_per_second": 7.298, "step": 22400 }, { "epoch": 1280.2275960170698, "grad_norm": 0.18541164696216583, "learning_rate": 3.7625418060200673e-06, "loss": 2.615, "step": 22500 }, { "epoch": 1280.2275960170698, "eval_loss": 2.614748477935791, "eval_runtime": 23.9792, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 22500 }, { "epoch": 1285.9174964438123, "grad_norm": 0.1989043653011322, "learning_rate": 3.7525083612040136e-06, "loss": 2.6149, "step": 22600 }, { "epoch": 1285.9174964438123, "eval_loss": 2.6172001361846924, "eval_runtime": 23.6321, "eval_samples_per_second": 58.988, "eval_steps_per_second": 7.405, "step": 22600 }, { "epoch": 1291.6073968705548, "grad_norm": 0.18920746445655823, "learning_rate": 3.74247491638796e-06, "loss": 2.6152, "step": 22700 }, { "epoch": 1291.6073968705548, "eval_loss": 2.613987445831299, "eval_runtime": 23.9748, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 22700 }, { "epoch": 1297.2972972972973, "grad_norm": 0.1757652908563614, "learning_rate": 3.7324414715719067e-06, "loss": 2.6146, "step": 22800 }, { "epoch": 1297.2972972972973, "eval_loss": 2.6179261207580566, "eval_runtime": 23.9716, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 22800 }, { "epoch": 1302.9871977240398, "grad_norm": 0.21378381550312042, "learning_rate": 3.722408026755853e-06, "loss": 2.6139, "step": 22900 }, { "epoch": 1302.9871977240398, "eval_loss": 2.6163623332977295, "eval_runtime": 23.9779, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 22900 }, { "epoch": 1308.6770981507823, "grad_norm": 0.2038387954235077, "learning_rate": 3.7123745819398e-06, "loss": 2.6142, "step": 23000 }, { "epoch": 1308.6770981507823, "eval_loss": 2.615541934967041, "eval_runtime": 23.9618, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 23000 }, { "epoch": 1314.3669985775248, "grad_norm": 0.13637392222881317, "learning_rate": 3.702341137123746e-06, "loss": 2.6141, "step": 23100 }, { "epoch": 1314.3669985775248, "eval_loss": 2.6169161796569824, "eval_runtime": 23.9822, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 23100 }, { "epoch": 1320.0568990042675, "grad_norm": 0.18610434234142303, "learning_rate": 3.6923076923076925e-06, "loss": 2.614, "step": 23200 }, { "epoch": 1320.0568990042675, "eval_loss": 2.6124112606048584, "eval_runtime": 23.9616, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 23200 }, { "epoch": 1325.74679943101, "grad_norm": 0.17579494416713715, "learning_rate": 3.6822742474916393e-06, "loss": 2.6138, "step": 23300 }, { "epoch": 1325.74679943101, "eval_loss": 2.615485429763794, "eval_runtime": 23.9777, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 23300 }, { "epoch": 1331.4366998577525, "grad_norm": 0.1542961448431015, "learning_rate": 3.6722408026755856e-06, "loss": 2.6135, "step": 23400 }, { "epoch": 1331.4366998577525, "eval_loss": 2.6187291145324707, "eval_runtime": 23.6277, "eval_samples_per_second": 58.999, "eval_steps_per_second": 7.407, "step": 23400 }, { "epoch": 1337.126600284495, "grad_norm": 0.17543186247348785, "learning_rate": 3.662207357859532e-06, "loss": 2.6138, "step": 23500 }, { "epoch": 1337.126600284495, "eval_loss": 2.614956855773926, "eval_runtime": 23.9548, "eval_samples_per_second": 58.193, "eval_steps_per_second": 7.305, "step": 23500 }, { "epoch": 1342.8165007112375, "grad_norm": 0.16438443958759308, "learning_rate": 3.6521739130434787e-06, "loss": 2.614, "step": 23600 }, { "epoch": 1342.8165007112375, "eval_loss": 2.615394353866577, "eval_runtime": 23.9594, "eval_samples_per_second": 58.182, "eval_steps_per_second": 7.304, "step": 23600 }, { "epoch": 1348.50640113798, "grad_norm": 0.16566355526447296, "learning_rate": 3.642140468227425e-06, "loss": 2.6133, "step": 23700 }, { "epoch": 1348.50640113798, "eval_loss": 2.6109862327575684, "eval_runtime": 23.973, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 23700 }, { "epoch": 1354.1963015647227, "grad_norm": 0.19186396896839142, "learning_rate": 3.6321070234113714e-06, "loss": 2.6132, "step": 23800 }, { "epoch": 1354.1963015647227, "eval_loss": 2.61553692817688, "eval_runtime": 23.9667, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 23800 }, { "epoch": 1359.8862019914652, "grad_norm": 0.1499451845884323, "learning_rate": 3.622073578595318e-06, "loss": 2.6129, "step": 23900 }, { "epoch": 1359.8862019914652, "eval_loss": 2.6135051250457764, "eval_runtime": 23.9621, "eval_samples_per_second": 58.175, "eval_steps_per_second": 7.303, "step": 23900 }, { "epoch": 1365.5761024182077, "grad_norm": 0.17722682654857635, "learning_rate": 3.6120401337792645e-06, "loss": 2.6132, "step": 24000 }, { "epoch": 1365.5761024182077, "eval_loss": 2.6139628887176514, "eval_runtime": 23.966, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 24000 }, { "epoch": 1371.2660028449502, "grad_norm": 0.17891627550125122, "learning_rate": 3.6020066889632112e-06, "loss": 2.6127, "step": 24100 }, { "epoch": 1371.2660028449502, "eval_loss": 2.616461992263794, "eval_runtime": 23.6223, "eval_samples_per_second": 59.012, "eval_steps_per_second": 7.408, "step": 24100 }, { "epoch": 1376.9559032716927, "grad_norm": 0.1613658368587494, "learning_rate": 3.5919732441471576e-06, "loss": 2.6129, "step": 24200 }, { "epoch": 1376.9559032716927, "eval_loss": 2.6160008907318115, "eval_runtime": 23.9933, "eval_samples_per_second": 58.1, "eval_steps_per_second": 7.294, "step": 24200 }, { "epoch": 1382.6458036984352, "grad_norm": 0.16486230492591858, "learning_rate": 3.581939799331104e-06, "loss": 2.6126, "step": 24300 }, { "epoch": 1382.6458036984352, "eval_loss": 2.617274761199951, "eval_runtime": 23.9711, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 24300 }, { "epoch": 1388.3357041251777, "grad_norm": 0.17561286687850952, "learning_rate": 3.5719063545150507e-06, "loss": 2.6121, "step": 24400 }, { "epoch": 1388.3357041251777, "eval_loss": 2.615642786026001, "eval_runtime": 23.9873, "eval_samples_per_second": 58.114, "eval_steps_per_second": 7.296, "step": 24400 }, { "epoch": 1394.0256045519204, "grad_norm": 0.2206220030784607, "learning_rate": 3.561872909698997e-06, "loss": 2.6121, "step": 24500 }, { "epoch": 1394.0256045519204, "eval_loss": 2.6155807971954346, "eval_runtime": 24.072, "eval_samples_per_second": 57.91, "eval_steps_per_second": 7.27, "step": 24500 }, { "epoch": 1399.715504978663, "grad_norm": 0.17727740108966827, "learning_rate": 3.5518394648829434e-06, "loss": 2.6116, "step": 24600 }, { "epoch": 1399.715504978663, "eval_loss": 2.6150245666503906, "eval_runtime": 23.9542, "eval_samples_per_second": 58.194, "eval_steps_per_second": 7.306, "step": 24600 }, { "epoch": 1405.4054054054054, "grad_norm": 0.23213492333889008, "learning_rate": 3.54180602006689e-06, "loss": 2.6122, "step": 24700 }, { "epoch": 1405.4054054054054, "eval_loss": 2.6127638816833496, "eval_runtime": 23.9798, "eval_samples_per_second": 58.132, "eval_steps_per_second": 7.298, "step": 24700 }, { "epoch": 1411.095305832148, "grad_norm": 0.19061516225337982, "learning_rate": 3.5317725752508365e-06, "loss": 2.6114, "step": 24800 }, { "epoch": 1411.095305832148, "eval_loss": 2.6152596473693848, "eval_runtime": 23.9762, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 24800 }, { "epoch": 1416.7852062588904, "grad_norm": 0.21451649069786072, "learning_rate": 3.521739130434783e-06, "loss": 2.6117, "step": 24900 }, { "epoch": 1416.7852062588904, "eval_loss": 2.616278886795044, "eval_runtime": 23.6278, "eval_samples_per_second": 58.998, "eval_steps_per_second": 7.407, "step": 24900 }, { "epoch": 1422.475106685633, "grad_norm": 0.19627918303012848, "learning_rate": 3.5117056856187296e-06, "loss": 2.6115, "step": 25000 }, { "epoch": 1422.475106685633, "eval_loss": 2.6146152019500732, "eval_runtime": 23.9995, "eval_samples_per_second": 58.084, "eval_steps_per_second": 7.292, "step": 25000 }, { "epoch": 1428.1650071123756, "grad_norm": 0.1765471249818802, "learning_rate": 3.501672240802676e-06, "loss": 2.6117, "step": 25100 }, { "epoch": 1428.1650071123756, "eval_loss": 2.612539768218994, "eval_runtime": 23.6206, "eval_samples_per_second": 59.016, "eval_steps_per_second": 7.409, "step": 25100 }, { "epoch": 1433.8549075391181, "grad_norm": 0.1898675262928009, "learning_rate": 3.491638795986622e-06, "loss": 2.6114, "step": 25200 }, { "epoch": 1433.8549075391181, "eval_loss": 2.6151158809661865, "eval_runtime": 23.9639, "eval_samples_per_second": 58.171, "eval_steps_per_second": 7.303, "step": 25200 }, { "epoch": 1439.5448079658606, "grad_norm": 0.16476163268089294, "learning_rate": 3.481605351170568e-06, "loss": 2.6113, "step": 25300 }, { "epoch": 1439.5448079658606, "eval_loss": 2.6140201091766357, "eval_runtime": 23.9566, "eval_samples_per_second": 58.188, "eval_steps_per_second": 7.305, "step": 25300 }, { "epoch": 1445.2347083926031, "grad_norm": 0.18662914633750916, "learning_rate": 3.471571906354515e-06, "loss": 2.6105, "step": 25400 }, { "epoch": 1445.2347083926031, "eval_loss": 2.615382194519043, "eval_runtime": 23.9571, "eval_samples_per_second": 58.187, "eval_steps_per_second": 7.305, "step": 25400 }, { "epoch": 1450.9246088193456, "grad_norm": 0.19092458486557007, "learning_rate": 3.4615384615384613e-06, "loss": 2.6109, "step": 25500 }, { "epoch": 1450.9246088193456, "eval_loss": 2.6135177612304688, "eval_runtime": 23.9762, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 25500 }, { "epoch": 1456.6145092460881, "grad_norm": 0.1783183068037033, "learning_rate": 3.4515050167224076e-06, "loss": 2.6106, "step": 25600 }, { "epoch": 1456.6145092460881, "eval_loss": 2.611865282058716, "eval_runtime": 23.9618, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 25600 }, { "epoch": 1462.3044096728306, "grad_norm": 0.19193953275680542, "learning_rate": 3.4414715719063544e-06, "loss": 2.6107, "step": 25700 }, { "epoch": 1462.3044096728306, "eval_loss": 2.6173393726348877, "eval_runtime": 23.9925, "eval_samples_per_second": 58.101, "eval_steps_per_second": 7.294, "step": 25700 }, { "epoch": 1467.9943100995733, "grad_norm": 0.16849613189697266, "learning_rate": 3.4314381270903007e-06, "loss": 2.6109, "step": 25800 }, { "epoch": 1467.9943100995733, "eval_loss": 2.612614631652832, "eval_runtime": 23.9774, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 25800 }, { "epoch": 1473.6842105263158, "grad_norm": 0.19681937992572784, "learning_rate": 3.4214046822742475e-06, "loss": 2.6099, "step": 25900 }, { "epoch": 1473.6842105263158, "eval_loss": 2.6156296730041504, "eval_runtime": 23.621, "eval_samples_per_second": 59.015, "eval_steps_per_second": 7.409, "step": 25900 }, { "epoch": 1479.3741109530583, "grad_norm": 0.18242304027080536, "learning_rate": 3.411371237458194e-06, "loss": 2.6102, "step": 26000 }, { "epoch": 1479.3741109530583, "eval_loss": 2.613642454147339, "eval_runtime": 23.9764, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 26000 }, { "epoch": 1485.0640113798008, "grad_norm": 0.16466517746448517, "learning_rate": 3.40133779264214e-06, "loss": 2.6104, "step": 26100 }, { "epoch": 1485.0640113798008, "eval_loss": 2.613408327102661, "eval_runtime": 23.9744, "eval_samples_per_second": 58.145, "eval_steps_per_second": 7.299, "step": 26100 }, { "epoch": 1490.7539118065433, "grad_norm": 0.19841539859771729, "learning_rate": 3.391304347826087e-06, "loss": 2.6099, "step": 26200 }, { "epoch": 1490.7539118065433, "eval_loss": 2.6156742572784424, "eval_runtime": 23.9736, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 26200 }, { "epoch": 1496.4438122332858, "grad_norm": 0.18455518782138824, "learning_rate": 3.3812709030100333e-06, "loss": 2.61, "step": 26300 }, { "epoch": 1496.4438122332858, "eval_loss": 2.6147565841674805, "eval_runtime": 23.9688, "eval_samples_per_second": 58.159, "eval_steps_per_second": 7.301, "step": 26300 }, { "epoch": 1502.1337126600286, "grad_norm": 0.1784888654947281, "learning_rate": 3.3712374581939796e-06, "loss": 2.6098, "step": 26400 }, { "epoch": 1502.1337126600286, "eval_loss": 2.614964723587036, "eval_runtime": 23.6319, "eval_samples_per_second": 58.988, "eval_steps_per_second": 7.405, "step": 26400 }, { "epoch": 1507.823613086771, "grad_norm": 0.17167048156261444, "learning_rate": 3.3612040133779264e-06, "loss": 2.6094, "step": 26500 }, { "epoch": 1507.823613086771, "eval_loss": 2.6146252155303955, "eval_runtime": 23.9509, "eval_samples_per_second": 58.202, "eval_steps_per_second": 7.307, "step": 26500 }, { "epoch": 1513.5135135135135, "grad_norm": 0.17414234578609467, "learning_rate": 3.3511705685618727e-06, "loss": 2.6097, "step": 26600 }, { "epoch": 1513.5135135135135, "eval_loss": 2.6143181324005127, "eval_runtime": 23.9631, "eval_samples_per_second": 58.173, "eval_steps_per_second": 7.303, "step": 26600 }, { "epoch": 1519.203413940256, "grad_norm": 0.18100158870220184, "learning_rate": 3.3411371237458195e-06, "loss": 2.6096, "step": 26700 }, { "epoch": 1519.203413940256, "eval_loss": 2.6159329414367676, "eval_runtime": 23.9773, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 26700 }, { "epoch": 1524.8933143669985, "grad_norm": 0.16772860288619995, "learning_rate": 3.331103678929766e-06, "loss": 2.6094, "step": 26800 }, { "epoch": 1524.8933143669985, "eval_loss": 2.6139376163482666, "eval_runtime": 23.9804, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 26800 }, { "epoch": 1530.583214793741, "grad_norm": 0.17105814814567566, "learning_rate": 3.321070234113712e-06, "loss": 2.6095, "step": 26900 }, { "epoch": 1530.583214793741, "eval_loss": 2.617861747741699, "eval_runtime": 23.9713, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 26900 }, { "epoch": 1536.2731152204835, "grad_norm": 0.16764415800571442, "learning_rate": 3.311036789297659e-06, "loss": 2.6093, "step": 27000 }, { "epoch": 1536.2731152204835, "eval_loss": 2.617765188217163, "eval_runtime": 23.9839, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.297, "step": 27000 }, { "epoch": 1541.9630156472263, "grad_norm": 0.18489839136600494, "learning_rate": 3.3010033444816052e-06, "loss": 2.6094, "step": 27100 }, { "epoch": 1541.9630156472263, "eval_loss": 2.614982843399048, "eval_runtime": 23.9739, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 27100 }, { "epoch": 1547.6529160739688, "grad_norm": 0.18775135278701782, "learning_rate": 3.2909698996655516e-06, "loss": 2.6086, "step": 27200 }, { "epoch": 1547.6529160739688, "eval_loss": 2.6160266399383545, "eval_runtime": 23.9808, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 27200 }, { "epoch": 1553.3428165007113, "grad_norm": 0.16984674334526062, "learning_rate": 3.2809364548494983e-06, "loss": 2.6085, "step": 27300 }, { "epoch": 1553.3428165007113, "eval_loss": 2.615583896636963, "eval_runtime": 23.6298, "eval_samples_per_second": 58.993, "eval_steps_per_second": 7.406, "step": 27300 }, { "epoch": 1559.0327169274537, "grad_norm": 0.16006973385810852, "learning_rate": 3.2709030100334447e-06, "loss": 2.6083, "step": 27400 }, { "epoch": 1559.0327169274537, "eval_loss": 2.616664409637451, "eval_runtime": 23.9836, "eval_samples_per_second": 58.123, "eval_steps_per_second": 7.297, "step": 27400 }, { "epoch": 1564.7226173541962, "grad_norm": 0.2134248912334442, "learning_rate": 3.260869565217391e-06, "loss": 2.6087, "step": 27500 }, { "epoch": 1564.7226173541962, "eval_loss": 2.6122682094573975, "eval_runtime": 23.9759, "eval_samples_per_second": 58.142, "eval_steps_per_second": 7.299, "step": 27500 }, { "epoch": 1570.4125177809387, "grad_norm": 0.18280279636383057, "learning_rate": 3.2508361204013378e-06, "loss": 2.6084, "step": 27600 }, { "epoch": 1570.4125177809387, "eval_loss": 2.61324143409729, "eval_runtime": 23.9718, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 27600 }, { "epoch": 1576.1024182076815, "grad_norm": 0.1901465803384781, "learning_rate": 3.240802675585284e-06, "loss": 2.6078, "step": 27700 }, { "epoch": 1576.1024182076815, "eval_loss": 2.6125519275665283, "eval_runtime": 23.9757, "eval_samples_per_second": 58.142, "eval_steps_per_second": 7.299, "step": 27700 }, { "epoch": 1581.792318634424, "grad_norm": 0.18895883858203888, "learning_rate": 3.230769230769231e-06, "loss": 2.608, "step": 27800 }, { "epoch": 1581.792318634424, "eval_loss": 2.6131763458251953, "eval_runtime": 23.9778, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 27800 }, { "epoch": 1587.4822190611665, "grad_norm": 0.17906545102596283, "learning_rate": 3.2207357859531772e-06, "loss": 2.608, "step": 27900 }, { "epoch": 1587.4822190611665, "eval_loss": 2.613011598587036, "eval_runtime": 23.9936, "eval_samples_per_second": 58.099, "eval_steps_per_second": 7.294, "step": 27900 }, { "epoch": 1593.172119487909, "grad_norm": 0.16846366226673126, "learning_rate": 3.2107023411371236e-06, "loss": 2.6079, "step": 28000 }, { "epoch": 1593.172119487909, "eval_loss": 2.6157352924346924, "eval_runtime": 23.9922, "eval_samples_per_second": 58.102, "eval_steps_per_second": 7.294, "step": 28000 }, { "epoch": 1598.8620199146515, "grad_norm": 0.1816137433052063, "learning_rate": 3.2006688963210703e-06, "loss": 2.6076, "step": 28100 }, { "epoch": 1598.8620199146515, "eval_loss": 2.6187076568603516, "eval_runtime": 23.9742, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.3, "step": 28100 }, { "epoch": 1604.551920341394, "grad_norm": 0.19645364582538605, "learning_rate": 3.1906354515050167e-06, "loss": 2.6076, "step": 28200 }, { "epoch": 1604.551920341394, "eval_loss": 2.6162264347076416, "eval_runtime": 23.6332, "eval_samples_per_second": 58.985, "eval_steps_per_second": 7.405, "step": 28200 }, { "epoch": 1610.2418207681364, "grad_norm": 0.16714365780353546, "learning_rate": 3.180602006688963e-06, "loss": 2.6079, "step": 28300 }, { "epoch": 1610.2418207681364, "eval_loss": 2.6153204441070557, "eval_runtime": 23.9767, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 28300 }, { "epoch": 1615.9317211948792, "grad_norm": 0.21319107711315155, "learning_rate": 3.1705685618729098e-06, "loss": 2.6079, "step": 28400 }, { "epoch": 1615.9317211948792, "eval_loss": 2.6141254901885986, "eval_runtime": 23.9806, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.298, "step": 28400 }, { "epoch": 1621.6216216216217, "grad_norm": 0.18718905746936798, "learning_rate": 3.160535117056856e-06, "loss": 2.6072, "step": 28500 }, { "epoch": 1621.6216216216217, "eval_loss": 2.6125545501708984, "eval_runtime": 23.9842, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.296, "step": 28500 }, { "epoch": 1627.3115220483642, "grad_norm": 0.15945301949977875, "learning_rate": 3.1505016722408024e-06, "loss": 2.6068, "step": 28600 }, { "epoch": 1627.3115220483642, "eval_loss": 2.6133553981781006, "eval_runtime": 23.9727, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 28600 }, { "epoch": 1633.0014224751067, "grad_norm": 0.17413286864757538, "learning_rate": 3.140468227424749e-06, "loss": 2.6067, "step": 28700 }, { "epoch": 1633.0014224751067, "eval_loss": 2.6167733669281006, "eval_runtime": 23.9797, "eval_samples_per_second": 58.132, "eval_steps_per_second": 7.298, "step": 28700 }, { "epoch": 1638.6913229018492, "grad_norm": 0.19335277378559113, "learning_rate": 3.1304347826086955e-06, "loss": 2.6071, "step": 28800 }, { "epoch": 1638.6913229018492, "eval_loss": 2.6125223636627197, "eval_runtime": 23.9933, "eval_samples_per_second": 58.1, "eval_steps_per_second": 7.294, "step": 28800 }, { "epoch": 1644.3812233285917, "grad_norm": 0.20571838319301605, "learning_rate": 3.1204013377926423e-06, "loss": 2.6066, "step": 28900 }, { "epoch": 1644.3812233285917, "eval_loss": 2.615057945251465, "eval_runtime": 23.9707, "eval_samples_per_second": 58.154, "eval_steps_per_second": 7.301, "step": 28900 }, { "epoch": 1650.0711237553344, "grad_norm": 0.1820344775915146, "learning_rate": 3.1103678929765886e-06, "loss": 2.6065, "step": 29000 }, { "epoch": 1650.0711237553344, "eval_loss": 2.610342025756836, "eval_runtime": 23.6162, "eval_samples_per_second": 59.027, "eval_steps_per_second": 7.41, "step": 29000 }, { "epoch": 1655.7610241820769, "grad_norm": 0.19376899302005768, "learning_rate": 3.100334448160535e-06, "loss": 2.6061, "step": 29100 }, { "epoch": 1655.7610241820769, "eval_loss": 2.6109142303466797, "eval_runtime": 23.9605, "eval_samples_per_second": 58.179, "eval_steps_per_second": 7.304, "step": 29100 }, { "epoch": 1661.4509246088194, "grad_norm": 0.17542988061904907, "learning_rate": 3.0903010033444818e-06, "loss": 2.6069, "step": 29200 }, { "epoch": 1661.4509246088194, "eval_loss": 2.613027572631836, "eval_runtime": 23.9699, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 29200 }, { "epoch": 1667.1408250355619, "grad_norm": 0.20546042919158936, "learning_rate": 3.080267558528428e-06, "loss": 2.6058, "step": 29300 }, { "epoch": 1667.1408250355619, "eval_loss": 2.6125547885894775, "eval_runtime": 24.0161, "eval_samples_per_second": 58.044, "eval_steps_per_second": 7.287, "step": 29300 }, { "epoch": 1672.8307254623044, "grad_norm": 0.18211744725704193, "learning_rate": 3.0702341137123744e-06, "loss": 2.6056, "step": 29400 }, { "epoch": 1672.8307254623044, "eval_loss": 2.6160383224487305, "eval_runtime": 23.9708, "eval_samples_per_second": 58.154, "eval_steps_per_second": 7.301, "step": 29400 }, { "epoch": 1678.5206258890469, "grad_norm": 0.20983801782131195, "learning_rate": 3.060200668896321e-06, "loss": 2.6061, "step": 29500 }, { "epoch": 1678.5206258890469, "eval_loss": 2.61398983001709, "eval_runtime": 23.9666, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 29500 }, { "epoch": 1684.2105263157894, "grad_norm": 0.20594222843647003, "learning_rate": 3.0501672240802675e-06, "loss": 2.606, "step": 29600 }, { "epoch": 1684.2105263157894, "eval_loss": 2.6141202449798584, "eval_runtime": 23.9797, "eval_samples_per_second": 58.132, "eval_steps_per_second": 7.298, "step": 29600 }, { "epoch": 1689.900426742532, "grad_norm": 0.19307450950145721, "learning_rate": 3.0401337792642143e-06, "loss": 2.6061, "step": 29700 }, { "epoch": 1689.900426742532, "eval_loss": 2.614793062210083, "eval_runtime": 23.6149, "eval_samples_per_second": 59.03, "eval_steps_per_second": 7.411, "step": 29700 }, { "epoch": 1695.5903271692746, "grad_norm": 0.23280702531337738, "learning_rate": 3.0301003344481606e-06, "loss": 2.6057, "step": 29800 }, { "epoch": 1695.5903271692746, "eval_loss": 2.6146321296691895, "eval_runtime": 23.9761, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 29800 }, { "epoch": 1701.280227596017, "grad_norm": 0.18242600560188293, "learning_rate": 3.020066889632107e-06, "loss": 2.6054, "step": 29900 }, { "epoch": 1701.280227596017, "eval_loss": 2.618605136871338, "eval_runtime": 23.9512, "eval_samples_per_second": 58.202, "eval_steps_per_second": 7.307, "step": 29900 }, { "epoch": 1706.9701280227596, "grad_norm": 0.18484260141849518, "learning_rate": 3.0100334448160537e-06, "loss": 2.6055, "step": 30000 }, { "epoch": 1706.9701280227596, "eval_loss": 2.613018274307251, "eval_runtime": 23.9772, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 30000 }, { "epoch": 1712.660028449502, "grad_norm": 0.19655726850032806, "learning_rate": 3e-06, "loss": 2.6048, "step": 30100 }, { "epoch": 1712.660028449502, "eval_loss": 2.615434408187866, "eval_runtime": 23.978, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 30100 }, { "epoch": 1718.3499288762446, "grad_norm": 0.21742597222328186, "learning_rate": 2.9899665551839464e-06, "loss": 2.6059, "step": 30200 }, { "epoch": 1718.3499288762446, "eval_loss": 2.6120095252990723, "eval_runtime": 23.9631, "eval_samples_per_second": 58.173, "eval_steps_per_second": 7.303, "step": 30200 }, { "epoch": 1724.0398293029873, "grad_norm": 0.22138410806655884, "learning_rate": 2.979933110367893e-06, "loss": 2.6048, "step": 30300 }, { "epoch": 1724.0398293029873, "eval_loss": 2.613083600997925, "eval_runtime": 23.9653, "eval_samples_per_second": 58.167, "eval_steps_per_second": 7.302, "step": 30300 }, { "epoch": 1729.7297297297298, "grad_norm": 0.1965727061033249, "learning_rate": 2.9698996655518395e-06, "loss": 2.6046, "step": 30400 }, { "epoch": 1729.7297297297298, "eval_loss": 2.6135008335113525, "eval_runtime": 23.9769, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 30400 }, { "epoch": 1735.4196301564723, "grad_norm": 0.1746743619441986, "learning_rate": 2.959866220735786e-06, "loss": 2.6047, "step": 30500 }, { "epoch": 1735.4196301564723, "eval_loss": 2.615384578704834, "eval_runtime": 23.6157, "eval_samples_per_second": 59.029, "eval_steps_per_second": 7.41, "step": 30500 }, { "epoch": 1741.1095305832148, "grad_norm": 0.18516981601715088, "learning_rate": 2.9498327759197326e-06, "loss": 2.6041, "step": 30600 }, { "epoch": 1741.1095305832148, "eval_loss": 2.61470103263855, "eval_runtime": 23.9702, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 30600 }, { "epoch": 1746.7994310099573, "grad_norm": 0.18617106974124908, "learning_rate": 2.939799331103679e-06, "loss": 2.604, "step": 30700 }, { "epoch": 1746.7994310099573, "eval_loss": 2.60992169380188, "eval_runtime": 23.9721, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 30700 }, { "epoch": 1752.4893314366998, "grad_norm": 0.1985541135072708, "learning_rate": 2.9297658862876257e-06, "loss": 2.6044, "step": 30800 }, { "epoch": 1752.4893314366998, "eval_loss": 2.610335350036621, "eval_runtime": 23.6291, "eval_samples_per_second": 58.995, "eval_steps_per_second": 7.406, "step": 30800 }, { "epoch": 1758.1792318634423, "grad_norm": 0.181779146194458, "learning_rate": 2.919732441471572e-06, "loss": 2.6038, "step": 30900 }, { "epoch": 1758.1792318634423, "eval_loss": 2.6195976734161377, "eval_runtime": 23.9725, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 30900 }, { "epoch": 1763.869132290185, "grad_norm": 0.21061445772647858, "learning_rate": 2.9096989966555184e-06, "loss": 2.6035, "step": 31000 }, { "epoch": 1763.869132290185, "eval_loss": 2.6109161376953125, "eval_runtime": 23.9629, "eval_samples_per_second": 58.173, "eval_steps_per_second": 7.303, "step": 31000 }, { "epoch": 1769.5590327169275, "grad_norm": 0.17267170548439026, "learning_rate": 2.899665551839465e-06, "loss": 2.6034, "step": 31100 }, { "epoch": 1769.5590327169275, "eval_loss": 2.6139283180236816, "eval_runtime": 23.9656, "eval_samples_per_second": 58.167, "eval_steps_per_second": 7.302, "step": 31100 }, { "epoch": 1775.24893314367, "grad_norm": 0.19237692654132843, "learning_rate": 2.8896321070234115e-06, "loss": 2.6033, "step": 31200 }, { "epoch": 1775.24893314367, "eval_loss": 2.6118404865264893, "eval_runtime": 23.964, "eval_samples_per_second": 58.17, "eval_steps_per_second": 7.303, "step": 31200 }, { "epoch": 1780.9388335704125, "grad_norm": 0.17256025969982147, "learning_rate": 2.879598662207358e-06, "loss": 2.6033, "step": 31300 }, { "epoch": 1780.9388335704125, "eval_loss": 2.6103038787841797, "eval_runtime": 23.6141, "eval_samples_per_second": 59.032, "eval_steps_per_second": 7.411, "step": 31300 }, { "epoch": 1786.628733997155, "grad_norm": 0.19163106381893158, "learning_rate": 2.8695652173913046e-06, "loss": 2.6031, "step": 31400 }, { "epoch": 1786.628733997155, "eval_loss": 2.611341953277588, "eval_runtime": 23.9739, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 31400 }, { "epoch": 1792.3186344238975, "grad_norm": 0.18413276970386505, "learning_rate": 2.859531772575251e-06, "loss": 2.6027, "step": 31500 }, { "epoch": 1792.3186344238975, "eval_loss": 2.6132729053497314, "eval_runtime": 23.9559, "eval_samples_per_second": 58.19, "eval_steps_per_second": 7.305, "step": 31500 }, { "epoch": 1798.0085348506402, "grad_norm": 0.20607994496822357, "learning_rate": 2.8494983277591977e-06, "loss": 2.6024, "step": 31600 }, { "epoch": 1798.0085348506402, "eval_loss": 2.610116958618164, "eval_runtime": 23.9884, "eval_samples_per_second": 58.112, "eval_steps_per_second": 7.295, "step": 31600 }, { "epoch": 1803.6984352773827, "grad_norm": 0.19761328399181366, "learning_rate": 2.839464882943144e-06, "loss": 2.6026, "step": 31700 }, { "epoch": 1803.6984352773827, "eval_loss": 2.612351179122925, "eval_runtime": 23.9662, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 31700 }, { "epoch": 1809.3883357041252, "grad_norm": 0.20598149299621582, "learning_rate": 2.8294314381270904e-06, "loss": 2.6021, "step": 31800 }, { "epoch": 1809.3883357041252, "eval_loss": 2.610358238220215, "eval_runtime": 23.9737, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 31800 }, { "epoch": 1815.0782361308677, "grad_norm": 0.18823176622390747, "learning_rate": 2.819397993311037e-06, "loss": 2.6019, "step": 31900 }, { "epoch": 1815.0782361308677, "eval_loss": 2.6109814643859863, "eval_runtime": 23.9697, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 31900 }, { "epoch": 1820.7681365576102, "grad_norm": 0.1910698264837265, "learning_rate": 2.8093645484949835e-06, "loss": 2.6015, "step": 32000 }, { "epoch": 1820.7681365576102, "eval_loss": 2.6069202423095703, "eval_runtime": 23.6158, "eval_samples_per_second": 59.028, "eval_steps_per_second": 7.41, "step": 32000 }, { "epoch": 1826.4580369843527, "grad_norm": 0.18724623322486877, "learning_rate": 2.79933110367893e-06, "loss": 2.6009, "step": 32100 }, { "epoch": 1826.4580369843527, "eval_loss": 2.610833168029785, "eval_runtime": 23.9646, "eval_samples_per_second": 58.169, "eval_steps_per_second": 7.302, "step": 32100 }, { "epoch": 1832.1479374110952, "grad_norm": 0.20118850469589233, "learning_rate": 2.7892976588628766e-06, "loss": 2.6008, "step": 32200 }, { "epoch": 1832.1479374110952, "eval_loss": 2.611131191253662, "eval_runtime": 23.9699, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 32200 }, { "epoch": 1837.837837837838, "grad_norm": 0.18612216413021088, "learning_rate": 2.779264214046823e-06, "loss": 2.6003, "step": 32300 }, { "epoch": 1837.837837837838, "eval_loss": 2.6071786880493164, "eval_runtime": 23.6255, "eval_samples_per_second": 59.004, "eval_steps_per_second": 7.407, "step": 32300 }, { "epoch": 1843.5277382645804, "grad_norm": 0.22030840814113617, "learning_rate": 2.7692307692307693e-06, "loss": 2.6006, "step": 32400 }, { "epoch": 1843.5277382645804, "eval_loss": 2.611001968383789, "eval_runtime": 23.9793, "eval_samples_per_second": 58.133, "eval_steps_per_second": 7.298, "step": 32400 }, { "epoch": 1849.217638691323, "grad_norm": 0.1993936002254486, "learning_rate": 2.759197324414716e-06, "loss": 2.5998, "step": 32500 }, { "epoch": 1849.217638691323, "eval_loss": 2.610430955886841, "eval_runtime": 23.9674, "eval_samples_per_second": 58.162, "eval_steps_per_second": 7.302, "step": 32500 }, { "epoch": 1854.9075391180654, "grad_norm": 0.20003485679626465, "learning_rate": 2.749163879598662e-06, "loss": 2.5992, "step": 32600 }, { "epoch": 1854.9075391180654, "eval_loss": 2.608316421508789, "eval_runtime": 23.6265, "eval_samples_per_second": 59.002, "eval_steps_per_second": 7.407, "step": 32600 }, { "epoch": 1860.597439544808, "grad_norm": 0.191485196352005, "learning_rate": 2.7391304347826087e-06, "loss": 2.5994, "step": 32700 }, { "epoch": 1860.597439544808, "eval_loss": 2.606895923614502, "eval_runtime": 23.9721, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 32700 }, { "epoch": 1866.2873399715504, "grad_norm": 0.22254496812820435, "learning_rate": 2.729096989966555e-06, "loss": 2.5987, "step": 32800 }, { "epoch": 1866.2873399715504, "eval_loss": 2.609882116317749, "eval_runtime": 23.6229, "eval_samples_per_second": 59.01, "eval_steps_per_second": 7.408, "step": 32800 }, { "epoch": 1871.9772403982931, "grad_norm": 0.21001747250556946, "learning_rate": 2.7190635451505014e-06, "loss": 2.5986, "step": 32900 }, { "epoch": 1871.9772403982931, "eval_loss": 2.609361410140991, "eval_runtime": 23.9824, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 32900 }, { "epoch": 1877.6671408250356, "grad_norm": 0.2182328701019287, "learning_rate": 2.709030100334448e-06, "loss": 2.5982, "step": 33000 }, { "epoch": 1877.6671408250356, "eval_loss": 2.6055867671966553, "eval_runtime": 23.9808, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 33000 }, { "epoch": 1883.3570412517781, "grad_norm": 0.2228955626487732, "learning_rate": 2.6989966555183945e-06, "loss": 2.5981, "step": 33100 }, { "epoch": 1883.3570412517781, "eval_loss": 2.6094181537628174, "eval_runtime": 23.9937, "eval_samples_per_second": 58.099, "eval_steps_per_second": 7.294, "step": 33100 }, { "epoch": 1889.0469416785206, "grad_norm": 0.20408956706523895, "learning_rate": 2.6889632107023413e-06, "loss": 2.5976, "step": 33200 }, { "epoch": 1889.0469416785206, "eval_loss": 2.610013484954834, "eval_runtime": 23.9779, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 33200 }, { "epoch": 1894.7368421052631, "grad_norm": 0.18041720986366272, "learning_rate": 2.6789297658862876e-06, "loss": 2.5966, "step": 33300 }, { "epoch": 1894.7368421052631, "eval_loss": 2.6073315143585205, "eval_runtime": 23.9719, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 33300 }, { "epoch": 1900.4267425320056, "grad_norm": 0.21475084125995636, "learning_rate": 2.668896321070234e-06, "loss": 2.5971, "step": 33400 }, { "epoch": 1900.4267425320056, "eval_loss": 2.6072559356689453, "eval_runtime": 23.9644, "eval_samples_per_second": 58.17, "eval_steps_per_second": 7.303, "step": 33400 }, { "epoch": 1906.1166429587481, "grad_norm": 0.20976944267749786, "learning_rate": 2.6588628762541807e-06, "loss": 2.5975, "step": 33500 }, { "epoch": 1906.1166429587481, "eval_loss": 2.610696315765381, "eval_runtime": 23.9698, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 33500 }, { "epoch": 1911.8065433854908, "grad_norm": 0.22089052200317383, "learning_rate": 2.648829431438127e-06, "loss": 2.5967, "step": 33600 }, { "epoch": 1911.8065433854908, "eval_loss": 2.610421895980835, "eval_runtime": 23.9697, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 33600 }, { "epoch": 1917.4964438122333, "grad_norm": 0.2476435750722885, "learning_rate": 2.6387959866220734e-06, "loss": 2.5967, "step": 33700 }, { "epoch": 1917.4964438122333, "eval_loss": 2.609027862548828, "eval_runtime": 23.9693, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 33700 }, { "epoch": 1923.1863442389758, "grad_norm": 0.2400488555431366, "learning_rate": 2.62876254180602e-06, "loss": 2.5963, "step": 33800 }, { "epoch": 1923.1863442389758, "eval_loss": 2.606323480606079, "eval_runtime": 23.9793, "eval_samples_per_second": 58.133, "eval_steps_per_second": 7.298, "step": 33800 }, { "epoch": 1928.8762446657183, "grad_norm": 0.22055935859680176, "learning_rate": 2.6187290969899665e-06, "loss": 2.596, "step": 33900 }, { "epoch": 1928.8762446657183, "eval_loss": 2.6059587001800537, "eval_runtime": 23.9784, "eval_samples_per_second": 58.136, "eval_steps_per_second": 7.298, "step": 33900 }, { "epoch": 1934.5661450924608, "grad_norm": 0.19678466022014618, "learning_rate": 2.6086956521739132e-06, "loss": 2.5958, "step": 34000 }, { "epoch": 1934.5661450924608, "eval_loss": 2.607278347015381, "eval_runtime": 23.9722, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 34000 }, { "epoch": 1940.2560455192033, "grad_norm": 0.19066280126571655, "learning_rate": 2.5986622073578596e-06, "loss": 2.596, "step": 34100 }, { "epoch": 1940.2560455192033, "eval_loss": 2.6062145233154297, "eval_runtime": 23.9754, "eval_samples_per_second": 58.143, "eval_steps_per_second": 7.299, "step": 34100 }, { "epoch": 1945.945945945946, "grad_norm": 0.18578113615512848, "learning_rate": 2.588628762541806e-06, "loss": 2.5956, "step": 34200 }, { "epoch": 1945.945945945946, "eval_loss": 2.6075074672698975, "eval_runtime": 23.9661, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 34200 }, { "epoch": 1951.6358463726885, "grad_norm": 0.1975042223930359, "learning_rate": 2.5785953177257527e-06, "loss": 2.595, "step": 34300 }, { "epoch": 1951.6358463726885, "eval_loss": 2.6064653396606445, "eval_runtime": 23.9808, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.298, "step": 34300 }, { "epoch": 1957.325746799431, "grad_norm": 0.19423623383045197, "learning_rate": 2.568561872909699e-06, "loss": 2.595, "step": 34400 }, { "epoch": 1957.325746799431, "eval_loss": 2.611752510070801, "eval_runtime": 23.6289, "eval_samples_per_second": 58.996, "eval_steps_per_second": 7.406, "step": 34400 }, { "epoch": 1963.0156472261735, "grad_norm": 0.2097640186548233, "learning_rate": 2.5585284280936454e-06, "loss": 2.595, "step": 34500 }, { "epoch": 1963.0156472261735, "eval_loss": 2.6093528270721436, "eval_runtime": 23.9769, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 34500 }, { "epoch": 1968.705547652916, "grad_norm": 0.21185003221035004, "learning_rate": 2.548494983277592e-06, "loss": 2.5948, "step": 34600 }, { "epoch": 1968.705547652916, "eval_loss": 2.608447551727295, "eval_runtime": 23.9828, "eval_samples_per_second": 58.125, "eval_steps_per_second": 7.297, "step": 34600 }, { "epoch": 1974.3954480796585, "grad_norm": 0.20163875818252563, "learning_rate": 2.5384615384615385e-06, "loss": 2.595, "step": 34700 }, { "epoch": 1974.3954480796585, "eval_loss": 2.6056010723114014, "eval_runtime": 23.9675, "eval_samples_per_second": 58.162, "eval_steps_per_second": 7.302, "step": 34700 }, { "epoch": 1980.085348506401, "grad_norm": 0.2156960666179657, "learning_rate": 2.528428093645485e-06, "loss": 2.5939, "step": 34800 }, { "epoch": 1980.085348506401, "eval_loss": 2.608295440673828, "eval_runtime": 23.9789, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 34800 }, { "epoch": 1985.7752489331438, "grad_norm": 0.21288277208805084, "learning_rate": 2.5183946488294316e-06, "loss": 2.5941, "step": 34900 }, { "epoch": 1985.7752489331438, "eval_loss": 2.6072516441345215, "eval_runtime": 23.9694, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 34900 }, { "epoch": 1991.4651493598863, "grad_norm": 0.19131049513816833, "learning_rate": 2.508361204013378e-06, "loss": 2.5936, "step": 35000 }, { "epoch": 1991.4651493598863, "eval_loss": 2.607457160949707, "eval_runtime": 23.9676, "eval_samples_per_second": 58.162, "eval_steps_per_second": 7.302, "step": 35000 }, { "epoch": 1997.1550497866288, "grad_norm": 0.20999549329280853, "learning_rate": 2.4983277591973247e-06, "loss": 2.5942, "step": 35100 }, { "epoch": 1997.1550497866288, "eval_loss": 2.6048200130462646, "eval_runtime": 23.9476, "eval_samples_per_second": 58.211, "eval_steps_per_second": 7.308, "step": 35100 }, { "epoch": 2002.8449502133712, "grad_norm": 0.22608843445777893, "learning_rate": 2.488294314381271e-06, "loss": 2.5936, "step": 35200 }, { "epoch": 2002.8449502133712, "eval_loss": 2.6086690425872803, "eval_runtime": 23.9613, "eval_samples_per_second": 58.177, "eval_steps_per_second": 7.303, "step": 35200 }, { "epoch": 2008.5348506401137, "grad_norm": 0.21249240636825562, "learning_rate": 2.4782608695652173e-06, "loss": 2.5937, "step": 35300 }, { "epoch": 2008.5348506401137, "eval_loss": 2.609616994857788, "eval_runtime": 23.984, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.297, "step": 35300 }, { "epoch": 2014.2247510668562, "grad_norm": 0.18126103281974792, "learning_rate": 2.468227424749164e-06, "loss": 2.593, "step": 35400 }, { "epoch": 2014.2247510668562, "eval_loss": 2.607325792312622, "eval_runtime": 23.6173, "eval_samples_per_second": 59.025, "eval_steps_per_second": 7.41, "step": 35400 }, { "epoch": 2019.914651493599, "grad_norm": 0.21532674133777618, "learning_rate": 2.4581939799331104e-06, "loss": 2.593, "step": 35500 }, { "epoch": 2019.914651493599, "eval_loss": 2.605360269546509, "eval_runtime": 23.9754, "eval_samples_per_second": 58.143, "eval_steps_per_second": 7.299, "step": 35500 }, { "epoch": 2025.6045519203415, "grad_norm": 0.22612471878528595, "learning_rate": 2.4481605351170568e-06, "loss": 2.5933, "step": 35600 }, { "epoch": 2025.6045519203415, "eval_loss": 2.607989549636841, "eval_runtime": 23.988, "eval_samples_per_second": 58.112, "eval_steps_per_second": 7.295, "step": 35600 }, { "epoch": 2031.294452347084, "grad_norm": 0.20216602087020874, "learning_rate": 2.4381270903010035e-06, "loss": 2.5932, "step": 35700 }, { "epoch": 2031.294452347084, "eval_loss": 2.606175184249878, "eval_runtime": 23.9791, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 35700 }, { "epoch": 2036.9843527738265, "grad_norm": 0.19835437834262848, "learning_rate": 2.42809364548495e-06, "loss": 2.5925, "step": 35800 }, { "epoch": 2036.9843527738265, "eval_loss": 2.607192277908325, "eval_runtime": 23.9678, "eval_samples_per_second": 58.161, "eval_steps_per_second": 7.301, "step": 35800 }, { "epoch": 2042.674253200569, "grad_norm": 0.24524998664855957, "learning_rate": 2.4180602006688962e-06, "loss": 2.5933, "step": 35900 }, { "epoch": 2042.674253200569, "eval_loss": 2.606074094772339, "eval_runtime": 23.9751, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 35900 }, { "epoch": 2048.3641536273117, "grad_norm": 0.2061767429113388, "learning_rate": 2.408026755852843e-06, "loss": 2.5923, "step": 36000 }, { "epoch": 2048.3641536273117, "eval_loss": 2.606078624725342, "eval_runtime": 23.9619, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 36000 }, { "epoch": 2054.054054054054, "grad_norm": 0.2126512974500656, "learning_rate": 2.3979933110367893e-06, "loss": 2.5923, "step": 36100 }, { "epoch": 2054.054054054054, "eval_loss": 2.6044681072235107, "eval_runtime": 23.9651, "eval_samples_per_second": 58.168, "eval_steps_per_second": 7.302, "step": 36100 }, { "epoch": 2059.7439544807967, "grad_norm": 0.19873598217964172, "learning_rate": 2.387959866220736e-06, "loss": 2.5917, "step": 36200 }, { "epoch": 2059.7439544807967, "eval_loss": 2.6049787998199463, "eval_runtime": 23.9985, "eval_samples_per_second": 58.087, "eval_steps_per_second": 7.292, "step": 36200 }, { "epoch": 2065.433854907539, "grad_norm": 0.22740136086940765, "learning_rate": 2.3779264214046824e-06, "loss": 2.592, "step": 36300 }, { "epoch": 2065.433854907539, "eval_loss": 2.6051523685455322, "eval_runtime": 23.9778, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 36300 }, { "epoch": 2071.1237553342817, "grad_norm": 0.23417142033576965, "learning_rate": 2.3678929765886288e-06, "loss": 2.5919, "step": 36400 }, { "epoch": 2071.1237553342817, "eval_loss": 2.6095163822174072, "eval_runtime": 23.9947, "eval_samples_per_second": 58.096, "eval_steps_per_second": 7.293, "step": 36400 }, { "epoch": 2076.813655761024, "grad_norm": 0.201192244887352, "learning_rate": 2.3578595317725755e-06, "loss": 2.5914, "step": 36500 }, { "epoch": 2076.813655761024, "eval_loss": 2.6050267219543457, "eval_runtime": 23.6283, "eval_samples_per_second": 58.997, "eval_steps_per_second": 7.406, "step": 36500 }, { "epoch": 2082.5035561877667, "grad_norm": 0.21296341717243195, "learning_rate": 2.347826086956522e-06, "loss": 2.592, "step": 36600 }, { "epoch": 2082.5035561877667, "eval_loss": 2.6075668334960938, "eval_runtime": 23.9712, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 36600 }, { "epoch": 2088.193456614509, "grad_norm": 0.21053853631019592, "learning_rate": 2.337792642140468e-06, "loss": 2.5917, "step": 36700 }, { "epoch": 2088.193456614509, "eval_loss": 2.606757879257202, "eval_runtime": 23.9742, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.3, "step": 36700 }, { "epoch": 2093.8833570412517, "grad_norm": 0.1819269359111786, "learning_rate": 2.327759197324415e-06, "loss": 2.5913, "step": 36800 }, { "epoch": 2093.8833570412517, "eval_loss": 2.606837272644043, "eval_runtime": 23.9719, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 36800 }, { "epoch": 2099.573257467994, "grad_norm": 0.2274492383003235, "learning_rate": 2.3177257525083613e-06, "loss": 2.5913, "step": 36900 }, { "epoch": 2099.573257467994, "eval_loss": 2.607267379760742, "eval_runtime": 23.9826, "eval_samples_per_second": 58.125, "eval_steps_per_second": 7.297, "step": 36900 }, { "epoch": 2105.2631578947367, "grad_norm": 0.2047816663980484, "learning_rate": 2.307692307692308e-06, "loss": 2.5913, "step": 37000 }, { "epoch": 2105.2631578947367, "eval_loss": 2.605915069580078, "eval_runtime": 23.9786, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 37000 }, { "epoch": 2110.953058321479, "grad_norm": 0.22065645456314087, "learning_rate": 2.2976588628762544e-06, "loss": 2.5911, "step": 37100 }, { "epoch": 2110.953058321479, "eval_loss": 2.6056876182556152, "eval_runtime": 23.9857, "eval_samples_per_second": 58.118, "eval_steps_per_second": 7.296, "step": 37100 }, { "epoch": 2116.642958748222, "grad_norm": 0.23279725015163422, "learning_rate": 2.2876254180602008e-06, "loss": 2.5909, "step": 37200 }, { "epoch": 2116.642958748222, "eval_loss": 2.6096079349517822, "eval_runtime": 23.9666, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 37200 }, { "epoch": 2122.3328591749646, "grad_norm": 0.20520757138729095, "learning_rate": 2.2775919732441475e-06, "loss": 2.5913, "step": 37300 }, { "epoch": 2122.3328591749646, "eval_loss": 2.605616569519043, "eval_runtime": 23.9741, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.3, "step": 37300 }, { "epoch": 2128.022759601707, "grad_norm": 0.21737107634544373, "learning_rate": 2.267558528428094e-06, "loss": 2.5909, "step": 37400 }, { "epoch": 2128.022759601707, "eval_loss": 2.6048943996429443, "eval_runtime": 23.6318, "eval_samples_per_second": 58.988, "eval_steps_per_second": 7.405, "step": 37400 }, { "epoch": 2133.7126600284496, "grad_norm": 0.20358791947364807, "learning_rate": 2.25752508361204e-06, "loss": 2.5909, "step": 37500 }, { "epoch": 2133.7126600284496, "eval_loss": 2.6061489582061768, "eval_runtime": 23.9684, "eval_samples_per_second": 58.16, "eval_steps_per_second": 7.301, "step": 37500 }, { "epoch": 2139.402560455192, "grad_norm": 0.22121722996234894, "learning_rate": 2.2474916387959865e-06, "loss": 2.5907, "step": 37600 }, { "epoch": 2139.402560455192, "eval_loss": 2.6052770614624023, "eval_runtime": 23.9516, "eval_samples_per_second": 58.201, "eval_steps_per_second": 7.306, "step": 37600 }, { "epoch": 2145.0924608819346, "grad_norm": 0.22327056527137756, "learning_rate": 2.237458193979933e-06, "loss": 2.59, "step": 37700 }, { "epoch": 2145.0924608819346, "eval_loss": 2.609347343444824, "eval_runtime": 23.9602, "eval_samples_per_second": 58.18, "eval_steps_per_second": 7.304, "step": 37700 }, { "epoch": 2150.782361308677, "grad_norm": 0.2154139280319214, "learning_rate": 2.2274247491638796e-06, "loss": 2.5906, "step": 37800 }, { "epoch": 2150.782361308677, "eval_loss": 2.6085565090179443, "eval_runtime": 23.9701, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 37800 }, { "epoch": 2156.4722617354196, "grad_norm": 0.2098981738090515, "learning_rate": 2.217391304347826e-06, "loss": 2.5904, "step": 37900 }, { "epoch": 2156.4722617354196, "eval_loss": 2.6086554527282715, "eval_runtime": 23.9713, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 37900 }, { "epoch": 2162.162162162162, "grad_norm": 0.23115159571170807, "learning_rate": 2.2073578595317723e-06, "loss": 2.5904, "step": 38000 }, { "epoch": 2162.162162162162, "eval_loss": 2.6087803840637207, "eval_runtime": 23.9789, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 38000 }, { "epoch": 2167.8520625889046, "grad_norm": 0.21658512949943542, "learning_rate": 2.197324414715719e-06, "loss": 2.59, "step": 38100 }, { "epoch": 2167.8520625889046, "eval_loss": 2.6069300174713135, "eval_runtime": 23.9659, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 38100 }, { "epoch": 2173.541963015647, "grad_norm": 0.20112384855747223, "learning_rate": 2.1872909698996654e-06, "loss": 2.5901, "step": 38200 }, { "epoch": 2173.541963015647, "eval_loss": 2.6072750091552734, "eval_runtime": 23.6174, "eval_samples_per_second": 59.024, "eval_steps_per_second": 7.41, "step": 38200 }, { "epoch": 2179.2318634423896, "grad_norm": 0.21559318900108337, "learning_rate": 2.177257525083612e-06, "loss": 2.5899, "step": 38300 }, { "epoch": 2179.2318634423896, "eval_loss": 2.609740734100342, "eval_runtime": 23.9734, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 38300 }, { "epoch": 2184.921763869132, "grad_norm": 0.2399979680776596, "learning_rate": 2.1672240802675585e-06, "loss": 2.5898, "step": 38400 }, { "epoch": 2184.921763869132, "eval_loss": 2.6082592010498047, "eval_runtime": 23.9721, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 38400 }, { "epoch": 2190.611664295875, "grad_norm": 0.20176702737808228, "learning_rate": 2.157190635451505e-06, "loss": 2.5898, "step": 38500 }, { "epoch": 2190.611664295875, "eval_loss": 2.609528064727783, "eval_runtime": 23.9695, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 38500 }, { "epoch": 2196.3015647226175, "grad_norm": 0.2130006104707718, "learning_rate": 2.1471571906354516e-06, "loss": 2.5896, "step": 38600 }, { "epoch": 2196.3015647226175, "eval_loss": 2.6067283153533936, "eval_runtime": 23.9785, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 38600 }, { "epoch": 2201.99146514936, "grad_norm": 0.19360195100307465, "learning_rate": 2.137123745819398e-06, "loss": 2.5897, "step": 38700 }, { "epoch": 2201.99146514936, "eval_loss": 2.6070244312286377, "eval_runtime": 23.9734, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 38700 }, { "epoch": 2207.6813655761025, "grad_norm": 0.21760191023349762, "learning_rate": 2.1270903010033443e-06, "loss": 2.589, "step": 38800 }, { "epoch": 2207.6813655761025, "eval_loss": 2.6102993488311768, "eval_runtime": 23.9663, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 38800 }, { "epoch": 2213.371266002845, "grad_norm": 0.21763825416564941, "learning_rate": 2.117056856187291e-06, "loss": 2.5895, "step": 38900 }, { "epoch": 2213.371266002845, "eval_loss": 2.6087794303894043, "eval_runtime": 23.9613, "eval_samples_per_second": 58.177, "eval_steps_per_second": 7.303, "step": 38900 }, { "epoch": 2219.0611664295875, "grad_norm": 0.2202463299036026, "learning_rate": 2.1070234113712374e-06, "loss": 2.5887, "step": 39000 }, { "epoch": 2219.0611664295875, "eval_loss": 2.6068382263183594, "eval_runtime": 23.9826, "eval_samples_per_second": 58.125, "eval_steps_per_second": 7.297, "step": 39000 }, { "epoch": 2224.75106685633, "grad_norm": 0.21476009488105774, "learning_rate": 2.0969899665551837e-06, "loss": 2.5889, "step": 39100 }, { "epoch": 2224.75106685633, "eval_loss": 2.608288049697876, "eval_runtime": 23.6185, "eval_samples_per_second": 59.021, "eval_steps_per_second": 7.409, "step": 39100 }, { "epoch": 2230.4409672830725, "grad_norm": 0.24493220448493958, "learning_rate": 2.0869565217391305e-06, "loss": 2.589, "step": 39200 }, { "epoch": 2230.4409672830725, "eval_loss": 2.6087050437927246, "eval_runtime": 23.9767, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 39200 }, { "epoch": 2236.130867709815, "grad_norm": 0.20519843697547913, "learning_rate": 2.076923076923077e-06, "loss": 2.5888, "step": 39300 }, { "epoch": 2236.130867709815, "eval_loss": 2.6077113151550293, "eval_runtime": 23.9716, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 39300 }, { "epoch": 2241.8207681365575, "grad_norm": 0.21633167564868927, "learning_rate": 2.0668896321070236e-06, "loss": 2.5889, "step": 39400 }, { "epoch": 2241.8207681365575, "eval_loss": 2.606517791748047, "eval_runtime": 23.9694, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 39400 }, { "epoch": 2247.5106685633, "grad_norm": 0.21099114418029785, "learning_rate": 2.05685618729097e-06, "loss": 2.5889, "step": 39500 }, { "epoch": 2247.5106685633, "eval_loss": 2.6069552898406982, "eval_runtime": 23.9796, "eval_samples_per_second": 58.133, "eval_steps_per_second": 7.298, "step": 39500 }, { "epoch": 2253.2005689900425, "grad_norm": 0.21585485339164734, "learning_rate": 2.0468227424749163e-06, "loss": 2.5885, "step": 39600 }, { "epoch": 2253.2005689900425, "eval_loss": 2.6081788539886475, "eval_runtime": 23.9947, "eval_samples_per_second": 58.096, "eval_steps_per_second": 7.293, "step": 39600 }, { "epoch": 2258.890469416785, "grad_norm": 0.22519052028656006, "learning_rate": 2.036789297658863e-06, "loss": 2.5887, "step": 39700 }, { "epoch": 2258.890469416785, "eval_loss": 2.607969284057617, "eval_runtime": 23.9681, "eval_samples_per_second": 58.161, "eval_steps_per_second": 7.301, "step": 39700 }, { "epoch": 2264.580369843528, "grad_norm": 0.22348664700984955, "learning_rate": 2.0267558528428094e-06, "loss": 2.5885, "step": 39800 }, { "epoch": 2264.580369843528, "eval_loss": 2.605234146118164, "eval_runtime": 23.9762, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 39800 }, { "epoch": 2270.2702702702704, "grad_norm": 0.22468328475952148, "learning_rate": 2.0167224080267557e-06, "loss": 2.5879, "step": 39900 }, { "epoch": 2270.2702702702704, "eval_loss": 2.6086230278015137, "eval_runtime": 23.9804, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 39900 }, { "epoch": 2275.960170697013, "grad_norm": 0.22100846469402313, "learning_rate": 2.0066889632107025e-06, "loss": 2.5881, "step": 40000 }, { "epoch": 2275.960170697013, "eval_loss": 2.6064453125, "eval_runtime": 23.619, "eval_samples_per_second": 59.02, "eval_steps_per_second": 7.409, "step": 40000 }, { "epoch": 2281.6500711237554, "grad_norm": 0.22934798896312714, "learning_rate": 1.996655518394649e-06, "loss": 2.588, "step": 40100 }, { "epoch": 2281.6500711237554, "eval_loss": 2.609809398651123, "eval_runtime": 23.9742, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.3, "step": 40100 }, { "epoch": 2287.339971550498, "grad_norm": 0.22776305675506592, "learning_rate": 1.986622073578595e-06, "loss": 2.5879, "step": 40200 }, { "epoch": 2287.339971550498, "eval_loss": 2.610957622528076, "eval_runtime": 23.9692, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 40200 }, { "epoch": 2293.0298719772404, "grad_norm": 0.24109528958797455, "learning_rate": 1.976588628762542e-06, "loss": 2.5873, "step": 40300 }, { "epoch": 2293.0298719772404, "eval_loss": 2.6117520332336426, "eval_runtime": 23.9749, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 40300 }, { "epoch": 2298.719772403983, "grad_norm": 0.20761537551879883, "learning_rate": 1.9665551839464883e-06, "loss": 2.5882, "step": 40400 }, { "epoch": 2298.719772403983, "eval_loss": 2.6076319217681885, "eval_runtime": 23.9837, "eval_samples_per_second": 58.123, "eval_steps_per_second": 7.297, "step": 40400 }, { "epoch": 2304.4096728307254, "grad_norm": 0.2197142243385315, "learning_rate": 1.956521739130435e-06, "loss": 2.5876, "step": 40500 }, { "epoch": 2304.4096728307254, "eval_loss": 2.6088201999664307, "eval_runtime": 23.9737, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 40500 }, { "epoch": 2310.099573257468, "grad_norm": 0.21660037338733673, "learning_rate": 1.9464882943143814e-06, "loss": 2.5877, "step": 40600 }, { "epoch": 2310.099573257468, "eval_loss": 2.6094954013824463, "eval_runtime": 23.9699, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 40600 }, { "epoch": 2315.7894736842104, "grad_norm": 0.2162291556596756, "learning_rate": 1.9364548494983277e-06, "loss": 2.5874, "step": 40700 }, { "epoch": 2315.7894736842104, "eval_loss": 2.6068737506866455, "eval_runtime": 23.9747, "eval_samples_per_second": 58.145, "eval_steps_per_second": 7.299, "step": 40700 }, { "epoch": 2321.479374110953, "grad_norm": 0.20265834033489227, "learning_rate": 1.9264214046822745e-06, "loss": 2.5874, "step": 40800 }, { "epoch": 2321.479374110953, "eval_loss": 2.6073391437530518, "eval_runtime": 23.634, "eval_samples_per_second": 58.983, "eval_steps_per_second": 7.405, "step": 40800 }, { "epoch": 2327.1692745376954, "grad_norm": 0.23362219333648682, "learning_rate": 1.916387959866221e-06, "loss": 2.5869, "step": 40900 }, { "epoch": 2327.1692745376954, "eval_loss": 2.6071267127990723, "eval_runtime": 23.9719, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 40900 }, { "epoch": 2332.859174964438, "grad_norm": 0.2258542776107788, "learning_rate": 1.9063545150501674e-06, "loss": 2.587, "step": 41000 }, { "epoch": 2332.859174964438, "eval_loss": 2.608017683029175, "eval_runtime": 23.9627, "eval_samples_per_second": 58.174, "eval_steps_per_second": 7.303, "step": 41000 }, { "epoch": 2338.549075391181, "grad_norm": 0.2063855081796646, "learning_rate": 1.896321070234114e-06, "loss": 2.5865, "step": 41100 }, { "epoch": 2338.549075391181, "eval_loss": 2.6086549758911133, "eval_runtime": 23.9754, "eval_samples_per_second": 58.143, "eval_steps_per_second": 7.299, "step": 41100 }, { "epoch": 2344.2389758179233, "grad_norm": 0.23593726754188538, "learning_rate": 1.8862876254180603e-06, "loss": 2.5868, "step": 41200 }, { "epoch": 2344.2389758179233, "eval_loss": 2.6107077598571777, "eval_runtime": 23.9719, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 41200 }, { "epoch": 2349.928876244666, "grad_norm": 0.21962851285934448, "learning_rate": 1.8762541806020068e-06, "loss": 2.5868, "step": 41300 }, { "epoch": 2349.928876244666, "eval_loss": 2.6100499629974365, "eval_runtime": 23.9651, "eval_samples_per_second": 58.168, "eval_steps_per_second": 7.302, "step": 41300 }, { "epoch": 2355.6187766714083, "grad_norm": 0.21013060212135315, "learning_rate": 1.8662207357859534e-06, "loss": 2.5868, "step": 41400 }, { "epoch": 2355.6187766714083, "eval_loss": 2.611445426940918, "eval_runtime": 23.9689, "eval_samples_per_second": 58.159, "eval_steps_per_second": 7.301, "step": 41400 }, { "epoch": 2361.308677098151, "grad_norm": 0.21894079446792603, "learning_rate": 1.8561872909699e-06, "loss": 2.5867, "step": 41500 }, { "epoch": 2361.308677098151, "eval_loss": 2.607694149017334, "eval_runtime": 23.9659, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 41500 }, { "epoch": 2366.9985775248933, "grad_norm": 0.23075686395168304, "learning_rate": 1.8461538461538462e-06, "loss": 2.5868, "step": 41600 }, { "epoch": 2366.9985775248933, "eval_loss": 2.6095731258392334, "eval_runtime": 23.955, "eval_samples_per_second": 58.192, "eval_steps_per_second": 7.305, "step": 41600 }, { "epoch": 2372.688477951636, "grad_norm": 0.20675238966941833, "learning_rate": 1.8361204013377928e-06, "loss": 2.5861, "step": 41700 }, { "epoch": 2372.688477951636, "eval_loss": 2.609567880630493, "eval_runtime": 23.6274, "eval_samples_per_second": 58.999, "eval_steps_per_second": 7.407, "step": 41700 }, { "epoch": 2378.3783783783783, "grad_norm": 0.22343020141124725, "learning_rate": 1.8260869565217394e-06, "loss": 2.5866, "step": 41800 }, { "epoch": 2378.3783783783783, "eval_loss": 2.6095831394195557, "eval_runtime": 23.9726, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 41800 }, { "epoch": 2384.068278805121, "grad_norm": 0.21456314623355865, "learning_rate": 1.8160535117056857e-06, "loss": 2.5865, "step": 41900 }, { "epoch": 2384.068278805121, "eval_loss": 2.608867883682251, "eval_runtime": 23.9641, "eval_samples_per_second": 58.17, "eval_steps_per_second": 7.303, "step": 41900 }, { "epoch": 2389.7581792318633, "grad_norm": 0.21167542040348053, "learning_rate": 1.8060200668896322e-06, "loss": 2.5861, "step": 42000 }, { "epoch": 2389.7581792318633, "eval_loss": 2.6074702739715576, "eval_runtime": 23.9714, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 42000 }, { "epoch": 2395.448079658606, "grad_norm": 0.2520188093185425, "learning_rate": 1.7959866220735788e-06, "loss": 2.5857, "step": 42100 }, { "epoch": 2395.448079658606, "eval_loss": 2.606618881225586, "eval_runtime": 23.9735, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 42100 }, { "epoch": 2401.1379800853483, "grad_norm": 0.2366916537284851, "learning_rate": 1.7859531772575253e-06, "loss": 2.5864, "step": 42200 }, { "epoch": 2401.1379800853483, "eval_loss": 2.6078577041625977, "eval_runtime": 23.9739, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 42200 }, { "epoch": 2406.827880512091, "grad_norm": 0.22600212693214417, "learning_rate": 1.7759197324414717e-06, "loss": 2.5863, "step": 42300 }, { "epoch": 2406.827880512091, "eval_loss": 2.6072797775268555, "eval_runtime": 23.9618, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 42300 }, { "epoch": 2412.5177809388338, "grad_norm": 0.2162897288799286, "learning_rate": 1.7658862876254182e-06, "loss": 2.5856, "step": 42400 }, { "epoch": 2412.5177809388338, "eval_loss": 2.6077022552490234, "eval_runtime": 23.6377, "eval_samples_per_second": 58.973, "eval_steps_per_second": 7.403, "step": 42400 }, { "epoch": 2418.2076813655763, "grad_norm": 0.22395272552967072, "learning_rate": 1.7558528428093648e-06, "loss": 2.5855, "step": 42500 }, { "epoch": 2418.2076813655763, "eval_loss": 2.6070289611816406, "eval_runtime": 23.9804, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 42500 }, { "epoch": 2423.8975817923188, "grad_norm": 0.21771951019763947, "learning_rate": 1.745819397993311e-06, "loss": 2.5856, "step": 42600 }, { "epoch": 2423.8975817923188, "eval_loss": 2.6128346920013428, "eval_runtime": 23.6285, "eval_samples_per_second": 58.996, "eval_steps_per_second": 7.406, "step": 42600 }, { "epoch": 2429.5874822190613, "grad_norm": 0.23614706099033356, "learning_rate": 1.7357859531772575e-06, "loss": 2.5856, "step": 42700 }, { "epoch": 2429.5874822190613, "eval_loss": 2.611271858215332, "eval_runtime": 23.9785, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 42700 }, { "epoch": 2435.2773826458038, "grad_norm": 0.22254329919815063, "learning_rate": 1.7257525083612038e-06, "loss": 2.5861, "step": 42800 }, { "epoch": 2435.2773826458038, "eval_loss": 2.609328031539917, "eval_runtime": 23.9834, "eval_samples_per_second": 58.124, "eval_steps_per_second": 7.297, "step": 42800 }, { "epoch": 2440.9672830725463, "grad_norm": 0.24362125992774963, "learning_rate": 1.7157190635451504e-06, "loss": 2.585, "step": 42900 }, { "epoch": 2440.9672830725463, "eval_loss": 2.610111713409424, "eval_runtime": 23.9838, "eval_samples_per_second": 58.123, "eval_steps_per_second": 7.297, "step": 42900 }, { "epoch": 2446.6571834992887, "grad_norm": 0.20748943090438843, "learning_rate": 1.705685618729097e-06, "loss": 2.5854, "step": 43000 }, { "epoch": 2446.6571834992887, "eval_loss": 2.6088545322418213, "eval_runtime": 23.9817, "eval_samples_per_second": 58.128, "eval_steps_per_second": 7.297, "step": 43000 }, { "epoch": 2452.3470839260312, "grad_norm": 0.231778085231781, "learning_rate": 1.6956521739130435e-06, "loss": 2.5848, "step": 43100 }, { "epoch": 2452.3470839260312, "eval_loss": 2.609499454498291, "eval_runtime": 23.9762, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 43100 }, { "epoch": 2458.0369843527737, "grad_norm": 0.2319919317960739, "learning_rate": 1.6856187290969898e-06, "loss": 2.5854, "step": 43200 }, { "epoch": 2458.0369843527737, "eval_loss": 2.6097819805145264, "eval_runtime": 23.9664, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 43200 }, { "epoch": 2463.7268847795162, "grad_norm": 0.212693452835083, "learning_rate": 1.6755852842809363e-06, "loss": 2.5846, "step": 43300 }, { "epoch": 2463.7268847795162, "eval_loss": 2.6089398860931396, "eval_runtime": 23.9713, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 43300 }, { "epoch": 2469.4167852062587, "grad_norm": 0.2074272632598877, "learning_rate": 1.665551839464883e-06, "loss": 2.5852, "step": 43400 }, { "epoch": 2469.4167852062587, "eval_loss": 2.6096198558807373, "eval_runtime": 23.9727, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 43400 }, { "epoch": 2475.1066856330012, "grad_norm": 0.23323705792427063, "learning_rate": 1.6555183946488294e-06, "loss": 2.5847, "step": 43500 }, { "epoch": 2475.1066856330012, "eval_loss": 2.607158899307251, "eval_runtime": 23.8136, "eval_samples_per_second": 58.538, "eval_steps_per_second": 7.349, "step": 43500 }, { "epoch": 2480.7965860597437, "grad_norm": 0.24258296191692352, "learning_rate": 1.6454849498327758e-06, "loss": 2.585, "step": 43600 }, { "epoch": 2480.7965860597437, "eval_loss": 2.6117851734161377, "eval_runtime": 23.8061, "eval_samples_per_second": 58.556, "eval_steps_per_second": 7.351, "step": 43600 }, { "epoch": 2486.4864864864867, "grad_norm": 0.23719272017478943, "learning_rate": 1.6354515050167223e-06, "loss": 2.585, "step": 43700 }, { "epoch": 2486.4864864864867, "eval_loss": 2.610424518585205, "eval_runtime": 23.9999, "eval_samples_per_second": 58.083, "eval_steps_per_second": 7.292, "step": 43700 }, { "epoch": 2492.176386913229, "grad_norm": 0.2383095622062683, "learning_rate": 1.6254180602006689e-06, "loss": 2.5847, "step": 43800 }, { "epoch": 2492.176386913229, "eval_loss": 2.6086204051971436, "eval_runtime": 23.9767, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 43800 }, { "epoch": 2497.8662873399717, "grad_norm": 0.23796670138835907, "learning_rate": 1.6153846153846154e-06, "loss": 2.5843, "step": 43900 }, { "epoch": 2497.8662873399717, "eval_loss": 2.607516050338745, "eval_runtime": 23.9736, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 43900 }, { "epoch": 2503.556187766714, "grad_norm": 0.2234671711921692, "learning_rate": 1.6053511705685618e-06, "loss": 2.5846, "step": 44000 }, { "epoch": 2503.556187766714, "eval_loss": 2.6071882247924805, "eval_runtime": 23.9746, "eval_samples_per_second": 58.145, "eval_steps_per_second": 7.299, "step": 44000 }, { "epoch": 2509.2460881934567, "grad_norm": 0.2136596143245697, "learning_rate": 1.5953177257525083e-06, "loss": 2.5847, "step": 44100 }, { "epoch": 2509.2460881934567, "eval_loss": 2.6061131954193115, "eval_runtime": 23.6146, "eval_samples_per_second": 59.031, "eval_steps_per_second": 7.411, "step": 44100 }, { "epoch": 2514.935988620199, "grad_norm": 0.21045100688934326, "learning_rate": 1.5852842809364549e-06, "loss": 2.5841, "step": 44200 }, { "epoch": 2514.935988620199, "eval_loss": 2.6131772994995117, "eval_runtime": 23.9763, "eval_samples_per_second": 58.141, "eval_steps_per_second": 7.299, "step": 44200 }, { "epoch": 2520.6258890469417, "grad_norm": 0.2306642085313797, "learning_rate": 1.5752508361204012e-06, "loss": 2.5843, "step": 44300 }, { "epoch": 2520.6258890469417, "eval_loss": 2.613348960876465, "eval_runtime": 23.6254, "eval_samples_per_second": 59.004, "eval_steps_per_second": 7.407, "step": 44300 }, { "epoch": 2526.315789473684, "grad_norm": 0.22865846753120422, "learning_rate": 1.5652173913043478e-06, "loss": 2.5839, "step": 44400 }, { "epoch": 2526.315789473684, "eval_loss": 2.608555793762207, "eval_runtime": 23.9869, "eval_samples_per_second": 58.115, "eval_steps_per_second": 7.296, "step": 44400 }, { "epoch": 2532.0056899004267, "grad_norm": 0.23196746408939362, "learning_rate": 1.5551839464882943e-06, "loss": 2.5837, "step": 44500 }, { "epoch": 2532.0056899004267, "eval_loss": 2.6074063777923584, "eval_runtime": 23.9961, "eval_samples_per_second": 58.093, "eval_steps_per_second": 7.293, "step": 44500 }, { "epoch": 2537.695590327169, "grad_norm": 0.22961702942848206, "learning_rate": 1.5451505016722409e-06, "loss": 2.5843, "step": 44600 }, { "epoch": 2537.695590327169, "eval_loss": 2.6085050106048584, "eval_runtime": 23.6299, "eval_samples_per_second": 58.993, "eval_steps_per_second": 7.406, "step": 44600 }, { "epoch": 2543.3854907539117, "grad_norm": 0.23799484968185425, "learning_rate": 1.5351170568561872e-06, "loss": 2.584, "step": 44700 }, { "epoch": 2543.3854907539117, "eval_loss": 2.610715866088867, "eval_runtime": 23.9802, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 44700 }, { "epoch": 2549.075391180654, "grad_norm": 0.21905191242694855, "learning_rate": 1.5250836120401338e-06, "loss": 2.5835, "step": 44800 }, { "epoch": 2549.075391180654, "eval_loss": 2.6101200580596924, "eval_runtime": 23.9821, "eval_samples_per_second": 58.127, "eval_steps_per_second": 7.297, "step": 44800 }, { "epoch": 2554.7652916073966, "grad_norm": 0.21092022955417633, "learning_rate": 1.5150501672240803e-06, "loss": 2.5836, "step": 44900 }, { "epoch": 2554.7652916073966, "eval_loss": 2.6060028076171875, "eval_runtime": 23.9759, "eval_samples_per_second": 58.142, "eval_steps_per_second": 7.299, "step": 44900 }, { "epoch": 2560.4551920341396, "grad_norm": 0.25125664472579956, "learning_rate": 1.5050167224080269e-06, "loss": 2.5832, "step": 45000 }, { "epoch": 2560.4551920341396, "eval_loss": 2.612673759460449, "eval_runtime": 23.9727, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 45000 }, { "epoch": 2566.145092460882, "grad_norm": 0.23938561975955963, "learning_rate": 1.4949832775919732e-06, "loss": 2.5833, "step": 45100 }, { "epoch": 2566.145092460882, "eval_loss": 2.61169171333313, "eval_runtime": 23.9657, "eval_samples_per_second": 58.167, "eval_steps_per_second": 7.302, "step": 45100 }, { "epoch": 2571.8349928876246, "grad_norm": 0.23332080245018005, "learning_rate": 1.4849498327759198e-06, "loss": 2.5836, "step": 45200 }, { "epoch": 2571.8349928876246, "eval_loss": 2.6075634956359863, "eval_runtime": 23.9719, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 45200 }, { "epoch": 2577.524893314367, "grad_norm": 0.2493467926979065, "learning_rate": 1.4749163879598663e-06, "loss": 2.5833, "step": 45300 }, { "epoch": 2577.524893314367, "eval_loss": 2.6123712062835693, "eval_runtime": 23.9809, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 45300 }, { "epoch": 2583.2147937411096, "grad_norm": 0.23774316906929016, "learning_rate": 1.4648829431438129e-06, "loss": 2.5832, "step": 45400 }, { "epoch": 2583.2147937411096, "eval_loss": 2.6104633808135986, "eval_runtime": 23.6171, "eval_samples_per_second": 59.025, "eval_steps_per_second": 7.41, "step": 45400 }, { "epoch": 2588.904694167852, "grad_norm": 0.2435203492641449, "learning_rate": 1.4548494983277592e-06, "loss": 2.5829, "step": 45500 }, { "epoch": 2588.904694167852, "eval_loss": 2.6109774112701416, "eval_runtime": 23.9777, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 45500 }, { "epoch": 2594.5945945945946, "grad_norm": 0.22922101616859436, "learning_rate": 1.4448160535117058e-06, "loss": 2.5835, "step": 45600 }, { "epoch": 2594.5945945945946, "eval_loss": 2.6112453937530518, "eval_runtime": 23.972, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 45600 }, { "epoch": 2600.284495021337, "grad_norm": 0.23543110489845276, "learning_rate": 1.4347826086956523e-06, "loss": 2.5828, "step": 45700 }, { "epoch": 2600.284495021337, "eval_loss": 2.6099853515625, "eval_runtime": 23.9851, "eval_samples_per_second": 58.119, "eval_steps_per_second": 7.296, "step": 45700 }, { "epoch": 2605.9743954480796, "grad_norm": 0.2309664934873581, "learning_rate": 1.4247491638795989e-06, "loss": 2.5826, "step": 45800 }, { "epoch": 2605.9743954480796, "eval_loss": 2.6090052127838135, "eval_runtime": 23.9855, "eval_samples_per_second": 58.119, "eval_steps_per_second": 7.296, "step": 45800 }, { "epoch": 2611.664295874822, "grad_norm": 0.22243796288967133, "learning_rate": 1.4147157190635452e-06, "loss": 2.5829, "step": 45900 }, { "epoch": 2611.664295874822, "eval_loss": 2.610006809234619, "eval_runtime": 23.6294, "eval_samples_per_second": 58.994, "eval_steps_per_second": 7.406, "step": 45900 }, { "epoch": 2617.3541963015646, "grad_norm": 0.23384037613868713, "learning_rate": 1.4046822742474917e-06, "loss": 2.5823, "step": 46000 }, { "epoch": 2617.3541963015646, "eval_loss": 2.6132850646972656, "eval_runtime": 23.9678, "eval_samples_per_second": 58.161, "eval_steps_per_second": 7.301, "step": 46000 }, { "epoch": 2623.044096728307, "grad_norm": 0.23333188891410828, "learning_rate": 1.3946488294314383e-06, "loss": 2.5827, "step": 46100 }, { "epoch": 2623.044096728307, "eval_loss": 2.612513780593872, "eval_runtime": 23.6358, "eval_samples_per_second": 58.978, "eval_steps_per_second": 7.404, "step": 46100 }, { "epoch": 2628.7339971550496, "grad_norm": 0.23102706670761108, "learning_rate": 1.3846153846153846e-06, "loss": 2.5824, "step": 46200 }, { "epoch": 2628.7339971550496, "eval_loss": 2.610734701156616, "eval_runtime": 23.9924, "eval_samples_per_second": 58.102, "eval_steps_per_second": 7.294, "step": 46200 }, { "epoch": 2634.4238975817925, "grad_norm": 0.2388777732849121, "learning_rate": 1.374581939799331e-06, "loss": 2.5828, "step": 46300 }, { "epoch": 2634.4238975817925, "eval_loss": 2.6136250495910645, "eval_runtime": 23.6235, "eval_samples_per_second": 59.009, "eval_steps_per_second": 7.408, "step": 46300 }, { "epoch": 2640.113798008535, "grad_norm": 0.24917742609977722, "learning_rate": 1.3645484949832775e-06, "loss": 2.5827, "step": 46400 }, { "epoch": 2640.113798008535, "eval_loss": 2.6128790378570557, "eval_runtime": 23.9692, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 46400 }, { "epoch": 2645.8036984352775, "grad_norm": 0.2471284121274948, "learning_rate": 1.354515050167224e-06, "loss": 2.5821, "step": 46500 }, { "epoch": 2645.8036984352775, "eval_loss": 2.612271785736084, "eval_runtime": 23.9748, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 46500 }, { "epoch": 2651.49359886202, "grad_norm": 0.24588705599308014, "learning_rate": 1.3444816053511706e-06, "loss": 2.5824, "step": 46600 }, { "epoch": 2651.49359886202, "eval_loss": 2.6146724224090576, "eval_runtime": 23.9734, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 46600 }, { "epoch": 2657.1834992887625, "grad_norm": 0.22074657678604126, "learning_rate": 1.334448160535117e-06, "loss": 2.5821, "step": 46700 }, { "epoch": 2657.1834992887625, "eval_loss": 2.612962484359741, "eval_runtime": 23.6303, "eval_samples_per_second": 58.992, "eval_steps_per_second": 7.406, "step": 46700 }, { "epoch": 2662.873399715505, "grad_norm": 0.23231124877929688, "learning_rate": 1.3244147157190635e-06, "loss": 2.5826, "step": 46800 }, { "epoch": 2662.873399715505, "eval_loss": 2.6139097213745117, "eval_runtime": 23.9752, "eval_samples_per_second": 58.143, "eval_steps_per_second": 7.299, "step": 46800 }, { "epoch": 2668.5633001422475, "grad_norm": 0.24543775618076324, "learning_rate": 1.31438127090301e-06, "loss": 2.5814, "step": 46900 }, { "epoch": 2668.5633001422475, "eval_loss": 2.6093571186065674, "eval_runtime": 23.9769, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 46900 }, { "epoch": 2674.25320056899, "grad_norm": 0.24010956287384033, "learning_rate": 1.3043478260869566e-06, "loss": 2.5825, "step": 47000 }, { "epoch": 2674.25320056899, "eval_loss": 2.610830307006836, "eval_runtime": 23.973, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 47000 }, { "epoch": 2679.9431009957325, "grad_norm": 0.25266411900520325, "learning_rate": 1.294314381270903e-06, "loss": 2.5823, "step": 47100 }, { "epoch": 2679.9431009957325, "eval_loss": 2.613356113433838, "eval_runtime": 24.8418, "eval_samples_per_second": 56.115, "eval_steps_per_second": 7.045, "step": 47100 }, { "epoch": 2685.633001422475, "grad_norm": 0.2359105348587036, "learning_rate": 1.2842809364548495e-06, "loss": 2.5815, "step": 47200 }, { "epoch": 2685.633001422475, "eval_loss": 2.6079251766204834, "eval_runtime": 23.9697, "eval_samples_per_second": 58.157, "eval_steps_per_second": 7.301, "step": 47200 }, { "epoch": 2691.3229018492175, "grad_norm": 0.23083586990833282, "learning_rate": 1.274247491638796e-06, "loss": 2.5813, "step": 47300 }, { "epoch": 2691.3229018492175, "eval_loss": 2.6135170459747314, "eval_runtime": 23.6299, "eval_samples_per_second": 58.993, "eval_steps_per_second": 7.406, "step": 47300 }, { "epoch": 2697.01280227596, "grad_norm": 0.2380247265100479, "learning_rate": 1.2642140468227424e-06, "loss": 2.5818, "step": 47400 }, { "epoch": 2697.01280227596, "eval_loss": 2.6122384071350098, "eval_runtime": 23.9739, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 47400 }, { "epoch": 2702.7027027027025, "grad_norm": 0.24715080857276917, "learning_rate": 1.254180602006689e-06, "loss": 2.5817, "step": 47500 }, { "epoch": 2702.7027027027025, "eval_loss": 2.6100640296936035, "eval_runtime": 23.6355, "eval_samples_per_second": 58.979, "eval_steps_per_second": 7.404, "step": 47500 }, { "epoch": 2708.3926031294454, "grad_norm": 0.24939079582691193, "learning_rate": 1.2441471571906355e-06, "loss": 2.5819, "step": 47600 }, { "epoch": 2708.3926031294454, "eval_loss": 2.611419439315796, "eval_runtime": 23.9748, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 47600 }, { "epoch": 2714.082503556188, "grad_norm": 0.22941380739212036, "learning_rate": 1.234113712374582e-06, "loss": 2.5809, "step": 47700 }, { "epoch": 2714.082503556188, "eval_loss": 2.612551689147949, "eval_runtime": 23.9661, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 47700 }, { "epoch": 2719.7724039829304, "grad_norm": 0.25405701994895935, "learning_rate": 1.2240802675585284e-06, "loss": 2.5817, "step": 47800 }, { "epoch": 2719.7724039829304, "eval_loss": 2.6119425296783447, "eval_runtime": 23.9765, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 47800 }, { "epoch": 2725.462304409673, "grad_norm": 0.22298355400562286, "learning_rate": 1.214046822742475e-06, "loss": 2.581, "step": 47900 }, { "epoch": 2725.462304409673, "eval_loss": 2.6139628887176514, "eval_runtime": 23.9951, "eval_samples_per_second": 58.095, "eval_steps_per_second": 7.293, "step": 47900 }, { "epoch": 2731.1522048364154, "grad_norm": 0.2541176676750183, "learning_rate": 1.2040133779264215e-06, "loss": 2.5816, "step": 48000 }, { "epoch": 2731.1522048364154, "eval_loss": 2.611189365386963, "eval_runtime": 23.9729, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 48000 }, { "epoch": 2736.842105263158, "grad_norm": 0.2229371964931488, "learning_rate": 1.193979933110368e-06, "loss": 2.5807, "step": 48100 }, { "epoch": 2736.842105263158, "eval_loss": 2.611900568008423, "eval_runtime": 23.9711, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 48100 }, { "epoch": 2742.5320056899004, "grad_norm": 0.23724424839019775, "learning_rate": 1.1839464882943144e-06, "loss": 2.5813, "step": 48200 }, { "epoch": 2742.5320056899004, "eval_loss": 2.612854242324829, "eval_runtime": 23.9618, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 48200 }, { "epoch": 2748.221906116643, "grad_norm": 0.2304125279188156, "learning_rate": 1.173913043478261e-06, "loss": 2.5809, "step": 48300 }, { "epoch": 2748.221906116643, "eval_loss": 2.6128289699554443, "eval_runtime": 23.6362, "eval_samples_per_second": 58.977, "eval_steps_per_second": 7.404, "step": 48300 }, { "epoch": 2753.9118065433854, "grad_norm": 0.23649877309799194, "learning_rate": 1.1638795986622075e-06, "loss": 2.5809, "step": 48400 }, { "epoch": 2753.9118065433854, "eval_loss": 2.610621929168701, "eval_runtime": 23.9852, "eval_samples_per_second": 58.119, "eval_steps_per_second": 7.296, "step": 48400 }, { "epoch": 2759.601706970128, "grad_norm": 0.22622932493686676, "learning_rate": 1.153846153846154e-06, "loss": 2.5812, "step": 48500 }, { "epoch": 2759.601706970128, "eval_loss": 2.6125988960266113, "eval_runtime": 23.9907, "eval_samples_per_second": 58.106, "eval_steps_per_second": 7.294, "step": 48500 }, { "epoch": 2765.2916073968704, "grad_norm": 0.22394676506519318, "learning_rate": 1.1438127090301004e-06, "loss": 2.5811, "step": 48600 }, { "epoch": 2765.2916073968704, "eval_loss": 2.6074447631835938, "eval_runtime": 23.9898, "eval_samples_per_second": 58.108, "eval_steps_per_second": 7.295, "step": 48600 }, { "epoch": 2770.981507823613, "grad_norm": 0.22825615108013153, "learning_rate": 1.133779264214047e-06, "loss": 2.5807, "step": 48700 }, { "epoch": 2770.981507823613, "eval_loss": 2.611135721206665, "eval_runtime": 23.9665, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 48700 }, { "epoch": 2776.6714082503554, "grad_norm": 0.21429681777954102, "learning_rate": 1.1237458193979933e-06, "loss": 2.5806, "step": 48800 }, { "epoch": 2776.6714082503554, "eval_loss": 2.6128909587860107, "eval_runtime": 23.9654, "eval_samples_per_second": 58.167, "eval_steps_per_second": 7.302, "step": 48800 }, { "epoch": 2782.3613086770984, "grad_norm": 0.24192963540554047, "learning_rate": 1.1137123745819398e-06, "loss": 2.5808, "step": 48900 }, { "epoch": 2782.3613086770984, "eval_loss": 2.6103410720825195, "eval_runtime": 23.9656, "eval_samples_per_second": 58.167, "eval_steps_per_second": 7.302, "step": 48900 }, { "epoch": 2788.051209103841, "grad_norm": 0.23475612699985504, "learning_rate": 1.1036789297658862e-06, "loss": 2.5808, "step": 49000 }, { "epoch": 2788.051209103841, "eval_loss": 2.6121463775634766, "eval_runtime": 23.9729, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 49000 }, { "epoch": 2793.7411095305833, "grad_norm": 0.2512466609477997, "learning_rate": 1.0936454849498327e-06, "loss": 2.5807, "step": 49100 }, { "epoch": 2793.7411095305833, "eval_loss": 2.6137309074401855, "eval_runtime": 23.6272, "eval_samples_per_second": 59.0, "eval_steps_per_second": 7.407, "step": 49100 }, { "epoch": 2799.431009957326, "grad_norm": 0.23058511316776276, "learning_rate": 1.0836120401337793e-06, "loss": 2.5804, "step": 49200 }, { "epoch": 2799.431009957326, "eval_loss": 2.6124165058135986, "eval_runtime": 23.9709, "eval_samples_per_second": 58.154, "eval_steps_per_second": 7.301, "step": 49200 }, { "epoch": 2805.1209103840683, "grad_norm": 0.22364000976085663, "learning_rate": 1.0735785953177258e-06, "loss": 2.5806, "step": 49300 }, { "epoch": 2805.1209103840683, "eval_loss": 2.6106984615325928, "eval_runtime": 23.9772, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 49300 }, { "epoch": 2810.810810810811, "grad_norm": 0.2526606321334839, "learning_rate": 1.0635451505016722e-06, "loss": 2.5804, "step": 49400 }, { "epoch": 2810.810810810811, "eval_loss": 2.610975742340088, "eval_runtime": 23.9837, "eval_samples_per_second": 58.123, "eval_steps_per_second": 7.297, "step": 49400 }, { "epoch": 2816.5007112375533, "grad_norm": 0.2499561458826065, "learning_rate": 1.0535117056856187e-06, "loss": 2.58, "step": 49500 }, { "epoch": 2816.5007112375533, "eval_loss": 2.6134748458862305, "eval_runtime": 23.9785, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 49500 }, { "epoch": 2822.190611664296, "grad_norm": 0.24504046142101288, "learning_rate": 1.0434782608695653e-06, "loss": 2.5807, "step": 49600 }, { "epoch": 2822.190611664296, "eval_loss": 2.612489700317383, "eval_runtime": 23.9682, "eval_samples_per_second": 58.16, "eval_steps_per_second": 7.301, "step": 49600 }, { "epoch": 2827.8805120910383, "grad_norm": 0.271673321723938, "learning_rate": 1.0334448160535118e-06, "loss": 2.5806, "step": 49700 }, { "epoch": 2827.8805120910383, "eval_loss": 2.609774112701416, "eval_runtime": 23.9659, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 49700 }, { "epoch": 2833.570412517781, "grad_norm": 0.2435864955186844, "learning_rate": 1.0234113712374581e-06, "loss": 2.58, "step": 49800 }, { "epoch": 2833.570412517781, "eval_loss": 2.6116602420806885, "eval_runtime": 23.9611, "eval_samples_per_second": 58.178, "eval_steps_per_second": 7.304, "step": 49800 }, { "epoch": 2839.2603129445233, "grad_norm": 0.24582748115062714, "learning_rate": 1.0133779264214047e-06, "loss": 2.5801, "step": 49900 }, { "epoch": 2839.2603129445233, "eval_loss": 2.605586528778076, "eval_runtime": 23.6223, "eval_samples_per_second": 59.012, "eval_steps_per_second": 7.408, "step": 49900 }, { "epoch": 2844.950213371266, "grad_norm": 0.24231892824172974, "learning_rate": 1.0033444816053512e-06, "loss": 2.5797, "step": 50000 }, { "epoch": 2844.950213371266, "eval_loss": 2.6107475757598877, "eval_runtime": 23.9738, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 50000 }, { "epoch": 2850.6401137980083, "grad_norm": 0.23913002014160156, "learning_rate": 9.933110367892976e-07, "loss": 2.5803, "step": 50100 }, { "epoch": 2850.6401137980083, "eval_loss": 2.611083745956421, "eval_runtime": 23.9703, "eval_samples_per_second": 58.155, "eval_steps_per_second": 7.301, "step": 50100 }, { "epoch": 2856.3300142247513, "grad_norm": 0.23877963423728943, "learning_rate": 9.832775919732441e-07, "loss": 2.5797, "step": 50200 }, { "epoch": 2856.3300142247513, "eval_loss": 2.6153957843780518, "eval_runtime": 23.9825, "eval_samples_per_second": 58.126, "eval_steps_per_second": 7.297, "step": 50200 }, { "epoch": 2862.0199146514938, "grad_norm": 0.23219284415245056, "learning_rate": 9.732441471571907e-07, "loss": 2.5798, "step": 50300 }, { "epoch": 2862.0199146514938, "eval_loss": 2.6109278202056885, "eval_runtime": 23.9766, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 50300 }, { "epoch": 2867.7098150782363, "grad_norm": 0.23990660905838013, "learning_rate": 9.632107023411372e-07, "loss": 2.5797, "step": 50400 }, { "epoch": 2867.7098150782363, "eval_loss": 2.6120529174804688, "eval_runtime": 23.9635, "eval_samples_per_second": 58.172, "eval_steps_per_second": 7.303, "step": 50400 }, { "epoch": 2873.3997155049788, "grad_norm": 0.22840148210525513, "learning_rate": 9.531772575250837e-07, "loss": 2.5799, "step": 50500 }, { "epoch": 2873.3997155049788, "eval_loss": 2.6159815788269043, "eval_runtime": 23.9613, "eval_samples_per_second": 58.177, "eval_steps_per_second": 7.303, "step": 50500 }, { "epoch": 2879.0896159317213, "grad_norm": 0.24212676286697388, "learning_rate": 9.431438127090301e-07, "loss": 2.5798, "step": 50600 }, { "epoch": 2879.0896159317213, "eval_loss": 2.609698534011841, "eval_runtime": 23.966, "eval_samples_per_second": 58.166, "eval_steps_per_second": 7.302, "step": 50600 }, { "epoch": 2884.7795163584638, "grad_norm": 0.2604888081550598, "learning_rate": 9.331103678929767e-07, "loss": 2.5795, "step": 50700 }, { "epoch": 2884.7795163584638, "eval_loss": 2.6143059730529785, "eval_runtime": 23.616, "eval_samples_per_second": 59.028, "eval_steps_per_second": 7.41, "step": 50700 }, { "epoch": 2890.4694167852062, "grad_norm": 0.2513519823551178, "learning_rate": 9.230769230769231e-07, "loss": 2.5798, "step": 50800 }, { "epoch": 2890.4694167852062, "eval_loss": 2.6114535331726074, "eval_runtime": 23.9617, "eval_samples_per_second": 58.176, "eval_steps_per_second": 7.303, "step": 50800 }, { "epoch": 2896.1593172119487, "grad_norm": 0.262923926115036, "learning_rate": 9.130434782608697e-07, "loss": 2.5795, "step": 50900 }, { "epoch": 2896.1593172119487, "eval_loss": 2.613948106765747, "eval_runtime": 23.9726, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 50900 }, { "epoch": 2901.8492176386912, "grad_norm": 0.2399134784936905, "learning_rate": 9.030100334448161e-07, "loss": 2.5796, "step": 51000 }, { "epoch": 2901.8492176386912, "eval_loss": 2.612149477005005, "eval_runtime": 23.9766, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 51000 }, { "epoch": 2907.5391180654337, "grad_norm": 0.23589570820331573, "learning_rate": 8.929765886287627e-07, "loss": 2.5803, "step": 51100 }, { "epoch": 2907.5391180654337, "eval_loss": 2.612802743911743, "eval_runtime": 23.9879, "eval_samples_per_second": 58.113, "eval_steps_per_second": 7.295, "step": 51100 }, { "epoch": 2913.2290184921762, "grad_norm": 0.24552448093891144, "learning_rate": 8.829431438127091e-07, "loss": 2.5795, "step": 51200 }, { "epoch": 2913.2290184921762, "eval_loss": 2.61102294921875, "eval_runtime": 23.9819, "eval_samples_per_second": 58.127, "eval_steps_per_second": 7.297, "step": 51200 }, { "epoch": 2918.9189189189187, "grad_norm": 0.23780512809753418, "learning_rate": 8.729096989966555e-07, "loss": 2.5793, "step": 51300 }, { "epoch": 2918.9189189189187, "eval_loss": 2.6139285564422607, "eval_runtime": 23.9687, "eval_samples_per_second": 58.159, "eval_steps_per_second": 7.301, "step": 51300 }, { "epoch": 2924.6088193456612, "grad_norm": 0.25004133582115173, "learning_rate": 8.628762541806019e-07, "loss": 2.5791, "step": 51400 }, { "epoch": 2924.6088193456612, "eval_loss": 2.6126387119293213, "eval_runtime": 24.012, "eval_samples_per_second": 58.054, "eval_steps_per_second": 7.288, "step": 51400 }, { "epoch": 2930.298719772404, "grad_norm": 0.24633045494556427, "learning_rate": 8.528428093645485e-07, "loss": 2.5793, "step": 51500 }, { "epoch": 2930.298719772404, "eval_loss": 2.6124863624572754, "eval_runtime": 23.9703, "eval_samples_per_second": 58.155, "eval_steps_per_second": 7.301, "step": 51500 }, { "epoch": 2935.9886201991467, "grad_norm": 0.22113491594791412, "learning_rate": 8.428093645484949e-07, "loss": 2.5787, "step": 51600 }, { "epoch": 2935.9886201991467, "eval_loss": 2.611276626586914, "eval_runtime": 23.6181, "eval_samples_per_second": 59.023, "eval_steps_per_second": 7.41, "step": 51600 }, { "epoch": 2941.678520625889, "grad_norm": 0.23446623980998993, "learning_rate": 8.327759197324414e-07, "loss": 2.5789, "step": 51700 }, { "epoch": 2941.678520625889, "eval_loss": 2.6144800186157227, "eval_runtime": 23.9766, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 51700 }, { "epoch": 2947.3684210526317, "grad_norm": 0.24706819653511047, "learning_rate": 8.227424749163879e-07, "loss": 2.5792, "step": 51800 }, { "epoch": 2947.3684210526317, "eval_loss": 2.613771915435791, "eval_runtime": 23.9663, "eval_samples_per_second": 58.165, "eval_steps_per_second": 7.302, "step": 51800 }, { "epoch": 2953.058321479374, "grad_norm": 0.2382979393005371, "learning_rate": 8.127090301003344e-07, "loss": 2.5792, "step": 51900 }, { "epoch": 2953.058321479374, "eval_loss": 2.61126446723938, "eval_runtime": 23.9672, "eval_samples_per_second": 58.163, "eval_steps_per_second": 7.302, "step": 51900 }, { "epoch": 2958.7482219061167, "grad_norm": 0.22167593240737915, "learning_rate": 8.026755852842809e-07, "loss": 2.5789, "step": 52000 }, { "epoch": 2958.7482219061167, "eval_loss": 2.612398147583008, "eval_runtime": 23.9669, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 52000 }, { "epoch": 2964.438122332859, "grad_norm": 0.24954278767108917, "learning_rate": 7.926421404682274e-07, "loss": 2.5789, "step": 52100 }, { "epoch": 2964.438122332859, "eval_loss": 2.6127331256866455, "eval_runtime": 23.9665, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 52100 }, { "epoch": 2970.1280227596017, "grad_norm": 0.25174450874328613, "learning_rate": 7.826086956521739e-07, "loss": 2.5787, "step": 52200 }, { "epoch": 2970.1280227596017, "eval_loss": 2.612732410430908, "eval_runtime": 23.6271, "eval_samples_per_second": 59.0, "eval_steps_per_second": 7.407, "step": 52200 }, { "epoch": 2975.817923186344, "grad_norm": 0.2445630580186844, "learning_rate": 7.725752508361204e-07, "loss": 2.579, "step": 52300 }, { "epoch": 2975.817923186344, "eval_loss": 2.613999366760254, "eval_runtime": 23.9666, "eval_samples_per_second": 58.164, "eval_steps_per_second": 7.302, "step": 52300 }, { "epoch": 2981.5078236130867, "grad_norm": 0.23932389914989471, "learning_rate": 7.625418060200669e-07, "loss": 2.5786, "step": 52400 }, { "epoch": 2981.5078236130867, "eval_loss": 2.6142683029174805, "eval_runtime": 23.9605, "eval_samples_per_second": 58.179, "eval_steps_per_second": 7.304, "step": 52400 }, { "epoch": 2987.197724039829, "grad_norm": 0.23177169263362885, "learning_rate": 7.525083612040134e-07, "loss": 2.5787, "step": 52500 }, { "epoch": 2987.197724039829, "eval_loss": 2.6153082847595215, "eval_runtime": 23.9736, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 52500 }, { "epoch": 2992.8876244665717, "grad_norm": 0.23721282184123993, "learning_rate": 7.424749163879599e-07, "loss": 2.5785, "step": 52600 }, { "epoch": 2992.8876244665717, "eval_loss": 2.6143527030944824, "eval_runtime": 23.9808, "eval_samples_per_second": 58.13, "eval_steps_per_second": 7.297, "step": 52600 }, { "epoch": 2998.577524893314, "grad_norm": 0.2384282797574997, "learning_rate": 7.324414715719064e-07, "loss": 2.5785, "step": 52700 }, { "epoch": 2998.577524893314, "eval_loss": 2.611684560775757, "eval_runtime": 23.9684, "eval_samples_per_second": 58.16, "eval_steps_per_second": 7.301, "step": 52700 }, { "epoch": 3004.267425320057, "grad_norm": 0.21971900761127472, "learning_rate": 7.224080267558529e-07, "loss": 2.5786, "step": 52800 }, { "epoch": 3004.267425320057, "eval_loss": 2.6113009452819824, "eval_runtime": 23.9803, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 52800 }, { "epoch": 3009.9573257467996, "grad_norm": 0.2397443652153015, "learning_rate": 7.123745819397994e-07, "loss": 2.5787, "step": 52900 }, { "epoch": 3009.9573257467996, "eval_loss": 2.6127891540527344, "eval_runtime": 23.9984, "eval_samples_per_second": 58.087, "eval_steps_per_second": 7.292, "step": 52900 }, { "epoch": 3015.647226173542, "grad_norm": 0.2291790097951889, "learning_rate": 7.023411371237459e-07, "loss": 2.5782, "step": 53000 }, { "epoch": 3015.647226173542, "eval_loss": 2.615396499633789, "eval_runtime": 23.9773, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 53000 }, { "epoch": 3021.3371266002846, "grad_norm": 0.22614707052707672, "learning_rate": 6.923076923076923e-07, "loss": 2.5786, "step": 53100 }, { "epoch": 3021.3371266002846, "eval_loss": 2.6115903854370117, "eval_runtime": 23.6261, "eval_samples_per_second": 59.003, "eval_steps_per_second": 7.407, "step": 53100 }, { "epoch": 3027.027027027027, "grad_norm": 0.25179383158683777, "learning_rate": 6.822742474916388e-07, "loss": 2.5789, "step": 53200 }, { "epoch": 3027.027027027027, "eval_loss": 2.6113967895507812, "eval_runtime": 23.9771, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 53200 }, { "epoch": 3032.7169274537696, "grad_norm": 0.24571330845355988, "learning_rate": 6.722408026755853e-07, "loss": 2.579, "step": 53300 }, { "epoch": 3032.7169274537696, "eval_loss": 2.6136531829833984, "eval_runtime": 23.6309, "eval_samples_per_second": 58.991, "eval_steps_per_second": 7.406, "step": 53300 }, { "epoch": 3038.406827880512, "grad_norm": 0.22767402231693268, "learning_rate": 6.622073578595318e-07, "loss": 2.5783, "step": 53400 }, { "epoch": 3038.406827880512, "eval_loss": 2.6177024841308594, "eval_runtime": 23.9884, "eval_samples_per_second": 58.111, "eval_steps_per_second": 7.295, "step": 53400 }, { "epoch": 3044.0967283072546, "grad_norm": 0.23231668770313263, "learning_rate": 6.521739130434783e-07, "loss": 2.5787, "step": 53500 }, { "epoch": 3044.0967283072546, "eval_loss": 2.611980438232422, "eval_runtime": 23.6137, "eval_samples_per_second": 59.033, "eval_steps_per_second": 7.411, "step": 53500 }, { "epoch": 3049.786628733997, "grad_norm": 0.23049120604991913, "learning_rate": 6.421404682274248e-07, "loss": 2.5779, "step": 53600 }, { "epoch": 3049.786628733997, "eval_loss": 2.61087965965271, "eval_runtime": 23.9786, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 53600 }, { "epoch": 3055.4765291607396, "grad_norm": 0.23373426496982574, "learning_rate": 6.321070234113712e-07, "loss": 2.5782, "step": 53700 }, { "epoch": 3055.4765291607396, "eval_loss": 2.612997531890869, "eval_runtime": 23.6252, "eval_samples_per_second": 59.005, "eval_steps_per_second": 7.407, "step": 53700 }, { "epoch": 3061.166429587482, "grad_norm": 0.25036904215812683, "learning_rate": 6.220735785953178e-07, "loss": 2.5782, "step": 53800 }, { "epoch": 3061.166429587482, "eval_loss": 2.614743709564209, "eval_runtime": 23.9821, "eval_samples_per_second": 58.127, "eval_steps_per_second": 7.297, "step": 53800 }, { "epoch": 3066.8563300142246, "grad_norm": 0.2329493761062622, "learning_rate": 6.120401337792642e-07, "loss": 2.5777, "step": 53900 }, { "epoch": 3066.8563300142246, "eval_loss": 2.60845947265625, "eval_runtime": 23.6181, "eval_samples_per_second": 59.023, "eval_steps_per_second": 7.41, "step": 53900 }, { "epoch": 3072.546230440967, "grad_norm": 0.24762089550495148, "learning_rate": 6.020066889632107e-07, "loss": 2.5781, "step": 54000 }, { "epoch": 3072.546230440967, "eval_loss": 2.6142795085906982, "eval_runtime": 23.9734, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 54000 }, { "epoch": 3078.23613086771, "grad_norm": 0.26611801981925964, "learning_rate": 5.919732441471572e-07, "loss": 2.5779, "step": 54100 }, { "epoch": 3078.23613086771, "eval_loss": 2.6147961616516113, "eval_runtime": 23.9768, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 54100 }, { "epoch": 3083.9260312944525, "grad_norm": 0.23262718319892883, "learning_rate": 5.819397993311037e-07, "loss": 2.5779, "step": 54200 }, { "epoch": 3083.9260312944525, "eval_loss": 2.6128289699554443, "eval_runtime": 23.9893, "eval_samples_per_second": 58.109, "eval_steps_per_second": 7.295, "step": 54200 }, { "epoch": 3089.615931721195, "grad_norm": 0.23977543413639069, "learning_rate": 5.719063545150502e-07, "loss": 2.5781, "step": 54300 }, { "epoch": 3089.615931721195, "eval_loss": 2.6133010387420654, "eval_runtime": 23.9739, "eval_samples_per_second": 58.147, "eval_steps_per_second": 7.3, "step": 54300 }, { "epoch": 3095.3058321479375, "grad_norm": 0.24166710674762726, "learning_rate": 5.618729096989966e-07, "loss": 2.5779, "step": 54400 }, { "epoch": 3095.3058321479375, "eval_loss": 2.6149790287017822, "eval_runtime": 23.96, "eval_samples_per_second": 58.18, "eval_steps_per_second": 7.304, "step": 54400 }, { "epoch": 3100.99573257468, "grad_norm": 0.23214000463485718, "learning_rate": 5.518394648829431e-07, "loss": 2.5779, "step": 54500 }, { "epoch": 3100.99573257468, "eval_loss": 2.612640142440796, "eval_runtime": 23.9716, "eval_samples_per_second": 58.152, "eval_steps_per_second": 7.3, "step": 54500 }, { "epoch": 3106.6856330014225, "grad_norm": 0.23323415219783783, "learning_rate": 5.418060200668896e-07, "loss": 2.5775, "step": 54600 }, { "epoch": 3106.6856330014225, "eval_loss": 2.61309552192688, "eval_runtime": 23.9769, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 54600 }, { "epoch": 3112.375533428165, "grad_norm": 0.255034863948822, "learning_rate": 5.317725752508361e-07, "loss": 2.5782, "step": 54700 }, { "epoch": 3112.375533428165, "eval_loss": 2.6171648502349854, "eval_runtime": 23.9725, "eval_samples_per_second": 58.15, "eval_steps_per_second": 7.3, "step": 54700 }, { "epoch": 3118.0654338549075, "grad_norm": 0.239244744181633, "learning_rate": 5.217391304347826e-07, "loss": 2.5779, "step": 54800 }, { "epoch": 3118.0654338549075, "eval_loss": 2.612152576446533, "eval_runtime": 23.6301, "eval_samples_per_second": 58.993, "eval_steps_per_second": 7.406, "step": 54800 }, { "epoch": 3123.75533428165, "grad_norm": 0.22807510197162628, "learning_rate": 5.117056856187291e-07, "loss": 2.5778, "step": 54900 }, { "epoch": 3123.75533428165, "eval_loss": 2.615468978881836, "eval_runtime": 23.9816, "eval_samples_per_second": 58.128, "eval_steps_per_second": 7.297, "step": 54900 }, { "epoch": 3129.4452347083925, "grad_norm": 0.23386307060718536, "learning_rate": 5.016722408026756e-07, "loss": 2.5774, "step": 55000 }, { "epoch": 3129.4452347083925, "eval_loss": 2.6134111881256104, "eval_runtime": 23.9585, "eval_samples_per_second": 58.184, "eval_steps_per_second": 7.304, "step": 55000 }, { "epoch": 3135.135135135135, "grad_norm": 0.23969079554080963, "learning_rate": 4.916387959866221e-07, "loss": 2.5776, "step": 55100 }, { "epoch": 3135.135135135135, "eval_loss": 2.6106112003326416, "eval_runtime": 23.9709, "eval_samples_per_second": 58.154, "eval_steps_per_second": 7.301, "step": 55100 }, { "epoch": 3140.8250355618775, "grad_norm": 0.24286451935768127, "learning_rate": 4.816053511705686e-07, "loss": 2.5779, "step": 55200 }, { "epoch": 3140.8250355618775, "eval_loss": 2.6160521507263184, "eval_runtime": 23.9565, "eval_samples_per_second": 58.189, "eval_steps_per_second": 7.305, "step": 55200 }, { "epoch": 3146.51493598862, "grad_norm": 0.24260209500789642, "learning_rate": 4.7157190635451506e-07, "loss": 2.5778, "step": 55300 }, { "epoch": 3146.51493598862, "eval_loss": 2.615705728530884, "eval_runtime": 23.9728, "eval_samples_per_second": 58.149, "eval_steps_per_second": 7.3, "step": 55300 }, { "epoch": 3152.204836415363, "grad_norm": 0.2503577768802643, "learning_rate": 4.6153846153846156e-07, "loss": 2.5775, "step": 55400 }, { "epoch": 3152.204836415363, "eval_loss": 2.6147685050964355, "eval_runtime": 23.9704, "eval_samples_per_second": 58.155, "eval_steps_per_second": 7.301, "step": 55400 }, { "epoch": 3157.8947368421054, "grad_norm": 0.23583941161632538, "learning_rate": 4.5150501672240806e-07, "loss": 2.5775, "step": 55500 }, { "epoch": 3157.8947368421054, "eval_loss": 2.6131534576416016, "eval_runtime": 23.9699, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 55500 }, { "epoch": 3163.584637268848, "grad_norm": 0.23154014348983765, "learning_rate": 4.4147157190635456e-07, "loss": 2.5769, "step": 55600 }, { "epoch": 3163.584637268848, "eval_loss": 2.612272024154663, "eval_runtime": 23.9775, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 55600 }, { "epoch": 3169.2745376955904, "grad_norm": 0.2477724254131317, "learning_rate": 4.3143812709030095e-07, "loss": 2.5778, "step": 55700 }, { "epoch": 3169.2745376955904, "eval_loss": 2.613452196121216, "eval_runtime": 23.6176, "eval_samples_per_second": 59.024, "eval_steps_per_second": 7.41, "step": 55700 }, { "epoch": 3174.964438122333, "grad_norm": 0.23132053017616272, "learning_rate": 4.2140468227424745e-07, "loss": 2.5776, "step": 55800 }, { "epoch": 3174.964438122333, "eval_loss": 2.6152663230895996, "eval_runtime": 23.978, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 55800 }, { "epoch": 3180.6543385490754, "grad_norm": 0.2514456510543823, "learning_rate": 4.1137123745819395e-07, "loss": 2.5773, "step": 55900 }, { "epoch": 3180.6543385490754, "eval_loss": 2.6148664951324463, "eval_runtime": 23.969, "eval_samples_per_second": 58.158, "eval_steps_per_second": 7.301, "step": 55900 }, { "epoch": 3186.344238975818, "grad_norm": 0.2294733226299286, "learning_rate": 4.0133779264214045e-07, "loss": 2.5773, "step": 56000 }, { "epoch": 3186.344238975818, "eval_loss": 2.6135528087615967, "eval_runtime": 23.9804, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 56000 }, { "epoch": 3192.0341394025604, "grad_norm": 0.23970623314380646, "learning_rate": 3.9130434782608694e-07, "loss": 2.5773, "step": 56100 }, { "epoch": 3192.0341394025604, "eval_loss": 2.61356258392334, "eval_runtime": 23.9871, "eval_samples_per_second": 58.115, "eval_steps_per_second": 7.296, "step": 56100 }, { "epoch": 3197.724039829303, "grad_norm": 0.23367737233638763, "learning_rate": 3.8127090301003344e-07, "loss": 2.5772, "step": 56200 }, { "epoch": 3197.724039829303, "eval_loss": 2.614643096923828, "eval_runtime": 23.9787, "eval_samples_per_second": 58.135, "eval_steps_per_second": 7.298, "step": 56200 }, { "epoch": 3203.4139402560454, "grad_norm": 0.23269738256931305, "learning_rate": 3.7123745819397994e-07, "loss": 2.577, "step": 56300 }, { "epoch": 3203.4139402560454, "eval_loss": 2.6141436100006104, "eval_runtime": 23.9773, "eval_samples_per_second": 58.138, "eval_steps_per_second": 7.299, "step": 56300 }, { "epoch": 3209.103840682788, "grad_norm": 0.2536345422267914, "learning_rate": 3.6120401337792644e-07, "loss": 2.5768, "step": 56400 }, { "epoch": 3209.103840682788, "eval_loss": 2.612579345703125, "eval_runtime": 24.012, "eval_samples_per_second": 58.054, "eval_steps_per_second": 7.288, "step": 56400 }, { "epoch": 3214.7937411095304, "grad_norm": 0.23528337478637695, "learning_rate": 3.5117056856187294e-07, "loss": 2.5769, "step": 56500 }, { "epoch": 3214.7937411095304, "eval_loss": 2.6140823364257812, "eval_runtime": 23.6308, "eval_samples_per_second": 58.991, "eval_steps_per_second": 7.406, "step": 56500 }, { "epoch": 3220.483641536273, "grad_norm": 0.22182530164718628, "learning_rate": 3.411371237458194e-07, "loss": 2.5777, "step": 56600 }, { "epoch": 3220.483641536273, "eval_loss": 2.60969614982605, "eval_runtime": 23.9743, "eval_samples_per_second": 58.146, "eval_steps_per_second": 7.299, "step": 56600 }, { "epoch": 3226.173541963016, "grad_norm": 0.23210017383098602, "learning_rate": 3.311036789297659e-07, "loss": 2.5778, "step": 56700 }, { "epoch": 3226.173541963016, "eval_loss": 2.614919900894165, "eval_runtime": 23.9777, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 56700 }, { "epoch": 3231.8634423897583, "grad_norm": 0.2339479774236679, "learning_rate": 3.210702341137124e-07, "loss": 2.5773, "step": 56800 }, { "epoch": 3231.8634423897583, "eval_loss": 2.615126848220825, "eval_runtime": 23.9735, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 56800 }, { "epoch": 3237.553342816501, "grad_norm": 0.2549735903739929, "learning_rate": 3.110367892976589e-07, "loss": 2.5771, "step": 56900 }, { "epoch": 3237.553342816501, "eval_loss": 2.6175296306610107, "eval_runtime": 23.9758, "eval_samples_per_second": 58.142, "eval_steps_per_second": 7.299, "step": 56900 }, { "epoch": 3243.2432432432433, "grad_norm": 0.2367645800113678, "learning_rate": 3.010033444816054e-07, "loss": 2.5775, "step": 57000 }, { "epoch": 3243.2432432432433, "eval_loss": 2.6152868270874023, "eval_runtime": 23.6141, "eval_samples_per_second": 59.033, "eval_steps_per_second": 7.411, "step": 57000 }, { "epoch": 3248.933143669986, "grad_norm": 0.24510601162910461, "learning_rate": 2.9096989966555187e-07, "loss": 2.5769, "step": 57100 }, { "epoch": 3248.933143669986, "eval_loss": 2.6133997440338135, "eval_runtime": 23.9936, "eval_samples_per_second": 58.099, "eval_steps_per_second": 7.294, "step": 57100 }, { "epoch": 3254.6230440967283, "grad_norm": 0.2624337673187256, "learning_rate": 2.809364548494983e-07, "loss": 2.5772, "step": 57200 }, { "epoch": 3254.6230440967283, "eval_loss": 2.611377716064453, "eval_runtime": 23.9805, "eval_samples_per_second": 58.131, "eval_steps_per_second": 7.298, "step": 57200 }, { "epoch": 3260.312944523471, "grad_norm": 0.23755531013011932, "learning_rate": 2.709030100334448e-07, "loss": 2.5768, "step": 57300 }, { "epoch": 3260.312944523471, "eval_loss": 2.6142404079437256, "eval_runtime": 23.989, "eval_samples_per_second": 58.11, "eval_steps_per_second": 7.295, "step": 57300 }, { "epoch": 3266.0028449502133, "grad_norm": 0.22622860968112946, "learning_rate": 2.608695652173913e-07, "loss": 2.5774, "step": 57400 }, { "epoch": 3266.0028449502133, "eval_loss": 2.6112027168273926, "eval_runtime": 23.9649, "eval_samples_per_second": 58.168, "eval_steps_per_second": 7.302, "step": 57400 }, { "epoch": 3271.692745376956, "grad_norm": 0.26149579882621765, "learning_rate": 2.508361204013378e-07, "loss": 2.5769, "step": 57500 }, { "epoch": 3271.692745376956, "eval_loss": 2.6112406253814697, "eval_runtime": 23.975, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 57500 }, { "epoch": 3277.3826458036983, "grad_norm": 0.22187606990337372, "learning_rate": 2.408026755852843e-07, "loss": 2.5772, "step": 57600 }, { "epoch": 3277.3826458036983, "eval_loss": 2.6137592792510986, "eval_runtime": 23.6148, "eval_samples_per_second": 59.031, "eval_steps_per_second": 7.411, "step": 57600 }, { "epoch": 3283.072546230441, "grad_norm": 0.22784946858882904, "learning_rate": 2.3076923076923078e-07, "loss": 2.5763, "step": 57700 }, { "epoch": 3283.072546230441, "eval_loss": 2.6143863201141357, "eval_runtime": 23.975, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 57700 }, { "epoch": 3288.7624466571833, "grad_norm": 0.23659248650074005, "learning_rate": 2.2073578595317728e-07, "loss": 2.5768, "step": 57800 }, { "epoch": 3288.7624466571833, "eval_loss": 2.6137397289276123, "eval_runtime": 23.9772, "eval_samples_per_second": 58.139, "eval_steps_per_second": 7.299, "step": 57800 }, { "epoch": 3294.452347083926, "grad_norm": 0.2437194287776947, "learning_rate": 2.1070234113712372e-07, "loss": 2.5764, "step": 57900 }, { "epoch": 3294.452347083926, "eval_loss": 2.6149351596832275, "eval_runtime": 23.9751, "eval_samples_per_second": 58.144, "eval_steps_per_second": 7.299, "step": 57900 }, { "epoch": 3300.1422475106688, "grad_norm": 0.22918538749217987, "learning_rate": 2.0066889632107022e-07, "loss": 2.5768, "step": 58000 }, { "epoch": 3300.1422475106688, "eval_loss": 2.6140780448913574, "eval_runtime": 23.9791, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 58000 }, { "epoch": 3305.8321479374113, "grad_norm": 0.2346070110797882, "learning_rate": 1.9063545150501672e-07, "loss": 2.5773, "step": 58100 }, { "epoch": 3305.8321479374113, "eval_loss": 2.616719961166382, "eval_runtime": 23.9911, "eval_samples_per_second": 58.105, "eval_steps_per_second": 7.294, "step": 58100 }, { "epoch": 3311.5220483641538, "grad_norm": 0.23859845101833344, "learning_rate": 1.8060200668896322e-07, "loss": 2.5774, "step": 58200 }, { "epoch": 3311.5220483641538, "eval_loss": 2.618227481842041, "eval_runtime": 23.6148, "eval_samples_per_second": 59.031, "eval_steps_per_second": 7.411, "step": 58200 }, { "epoch": 3317.2119487908963, "grad_norm": 0.2350337654352188, "learning_rate": 1.705685618729097e-07, "loss": 2.5776, "step": 58300 }, { "epoch": 3317.2119487908963, "eval_loss": 2.611265182495117, "eval_runtime": 23.9884, "eval_samples_per_second": 58.111, "eval_steps_per_second": 7.295, "step": 58300 }, { "epoch": 3322.9018492176388, "grad_norm": 0.24340157210826874, "learning_rate": 1.605351170568562e-07, "loss": 2.5765, "step": 58400 }, { "epoch": 3322.9018492176388, "eval_loss": 2.6160683631896973, "eval_runtime": 23.9833, "eval_samples_per_second": 58.124, "eval_steps_per_second": 7.297, "step": 58400 }, { "epoch": 3328.5917496443813, "grad_norm": 0.23944565653800964, "learning_rate": 1.505016722408027e-07, "loss": 2.577, "step": 58500 }, { "epoch": 3328.5917496443813, "eval_loss": 2.6151676177978516, "eval_runtime": 23.9765, "eval_samples_per_second": 58.14, "eval_steps_per_second": 7.299, "step": 58500 }, { "epoch": 3334.2816500711237, "grad_norm": 0.2317434698343277, "learning_rate": 1.4046822742474916e-07, "loss": 2.5769, "step": 58600 }, { "epoch": 3334.2816500711237, "eval_loss": 2.612539768218994, "eval_runtime": 23.9857, "eval_samples_per_second": 58.118, "eval_steps_per_second": 7.296, "step": 58600 }, { "epoch": 3339.9715504978662, "grad_norm": 0.2317090928554535, "learning_rate": 1.3043478260869566e-07, "loss": 2.5766, "step": 58700 }, { "epoch": 3339.9715504978662, "eval_loss": 2.614988088607788, "eval_runtime": 23.9839, "eval_samples_per_second": 58.122, "eval_steps_per_second": 7.297, "step": 58700 }, { "epoch": 3345.6614509246087, "grad_norm": 0.2286725789308548, "learning_rate": 1.2040133779264215e-07, "loss": 2.5771, "step": 58800 }, { "epoch": 3345.6614509246087, "eval_loss": 2.610363245010376, "eval_runtime": 23.9703, "eval_samples_per_second": 58.155, "eval_steps_per_second": 7.301, "step": 58800 }, { "epoch": 3351.3513513513512, "grad_norm": 0.23261764645576477, "learning_rate": 1.1036789297658864e-07, "loss": 2.5773, "step": 58900 }, { "epoch": 3351.3513513513512, "eval_loss": 2.612881660461426, "eval_runtime": 23.9746, "eval_samples_per_second": 58.145, "eval_steps_per_second": 7.299, "step": 58900 }, { "epoch": 3357.0412517780937, "grad_norm": 0.24025121331214905, "learning_rate": 1.0033444816053511e-07, "loss": 2.5771, "step": 59000 }, { "epoch": 3357.0412517780937, "eval_loss": 2.616802453994751, "eval_runtime": 23.9791, "eval_samples_per_second": 58.134, "eval_steps_per_second": 7.298, "step": 59000 }, { "epoch": 3362.7311522048362, "grad_norm": 0.22746782004833221, "learning_rate": 9.030100334448161e-08, "loss": 2.5771, "step": 59100 }, { "epoch": 3362.7311522048362, "eval_loss": 2.6187052726745605, "eval_runtime": 23.6285, "eval_samples_per_second": 58.996, "eval_steps_per_second": 7.406, "step": 59100 }, { "epoch": 3368.4210526315787, "grad_norm": 0.22770369052886963, "learning_rate": 8.02675585284281e-08, "loss": 2.5765, "step": 59200 }, { "epoch": 3368.4210526315787, "eval_loss": 2.6143691539764404, "eval_runtime": 23.9914, "eval_samples_per_second": 58.104, "eval_steps_per_second": 7.294, "step": 59200 }, { "epoch": 3374.1109530583217, "grad_norm": 0.2320239543914795, "learning_rate": 7.023411371237458e-08, "loss": 2.5766, "step": 59300 }, { "epoch": 3374.1109530583217, "eval_loss": 2.610212802886963, "eval_runtime": 23.971, "eval_samples_per_second": 58.153, "eval_steps_per_second": 7.3, "step": 59300 }, { "epoch": 3379.800853485064, "grad_norm": 0.24047939479351044, "learning_rate": 6.020066889632108e-08, "loss": 2.5766, "step": 59400 }, { "epoch": 3379.800853485064, "eval_loss": 2.6162428855895996, "eval_runtime": 23.9874, "eval_samples_per_second": 58.114, "eval_steps_per_second": 7.296, "step": 59400 }, { "epoch": 3385.4907539118067, "grad_norm": 0.2457292228937149, "learning_rate": 5.0167224080267556e-08, "loss": 2.5767, "step": 59500 }, { "epoch": 3385.4907539118067, "eval_loss": 2.612614870071411, "eval_runtime": 23.9722, "eval_samples_per_second": 58.151, "eval_steps_per_second": 7.3, "step": 59500 }, { "epoch": 3391.180654338549, "grad_norm": 0.24467065930366516, "learning_rate": 4.013377926421405e-08, "loss": 2.5765, "step": 59600 }, { "epoch": 3391.180654338549, "eval_loss": 2.616481065750122, "eval_runtime": 23.9732, "eval_samples_per_second": 58.148, "eval_steps_per_second": 7.3, "step": 59600 }, { "epoch": 3396.8705547652917, "grad_norm": 0.2387106865644455, "learning_rate": 3.010033444816054e-08, "loss": 2.5763, "step": 59700 }, { "epoch": 3396.8705547652917, "eval_loss": 2.6150505542755127, "eval_runtime": 23.97, "eval_samples_per_second": 58.156, "eval_steps_per_second": 7.301, "step": 59700 }, { "epoch": 3402.560455192034, "grad_norm": 0.24101048707962036, "learning_rate": 2.0066889632107024e-08, "loss": 2.5769, "step": 59800 }, { "epoch": 3402.560455192034, "eval_loss": 2.6154425144195557, "eval_runtime": 23.9679, "eval_samples_per_second": 58.161, "eval_steps_per_second": 7.301, "step": 59800 }, { "epoch": 3408.2503556187767, "grad_norm": 0.25155118107795715, "learning_rate": 1.0033444816053512e-08, "loss": 2.5763, "step": 59900 }, { "epoch": 3408.2503556187767, "eval_loss": 2.6153502464294434, "eval_runtime": 23.9778, "eval_samples_per_second": 58.137, "eval_steps_per_second": 7.298, "step": 59900 }, { "epoch": 3413.940256045519, "grad_norm": 0.23192672431468964, "learning_rate": 0.0, "loss": 2.5765, "step": 60000 }, { "epoch": 3413.940256045519, "eval_loss": 2.6164731979370117, "eval_runtime": 23.6169, "eval_samples_per_second": 59.026, "eval_steps_per_second": 7.41, "step": 60000 } ], "logging_steps": 100, "max_steps": 60000, "num_input_tokens_seen": 0, "num_train_epochs": 3530, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.1572336017408e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }