{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.991040318566451, "eval_steps": 10000, "global_step": 500000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "learning_rate": 9.83937751004016e-06, "loss": 1.3677, "step": 10000 }, { "epoch": 0.04, "eval_loss": 0.8635309934616089, "eval_runtime": 127.2968, "eval_samples_per_second": 1932.492, "eval_steps_per_second": 20.134, "step": 10000 }, { "epoch": 0.08, "learning_rate": 9.638674698795182e-06, "loss": 0.7723, "step": 20000 }, { "epoch": 0.08, "eval_loss": 0.6709557175636292, "eval_runtime": 127.3309, "eval_samples_per_second": 1931.974, "eval_steps_per_second": 20.129, "step": 20000 }, { "epoch": 0.12, "learning_rate": 9.437951807228917e-06, "loss": 0.6635, "step": 30000 }, { "epoch": 0.12, "eval_loss": 0.6188660264015198, "eval_runtime": 127.3716, "eval_samples_per_second": 1931.357, "eval_steps_per_second": 20.122, "step": 30000 }, { "epoch": 0.16, "learning_rate": 9.237248995983937e-06, "loss": 0.62, "step": 40000 }, { "epoch": 0.16, "eval_loss": 0.5857706069946289, "eval_runtime": 128.3689, "eval_samples_per_second": 1916.353, "eval_steps_per_second": 19.966, "step": 40000 }, { "epoch": 0.2, "learning_rate": 9.036465863453816e-06, "loss": 0.5933, "step": 50000 }, { "epoch": 0.2, "eval_loss": 0.5672558546066284, "eval_runtime": 128.5129, "eval_samples_per_second": 1914.205, "eval_steps_per_second": 19.944, "step": 50000 }, { "epoch": 0.24, "learning_rate": 8.835763052208836e-06, "loss": 0.5755, "step": 60000 }, { "epoch": 0.24, "eval_loss": 0.5512102842330933, "eval_runtime": 127.3167, "eval_samples_per_second": 1932.19, "eval_steps_per_second": 20.131, "step": 60000 }, { "epoch": 0.28, "learning_rate": 8.63504016064257e-06, "loss": 0.5622, "step": 70000 }, { "epoch": 0.28, "eval_loss": 0.5416693687438965, "eval_runtime": 127.5393, "eval_samples_per_second": 1928.817, "eval_steps_per_second": 20.096, "step": 70000 }, { "epoch": 0.32, "learning_rate": 8.434337349397592e-06, "loss": 0.5522, "step": 80000 }, { "epoch": 0.32, "eval_loss": 0.5347551107406616, "eval_runtime": 127.3132, "eval_samples_per_second": 1932.243, "eval_steps_per_second": 20.131, "step": 80000 }, { "epoch": 0.36, "learning_rate": 8.23363453815261e-06, "loss": 0.5437, "step": 90000 }, { "epoch": 0.36, "eval_loss": 0.5291008353233337, "eval_runtime": 127.3464, "eval_samples_per_second": 1931.739, "eval_steps_per_second": 20.126, "step": 90000 }, { "epoch": 0.4, "learning_rate": 8.032931726907631e-06, "loss": 0.5365, "step": 100000 }, { "epoch": 0.4, "eval_loss": 0.5225369334220886, "eval_runtime": 127.5532, "eval_samples_per_second": 1928.608, "eval_steps_per_second": 20.094, "step": 100000 }, { "epoch": 0.44, "learning_rate": 7.832228915662651e-06, "loss": 0.5309, "step": 110000 }, { "epoch": 0.44, "eval_loss": 0.5173077583312988, "eval_runtime": 127.5475, "eval_samples_per_second": 1928.693, "eval_steps_per_second": 20.094, "step": 110000 }, { "epoch": 0.48, "learning_rate": 7.63152610441767e-06, "loss": 0.5252, "step": 120000 }, { "epoch": 0.48, "eval_loss": 0.513536274433136, "eval_runtime": 130.7886, "eval_samples_per_second": 1880.897, "eval_steps_per_second": 19.597, "step": 120000 }, { "epoch": 0.52, "learning_rate": 7.430823293172691e-06, "loss": 0.5204, "step": 130000 }, { "epoch": 0.52, "eval_loss": 0.5111202001571655, "eval_runtime": 127.6816, "eval_samples_per_second": 1926.668, "eval_steps_per_second": 20.073, "step": 130000 }, { "epoch": 0.56, "learning_rate": 7.230100401606426e-06, "loss": 0.518, "step": 140000 }, { "epoch": 0.56, "eval_loss": 0.5084987282752991, "eval_runtime": 128.2814, "eval_samples_per_second": 1917.659, "eval_steps_per_second": 19.98, "step": 140000 }, { "epoch": 0.6, "learning_rate": 7.029397590361447e-06, "loss": 0.5135, "step": 150000 }, { "epoch": 0.6, "eval_loss": 0.5029130578041077, "eval_runtime": 130.8657, "eval_samples_per_second": 1879.79, "eval_steps_per_second": 19.585, "step": 150000 }, { "epoch": 0.64, "learning_rate": 6.828694779116466e-06, "loss": 0.5101, "step": 160000 }, { "epoch": 0.64, "eval_loss": 0.5005983710289001, "eval_runtime": 128.9427, "eval_samples_per_second": 1907.824, "eval_steps_per_second": 19.877, "step": 160000 }, { "epoch": 0.68, "learning_rate": 6.627991967871487e-06, "loss": 0.5065, "step": 170000 }, { "epoch": 0.68, "eval_loss": 0.4987814128398895, "eval_runtime": 127.8532, "eval_samples_per_second": 1924.081, "eval_steps_per_second": 20.046, "step": 170000 }, { "epoch": 0.72, "learning_rate": 6.427289156626506e-06, "loss": 0.5052, "step": 180000 }, { "epoch": 0.72, "eval_loss": 0.49448052048683167, "eval_runtime": 128.0108, "eval_samples_per_second": 1921.713, "eval_steps_per_second": 20.022, "step": 180000 }, { "epoch": 0.76, "learning_rate": 6.226586345381527e-06, "loss": 0.5025, "step": 190000 }, { "epoch": 0.76, "eval_loss": 0.49206921458244324, "eval_runtime": 127.1399, "eval_samples_per_second": 1934.877, "eval_steps_per_second": 20.159, "step": 190000 }, { "epoch": 0.8, "learning_rate": 6.025883534136546e-06, "loss": 0.4998, "step": 200000 }, { "epoch": 0.8, "eval_loss": 0.4929586946964264, "eval_runtime": 127.6742, "eval_samples_per_second": 1926.779, "eval_steps_per_second": 20.075, "step": 200000 }, { "epoch": 0.84, "learning_rate": 5.825180722891567e-06, "loss": 0.4982, "step": 210000 }, { "epoch": 0.84, "eval_loss": 0.48860839009284973, "eval_runtime": 127.3878, "eval_samples_per_second": 1931.112, "eval_steps_per_second": 20.12, "step": 210000 }, { "epoch": 0.88, "learning_rate": 5.6244578313253014e-06, "loss": 0.4969, "step": 220000 }, { "epoch": 0.88, "eval_loss": 0.4888823628425598, "eval_runtime": 127.2533, "eval_samples_per_second": 1933.152, "eval_steps_per_second": 20.141, "step": 220000 }, { "epoch": 0.92, "learning_rate": 5.423755020080321e-06, "loss": 0.495, "step": 230000 }, { "epoch": 0.92, "eval_loss": 0.4841912090778351, "eval_runtime": 126.968, "eval_samples_per_second": 1937.496, "eval_steps_per_second": 20.186, "step": 230000 }, { "epoch": 0.96, "learning_rate": 5.223052208835342e-06, "loss": 0.4927, "step": 240000 }, { "epoch": 0.96, "eval_loss": 0.4853549897670746, "eval_runtime": 127.4501, "eval_samples_per_second": 1930.167, "eval_steps_per_second": 20.11, "step": 240000 }, { "epoch": 1.0, "learning_rate": 5.022349397590361e-06, "loss": 0.4914, "step": 250000 }, { "epoch": 1.0, "eval_loss": 0.4826248586177826, "eval_runtime": 127.3161, "eval_samples_per_second": 1932.198, "eval_steps_per_second": 20.131, "step": 250000 }, { "epoch": 1.04, "learning_rate": 4.821646586345382e-06, "loss": 0.4902, "step": 260000 }, { "epoch": 1.04, "eval_loss": 0.48145654797554016, "eval_runtime": 127.4143, "eval_samples_per_second": 1930.709, "eval_steps_per_second": 20.115, "step": 260000 }, { "epoch": 1.08, "learning_rate": 4.620943775100402e-06, "loss": 0.4894, "step": 270000 }, { "epoch": 1.08, "eval_loss": 0.47896286845207214, "eval_runtime": 127.419, "eval_samples_per_second": 1930.638, "eval_steps_per_second": 20.115, "step": 270000 }, { "epoch": 1.11, "learning_rate": 4.420240963855422e-06, "loss": 0.4881, "step": 280000 }, { "epoch": 1.11, "eval_loss": 0.48297473788261414, "eval_runtime": 127.4472, "eval_samples_per_second": 1930.211, "eval_steps_per_second": 20.11, "step": 280000 }, { "epoch": 1.15, "learning_rate": 4.219538152610443e-06, "loss": 0.487, "step": 290000 }, { "epoch": 1.15, "eval_loss": 0.47816893458366394, "eval_runtime": 127.1599, "eval_samples_per_second": 1934.573, "eval_steps_per_second": 20.156, "step": 290000 }, { "epoch": 1.19, "learning_rate": 4.018835341365462e-06, "loss": 0.4859, "step": 300000 }, { "epoch": 1.19, "eval_loss": 0.4779074192047119, "eval_runtime": 127.96, "eval_samples_per_second": 1922.476, "eval_steps_per_second": 20.03, "step": 300000 }, { "epoch": 1.23, "learning_rate": 3.818132530120483e-06, "loss": 0.4845, "step": 310000 }, { "epoch": 1.23, "eval_loss": 0.47683581709861755, "eval_runtime": 127.9081, "eval_samples_per_second": 1923.256, "eval_steps_per_second": 20.038, "step": 310000 }, { "epoch": 1.27, "learning_rate": 3.6174297188755025e-06, "loss": 0.4835, "step": 320000 }, { "epoch": 1.27, "eval_loss": 0.4755454957485199, "eval_runtime": 127.7316, "eval_samples_per_second": 1925.914, "eval_steps_per_second": 20.066, "step": 320000 }, { "epoch": 1.31, "learning_rate": 3.4167068273092375e-06, "loss": 0.483, "step": 330000 }, { "epoch": 1.31, "eval_loss": 0.4744352400302887, "eval_runtime": 128.197, "eval_samples_per_second": 1918.921, "eval_steps_per_second": 19.993, "step": 330000 }, { "epoch": 1.35, "learning_rate": 3.2160040160642576e-06, "loss": 0.4819, "step": 340000 }, { "epoch": 1.35, "eval_loss": 0.4745638966560364, "eval_runtime": 128.0584, "eval_samples_per_second": 1920.999, "eval_steps_per_second": 20.014, "step": 340000 }, { "epoch": 1.39, "learning_rate": 3.0153012048192777e-06, "loss": 0.481, "step": 350000 }, { "epoch": 1.39, "eval_loss": 0.4744107723236084, "eval_runtime": 128.1739, "eval_samples_per_second": 1919.268, "eval_steps_per_second": 19.996, "step": 350000 }, { "epoch": 1.43, "learning_rate": 2.8145983935742978e-06, "loss": 0.481, "step": 360000 }, { "epoch": 1.43, "eval_loss": 0.472385436296463, "eval_runtime": 128.2659, "eval_samples_per_second": 1917.89, "eval_steps_per_second": 19.982, "step": 360000 }, { "epoch": 1.47, "learning_rate": 2.613895582329318e-06, "loss": 0.4799, "step": 370000 }, { "epoch": 1.47, "eval_loss": 0.4733026921749115, "eval_runtime": 127.698, "eval_samples_per_second": 1926.419, "eval_steps_per_second": 20.071, "step": 370000 }, { "epoch": 1.51, "learning_rate": 2.4131927710843376e-06, "loss": 0.4795, "step": 380000 }, { "epoch": 1.51, "eval_loss": 0.4719351530075073, "eval_runtime": 128.2337, "eval_samples_per_second": 1918.372, "eval_steps_per_second": 19.987, "step": 380000 }, { "epoch": 1.55, "learning_rate": 2.2124899598393577e-06, "loss": 0.4784, "step": 390000 }, { "epoch": 1.55, "eval_loss": 0.4699419438838959, "eval_runtime": 127.9847, "eval_samples_per_second": 1922.105, "eval_steps_per_second": 20.026, "step": 390000 }, { "epoch": 1.59, "learning_rate": 2.0117871485943778e-06, "loss": 0.4785, "step": 400000 }, { "epoch": 1.59, "eval_loss": 0.4711839556694031, "eval_runtime": 127.89, "eval_samples_per_second": 1923.527, "eval_steps_per_second": 20.041, "step": 400000 }, { "epoch": 1.63, "learning_rate": 1.8110843373493979e-06, "loss": 0.4777, "step": 410000 }, { "epoch": 1.63, "eval_loss": 0.46987083554267883, "eval_runtime": 128.85, "eval_samples_per_second": 1909.197, "eval_steps_per_second": 19.891, "step": 410000 }, { "epoch": 1.67, "learning_rate": 1.6103614457831327e-06, "loss": 0.477, "step": 420000 }, { "epoch": 1.67, "eval_loss": 0.46960577368736267, "eval_runtime": 130.3922, "eval_samples_per_second": 1886.616, "eval_steps_per_second": 19.656, "step": 420000 }, { "epoch": 1.71, "learning_rate": 1.4096586345381528e-06, "loss": 0.4771, "step": 430000 }, { "epoch": 1.71, "eval_loss": 0.47003933787345886, "eval_runtime": 129.4605, "eval_samples_per_second": 1900.193, "eval_steps_per_second": 19.798, "step": 430000 }, { "epoch": 1.75, "learning_rate": 1.2089558232931729e-06, "loss": 0.4766, "step": 440000 }, { "epoch": 1.75, "eval_loss": 0.47017282247543335, "eval_runtime": 129.4902, "eval_samples_per_second": 1899.758, "eval_steps_per_second": 19.793, "step": 440000 }, { "epoch": 1.79, "learning_rate": 1.008253012048193e-06, "loss": 0.476, "step": 450000 }, { "epoch": 1.79, "eval_loss": 0.46954795718193054, "eval_runtime": 129.5407, "eval_samples_per_second": 1899.017, "eval_steps_per_second": 19.785, "step": 450000 }, { "epoch": 1.83, "learning_rate": 8.07550200803213e-07, "loss": 0.4757, "step": 460000 }, { "epoch": 1.83, "eval_loss": 0.4694086015224457, "eval_runtime": 129.2469, "eval_samples_per_second": 1903.334, "eval_steps_per_second": 19.83, "step": 460000 }, { "epoch": 1.87, "learning_rate": 6.068273092369479e-07, "loss": 0.4758, "step": 470000 }, { "epoch": 1.87, "eval_loss": 0.4685874581336975, "eval_runtime": 129.0023, "eval_samples_per_second": 1906.943, "eval_steps_per_second": 19.868, "step": 470000 }, { "epoch": 1.91, "learning_rate": 4.061244979919679e-07, "loss": 0.4754, "step": 480000 }, { "epoch": 1.91, "eval_loss": 0.46817249059677124, "eval_runtime": 130.8406, "eval_samples_per_second": 1880.15, "eval_steps_per_second": 19.589, "step": 480000 }, { "epoch": 1.95, "learning_rate": 2.0542168674698798e-07, "loss": 0.475, "step": 490000 }, { "epoch": 1.95, "eval_loss": 0.4691283404827118, "eval_runtime": 129.0096, "eval_samples_per_second": 1906.836, "eval_steps_per_second": 19.867, "step": 490000 }, { "epoch": 1.99, "learning_rate": 4.718875502008032e-09, "loss": 0.4756, "step": 500000 }, { "epoch": 1.99, "eval_loss": 0.46795061230659485, "eval_runtime": 129.0896, "eval_samples_per_second": 1905.653, "eval_steps_per_second": 19.854, "step": 500000 } ], "logging_steps": 10000, "max_steps": 500000, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 8.290835482935528e+17, "trial_name": null, "trial_params": null }