| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.991040318566451, | |
| "eval_steps": 10000, | |
| "global_step": 500000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04, | |
| "learning_rate": 9.83937751004016e-06, | |
| "loss": 1.3677, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 0.8635309934616089, | |
| "eval_runtime": 127.2968, | |
| "eval_samples_per_second": 1932.492, | |
| "eval_steps_per_second": 20.134, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "learning_rate": 9.638674698795182e-06, | |
| "loss": 0.7723, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 0.6709557175636292, | |
| "eval_runtime": 127.3309, | |
| "eval_samples_per_second": 1931.974, | |
| "eval_steps_per_second": 20.129, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "learning_rate": 9.437951807228917e-06, | |
| "loss": 0.6635, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 0.6188660264015198, | |
| "eval_runtime": 127.3716, | |
| "eval_samples_per_second": 1931.357, | |
| "eval_steps_per_second": 20.122, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "learning_rate": 9.237248995983937e-06, | |
| "loss": 0.62, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 0.5857706069946289, | |
| "eval_runtime": 128.3689, | |
| "eval_samples_per_second": 1916.353, | |
| "eval_steps_per_second": 19.966, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "learning_rate": 9.036465863453816e-06, | |
| "loss": 0.5933, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 0.5672558546066284, | |
| "eval_runtime": 128.5129, | |
| "eval_samples_per_second": 1914.205, | |
| "eval_steps_per_second": 19.944, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "learning_rate": 8.835763052208836e-06, | |
| "loss": 0.5755, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 0.5512102842330933, | |
| "eval_runtime": 127.3167, | |
| "eval_samples_per_second": 1932.19, | |
| "eval_steps_per_second": 20.131, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "learning_rate": 8.63504016064257e-06, | |
| "loss": 0.5622, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 0.5416693687438965, | |
| "eval_runtime": 127.5393, | |
| "eval_samples_per_second": 1928.817, | |
| "eval_steps_per_second": 20.096, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "learning_rate": 8.434337349397592e-06, | |
| "loss": 0.5522, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 0.5347551107406616, | |
| "eval_runtime": 127.3132, | |
| "eval_samples_per_second": 1932.243, | |
| "eval_steps_per_second": 20.131, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "learning_rate": 8.23363453815261e-06, | |
| "loss": 0.5437, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_loss": 0.5291008353233337, | |
| "eval_runtime": 127.3464, | |
| "eval_samples_per_second": 1931.739, | |
| "eval_steps_per_second": 20.126, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "learning_rate": 8.032931726907631e-06, | |
| "loss": 0.5365, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 0.5225369334220886, | |
| "eval_runtime": 127.5532, | |
| "eval_samples_per_second": 1928.608, | |
| "eval_steps_per_second": 20.094, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "learning_rate": 7.832228915662651e-06, | |
| "loss": 0.5309, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": 0.5173077583312988, | |
| "eval_runtime": 127.5475, | |
| "eval_samples_per_second": 1928.693, | |
| "eval_steps_per_second": 20.094, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "learning_rate": 7.63152610441767e-06, | |
| "loss": 0.5252, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 0.513536274433136, | |
| "eval_runtime": 130.7886, | |
| "eval_samples_per_second": 1880.897, | |
| "eval_steps_per_second": 19.597, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "learning_rate": 7.430823293172691e-06, | |
| "loss": 0.5204, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_loss": 0.5111202001571655, | |
| "eval_runtime": 127.6816, | |
| "eval_samples_per_second": 1926.668, | |
| "eval_steps_per_second": 20.073, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "learning_rate": 7.230100401606426e-06, | |
| "loss": 0.518, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 0.5084987282752991, | |
| "eval_runtime": 128.2814, | |
| "eval_samples_per_second": 1917.659, | |
| "eval_steps_per_second": 19.98, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "learning_rate": 7.029397590361447e-06, | |
| "loss": 0.5135, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 0.5029130578041077, | |
| "eval_runtime": 130.8657, | |
| "eval_samples_per_second": 1879.79, | |
| "eval_steps_per_second": 19.585, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "learning_rate": 6.828694779116466e-06, | |
| "loss": 0.5101, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 0.5005983710289001, | |
| "eval_runtime": 128.9427, | |
| "eval_samples_per_second": 1907.824, | |
| "eval_steps_per_second": 19.877, | |
| "step": 160000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "learning_rate": 6.627991967871487e-06, | |
| "loss": 0.5065, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_loss": 0.4987814128398895, | |
| "eval_runtime": 127.8532, | |
| "eval_samples_per_second": 1924.081, | |
| "eval_steps_per_second": 20.046, | |
| "step": 170000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "learning_rate": 6.427289156626506e-06, | |
| "loss": 0.5052, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 0.49448052048683167, | |
| "eval_runtime": 128.0108, | |
| "eval_samples_per_second": 1921.713, | |
| "eval_steps_per_second": 20.022, | |
| "step": 180000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "learning_rate": 6.226586345381527e-06, | |
| "loss": 0.5025, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_loss": 0.49206921458244324, | |
| "eval_runtime": 127.1399, | |
| "eval_samples_per_second": 1934.877, | |
| "eval_steps_per_second": 20.159, | |
| "step": 190000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "learning_rate": 6.025883534136546e-06, | |
| "loss": 0.4998, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 0.4929586946964264, | |
| "eval_runtime": 127.6742, | |
| "eval_samples_per_second": 1926.779, | |
| "eval_steps_per_second": 20.075, | |
| "step": 200000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "learning_rate": 5.825180722891567e-06, | |
| "loss": 0.4982, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 0.48860839009284973, | |
| "eval_runtime": 127.3878, | |
| "eval_samples_per_second": 1931.112, | |
| "eval_steps_per_second": 20.12, | |
| "step": 210000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "learning_rate": 5.6244578313253014e-06, | |
| "loss": 0.4969, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 0.4888823628425598, | |
| "eval_runtime": 127.2533, | |
| "eval_samples_per_second": 1933.152, | |
| "eval_steps_per_second": 20.141, | |
| "step": 220000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "learning_rate": 5.423755020080321e-06, | |
| "loss": 0.495, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_loss": 0.4841912090778351, | |
| "eval_runtime": 126.968, | |
| "eval_samples_per_second": 1937.496, | |
| "eval_steps_per_second": 20.186, | |
| "step": 230000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "learning_rate": 5.223052208835342e-06, | |
| "loss": 0.4927, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 0.4853549897670746, | |
| "eval_runtime": 127.4501, | |
| "eval_samples_per_second": 1930.167, | |
| "eval_steps_per_second": 20.11, | |
| "step": 240000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "learning_rate": 5.022349397590361e-06, | |
| "loss": 0.4914, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.4826248586177826, | |
| "eval_runtime": 127.3161, | |
| "eval_samples_per_second": 1932.198, | |
| "eval_steps_per_second": 20.131, | |
| "step": 250000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "learning_rate": 4.821646586345382e-06, | |
| "loss": 0.4902, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 1.04, | |
| "eval_loss": 0.48145654797554016, | |
| "eval_runtime": 127.4143, | |
| "eval_samples_per_second": 1930.709, | |
| "eval_steps_per_second": 20.115, | |
| "step": 260000 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "learning_rate": 4.620943775100402e-06, | |
| "loss": 0.4894, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 1.08, | |
| "eval_loss": 0.47896286845207214, | |
| "eval_runtime": 127.419, | |
| "eval_samples_per_second": 1930.638, | |
| "eval_steps_per_second": 20.115, | |
| "step": 270000 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "learning_rate": 4.420240963855422e-06, | |
| "loss": 0.4881, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 1.11, | |
| "eval_loss": 0.48297473788261414, | |
| "eval_runtime": 127.4472, | |
| "eval_samples_per_second": 1930.211, | |
| "eval_steps_per_second": 20.11, | |
| "step": 280000 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "learning_rate": 4.219538152610443e-06, | |
| "loss": 0.487, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 1.15, | |
| "eval_loss": 0.47816893458366394, | |
| "eval_runtime": 127.1599, | |
| "eval_samples_per_second": 1934.573, | |
| "eval_steps_per_second": 20.156, | |
| "step": 290000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "learning_rate": 4.018835341365462e-06, | |
| "loss": 0.4859, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 1.19, | |
| "eval_loss": 0.4779074192047119, | |
| "eval_runtime": 127.96, | |
| "eval_samples_per_second": 1922.476, | |
| "eval_steps_per_second": 20.03, | |
| "step": 300000 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "learning_rate": 3.818132530120483e-06, | |
| "loss": 0.4845, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 1.23, | |
| "eval_loss": 0.47683581709861755, | |
| "eval_runtime": 127.9081, | |
| "eval_samples_per_second": 1923.256, | |
| "eval_steps_per_second": 20.038, | |
| "step": 310000 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "learning_rate": 3.6174297188755025e-06, | |
| "loss": 0.4835, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 1.27, | |
| "eval_loss": 0.4755454957485199, | |
| "eval_runtime": 127.7316, | |
| "eval_samples_per_second": 1925.914, | |
| "eval_steps_per_second": 20.066, | |
| "step": 320000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "learning_rate": 3.4167068273092375e-06, | |
| "loss": 0.483, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 1.31, | |
| "eval_loss": 0.4744352400302887, | |
| "eval_runtime": 128.197, | |
| "eval_samples_per_second": 1918.921, | |
| "eval_steps_per_second": 19.993, | |
| "step": 330000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "learning_rate": 3.2160040160642576e-06, | |
| "loss": 0.4819, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 1.35, | |
| "eval_loss": 0.4745638966560364, | |
| "eval_runtime": 128.0584, | |
| "eval_samples_per_second": 1920.999, | |
| "eval_steps_per_second": 20.014, | |
| "step": 340000 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "learning_rate": 3.0153012048192777e-06, | |
| "loss": 0.481, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 1.39, | |
| "eval_loss": 0.4744107723236084, | |
| "eval_runtime": 128.1739, | |
| "eval_samples_per_second": 1919.268, | |
| "eval_steps_per_second": 19.996, | |
| "step": 350000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "learning_rate": 2.8145983935742978e-06, | |
| "loss": 0.481, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 1.43, | |
| "eval_loss": 0.472385436296463, | |
| "eval_runtime": 128.2659, | |
| "eval_samples_per_second": 1917.89, | |
| "eval_steps_per_second": 19.982, | |
| "step": 360000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "learning_rate": 2.613895582329318e-06, | |
| "loss": 0.4799, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 1.47, | |
| "eval_loss": 0.4733026921749115, | |
| "eval_runtime": 127.698, | |
| "eval_samples_per_second": 1926.419, | |
| "eval_steps_per_second": 20.071, | |
| "step": 370000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "learning_rate": 2.4131927710843376e-06, | |
| "loss": 0.4795, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "eval_loss": 0.4719351530075073, | |
| "eval_runtime": 128.2337, | |
| "eval_samples_per_second": 1918.372, | |
| "eval_steps_per_second": 19.987, | |
| "step": 380000 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "learning_rate": 2.2124899598393577e-06, | |
| "loss": 0.4784, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 1.55, | |
| "eval_loss": 0.4699419438838959, | |
| "eval_runtime": 127.9847, | |
| "eval_samples_per_second": 1922.105, | |
| "eval_steps_per_second": 20.026, | |
| "step": 390000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "learning_rate": 2.0117871485943778e-06, | |
| "loss": 0.4785, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 1.59, | |
| "eval_loss": 0.4711839556694031, | |
| "eval_runtime": 127.89, | |
| "eval_samples_per_second": 1923.527, | |
| "eval_steps_per_second": 20.041, | |
| "step": 400000 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "learning_rate": 1.8110843373493979e-06, | |
| "loss": 0.4777, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 1.63, | |
| "eval_loss": 0.46987083554267883, | |
| "eval_runtime": 128.85, | |
| "eval_samples_per_second": 1909.197, | |
| "eval_steps_per_second": 19.891, | |
| "step": 410000 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "learning_rate": 1.6103614457831327e-06, | |
| "loss": 0.477, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 1.67, | |
| "eval_loss": 0.46960577368736267, | |
| "eval_runtime": 130.3922, | |
| "eval_samples_per_second": 1886.616, | |
| "eval_steps_per_second": 19.656, | |
| "step": 420000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "learning_rate": 1.4096586345381528e-06, | |
| "loss": 0.4771, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 1.71, | |
| "eval_loss": 0.47003933787345886, | |
| "eval_runtime": 129.4605, | |
| "eval_samples_per_second": 1900.193, | |
| "eval_steps_per_second": 19.798, | |
| "step": 430000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "learning_rate": 1.2089558232931729e-06, | |
| "loss": 0.4766, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 1.75, | |
| "eval_loss": 0.47017282247543335, | |
| "eval_runtime": 129.4902, | |
| "eval_samples_per_second": 1899.758, | |
| "eval_steps_per_second": 19.793, | |
| "step": 440000 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "learning_rate": 1.008253012048193e-06, | |
| "loss": 0.476, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 1.79, | |
| "eval_loss": 0.46954795718193054, | |
| "eval_runtime": 129.5407, | |
| "eval_samples_per_second": 1899.017, | |
| "eval_steps_per_second": 19.785, | |
| "step": 450000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "learning_rate": 8.07550200803213e-07, | |
| "loss": 0.4757, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 1.83, | |
| "eval_loss": 0.4694086015224457, | |
| "eval_runtime": 129.2469, | |
| "eval_samples_per_second": 1903.334, | |
| "eval_steps_per_second": 19.83, | |
| "step": 460000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "learning_rate": 6.068273092369479e-07, | |
| "loss": 0.4758, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 1.87, | |
| "eval_loss": 0.4685874581336975, | |
| "eval_runtime": 129.0023, | |
| "eval_samples_per_second": 1906.943, | |
| "eval_steps_per_second": 19.868, | |
| "step": 470000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "learning_rate": 4.061244979919679e-07, | |
| "loss": 0.4754, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 1.91, | |
| "eval_loss": 0.46817249059677124, | |
| "eval_runtime": 130.8406, | |
| "eval_samples_per_second": 1880.15, | |
| "eval_steps_per_second": 19.589, | |
| "step": 480000 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "learning_rate": 2.0542168674698798e-07, | |
| "loss": 0.475, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 1.95, | |
| "eval_loss": 0.4691283404827118, | |
| "eval_runtime": 129.0096, | |
| "eval_samples_per_second": 1906.836, | |
| "eval_steps_per_second": 19.867, | |
| "step": 490000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "learning_rate": 4.718875502008032e-09, | |
| "loss": 0.4756, | |
| "step": 500000 | |
| }, | |
| { | |
| "epoch": 1.99, | |
| "eval_loss": 0.46795061230659485, | |
| "eval_runtime": 129.0896, | |
| "eval_samples_per_second": 1905.653, | |
| "eval_steps_per_second": 19.854, | |
| "step": 500000 | |
| } | |
| ], | |
| "logging_steps": 10000, | |
| "max_steps": 500000, | |
| "num_train_epochs": 2, | |
| "save_steps": 10000, | |
| "total_flos": 8.290835482935528e+17, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |