{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.994350282485876, "eval_steps": 500, "global_step": 795, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03766478342749529, "grad_norm": 2.263125496637036, "learning_rate": 5e-06, "loss": 0.8213, "step": 10 }, { "epoch": 0.07532956685499058, "grad_norm": 2.189120038314998, "learning_rate": 5e-06, "loss": 0.725, "step": 20 }, { "epoch": 0.11299435028248588, "grad_norm": 2.7377229620997587, "learning_rate": 5e-06, "loss": 0.6984, "step": 30 }, { "epoch": 0.15065913370998116, "grad_norm": 1.6393030705097076, "learning_rate": 5e-06, "loss": 0.6949, "step": 40 }, { "epoch": 0.18832391713747645, "grad_norm": 1.0398712245575565, "learning_rate": 5e-06, "loss": 0.6683, "step": 50 }, { "epoch": 0.22598870056497175, "grad_norm": 0.8808957646364496, "learning_rate": 5e-06, "loss": 0.6657, "step": 60 }, { "epoch": 0.263653483992467, "grad_norm": 0.7566447407586742, "learning_rate": 5e-06, "loss": 0.6496, "step": 70 }, { "epoch": 0.3013182674199623, "grad_norm": 0.7117741359385918, "learning_rate": 5e-06, "loss": 0.6458, "step": 80 }, { "epoch": 0.3389830508474576, "grad_norm": 0.7784154878802825, "learning_rate": 5e-06, "loss": 0.643, "step": 90 }, { "epoch": 0.3766478342749529, "grad_norm": 1.2764054406718297, "learning_rate": 5e-06, "loss": 0.634, "step": 100 }, { "epoch": 0.4143126177024482, "grad_norm": 0.7389123630080362, "learning_rate": 5e-06, "loss": 0.642, "step": 110 }, { "epoch": 0.4519774011299435, "grad_norm": 0.5996098331338064, "learning_rate": 5e-06, "loss": 0.6213, "step": 120 }, { "epoch": 0.4896421845574388, "grad_norm": 1.1501035575220573, "learning_rate": 5e-06, "loss": 0.6249, "step": 130 }, { "epoch": 0.527306967984934, "grad_norm": 0.6904388049559987, "learning_rate": 5e-06, "loss": 0.6243, "step": 140 }, { "epoch": 0.5649717514124294, "grad_norm": 0.783351095580665, "learning_rate": 5e-06, "loss": 0.621, "step": 150 }, { "epoch": 0.6026365348399246, "grad_norm": 0.5318412680049267, "learning_rate": 5e-06, "loss": 0.6296, "step": 160 }, { "epoch": 0.64030131826742, "grad_norm": 1.1247908870238332, "learning_rate": 5e-06, "loss": 0.6244, "step": 170 }, { "epoch": 0.6779661016949152, "grad_norm": 0.5383437573904913, "learning_rate": 5e-06, "loss": 0.6174, "step": 180 }, { "epoch": 0.7156308851224106, "grad_norm": 2.5733659531838198, "learning_rate": 5e-06, "loss": 0.6193, "step": 190 }, { "epoch": 0.7532956685499058, "grad_norm": 0.7831306502565981, "learning_rate": 5e-06, "loss": 0.6127, "step": 200 }, { "epoch": 0.7909604519774012, "grad_norm": 0.6934442696862589, "learning_rate": 5e-06, "loss": 0.6244, "step": 210 }, { "epoch": 0.8286252354048964, "grad_norm": 0.6677867481758228, "learning_rate": 5e-06, "loss": 0.613, "step": 220 }, { "epoch": 0.8662900188323918, "grad_norm": 0.4859701739274024, "learning_rate": 5e-06, "loss": 0.6101, "step": 230 }, { "epoch": 0.903954802259887, "grad_norm": 0.8722337210188531, "learning_rate": 5e-06, "loss": 0.606, "step": 240 }, { "epoch": 0.9416195856873822, "grad_norm": 1.9266628990003756, "learning_rate": 5e-06, "loss": 0.6019, "step": 250 }, { "epoch": 0.9792843691148776, "grad_norm": 1.056076074715482, "learning_rate": 5e-06, "loss": 0.6112, "step": 260 }, { "epoch": 0.9981167608286252, "eval_loss": 0.605501651763916, "eval_runtime": 91.0085, "eval_samples_per_second": 78.586, "eval_steps_per_second": 0.615, "step": 265 }, { "epoch": 1.0169491525423728, "grad_norm": 0.8391047454772584, "learning_rate": 5e-06, "loss": 0.5915, "step": 270 }, { "epoch": 1.054613935969868, "grad_norm": 0.834651626730102, "learning_rate": 5e-06, "loss": 0.5646, "step": 280 }, { "epoch": 1.0922787193973635, "grad_norm": 0.7450681825170591, "learning_rate": 5e-06, "loss": 0.5518, "step": 290 }, { "epoch": 1.1299435028248588, "grad_norm": 0.590980176111281, "learning_rate": 5e-06, "loss": 0.5601, "step": 300 }, { "epoch": 1.167608286252354, "grad_norm": 0.9385625642802127, "learning_rate": 5e-06, "loss": 0.552, "step": 310 }, { "epoch": 1.2052730696798493, "grad_norm": 0.8126168794552087, "learning_rate": 5e-06, "loss": 0.558, "step": 320 }, { "epoch": 1.2429378531073447, "grad_norm": 0.677905810554928, "learning_rate": 5e-06, "loss": 0.565, "step": 330 }, { "epoch": 1.28060263653484, "grad_norm": 0.6544626057539239, "learning_rate": 5e-06, "loss": 0.5582, "step": 340 }, { "epoch": 1.3182674199623352, "grad_norm": 0.8524924080405836, "learning_rate": 5e-06, "loss": 0.5602, "step": 350 }, { "epoch": 1.3559322033898304, "grad_norm": 0.4907188308076832, "learning_rate": 5e-06, "loss": 0.5607, "step": 360 }, { "epoch": 1.3935969868173257, "grad_norm": 0.53907446375581, "learning_rate": 5e-06, "loss": 0.5547, "step": 370 }, { "epoch": 1.4312617702448212, "grad_norm": 0.5927028384991923, "learning_rate": 5e-06, "loss": 0.5541, "step": 380 }, { "epoch": 1.4689265536723164, "grad_norm": 0.7128973727870778, "learning_rate": 5e-06, "loss": 0.5528, "step": 390 }, { "epoch": 1.5065913370998116, "grad_norm": 0.49840825439685243, "learning_rate": 5e-06, "loss": 0.5668, "step": 400 }, { "epoch": 1.544256120527307, "grad_norm": 0.5370743335720791, "learning_rate": 5e-06, "loss": 0.5575, "step": 410 }, { "epoch": 1.5819209039548023, "grad_norm": 0.6150871895812915, "learning_rate": 5e-06, "loss": 0.5597, "step": 420 }, { "epoch": 1.6195856873822976, "grad_norm": 0.563194743905304, "learning_rate": 5e-06, "loss": 0.5592, "step": 430 }, { "epoch": 1.6572504708097928, "grad_norm": 0.5119581124907059, "learning_rate": 5e-06, "loss": 0.5621, "step": 440 }, { "epoch": 1.694915254237288, "grad_norm": 0.5352254655513019, "learning_rate": 5e-06, "loss": 0.5541, "step": 450 }, { "epoch": 1.7325800376647833, "grad_norm": 0.6077433771903062, "learning_rate": 5e-06, "loss": 0.5563, "step": 460 }, { "epoch": 1.7702448210922788, "grad_norm": 0.562877694142977, "learning_rate": 5e-06, "loss": 0.555, "step": 470 }, { "epoch": 1.807909604519774, "grad_norm": 0.5453089094350608, "learning_rate": 5e-06, "loss": 0.5465, "step": 480 }, { "epoch": 1.8455743879472695, "grad_norm": 0.5709862620082578, "learning_rate": 5e-06, "loss": 0.5592, "step": 490 }, { "epoch": 1.8832391713747647, "grad_norm": 0.49785144147435545, "learning_rate": 5e-06, "loss": 0.5563, "step": 500 }, { "epoch": 1.92090395480226, "grad_norm": 0.48543855573710365, "learning_rate": 5e-06, "loss": 0.5552, "step": 510 }, { "epoch": 1.9585687382297552, "grad_norm": 0.5180932799655572, "learning_rate": 5e-06, "loss": 0.5571, "step": 520 }, { "epoch": 1.9962335216572504, "grad_norm": 0.5674984350650156, "learning_rate": 5e-06, "loss": 0.5554, "step": 530 }, { "epoch": 2.0, "eval_loss": 0.5974339842796326, "eval_runtime": 92.2503, "eval_samples_per_second": 77.528, "eval_steps_per_second": 0.607, "step": 531 }, { "epoch": 2.0338983050847457, "grad_norm": 0.6380443072327275, "learning_rate": 5e-06, "loss": 0.5074, "step": 540 }, { "epoch": 2.071563088512241, "grad_norm": 0.7526012751703193, "learning_rate": 5e-06, "loss": 0.5056, "step": 550 }, { "epoch": 2.109227871939736, "grad_norm": 0.601125683400543, "learning_rate": 5e-06, "loss": 0.5081, "step": 560 }, { "epoch": 2.146892655367232, "grad_norm": 0.5412801866050161, "learning_rate": 5e-06, "loss": 0.4964, "step": 570 }, { "epoch": 2.184557438794727, "grad_norm": 0.6605525778778812, "learning_rate": 5e-06, "loss": 0.4924, "step": 580 }, { "epoch": 2.2222222222222223, "grad_norm": 0.5634126387252626, "learning_rate": 5e-06, "loss": 0.5017, "step": 590 }, { "epoch": 2.2598870056497176, "grad_norm": 0.5612826370434433, "learning_rate": 5e-06, "loss": 0.507, "step": 600 }, { "epoch": 2.297551789077213, "grad_norm": 0.5863149934883163, "learning_rate": 5e-06, "loss": 0.4966, "step": 610 }, { "epoch": 2.335216572504708, "grad_norm": 0.5234770461125302, "learning_rate": 5e-06, "loss": 0.504, "step": 620 }, { "epoch": 2.3728813559322033, "grad_norm": 0.6459395940002383, "learning_rate": 5e-06, "loss": 0.5026, "step": 630 }, { "epoch": 2.4105461393596985, "grad_norm": 0.6027956338487243, "learning_rate": 5e-06, "loss": 0.5025, "step": 640 }, { "epoch": 2.4482109227871938, "grad_norm": 0.5328974338222766, "learning_rate": 5e-06, "loss": 0.5003, "step": 650 }, { "epoch": 2.4858757062146895, "grad_norm": 0.6107575449426592, "learning_rate": 5e-06, "loss": 0.5009, "step": 660 }, { "epoch": 2.5235404896421847, "grad_norm": 0.6193028412595688, "learning_rate": 5e-06, "loss": 0.5068, "step": 670 }, { "epoch": 2.56120527306968, "grad_norm": 0.5313172697707192, "learning_rate": 5e-06, "loss": 0.5087, "step": 680 }, { "epoch": 2.598870056497175, "grad_norm": 0.6705815338360445, "learning_rate": 5e-06, "loss": 0.5072, "step": 690 }, { "epoch": 2.6365348399246704, "grad_norm": 0.5631108090258757, "learning_rate": 5e-06, "loss": 0.5053, "step": 700 }, { "epoch": 2.6741996233521657, "grad_norm": 0.6409277069423337, "learning_rate": 5e-06, "loss": 0.503, "step": 710 }, { "epoch": 2.711864406779661, "grad_norm": 0.5852444630897177, "learning_rate": 5e-06, "loss": 0.5099, "step": 720 }, { "epoch": 2.7495291902071566, "grad_norm": 0.6554053610190018, "learning_rate": 5e-06, "loss": 0.5149, "step": 730 }, { "epoch": 2.7871939736346514, "grad_norm": 0.6563071365261379, "learning_rate": 5e-06, "loss": 0.5018, "step": 740 }, { "epoch": 2.824858757062147, "grad_norm": 0.5582449045429995, "learning_rate": 5e-06, "loss": 0.5103, "step": 750 }, { "epoch": 2.8625235404896423, "grad_norm": 0.5062040173398443, "learning_rate": 5e-06, "loss": 0.5063, "step": 760 }, { "epoch": 2.9001883239171375, "grad_norm": 0.6071759917390698, "learning_rate": 5e-06, "loss": 0.5003, "step": 770 }, { "epoch": 2.937853107344633, "grad_norm": 0.5606403524855348, "learning_rate": 5e-06, "loss": 0.512, "step": 780 }, { "epoch": 2.975517890772128, "grad_norm": 0.6859712101741441, "learning_rate": 5e-06, "loss": 0.5025, "step": 790 }, { "epoch": 2.994350282485876, "eval_loss": 0.6022372245788574, "eval_runtime": 89.8413, "eval_samples_per_second": 79.607, "eval_steps_per_second": 0.623, "step": 795 }, { "epoch": 2.994350282485876, "step": 795, "total_flos": 1331235850813440.0, "train_loss": 0.5684360762062313, "train_runtime": 14109.8059, "train_samples_per_second": 28.892, "train_steps_per_second": 0.056 } ], "logging_steps": 10, "max_steps": 795, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1331235850813440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }