{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994107248084856, "eval_steps": 500, "global_step": 1272, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.019642506383814574, "grad_norm": 2.737959623336792, "learning_rate": 3.90625e-05, "loss": 1.8852, "step": 25 }, { "epoch": 0.03928501276762915, "grad_norm": 0.8758559823036194, "learning_rate": 7.8125e-05, "loss": 1.4536, "step": 50 }, { "epoch": 0.05892751915144372, "grad_norm": 0.5361883640289307, "learning_rate": 0.00011718750000000001, "loss": 1.16, "step": 75 }, { "epoch": 0.0785700255352583, "grad_norm": 0.5839149951934814, "learning_rate": 0.00015625, "loss": 1.01, "step": 100 }, { "epoch": 0.09821253191907288, "grad_norm": 0.6101933121681213, "learning_rate": 0.0001953125, "loss": 0.9627, "step": 125 }, { "epoch": 0.11785503830288745, "grad_norm": 0.6417807936668396, "learning_rate": 0.00019615384615384615, "loss": 0.9149, "step": 150 }, { "epoch": 0.13749754468670203, "grad_norm": 0.549029529094696, "learning_rate": 0.0001917832167832168, "loss": 0.8981, "step": 175 }, { "epoch": 0.1571400510705166, "grad_norm": 0.6425251364707947, "learning_rate": 0.00018741258741258743, "loss": 0.8879, "step": 200 }, { "epoch": 0.17678255745433116, "grad_norm": 0.5528634786605835, "learning_rate": 0.00018304195804195805, "loss": 0.8691, "step": 225 }, { "epoch": 0.19642506383814576, "grad_norm": 0.6609376668930054, "learning_rate": 0.00017867132867132866, "loss": 0.8617, "step": 250 }, { "epoch": 0.21606757022196033, "grad_norm": 0.5458253622055054, "learning_rate": 0.0001743006993006993, "loss": 0.8572, "step": 275 }, { "epoch": 0.2357100766057749, "grad_norm": 0.6044121384620667, "learning_rate": 0.00016993006993006995, "loss": 0.8494, "step": 300 }, { "epoch": 0.25535258298958946, "grad_norm": 0.5752493739128113, "learning_rate": 0.00016555944055944056, "loss": 0.8381, "step": 325 }, { "epoch": 0.27499508937340406, "grad_norm": 0.5365332961082458, "learning_rate": 0.0001611888111888112, "loss": 0.8516, "step": 350 }, { "epoch": 0.2946375957572186, "grad_norm": 0.7016746997833252, "learning_rate": 0.00015681818181818182, "loss": 0.8359, "step": 375 }, { "epoch": 0.3142801021410332, "grad_norm": 0.6072686910629272, "learning_rate": 0.00015244755244755244, "loss": 0.8178, "step": 400 }, { "epoch": 0.3339226085248478, "grad_norm": 0.5570734739303589, "learning_rate": 0.00014807692307692308, "loss": 0.8127, "step": 425 }, { "epoch": 0.3535651149086623, "grad_norm": 0.5914424657821655, "learning_rate": 0.00014370629370629372, "loss": 0.8128, "step": 450 }, { "epoch": 0.3732076212924769, "grad_norm": 0.5375176072120667, "learning_rate": 0.00013933566433566434, "loss": 0.7828, "step": 475 }, { "epoch": 0.3928501276762915, "grad_norm": 0.5489270091056824, "learning_rate": 0.00013496503496503496, "loss": 0.8109, "step": 500 }, { "epoch": 0.41249263406010606, "grad_norm": 0.5411733984947205, "learning_rate": 0.0001305944055944056, "loss": 0.7862, "step": 525 }, { "epoch": 0.43213514044392065, "grad_norm": 0.5604883432388306, "learning_rate": 0.00012622377622377624, "loss": 0.8028, "step": 550 }, { "epoch": 0.45177764682773525, "grad_norm": 0.6268212199211121, "learning_rate": 0.00012185314685314686, "loss": 0.7969, "step": 575 }, { "epoch": 0.4714201532115498, "grad_norm": 0.5777909755706787, "learning_rate": 0.00011748251748251749, "loss": 0.7803, "step": 600 }, { "epoch": 0.4910626595953644, "grad_norm": 0.5517834424972534, "learning_rate": 0.0001131118881118881, "loss": 0.8052, "step": 625 }, { "epoch": 0.5107051659791789, "grad_norm": 0.5613248944282532, "learning_rate": 0.00010874125874125876, "loss": 0.7731, "step": 650 }, { "epoch": 0.5303476723629935, "grad_norm": 0.5555421113967896, "learning_rate": 0.00010437062937062938, "loss": 0.7959, "step": 675 }, { "epoch": 0.5499901787468081, "grad_norm": 0.5249913334846497, "learning_rate": 0.0001, "loss": 0.8082, "step": 700 }, { "epoch": 0.5696326851306227, "grad_norm": 0.578350841999054, "learning_rate": 9.562937062937063e-05, "loss": 0.7996, "step": 725 }, { "epoch": 0.5892751915144372, "grad_norm": 0.5972084403038025, "learning_rate": 9.125874125874126e-05, "loss": 0.7892, "step": 750 }, { "epoch": 0.6089176978982518, "grad_norm": 0.5550151467323303, "learning_rate": 8.688811188811189e-05, "loss": 0.7544, "step": 775 }, { "epoch": 0.6285602042820664, "grad_norm": 0.5595849752426147, "learning_rate": 8.251748251748252e-05, "loss": 0.7917, "step": 800 }, { "epoch": 0.648202710665881, "grad_norm": 0.5400447249412537, "learning_rate": 7.814685314685315e-05, "loss": 0.7429, "step": 825 }, { "epoch": 0.6678452170496956, "grad_norm": 0.5469474196434021, "learning_rate": 7.377622377622378e-05, "loss": 0.7858, "step": 850 }, { "epoch": 0.6874877234335102, "grad_norm": 0.5074354410171509, "learning_rate": 6.940559440559441e-05, "loss": 0.7378, "step": 875 }, { "epoch": 0.7071302298173247, "grad_norm": 0.5348958373069763, "learning_rate": 6.503496503496504e-05, "loss": 0.7742, "step": 900 }, { "epoch": 0.7267727362011392, "grad_norm": 0.5498335957527161, "learning_rate": 6.066433566433567e-05, "loss": 0.7922, "step": 925 }, { "epoch": 0.7464152425849538, "grad_norm": 0.5797409415245056, "learning_rate": 5.629370629370629e-05, "loss": 0.7567, "step": 950 }, { "epoch": 0.7660577489687684, "grad_norm": 0.5608484745025635, "learning_rate": 5.192307692307693e-05, "loss": 0.7533, "step": 975 }, { "epoch": 0.785700255352583, "grad_norm": 0.5730789303779602, "learning_rate": 4.755244755244756e-05, "loss": 0.7608, "step": 1000 }, { "epoch": 0.8053427617363975, "grad_norm": 0.5161120295524597, "learning_rate": 4.318181818181819e-05, "loss": 0.7644, "step": 1025 }, { "epoch": 0.8249852681202121, "grad_norm": 0.6298760175704956, "learning_rate": 3.8811188811188816e-05, "loss": 0.7678, "step": 1050 }, { "epoch": 0.8446277745040267, "grad_norm": 0.559695839881897, "learning_rate": 3.4440559440559445e-05, "loss": 0.7627, "step": 1075 }, { "epoch": 0.8642702808878413, "grad_norm": 0.5945947170257568, "learning_rate": 3.0069930069930068e-05, "loss": 0.7767, "step": 1100 }, { "epoch": 0.8839127872716559, "grad_norm": 0.5842404365539551, "learning_rate": 2.5699300699300697e-05, "loss": 0.7752, "step": 1125 }, { "epoch": 0.9035552936554705, "grad_norm": 0.5409468412399292, "learning_rate": 2.132867132867133e-05, "loss": 0.7667, "step": 1150 }, { "epoch": 0.923197800039285, "grad_norm": 0.6497332453727722, "learning_rate": 1.695804195804196e-05, "loss": 0.7817, "step": 1175 }, { "epoch": 0.9428403064230996, "grad_norm": 0.5824007987976074, "learning_rate": 1.2587412587412589e-05, "loss": 0.7951, "step": 1200 }, { "epoch": 0.9624828128069142, "grad_norm": 0.6233786940574646, "learning_rate": 8.216783216783217e-06, "loss": 0.7926, "step": 1225 }, { "epoch": 0.9821253191907288, "grad_norm": 0.5785284042358398, "learning_rate": 3.846153846153847e-06, "loss": 0.7632, "step": 1250 } ], "logging_steps": 25, "max_steps": 1272, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 1.2402531158196224e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }