| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994107248084856, | |
| "eval_steps": 500, | |
| "global_step": 1272, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.019642506383814574, | |
| "grad_norm": 2.737959623336792, | |
| "learning_rate": 3.90625e-05, | |
| "loss": 1.8852, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03928501276762915, | |
| "grad_norm": 0.8758559823036194, | |
| "learning_rate": 7.8125e-05, | |
| "loss": 1.4536, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.05892751915144372, | |
| "grad_norm": 0.5361883640289307, | |
| "learning_rate": 0.00011718750000000001, | |
| "loss": 1.16, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.0785700255352583, | |
| "grad_norm": 0.5839149951934814, | |
| "learning_rate": 0.00015625, | |
| "loss": 1.01, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09821253191907288, | |
| "grad_norm": 0.6101933121681213, | |
| "learning_rate": 0.0001953125, | |
| "loss": 0.9627, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.11785503830288745, | |
| "grad_norm": 0.6417807936668396, | |
| "learning_rate": 0.00019615384615384615, | |
| "loss": 0.9149, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.13749754468670203, | |
| "grad_norm": 0.549029529094696, | |
| "learning_rate": 0.0001917832167832168, | |
| "loss": 0.8981, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1571400510705166, | |
| "grad_norm": 0.6425251364707947, | |
| "learning_rate": 0.00018741258741258743, | |
| "loss": 0.8879, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.17678255745433116, | |
| "grad_norm": 0.5528634786605835, | |
| "learning_rate": 0.00018304195804195805, | |
| "loss": 0.8691, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.19642506383814576, | |
| "grad_norm": 0.6609376668930054, | |
| "learning_rate": 0.00017867132867132866, | |
| "loss": 0.8617, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.21606757022196033, | |
| "grad_norm": 0.5458253622055054, | |
| "learning_rate": 0.0001743006993006993, | |
| "loss": 0.8572, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2357100766057749, | |
| "grad_norm": 0.6044121384620667, | |
| "learning_rate": 0.00016993006993006995, | |
| "loss": 0.8494, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.25535258298958946, | |
| "grad_norm": 0.5752493739128113, | |
| "learning_rate": 0.00016555944055944056, | |
| "loss": 0.8381, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.27499508937340406, | |
| "grad_norm": 0.5365332961082458, | |
| "learning_rate": 0.0001611888111888112, | |
| "loss": 0.8516, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.2946375957572186, | |
| "grad_norm": 0.7016746997833252, | |
| "learning_rate": 0.00015681818181818182, | |
| "loss": 0.8359, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3142801021410332, | |
| "grad_norm": 0.6072686910629272, | |
| "learning_rate": 0.00015244755244755244, | |
| "loss": 0.8178, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.3339226085248478, | |
| "grad_norm": 0.5570734739303589, | |
| "learning_rate": 0.00014807692307692308, | |
| "loss": 0.8127, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.3535651149086623, | |
| "grad_norm": 0.5914424657821655, | |
| "learning_rate": 0.00014370629370629372, | |
| "loss": 0.8128, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3732076212924769, | |
| "grad_norm": 0.5375176072120667, | |
| "learning_rate": 0.00013933566433566434, | |
| "loss": 0.7828, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.3928501276762915, | |
| "grad_norm": 0.5489270091056824, | |
| "learning_rate": 0.00013496503496503496, | |
| "loss": 0.8109, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.41249263406010606, | |
| "grad_norm": 0.5411733984947205, | |
| "learning_rate": 0.0001305944055944056, | |
| "loss": 0.7862, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.43213514044392065, | |
| "grad_norm": 0.5604883432388306, | |
| "learning_rate": 0.00012622377622377624, | |
| "loss": 0.8028, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.45177764682773525, | |
| "grad_norm": 0.6268212199211121, | |
| "learning_rate": 0.00012185314685314686, | |
| "loss": 0.7969, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.4714201532115498, | |
| "grad_norm": 0.5777909755706787, | |
| "learning_rate": 0.00011748251748251749, | |
| "loss": 0.7803, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.4910626595953644, | |
| "grad_norm": 0.5517834424972534, | |
| "learning_rate": 0.0001131118881118881, | |
| "loss": 0.8052, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.5107051659791789, | |
| "grad_norm": 0.5613248944282532, | |
| "learning_rate": 0.00010874125874125876, | |
| "loss": 0.7731, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.5303476723629935, | |
| "grad_norm": 0.5555421113967896, | |
| "learning_rate": 0.00010437062937062938, | |
| "loss": 0.7959, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.5499901787468081, | |
| "grad_norm": 0.5249913334846497, | |
| "learning_rate": 0.0001, | |
| "loss": 0.8082, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.5696326851306227, | |
| "grad_norm": 0.578350841999054, | |
| "learning_rate": 9.562937062937063e-05, | |
| "loss": 0.7996, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.5892751915144372, | |
| "grad_norm": 0.5972084403038025, | |
| "learning_rate": 9.125874125874126e-05, | |
| "loss": 0.7892, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.6089176978982518, | |
| "grad_norm": 0.5550151467323303, | |
| "learning_rate": 8.688811188811189e-05, | |
| "loss": 0.7544, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.6285602042820664, | |
| "grad_norm": 0.5595849752426147, | |
| "learning_rate": 8.251748251748252e-05, | |
| "loss": 0.7917, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.648202710665881, | |
| "grad_norm": 0.5400447249412537, | |
| "learning_rate": 7.814685314685315e-05, | |
| "loss": 0.7429, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.6678452170496956, | |
| "grad_norm": 0.5469474196434021, | |
| "learning_rate": 7.377622377622378e-05, | |
| "loss": 0.7858, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.6874877234335102, | |
| "grad_norm": 0.5074354410171509, | |
| "learning_rate": 6.940559440559441e-05, | |
| "loss": 0.7378, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.7071302298173247, | |
| "grad_norm": 0.5348958373069763, | |
| "learning_rate": 6.503496503496504e-05, | |
| "loss": 0.7742, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.7267727362011392, | |
| "grad_norm": 0.5498335957527161, | |
| "learning_rate": 6.066433566433567e-05, | |
| "loss": 0.7922, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.7464152425849538, | |
| "grad_norm": 0.5797409415245056, | |
| "learning_rate": 5.629370629370629e-05, | |
| "loss": 0.7567, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.7660577489687684, | |
| "grad_norm": 0.5608484745025635, | |
| "learning_rate": 5.192307692307693e-05, | |
| "loss": 0.7533, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.785700255352583, | |
| "grad_norm": 0.5730789303779602, | |
| "learning_rate": 4.755244755244756e-05, | |
| "loss": 0.7608, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.8053427617363975, | |
| "grad_norm": 0.5161120295524597, | |
| "learning_rate": 4.318181818181819e-05, | |
| "loss": 0.7644, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.8249852681202121, | |
| "grad_norm": 0.6298760175704956, | |
| "learning_rate": 3.8811188811188816e-05, | |
| "loss": 0.7678, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.8446277745040267, | |
| "grad_norm": 0.559695839881897, | |
| "learning_rate": 3.4440559440559445e-05, | |
| "loss": 0.7627, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.8642702808878413, | |
| "grad_norm": 0.5945947170257568, | |
| "learning_rate": 3.0069930069930068e-05, | |
| "loss": 0.7767, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.8839127872716559, | |
| "grad_norm": 0.5842404365539551, | |
| "learning_rate": 2.5699300699300697e-05, | |
| "loss": 0.7752, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.9035552936554705, | |
| "grad_norm": 0.5409468412399292, | |
| "learning_rate": 2.132867132867133e-05, | |
| "loss": 0.7667, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.923197800039285, | |
| "grad_norm": 0.6497332453727722, | |
| "learning_rate": 1.695804195804196e-05, | |
| "loss": 0.7817, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.9428403064230996, | |
| "grad_norm": 0.5824007987976074, | |
| "learning_rate": 1.2587412587412589e-05, | |
| "loss": 0.7951, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.9624828128069142, | |
| "grad_norm": 0.6233786940574646, | |
| "learning_rate": 8.216783216783217e-06, | |
| "loss": 0.7926, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.9821253191907288, | |
| "grad_norm": 0.5785284042358398, | |
| "learning_rate": 3.846153846153847e-06, | |
| "loss": 0.7632, | |
| "step": 1250 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 1272, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "total_flos": 1.2402531158196224e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |