{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 433, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02313475997686524, "grad_norm": 3.5443220138549805, "learning_rate": 0.00013846153846153847, "loss": 2.4906261444091795, "step": 10 }, { "epoch": 0.04626951995373048, "grad_norm": 0.4789237976074219, "learning_rate": 0.00019714285714285716, "loss": 0.13569670915603638, "step": 20 }, { "epoch": 0.06940427993059572, "grad_norm": 0.0071147712878882885, "learning_rate": 0.0001923809523809524, "loss": 0.023090672492980958, "step": 30 }, { "epoch": 0.09253903990746096, "grad_norm": 0.031249770894646645, "learning_rate": 0.00018761904761904763, "loss": 0.07923950552940369, "step": 40 }, { "epoch": 0.1156737998843262, "grad_norm": 0.23233748972415924, "learning_rate": 0.00018285714285714286, "loss": 0.01179821714758873, "step": 50 }, { "epoch": 0.13880855986119145, "grad_norm": 1.0764169692993164, "learning_rate": 0.0001780952380952381, "loss": 0.04595586061477661, "step": 60 }, { "epoch": 0.16194331983805668, "grad_norm": 0.4203377366065979, "learning_rate": 0.00017333333333333334, "loss": 0.020573070645332335, "step": 70 }, { "epoch": 0.18507807981492191, "grad_norm": 0.06960943341255188, "learning_rate": 0.00016857142857142857, "loss": 0.02428939938545227, "step": 80 }, { "epoch": 0.20821283979178715, "grad_norm": 0.761617124080658, "learning_rate": 0.0001638095238095238, "loss": 0.03832740783691406, "step": 90 }, { "epoch": 0.2313475997686524, "grad_norm": 0.024328596889972687, "learning_rate": 0.00015904761904761904, "loss": 0.010582192242145539, "step": 100 }, { "epoch": 0.25448235974551764, "grad_norm": 0.03712387755513191, "learning_rate": 0.0001542857142857143, "loss": 0.014394421875476838, "step": 110 }, { "epoch": 0.2776171197223829, "grad_norm": 0.35509419441223145, "learning_rate": 0.00014952380952380954, "loss": 0.03347426652908325, "step": 120 }, { "epoch": 0.3007518796992481, "grad_norm": 0.11692740023136139, "learning_rate": 0.00014476190476190475, "loss": 0.06869672536849976, "step": 130 }, { "epoch": 0.32388663967611336, "grad_norm": 0.025060011073946953, "learning_rate": 0.00014, "loss": 0.002869569510221481, "step": 140 }, { "epoch": 0.3470213996529786, "grad_norm": 0.04316815361380577, "learning_rate": 0.00013523809523809525, "loss": 0.017471878230571745, "step": 150 }, { "epoch": 0.37015615962984383, "grad_norm": 0.24662676453590393, "learning_rate": 0.0001304761904761905, "loss": 0.04017368853092194, "step": 160 }, { "epoch": 0.3932909196067091, "grad_norm": 0.013658199459314346, "learning_rate": 0.00012571428571428572, "loss": 0.011992159485816955, "step": 170 }, { "epoch": 0.4164256795835743, "grad_norm": 0.011875756084918976, "learning_rate": 0.00012095238095238095, "loss": 0.003486642986536026, "step": 180 }, { "epoch": 0.43956043956043955, "grad_norm": 0.008698398247361183, "learning_rate": 0.00011619047619047621, "loss": 0.009918726235628127, "step": 190 }, { "epoch": 0.4626951995373048, "grad_norm": 0.012751366011798382, "learning_rate": 0.00011142857142857144, "loss": 0.0135767862200737, "step": 200 }, { "epoch": 0.48582995951417, "grad_norm": 0.013968385756015778, "learning_rate": 0.00010666666666666667, "loss": 0.0009812915697693825, "step": 210 }, { "epoch": 0.5089647194910353, "grad_norm": 0.38197270035743713, "learning_rate": 0.0001019047619047619, "loss": 0.010036008059978485, "step": 220 }, { "epoch": 0.5320994794679005, "grad_norm": 0.008353643119335175, "learning_rate": 9.714285714285715e-05, "loss": 0.0008777316659688949, "step": 230 }, { "epoch": 0.5552342394447658, "grad_norm": 0.03482387587428093, "learning_rate": 9.238095238095239e-05, "loss": 0.06720049977302552, "step": 240 }, { "epoch": 0.578368999421631, "grad_norm": 0.017207808792591095, "learning_rate": 8.761904761904762e-05, "loss": 0.009993697702884673, "step": 250 }, { "epoch": 0.6015037593984962, "grad_norm": 0.04262904077768326, "learning_rate": 8.285714285714287e-05, "loss": 0.005409357324242592, "step": 260 }, { "epoch": 0.6246385193753615, "grad_norm": 0.010086641646921635, "learning_rate": 7.80952380952381e-05, "loss": 0.0022824501618742945, "step": 270 }, { "epoch": 0.6477732793522267, "grad_norm": 0.019428474828600883, "learning_rate": 7.333333333333333e-05, "loss": 0.009420862048864364, "step": 280 }, { "epoch": 0.6709080393290919, "grad_norm": 0.010999761521816254, "learning_rate": 6.857142857142858e-05, "loss": 0.017436870932579042, "step": 290 }, { "epoch": 0.6940427993059572, "grad_norm": 0.03521187976002693, "learning_rate": 6.38095238095238e-05, "loss": 0.007618572562932968, "step": 300 }, { "epoch": 0.7171775592828225, "grad_norm": 0.02904939278960228, "learning_rate": 5.904761904761905e-05, "loss": 0.009538635611534119, "step": 310 }, { "epoch": 0.7403123192596877, "grad_norm": 0.28503215312957764, "learning_rate": 5.428571428571428e-05, "loss": 0.003535139188170433, "step": 320 }, { "epoch": 0.763447079236553, "grad_norm": 0.1933276355266571, "learning_rate": 4.9523809523809525e-05, "loss": 0.00619364008307457, "step": 330 }, { "epoch": 0.7865818392134182, "grad_norm": 0.007664988283067942, "learning_rate": 4.476190476190477e-05, "loss": 0.0007373106665909291, "step": 340 }, { "epoch": 0.8097165991902834, "grad_norm": 0.5651599764823914, "learning_rate": 4e-05, "loss": 0.07755469083786011, "step": 350 }, { "epoch": 0.8328513591671486, "grad_norm": 0.05437934026122093, "learning_rate": 3.523809523809524e-05, "loss": 0.015709011256694792, "step": 360 }, { "epoch": 0.8559861191440139, "grad_norm": 0.061594076454639435, "learning_rate": 3.0476190476190482e-05, "loss": 0.04147002398967743, "step": 370 }, { "epoch": 0.8791208791208791, "grad_norm": 0.03086649812757969, "learning_rate": 2.5714285714285714e-05, "loss": 0.0031579844653606415, "step": 380 }, { "epoch": 0.9022556390977443, "grad_norm": 0.3715185523033142, "learning_rate": 2.0952380952380954e-05, "loss": 0.01193058043718338, "step": 390 }, { "epoch": 0.9253903990746096, "grad_norm": 0.017640365287661552, "learning_rate": 1.6190476190476193e-05, "loss": 0.00927691012620926, "step": 400 }, { "epoch": 0.9485251590514748, "grad_norm": 0.02302401326596737, "learning_rate": 1.1428571428571429e-05, "loss": 0.07495037913322448, "step": 410 }, { "epoch": 0.97165991902834, "grad_norm": 0.013834159821271896, "learning_rate": 6.666666666666667e-06, "loss": 0.015109787881374358, "step": 420 }, { "epoch": 0.9947946790052054, "grad_norm": 0.023592131212353706, "learning_rate": 1.9047619047619051e-06, "loss": 0.012308744341135025, "step": 430 } ], "logging_steps": 10, "max_steps": 433, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3979819485155328.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }