| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9974102848686645, |
| "eval_steps": 100, |
| "global_step": 337, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.014798372179060304, |
| "grad_norm": 2.5319928724744716, |
| "learning_rate": 2.9411764705882355e-06, |
| "loss": 1.0934, |
| "mean_token_accuracy": 0.710963543585844, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.029596744358120607, |
| "grad_norm": 1.4244723552718639, |
| "learning_rate": 5.882352941176471e-06, |
| "loss": 1.0769, |
| "mean_token_accuracy": 0.7137204635526403, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.04439511653718091, |
| "grad_norm": 1.2482381526688695, |
| "learning_rate": 8.823529411764707e-06, |
| "loss": 0.9837, |
| "mean_token_accuracy": 0.729854775606871, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.059193488716241215, |
| "grad_norm": 0.8840186116418455, |
| "learning_rate": 1.1764705882352942e-05, |
| "loss": 0.959, |
| "mean_token_accuracy": 0.7300877725865283, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.07399186089530152, |
| "grad_norm": 0.7992527652377878, |
| "learning_rate": 1.4705882352941179e-05, |
| "loss": 0.8946, |
| "mean_token_accuracy": 0.7439422970769893, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.08879023307436182, |
| "grad_norm": 0.7193278191451224, |
| "learning_rate": 1.7647058823529414e-05, |
| "loss": 0.8869, |
| "mean_token_accuracy": 0.7441348945078509, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.10358860525342212, |
| "grad_norm": 0.620194874837715, |
| "learning_rate": 1.9999462497359468e-05, |
| "loss": 0.846, |
| "mean_token_accuracy": 0.753654310037237, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.11838697743248243, |
| "grad_norm": 0.605036208438505, |
| "learning_rate": 1.9980655971335944e-05, |
| "loss": 0.8244, |
| "mean_token_accuracy": 0.7586147304733527, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.13318534961154274, |
| "grad_norm": 0.5325835774353814, |
| "learning_rate": 1.993503206718859e-05, |
| "loss": 0.8234, |
| "mean_token_accuracy": 0.7580317217224875, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.14798372179060304, |
| "grad_norm": 0.5922728516285096, |
| "learning_rate": 1.986271337340182e-05, |
| "loss": 0.8061, |
| "mean_token_accuracy": 0.7622985291883017, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.16278209396966334, |
| "grad_norm": 0.5378220329694758, |
| "learning_rate": 1.976389420563607e-05, |
| "loss": 0.8147, |
| "mean_token_accuracy": 0.759805779032566, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.17758046614872364, |
| "grad_norm": 0.5259675117637312, |
| "learning_rate": 1.9638840084614182e-05, |
| "loss": 0.8023, |
| "mean_token_accuracy": 0.7627232625492987, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.19237883832778394, |
| "grad_norm": 0.5762209896140157, |
| "learning_rate": 1.9487887022684336e-05, |
| "loss": 0.792, |
| "mean_token_accuracy": 0.7641335052616005, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.20717721050684423, |
| "grad_norm": 0.4808227147945767, |
| "learning_rate": 1.9311440620976597e-05, |
| "loss": 0.7952, |
| "mean_token_accuracy": 0.7634592815770592, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.22197558268590456, |
| "grad_norm": 0.5001394930884526, |
| "learning_rate": 1.9109974979578852e-05, |
| "loss": 0.7766, |
| "mean_token_accuracy": 0.768052524785652, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.23677395486496486, |
| "grad_norm": 0.5439973784585546, |
| "learning_rate": 1.8884031423660492e-05, |
| "loss": 0.7858, |
| "mean_token_accuracy": 0.7659040802193098, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.25157232704402516, |
| "grad_norm": 0.4626471578447052, |
| "learning_rate": 1.8634217048966638e-05, |
| "loss": 0.7898, |
| "mean_token_accuracy": 0.7641630963096735, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.2663706992230855, |
| "grad_norm": 0.4953360320003589, |
| "learning_rate": 1.836120309059107e-05, |
| "loss": 0.7653, |
| "mean_token_accuracy": 0.7711456471218759, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.28116907140214575, |
| "grad_norm": 0.4920623807025407, |
| "learning_rate": 1.8065723119410885e-05, |
| "loss": 0.7729, |
| "mean_token_accuracy": 0.7689427685113397, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2959674435812061, |
| "grad_norm": 0.5103694322535611, |
| "learning_rate": 1.77485710710289e-05, |
| "loss": 0.7769, |
| "mean_token_accuracy": 0.7670994908657515, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.2959674435812061, |
| "eval_loss": 0.7992530465126038, |
| "eval_mean_token_accuracy": 0.7520903997377723, |
| "eval_runtime": 37.942, |
| "eval_samples_per_second": 3.4, |
| "eval_steps_per_second": 0.237, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.31076581576026635, |
| "grad_norm": 0.5055132167951369, |
| "learning_rate": 1.741059911251997e-05, |
| "loss": 0.7743, |
| "mean_token_accuracy": 0.7680788037040351, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.3255641879393267, |
| "grad_norm": 0.506430933020723, |
| "learning_rate": 1.7052715352713076e-05, |
| "loss": 0.7701, |
| "mean_token_accuracy": 0.768703717503791, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.340362560118387, |
| "grad_norm": 0.5043957879151227, |
| "learning_rate": 1.667588140216154e-05, |
| "loss": 0.7809, |
| "mean_token_accuracy": 0.7663542286457965, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.3551609322974473, |
| "grad_norm": 0.4950305678035583, |
| "learning_rate": 1.628110978935756e-05, |
| "loss": 0.7858, |
| "mean_token_accuracy": 0.7642025057223836, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3699593044765076, |
| "grad_norm": 0.48560370957454324, |
| "learning_rate": 1.586946124013354e-05, |
| "loss": 0.7721, |
| "mean_token_accuracy": 0.7674092863960766, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.38475767665556787, |
| "grad_norm": 0.4630554031524683, |
| "learning_rate": 1.5442041827560274e-05, |
| "loss": 0.7701, |
| "mean_token_accuracy": 0.7682268770584184, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.3995560488346282, |
| "grad_norm": 0.48760057057176687, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 0.7784, |
| "mean_token_accuracy": 0.7661367681955669, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.41435442101368847, |
| "grad_norm": 0.5733716326658232, |
| "learning_rate": 1.4544523495299843e-05, |
| "loss": 0.7479, |
| "mean_token_accuracy": 0.7748393560242525, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4291527931927488, |
| "grad_norm": 0.4674485766276026, |
| "learning_rate": 1.4076836149416889e-05, |
| "loss": 0.7624, |
| "mean_token_accuracy": 0.7700554565613973, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.4439511653718091, |
| "grad_norm": 0.45437614184568614, |
| "learning_rate": 1.3598194608050011e-05, |
| "loss": 0.7615, |
| "mean_token_accuracy": 0.771047339890566, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4587495375508694, |
| "grad_norm": 0.5161211912106276, |
| "learning_rate": 1.3109884950114007e-05, |
| "loss": 0.7483, |
| "mean_token_accuracy": 0.7738057195374108, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.4735479097299297, |
| "grad_norm": 0.48299986888606256, |
| "learning_rate": 1.2613219232128608e-05, |
| "loss": 0.7785, |
| "mean_token_accuracy": 0.7653209447855766, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.48834628190899, |
| "grad_norm": 0.4972165634793764, |
| "learning_rate": 1.2109531962807333e-05, |
| "loss": 0.7584, |
| "mean_token_accuracy": 0.7717189958506077, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.5031446540880503, |
| "grad_norm": 0.5038295262718444, |
| "learning_rate": 1.1600176517318742e-05, |
| "loss": 0.7697, |
| "mean_token_accuracy": 0.7677580608314103, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5179430262671106, |
| "grad_norm": 0.506972102629947, |
| "learning_rate": 1.1086521500854746e-05, |
| "loss": 0.7644, |
| "mean_token_accuracy": 0.7690021577852479, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.532741398446171, |
| "grad_norm": 0.4607643963694159, |
| "learning_rate": 1.0569947071276847e-05, |
| "loss": 0.7508, |
| "mean_token_accuracy": 0.7737279353788342, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5475397706252312, |
| "grad_norm": 0.4690988373460707, |
| "learning_rate": 1.0051841230721065e-05, |
| "loss": 0.7571, |
| "mean_token_accuracy": 0.7718751333144616, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.5623381428042915, |
| "grad_norm": 0.5230684589508905, |
| "learning_rate": 9.533596096125826e-06, |
| "loss": 0.7473, |
| "mean_token_accuracy": 0.7735613336731262, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.5771365149833518, |
| "grad_norm": 0.4965020439000996, |
| "learning_rate": 9.016604158703654e-06, |
| "loss": 0.7366, |
| "mean_token_accuracy": 0.7761915520691296, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.5919348871624122, |
| "grad_norm": 0.47470212551853064, |
| "learning_rate": 8.502254542407186e-06, |
| "loss": 0.7509, |
| "mean_token_accuracy": 0.7725352138788509, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5919348871624122, |
| "eval_loss": 0.7736743092536926, |
| "eval_mean_token_accuracy": 0.7563969146133266, |
| "eval_runtime": 36.8476, |
| "eval_samples_per_second": 3.501, |
| "eval_steps_per_second": 0.244, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6067332593414725, |
| "grad_norm": 0.460020351716667, |
| "learning_rate": 7.991929271442817e-06, |
| "loss": 0.7406, |
| "mean_token_accuracy": 0.7756438027345837, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.6215316315205327, |
| "grad_norm": 0.4298188029300264, |
| "learning_rate": 7.48699955686089e-06, |
| "loss": 0.7481, |
| "mean_token_accuracy": 0.7735235687761747, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.636330003699593, |
| "grad_norm": 0.4966460976529198, |
| "learning_rate": 6.988822112200157e-06, |
| "loss": 0.775, |
| "mean_token_accuracy": 0.7659616415859062, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.6511283758786534, |
| "grad_norm": 0.41853465241085613, |
| "learning_rate": 6.498735508086094e-06, |
| "loss": 0.7498, |
| "mean_token_accuracy": 0.7732758807008967, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.6659267480577137, |
| "grad_norm": 0.4702312396884364, |
| "learning_rate": 6.018056575578075e-06, |
| "loss": 0.7592, |
| "mean_token_accuracy": 0.7700622405709057, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.680725120236774, |
| "grad_norm": 0.42975452969024625, |
| "learning_rate": 5.548076867929331e-06, |
| "loss": 0.7378, |
| "mean_token_accuracy": 0.7767314400830869, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.6955234924158342, |
| "grad_norm": 0.4108583246355971, |
| "learning_rate": 5.090059190266779e-06, |
| "loss": 0.7406, |
| "mean_token_accuracy": 0.7765217843280351, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.7103218645948945, |
| "grad_norm": 0.45304370273166905, |
| "learning_rate": 4.645234206515171e-06, |
| "loss": 0.7316, |
| "mean_token_accuracy": 0.7780036833059428, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7251202367739549, |
| "grad_norm": 0.4232322187564541, |
| "learning_rate": 4.214797132682597e-06, |
| "loss": 0.744, |
| "mean_token_accuracy": 0.7749563602812362, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.7399186089530152, |
| "grad_norm": 0.46727693453523017, |
| "learning_rate": 3.799904525392251e-06, |
| "loss": 0.7444, |
| "mean_token_accuracy": 0.7747609637386101, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7547169811320755, |
| "grad_norm": 0.4558283663119779, |
| "learning_rate": 3.401671174289469e-06, |
| "loss": 0.7487, |
| "mean_token_accuracy": 0.7735000671597567, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.7695153533111357, |
| "grad_norm": 0.4473149950802802, |
| "learning_rate": 3.021167106673928e-06, |
| "loss": 0.7445, |
| "mean_token_accuracy": 0.7743566228811749, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.7843137254901961, |
| "grad_norm": 0.4461233002812556, |
| "learning_rate": 2.6594147124053983e-06, |
| "loss": 0.7611, |
| "mean_token_accuracy": 0.7693132343191504, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.7991120976692564, |
| "grad_norm": 0.4288144300029119, |
| "learning_rate": 2.317385996808195e-06, |
| "loss": 0.7445, |
| "mean_token_accuracy": 0.7748136727030183, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8139104698483167, |
| "grad_norm": 0.42462095484083673, |
| "learning_rate": 1.9959999689556407e-06, |
| "loss": 0.7434, |
| "mean_token_accuracy": 0.7749245362781049, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.8287088420273769, |
| "grad_norm": 0.4063322872281588, |
| "learning_rate": 1.6961201723520248e-06, |
| "loss": 0.7449, |
| "mean_token_accuracy": 0.7749680039942058, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8435072142064373, |
| "grad_norm": 0.397264999375389, |
| "learning_rate": 1.4185523646469822e-06, |
| "loss": 0.7453, |
| "mean_token_accuracy": 0.7743507033929469, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.8583055863854976, |
| "grad_norm": 0.4451350635097519, |
| "learning_rate": 1.1640423526166987e-06, |
| "loss": 0.742, |
| "mean_token_accuracy": 0.7751791694228443, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8731039585645579, |
| "grad_norm": 0.43301511965025696, |
| "learning_rate": 9.332739882292752e-07, |
| "loss": 0.7389, |
| "mean_token_accuracy": 0.7757602051567191, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.8879023307436182, |
| "grad_norm": 0.4096833414120662, |
| "learning_rate": 7.268673311786378e-07, |
| "loss": 0.731, |
| "mean_token_accuracy": 0.7785745911127538, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.8879023307436182, |
| "eval_loss": 0.7652355432510376, |
| "eval_mean_token_accuracy": 0.7601538929893327, |
| "eval_runtime": 37.6456, |
| "eval_samples_per_second": 3.427, |
| "eval_steps_per_second": 0.239, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9027007029226785, |
| "grad_norm": 0.40132760343408064, |
| "learning_rate": 5.453769828241872e-07, |
| "loss": 0.7391, |
| "mean_token_accuracy": 0.7762386779688674, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.9174990751017388, |
| "grad_norm": 0.3932064251891102, |
| "learning_rate": 3.8929059601275463e-07, |
| "loss": 0.712, |
| "mean_token_accuracy": 0.7837027776073648, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9322974472807991, |
| "grad_norm": 0.404346485308974, |
| "learning_rate": 2.5902756478688674e-07, |
| "loss": 0.74, |
| "mean_token_accuracy": 0.7763130349280738, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.9470958194598594, |
| "grad_norm": 0.4014282081908607, |
| "learning_rate": 1.5493789750014032e-07, |
| "loss": 0.7373, |
| "mean_token_accuracy": 0.7765861246677741, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9618941916389198, |
| "grad_norm": 0.411881298783951, |
| "learning_rate": 7.730127636723539e-08, |
| "loss": 0.7236, |
| "mean_token_accuracy": 0.7803048063818399, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.97669256381798, |
| "grad_norm": 0.38863002242821104, |
| "learning_rate": 2.6326305976001054e-08, |
| "loss": 0.7384, |
| "mean_token_accuracy": 0.7764187201948197, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9914909359970403, |
| "grad_norm": 0.4208440998084639, |
| "learning_rate": 2.149952780321485e-09, |
| "loss": 0.7368, |
| "mean_token_accuracy": 0.7763385910458351, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.9974102848686645, |
| "mean_token_accuracy": 0.7691396070282631, |
| "step": 337, |
| "total_flos": 76774385909760.0, |
| "train_loss": 0.7816258675974271, |
| "train_runtime": 9729.0212, |
| "train_samples_per_second": 2.223, |
| "train_steps_per_second": 0.035 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 337, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": false, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 76774385909760.0, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|