{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 50, "global_step": 670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14925373134328357, "grad_norm": 35.596871044953, "learning_rate": 1.4925373134328358e-06, "loss": 1.9213, "step": 10 }, { "epoch": 0.29850746268656714, "grad_norm": 20.609759250234525, "learning_rate": 2.9850746268656716e-06, "loss": 1.151, "step": 20 }, { "epoch": 0.44776119402985076, "grad_norm": 17.51541675553714, "learning_rate": 4.477611940298508e-06, "loss": 0.9953, "step": 30 }, { "epoch": 0.5970149253731343, "grad_norm": 12.529056819699033, "learning_rate": 5.970149253731343e-06, "loss": 1.0143, "step": 40 }, { "epoch": 0.746268656716418, "grad_norm": 12.891928164600921, "learning_rate": 7.46268656716418e-06, "loss": 1.0534, "step": 50 }, { "epoch": 0.746268656716418, "eval_loss": 1.2635215520858765, "eval_runtime": 0.9888, "eval_samples_per_second": 22.25, "eval_steps_per_second": 3.034, "step": 50 }, { "epoch": 0.8955223880597015, "grad_norm": 13.18139035026108, "learning_rate": 8.955223880597016e-06, "loss": 1.0096, "step": 60 }, { "epoch": 1.044776119402985, "grad_norm": 10.740181523405264, "learning_rate": 9.999389284703265e-06, "loss": 0.9688, "step": 70 }, { "epoch": 1.1940298507462686, "grad_norm": 10.70923590419392, "learning_rate": 9.988536273658876e-06, "loss": 0.711, "step": 80 }, { "epoch": 1.3432835820895521, "grad_norm": 11.258468673854201, "learning_rate": 9.964145714351633e-06, "loss": 0.7263, "step": 90 }, { "epoch": 1.4925373134328357, "grad_norm": 13.780566009646858, "learning_rate": 9.926283796211796e-06, "loss": 0.8118, "step": 100 }, { "epoch": 1.4925373134328357, "eval_loss": 1.3804577589035034, "eval_runtime": 0.9931, "eval_samples_per_second": 22.152, "eval_steps_per_second": 3.021, "step": 100 }, { "epoch": 1.6417910447761193, "grad_norm": 9.18658078765986, "learning_rate": 9.87505326632108e-06, "loss": 0.8126, "step": 110 }, { "epoch": 1.7910447761194028, "grad_norm": 10.993066748048813, "learning_rate": 9.810593150584658e-06, "loss": 0.8099, "step": 120 }, { "epoch": 1.9402985074626866, "grad_norm": 8.891553991106202, "learning_rate": 9.733078376452172e-06, "loss": 0.804, "step": 130 }, { "epoch": 2.08955223880597, "grad_norm": 8.268316338255294, "learning_rate": 9.642719298211602e-06, "loss": 0.4978, "step": 140 }, { "epoch": 2.2388059701492535, "grad_norm": 8.48854187035418, "learning_rate": 9.539761126144193e-06, "loss": 0.3889, "step": 150 }, { "epoch": 2.2388059701492535, "eval_loss": 1.6006965637207031, "eval_runtime": 0.9927, "eval_samples_per_second": 22.162, "eval_steps_per_second": 3.022, "step": 150 }, { "epoch": 2.388059701492537, "grad_norm": 6.609934011045634, "learning_rate": 9.424483261089584e-06, "loss": 0.3846, "step": 160 }, { "epoch": 2.5373134328358207, "grad_norm": 7.50682908737009, "learning_rate": 9.297198536226927e-06, "loss": 0.4125, "step": 170 }, { "epoch": 2.6865671641791042, "grad_norm": 8.821510256598781, "learning_rate": 9.158252368129628e-06, "loss": 0.4261, "step": 180 }, { "epoch": 2.835820895522388, "grad_norm": 7.416255028966862, "learning_rate": 9.008021819397488e-06, "loss": 0.4488, "step": 190 }, { "epoch": 2.9850746268656714, "grad_norm": 8.233603986842171, "learning_rate": 8.846914575410035e-06, "loss": 0.4361, "step": 200 }, { "epoch": 2.9850746268656714, "eval_loss": 1.532719612121582, "eval_runtime": 0.9841, "eval_samples_per_second": 22.354, "eval_steps_per_second": 3.048, "step": 200 }, { "epoch": 3.1343283582089554, "grad_norm": 8.824258454974311, "learning_rate": 8.675367837977848e-06, "loss": 0.2399, "step": 210 }, { "epoch": 3.283582089552239, "grad_norm": 7.063462594451812, "learning_rate": 8.49384713889421e-06, "loss": 0.2246, "step": 220 }, { "epoch": 3.4328358208955225, "grad_norm": 7.331660049583603, "learning_rate": 8.302845076606786e-06, "loss": 0.2415, "step": 230 }, { "epoch": 3.582089552238806, "grad_norm": 6.424375821012275, "learning_rate": 8.10287997943769e-06, "loss": 0.2615, "step": 240 }, { "epoch": 3.7313432835820897, "grad_norm": 9.043879070705094, "learning_rate": 7.894494498979558e-06, "loss": 0.265, "step": 250 }, { "epoch": 3.7313432835820897, "eval_loss": 1.6067496538162231, "eval_runtime": 0.99, "eval_samples_per_second": 22.223, "eval_steps_per_second": 3.03, "step": 250 }, { "epoch": 3.8805970149253732, "grad_norm": 7.232856290764463, "learning_rate": 7.678254137484797e-06, "loss": 0.2371, "step": 260 }, { "epoch": 4.029850746268656, "grad_norm": 3.885849230377616, "learning_rate": 7.4547457132442895e-06, "loss": 0.2234, "step": 270 }, { "epoch": 4.17910447761194, "grad_norm": 8.201942272896055, "learning_rate": 7.2245757681200835e-06, "loss": 0.1273, "step": 280 }, { "epoch": 4.3283582089552235, "grad_norm": 6.742950931031516, "learning_rate": 6.988368921553601e-06, "loss": 0.1232, "step": 290 }, { "epoch": 4.477611940298507, "grad_norm": 7.18067667554869, "learning_rate": 6.746766175516159e-06, "loss": 0.1347, "step": 300 }, { "epoch": 4.477611940298507, "eval_loss": 1.8177189826965332, "eval_runtime": 0.9881, "eval_samples_per_second": 22.265, "eval_steps_per_second": 3.036, "step": 300 }, { "epoch": 4.6268656716417915, "grad_norm": 4.134448034439449, "learning_rate": 6.500423175001705e-06, "loss": 0.1326, "step": 310 }, { "epoch": 4.776119402985074, "grad_norm": 5.984462599094163, "learning_rate": 6.2500084287822925e-06, "loss": 0.1424, "step": 320 }, { "epoch": 4.925373134328359, "grad_norm": 7.603214916490548, "learning_rate": 5.996201495254757e-06, "loss": 0.1489, "step": 330 }, { "epoch": 5.074626865671641, "grad_norm": 4.174840343630945, "learning_rate": 5.73969113830165e-06, "loss": 0.0922, "step": 340 }, { "epoch": 5.223880597014926, "grad_norm": 4.1502544333430995, "learning_rate": 5.481173458170952e-06, "loss": 0.0857, "step": 350 }, { "epoch": 5.223880597014926, "eval_loss": 1.977053165435791, "eval_runtime": 0.9844, "eval_samples_per_second": 22.349, "eval_steps_per_second": 3.048, "step": 350 }, { "epoch": 5.373134328358209, "grad_norm": 6.862487683858245, "learning_rate": 5.221350002446882e-06, "loss": 0.0723, "step": 360 }, { "epoch": 5.522388059701493, "grad_norm": 4.515093540059101, "learning_rate": 4.96092586223808e-06, "loss": 0.0693, "step": 370 }, { "epoch": 5.6716417910447765, "grad_norm": 5.571199681266628, "learning_rate": 4.700607758749626e-06, "loss": 0.0978, "step": 380 }, { "epoch": 5.82089552238806, "grad_norm": 4.718390811556805, "learning_rate": 4.441102125431398e-06, "loss": 0.0792, "step": 390 }, { "epoch": 5.970149253731344, "grad_norm": 3.7184414288353422, "learning_rate": 4.183113190907349e-06, "loss": 0.0709, "step": 400 }, { "epoch": 5.970149253731344, "eval_loss": 1.9007922410964966, "eval_runtime": 0.9935, "eval_samples_per_second": 22.145, "eval_steps_per_second": 3.02, "step": 400 }, { "epoch": 6.119402985074627, "grad_norm": 3.9943470043032603, "learning_rate": 3.927341067888065e-06, "loss": 0.0309, "step": 410 }, { "epoch": 6.268656716417911, "grad_norm": 2.122106221543816, "learning_rate": 3.6744798532528137e-06, "loss": 0.0318, "step": 420 }, { "epoch": 6.417910447761194, "grad_norm": 5.125755419214468, "learning_rate": 3.4252157444569478e-06, "loss": 0.0388, "step": 430 }, { "epoch": 6.567164179104478, "grad_norm": 3.6926779235153173, "learning_rate": 3.1802251773762294e-06, "loss": 0.0549, "step": 440 }, { "epoch": 6.7164179104477615, "grad_norm": 3.2890545865060345, "learning_rate": 2.9401729906414385e-06, "loss": 0.0474, "step": 450 }, { "epoch": 6.7164179104477615, "eval_loss": 2.131742238998413, "eval_runtime": 0.9906, "eval_samples_per_second": 22.208, "eval_steps_per_second": 3.028, "step": 450 }, { "epoch": 6.865671641791045, "grad_norm": 2.1811442357317303, "learning_rate": 2.7057106214448216e-06, "loss": 0.0502, "step": 460 }, { "epoch": 7.014925373134329, "grad_norm": 1.2788531958401737, "learning_rate": 2.4774743377144265e-06, "loss": 0.0228, "step": 470 }, { "epoch": 7.164179104477612, "grad_norm": 5.299430914553839, "learning_rate": 2.256083511453747e-06, "loss": 0.0145, "step": 480 }, { "epoch": 7.313432835820896, "grad_norm": 2.916687972196201, "learning_rate": 2.042138937932388e-06, "loss": 0.019, "step": 490 }, { "epoch": 7.462686567164179, "grad_norm": 0.1327510881497756, "learning_rate": 1.8362212052889827e-06, "loss": 0.0286, "step": 500 }, { "epoch": 7.462686567164179, "eval_loss": 2.2198660373687744, "eval_runtime": 0.9905, "eval_samples_per_second": 22.21, "eval_steps_per_second": 3.029, "step": 500 }, { "epoch": 7.611940298507463, "grad_norm": 2.6883922474481623, "learning_rate": 1.63888911897084e-06, "loss": 0.0177, "step": 510 }, { "epoch": 7.7611940298507465, "grad_norm": 2.30912712845238, "learning_rate": 1.4506781852859836e-06, "loss": 0.0247, "step": 520 }, { "epoch": 7.91044776119403, "grad_norm": 0.9713338532013991, "learning_rate": 1.2720991581827852e-06, "loss": 0.0129, "step": 530 }, { "epoch": 8.059701492537313, "grad_norm": 0.21817578611988062, "learning_rate": 1.1036366532008552e-06, "loss": 0.0118, "step": 540 }, { "epoch": 8.208955223880597, "grad_norm": 0.9407606204330533, "learning_rate": 9.457478323545749e-07, "loss": 0.0091, "step": 550 }, { "epoch": 8.208955223880597, "eval_loss": 2.2086477279663086, "eval_runtime": 0.9909, "eval_samples_per_second": 22.202, "eval_steps_per_second": 3.028, "step": 550 }, { "epoch": 8.35820895522388, "grad_norm": 0.49920720779292554, "learning_rate": 7.988611635181099e-07, "loss": 0.0065, "step": 560 }, { "epoch": 8.507462686567164, "grad_norm": 0.4774032163274442, "learning_rate": 6.633752576786251e-07, "loss": 0.0104, "step": 570 }, { "epoch": 8.656716417910447, "grad_norm": 0.5916059752548383, "learning_rate": 5.396577872130676e-07, "loss": 0.006, "step": 580 }, { "epoch": 8.805970149253731, "grad_norm": 0.2966687083499466, "learning_rate": 4.2804448812404754e-07, "loss": 0.006, "step": 590 }, { "epoch": 8.955223880597014, "grad_norm": 0.7007110357286702, "learning_rate": 3.288382489424502e-07, "loss": 0.0054, "step": 600 }, { "epoch": 8.955223880597014, "eval_loss": 2.2865116596221924, "eval_runtime": 0.9893, "eval_samples_per_second": 22.239, "eval_steps_per_second": 3.033, "step": 600 }, { "epoch": 9.104477611940299, "grad_norm": 1.48416033550746, "learning_rate": 2.4230828876927293e-07, "loss": 0.006, "step": 610 }, { "epoch": 9.253731343283581, "grad_norm": 1.0330803943081195, "learning_rate": 1.6868942668726408e-07, "loss": 0.0033, "step": 620 }, { "epoch": 9.402985074626866, "grad_norm": 0.9190962016297656, "learning_rate": 1.0818144452496293e-07, "loss": 0.0064, "step": 630 }, { "epoch": 9.552238805970148, "grad_norm": 0.03995791087011875, "learning_rate": 6.094854470245326e-08, "loss": 0.0023, "step": 640 }, { "epoch": 9.701492537313433, "grad_norm": 0.04136050995455753, "learning_rate": 2.711890463007405e-08, "loss": 0.0038, "step": 650 }, { "epoch": 9.701492537313433, "eval_loss": 2.301581621170044, "eval_runtime": 0.9856, "eval_samples_per_second": 22.322, "eval_steps_per_second": 3.044, "step": 650 }, { "epoch": 9.850746268656717, "grad_norm": 0.875079509506855, "learning_rate": 6.784328869339218e-09, "loss": 0.0037, "step": 660 }, { "epoch": 10.0, "grad_norm": 0.27404452578171096, "learning_rate": 0.0, "loss": 0.0034, "step": 670 }, { "epoch": 10.0, "step": 670, "total_flos": 51054080163840.0, "train_loss": 0.28853574658174125, "train_runtime": 2679.9258, "train_samples_per_second": 3.989, "train_steps_per_second": 0.25 } ], "logging_steps": 10, "max_steps": 670, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 51054080163840.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }