{ "best_global_step": 400, "best_metric": 2.051694393157959, "best_model_checkpoint": "models/MNLP_M3_rag_model_test/checkpoint-400", "epoch": 2.08355091383812, "eval_steps": 200, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05221932114882506, "grad_norm": 17.204919815063477, "learning_rate": 8.000000000000001e-07, "loss": 3.6299, "step": 10 }, { "epoch": 0.10443864229765012, "grad_norm": 10.213739395141602, "learning_rate": 1.8000000000000001e-06, "loss": 3.263, "step": 20 }, { "epoch": 0.1566579634464752, "grad_norm": 10.35930347442627, "learning_rate": 2.8000000000000003e-06, "loss": 2.7275, "step": 30 }, { "epoch": 0.20887728459530025, "grad_norm": 13.71743106842041, "learning_rate": 3.8000000000000005e-06, "loss": 2.7291, "step": 40 }, { "epoch": 0.26109660574412535, "grad_norm": 9.006918907165527, "learning_rate": 4.800000000000001e-06, "loss": 2.551, "step": 50 }, { "epoch": 0.3133159268929504, "grad_norm": 7.384293556213379, "learning_rate": 5.8e-06, "loss": 2.4662, "step": 60 }, { "epoch": 0.36553524804177545, "grad_norm": 7.212233543395996, "learning_rate": 6.800000000000001e-06, "loss": 2.3987, "step": 70 }, { "epoch": 0.4177545691906005, "grad_norm": 8.014248847961426, "learning_rate": 7.800000000000002e-06, "loss": 2.3993, "step": 80 }, { "epoch": 0.4699738903394256, "grad_norm": 7.093222618103027, "learning_rate": 8.8e-06, "loss": 2.3603, "step": 90 }, { "epoch": 0.5221932114882507, "grad_norm": 9.149968147277832, "learning_rate": 9.800000000000001e-06, "loss": 2.3923, "step": 100 }, { "epoch": 0.5744125326370757, "grad_norm": 8.030098915100098, "learning_rate": 9.943977591036416e-06, "loss": 2.2983, "step": 110 }, { "epoch": 0.6266318537859008, "grad_norm": 7.210897445678711, "learning_rate": 9.873949579831935e-06, "loss": 2.3049, "step": 120 }, { "epoch": 0.6788511749347258, "grad_norm": 7.169530391693115, "learning_rate": 9.803921568627451e-06, "loss": 2.2872, "step": 130 }, { "epoch": 0.7310704960835509, "grad_norm": 7.033287525177002, "learning_rate": 9.73389355742297e-06, "loss": 2.279, "step": 140 }, { "epoch": 0.783289817232376, "grad_norm": 7.60181188583374, "learning_rate": 9.663865546218488e-06, "loss": 2.2453, "step": 150 }, { "epoch": 0.835509138381201, "grad_norm": 7.923684597015381, "learning_rate": 9.593837535014006e-06, "loss": 2.2335, "step": 160 }, { "epoch": 0.8877284595300261, "grad_norm": 6.4125165939331055, "learning_rate": 9.523809523809525e-06, "loss": 2.2682, "step": 170 }, { "epoch": 0.9399477806788512, "grad_norm": 9.670829772949219, "learning_rate": 9.453781512605043e-06, "loss": 2.1589, "step": 180 }, { "epoch": 0.9921671018276762, "grad_norm": 6.57144832611084, "learning_rate": 9.383753501400561e-06, "loss": 2.04, "step": 190 }, { "epoch": 1.04177545691906, "grad_norm": 6.862838268280029, "learning_rate": 9.31372549019608e-06, "loss": 1.867, "step": 200 }, { "epoch": 1.04177545691906, "eval_loss": 2.155261278152466, "eval_runtime": 4.7541, "eval_samples_per_second": 35.969, "eval_steps_per_second": 4.628, "step": 200 }, { "epoch": 1.0939947780678851, "grad_norm": 12.507933616638184, "learning_rate": 9.243697478991598e-06, "loss": 1.7035, "step": 210 }, { "epoch": 1.1462140992167102, "grad_norm": 8.804220199584961, "learning_rate": 9.173669467787115e-06, "loss": 1.6772, "step": 220 }, { "epoch": 1.1984334203655354, "grad_norm": 8.77147102355957, "learning_rate": 9.103641456582633e-06, "loss": 1.6972, "step": 230 }, { "epoch": 1.2506527415143602, "grad_norm": 8.48830795288086, "learning_rate": 9.033613445378152e-06, "loss": 1.5959, "step": 240 }, { "epoch": 1.3028720626631853, "grad_norm": 7.4547648429870605, "learning_rate": 8.96358543417367e-06, "loss": 1.6931, "step": 250 }, { "epoch": 1.3550913838120104, "grad_norm": 10.059548377990723, "learning_rate": 8.893557422969188e-06, "loss": 1.6475, "step": 260 }, { "epoch": 1.4073107049608355, "grad_norm": 8.088046073913574, "learning_rate": 8.823529411764707e-06, "loss": 1.7921, "step": 270 }, { "epoch": 1.4595300261096606, "grad_norm": 7.830897808074951, "learning_rate": 8.753501400560225e-06, "loss": 1.6743, "step": 280 }, { "epoch": 1.5117493472584855, "grad_norm": 10.651778221130371, "learning_rate": 8.683473389355744e-06, "loss": 1.614, "step": 290 }, { "epoch": 1.5639686684073109, "grad_norm": 8.227381706237793, "learning_rate": 8.613445378151262e-06, "loss": 1.612, "step": 300 }, { "epoch": 1.6161879895561357, "grad_norm": 8.035416603088379, "learning_rate": 8.543417366946779e-06, "loss": 1.6633, "step": 310 }, { "epoch": 1.6684073107049608, "grad_norm": 8.318634033203125, "learning_rate": 8.473389355742297e-06, "loss": 1.6989, "step": 320 }, { "epoch": 1.720626631853786, "grad_norm": 6.585997581481934, "learning_rate": 8.403361344537815e-06, "loss": 1.6686, "step": 330 }, { "epoch": 1.7728459530026108, "grad_norm": 6.752286911010742, "learning_rate": 8.333333333333334e-06, "loss": 1.7218, "step": 340 }, { "epoch": 1.8250652741514362, "grad_norm": 8.7686128616333, "learning_rate": 8.263305322128852e-06, "loss": 1.588, "step": 350 }, { "epoch": 1.877284595300261, "grad_norm": 7.1550703048706055, "learning_rate": 8.19327731092437e-06, "loss": 1.5188, "step": 360 }, { "epoch": 1.9295039164490861, "grad_norm": 8.00949478149414, "learning_rate": 8.123249299719889e-06, "loss": 1.5708, "step": 370 }, { "epoch": 1.9817232375979112, "grad_norm": 8.731331825256348, "learning_rate": 8.053221288515407e-06, "loss": 1.565, "step": 380 }, { "epoch": 2.031331592689295, "grad_norm": 8.245712280273438, "learning_rate": 7.983193277310926e-06, "loss": 1.3907, "step": 390 }, { "epoch": 2.08355091383812, "grad_norm": 10.174018859863281, "learning_rate": 7.913165266106442e-06, "loss": 1.1657, "step": 400 }, { "epoch": 2.08355091383812, "eval_loss": 2.051694393157959, "eval_runtime": 4.6523, "eval_samples_per_second": 36.756, "eval_steps_per_second": 4.729, "step": 400 } ], "logging_steps": 10, "max_steps": 1528, "num_input_tokens_seen": 0, "num_train_epochs": 8, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4316440119214080.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }