{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.995899772209567, "eval_steps": 500, "global_step": 822, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03644646924829157, "grad_norm": 13.273536409408566, "learning_rate": 5e-06, "loss": 1.0215, "step": 10 }, { "epoch": 0.07289293849658314, "grad_norm": 2.0256934340222124, "learning_rate": 5e-06, "loss": 0.901, "step": 20 }, { "epoch": 0.10933940774487472, "grad_norm": 1.1110745496419128, "learning_rate": 5e-06, "loss": 0.8577, "step": 30 }, { "epoch": 0.14578587699316628, "grad_norm": 1.2500399664289437, "learning_rate": 5e-06, "loss": 0.8319, "step": 40 }, { "epoch": 0.18223234624145787, "grad_norm": 0.993614371192878, "learning_rate": 5e-06, "loss": 0.8081, "step": 50 }, { "epoch": 0.21867881548974943, "grad_norm": 0.97363008151794, "learning_rate": 5e-06, "loss": 0.7963, "step": 60 }, { "epoch": 0.255125284738041, "grad_norm": 0.9129099469883597, "learning_rate": 5e-06, "loss": 0.7888, "step": 70 }, { "epoch": 0.29157175398633256, "grad_norm": 0.9344193654869811, "learning_rate": 5e-06, "loss": 0.7758, "step": 80 }, { "epoch": 0.32801822323462415, "grad_norm": 1.0188210165751896, "learning_rate": 5e-06, "loss": 0.775, "step": 90 }, { "epoch": 0.36446469248291574, "grad_norm": 0.9232347440747942, "learning_rate": 5e-06, "loss": 0.7692, "step": 100 }, { "epoch": 0.4009111617312073, "grad_norm": 0.6783140292643164, "learning_rate": 5e-06, "loss": 0.7621, "step": 110 }, { "epoch": 0.43735763097949887, "grad_norm": 0.6184031681315292, "learning_rate": 5e-06, "loss": 0.7556, "step": 120 }, { "epoch": 0.47380410022779046, "grad_norm": 0.5469349607429109, "learning_rate": 5e-06, "loss": 0.7551, "step": 130 }, { "epoch": 0.510250569476082, "grad_norm": 0.8877445271536203, "learning_rate": 5e-06, "loss": 0.7554, "step": 140 }, { "epoch": 0.5466970387243736, "grad_norm": 1.0291306012367956, "learning_rate": 5e-06, "loss": 0.7495, "step": 150 }, { "epoch": 0.5831435079726651, "grad_norm": 0.6607692936776239, "learning_rate": 5e-06, "loss": 0.7459, "step": 160 }, { "epoch": 0.6195899772209568, "grad_norm": 0.6336927673267501, "learning_rate": 5e-06, "loss": 0.748, "step": 170 }, { "epoch": 0.6560364464692483, "grad_norm": 0.8944901762240539, "learning_rate": 5e-06, "loss": 0.7452, "step": 180 }, { "epoch": 0.6924829157175398, "grad_norm": 0.8843897330408937, "learning_rate": 5e-06, "loss": 0.7445, "step": 190 }, { "epoch": 0.7289293849658315, "grad_norm": 0.6972008986400734, "learning_rate": 5e-06, "loss": 0.7385, "step": 200 }, { "epoch": 0.765375854214123, "grad_norm": 0.5894622066220608, "learning_rate": 5e-06, "loss": 0.7375, "step": 210 }, { "epoch": 0.8018223234624146, "grad_norm": 0.5755055010916849, "learning_rate": 5e-06, "loss": 0.7366, "step": 220 }, { "epoch": 0.8382687927107062, "grad_norm": 0.6956357933104967, "learning_rate": 5e-06, "loss": 0.7418, "step": 230 }, { "epoch": 0.8747152619589977, "grad_norm": 0.7222014717098794, "learning_rate": 5e-06, "loss": 0.7289, "step": 240 }, { "epoch": 0.9111617312072893, "grad_norm": 0.6509301599056833, "learning_rate": 5e-06, "loss": 0.7353, "step": 250 }, { "epoch": 0.9476082004555809, "grad_norm": 0.5946415096003963, "learning_rate": 5e-06, "loss": 0.739, "step": 260 }, { "epoch": 0.9840546697038725, "grad_norm": 0.6399983053851734, "learning_rate": 5e-06, "loss": 0.7315, "step": 270 }, { "epoch": 0.9986332574031891, "eval_loss": 0.7315455079078674, "eval_runtime": 290.5947, "eval_samples_per_second": 25.437, "eval_steps_per_second": 0.399, "step": 274 }, { "epoch": 1.020501138952164, "grad_norm": 0.8395093526707935, "learning_rate": 5e-06, "loss": 0.747, "step": 280 }, { "epoch": 1.0569476082004556, "grad_norm": 0.9199729633560787, "learning_rate": 5e-06, "loss": 0.6787, "step": 290 }, { "epoch": 1.0933940774487472, "grad_norm": 0.6628677057491944, "learning_rate": 5e-06, "loss": 0.6791, "step": 300 }, { "epoch": 1.1298405466970387, "grad_norm": 0.6614989831751948, "learning_rate": 5e-06, "loss": 0.6854, "step": 310 }, { "epoch": 1.1662870159453302, "grad_norm": 0.6964522514874895, "learning_rate": 5e-06, "loss": 0.6763, "step": 320 }, { "epoch": 1.2027334851936218, "grad_norm": 0.7090306269606215, "learning_rate": 5e-06, "loss": 0.6753, "step": 330 }, { "epoch": 1.2391799544419135, "grad_norm": 0.648532712130652, "learning_rate": 5e-06, "loss": 0.68, "step": 340 }, { "epoch": 1.275626423690205, "grad_norm": 0.7822954196339824, "learning_rate": 5e-06, "loss": 0.6817, "step": 350 }, { "epoch": 1.3120728929384966, "grad_norm": 0.6766423459315555, "learning_rate": 5e-06, "loss": 0.6803, "step": 360 }, { "epoch": 1.3485193621867881, "grad_norm": 0.7731309625470634, "learning_rate": 5e-06, "loss": 0.6788, "step": 370 }, { "epoch": 1.3849658314350797, "grad_norm": 0.6229285700860081, "learning_rate": 5e-06, "loss": 0.6856, "step": 380 }, { "epoch": 1.4214123006833712, "grad_norm": 0.6927410350677501, "learning_rate": 5e-06, "loss": 0.6808, "step": 390 }, { "epoch": 1.4578587699316627, "grad_norm": 0.834486739783265, "learning_rate": 5e-06, "loss": 0.6772, "step": 400 }, { "epoch": 1.4943052391799545, "grad_norm": 0.7099676513539387, "learning_rate": 5e-06, "loss": 0.6803, "step": 410 }, { "epoch": 1.530751708428246, "grad_norm": 0.6104516289365347, "learning_rate": 5e-06, "loss": 0.683, "step": 420 }, { "epoch": 1.5671981776765376, "grad_norm": 0.5971848121166693, "learning_rate": 5e-06, "loss": 0.6787, "step": 430 }, { "epoch": 1.603644646924829, "grad_norm": 0.6649414637192727, "learning_rate": 5e-06, "loss": 0.6849, "step": 440 }, { "epoch": 1.6400911161731209, "grad_norm": 0.7320907085872882, "learning_rate": 5e-06, "loss": 0.6817, "step": 450 }, { "epoch": 1.6765375854214124, "grad_norm": 0.5705453457499549, "learning_rate": 5e-06, "loss": 0.6836, "step": 460 }, { "epoch": 1.712984054669704, "grad_norm": 0.6288020854363963, "learning_rate": 5e-06, "loss": 0.6788, "step": 470 }, { "epoch": 1.7494305239179955, "grad_norm": 0.5726327402033801, "learning_rate": 5e-06, "loss": 0.6808, "step": 480 }, { "epoch": 1.785876993166287, "grad_norm": 0.5173548522448698, "learning_rate": 5e-06, "loss": 0.6799, "step": 490 }, { "epoch": 1.8223234624145785, "grad_norm": 0.5790339638087626, "learning_rate": 5e-06, "loss": 0.6747, "step": 500 }, { "epoch": 1.85876993166287, "grad_norm": 0.6584239869836397, "learning_rate": 5e-06, "loss": 0.677, "step": 510 }, { "epoch": 1.8952164009111616, "grad_norm": 0.5311060458966043, "learning_rate": 5e-06, "loss": 0.6854, "step": 520 }, { "epoch": 1.9316628701594531, "grad_norm": 0.6512560331845895, "learning_rate": 5e-06, "loss": 0.6784, "step": 530 }, { "epoch": 1.968109339407745, "grad_norm": 0.584794911589519, "learning_rate": 5e-06, "loss": 0.6805, "step": 540 }, { "epoch": 1.9972665148063782, "eval_loss": 0.7187947630882263, "eval_runtime": 291.938, "eval_samples_per_second": 25.32, "eval_steps_per_second": 0.397, "step": 548 }, { "epoch": 2.0045558086560367, "grad_norm": 0.885264185792607, "learning_rate": 5e-06, "loss": 0.7193, "step": 550 }, { "epoch": 2.041002277904328, "grad_norm": 0.7654881044126012, "learning_rate": 5e-06, "loss": 0.6233, "step": 560 }, { "epoch": 2.0774487471526197, "grad_norm": 0.6151945001911823, "learning_rate": 5e-06, "loss": 0.6283, "step": 570 }, { "epoch": 2.1138952164009113, "grad_norm": 0.8374365216828517, "learning_rate": 5e-06, "loss": 0.6207, "step": 580 }, { "epoch": 2.150341685649203, "grad_norm": 0.7031485699411321, "learning_rate": 5e-06, "loss": 0.6244, "step": 590 }, { "epoch": 2.1867881548974943, "grad_norm": 0.8633299586157721, "learning_rate": 5e-06, "loss": 0.6258, "step": 600 }, { "epoch": 2.223234624145786, "grad_norm": 0.7458455143129973, "learning_rate": 5e-06, "loss": 0.628, "step": 610 }, { "epoch": 2.2596810933940774, "grad_norm": 1.0109682737601884, "learning_rate": 5e-06, "loss": 0.6264, "step": 620 }, { "epoch": 2.296127562642369, "grad_norm": 0.6211087319945944, "learning_rate": 5e-06, "loss": 0.6242, "step": 630 }, { "epoch": 2.3325740318906605, "grad_norm": 0.7036393621519607, "learning_rate": 5e-06, "loss": 0.6305, "step": 640 }, { "epoch": 2.369020501138952, "grad_norm": 0.6058323256112293, "learning_rate": 5e-06, "loss": 0.6297, "step": 650 }, { "epoch": 2.4054669703872436, "grad_norm": 0.6608686938446593, "learning_rate": 5e-06, "loss": 0.631, "step": 660 }, { "epoch": 2.4419134396355355, "grad_norm": 0.6199043838308076, "learning_rate": 5e-06, "loss": 0.6264, "step": 670 }, { "epoch": 2.478359908883827, "grad_norm": 0.607673754805363, "learning_rate": 5e-06, "loss": 0.6266, "step": 680 }, { "epoch": 2.5148063781321186, "grad_norm": 0.9277091135129097, "learning_rate": 5e-06, "loss": 0.6263, "step": 690 }, { "epoch": 2.55125284738041, "grad_norm": 0.9381891238069503, "learning_rate": 5e-06, "loss": 0.6317, "step": 700 }, { "epoch": 2.5876993166287017, "grad_norm": 0.6592786383334494, "learning_rate": 5e-06, "loss": 0.6264, "step": 710 }, { "epoch": 2.624145785876993, "grad_norm": 0.7421181566721138, "learning_rate": 5e-06, "loss": 0.6295, "step": 720 }, { "epoch": 2.6605922551252847, "grad_norm": 0.6781081672896357, "learning_rate": 5e-06, "loss": 0.6273, "step": 730 }, { "epoch": 2.6970387243735763, "grad_norm": 0.609137054982541, "learning_rate": 5e-06, "loss": 0.6328, "step": 740 }, { "epoch": 2.733485193621868, "grad_norm": 0.6919361244155826, "learning_rate": 5e-06, "loss": 0.6333, "step": 750 }, { "epoch": 2.7699316628701594, "grad_norm": 0.6379259386020866, "learning_rate": 5e-06, "loss": 0.6306, "step": 760 }, { "epoch": 2.806378132118451, "grad_norm": 0.6035608731746878, "learning_rate": 5e-06, "loss": 0.6338, "step": 770 }, { "epoch": 2.8428246013667424, "grad_norm": 0.7325417971133363, "learning_rate": 5e-06, "loss": 0.6352, "step": 780 }, { "epoch": 2.879271070615034, "grad_norm": 0.8532590605538493, "learning_rate": 5e-06, "loss": 0.6284, "step": 790 }, { "epoch": 2.9157175398633255, "grad_norm": 0.6185281977585761, "learning_rate": 5e-06, "loss": 0.6306, "step": 800 }, { "epoch": 2.9521640091116175, "grad_norm": 0.6806046770942457, "learning_rate": 5e-06, "loss": 0.6402, "step": 810 }, { "epoch": 2.988610478359909, "grad_norm": 0.6802410015239903, "learning_rate": 5e-06, "loss": 0.6348, "step": 820 }, { "epoch": 2.995899772209567, "eval_loss": 0.7211272120475769, "eval_runtime": 291.822, "eval_samples_per_second": 25.331, "eval_steps_per_second": 0.398, "step": 822 }, { "epoch": 2.995899772209567, "step": 822, "total_flos": 1376671236096000.0, "train_loss": 0.6964337152866261, "train_runtime": 48380.1477, "train_samples_per_second": 8.709, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 822, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1376671236096000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }