| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.995899772209567, | |
| "eval_steps": 500, | |
| "global_step": 822, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03644646924829157, | |
| "grad_norm": 13.273536409408566, | |
| "learning_rate": 5e-06, | |
| "loss": 1.0215, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07289293849658314, | |
| "grad_norm": 2.0256934340222124, | |
| "learning_rate": 5e-06, | |
| "loss": 0.901, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10933940774487472, | |
| "grad_norm": 1.1110745496419128, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8577, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14578587699316628, | |
| "grad_norm": 1.2500399664289437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8319, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.18223234624145787, | |
| "grad_norm": 0.993614371192878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8081, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21867881548974943, | |
| "grad_norm": 0.97363008151794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7963, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.255125284738041, | |
| "grad_norm": 0.9129099469883597, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7888, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.29157175398633256, | |
| "grad_norm": 0.9344193654869811, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7758, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.32801822323462415, | |
| "grad_norm": 1.0188210165751896, | |
| "learning_rate": 5e-06, | |
| "loss": 0.775, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36446469248291574, | |
| "grad_norm": 0.9232347440747942, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7692, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4009111617312073, | |
| "grad_norm": 0.6783140292643164, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7621, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.43735763097949887, | |
| "grad_norm": 0.6184031681315292, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7556, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.47380410022779046, | |
| "grad_norm": 0.5469349607429109, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7551, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.510250569476082, | |
| "grad_norm": 0.8877445271536203, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7554, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5466970387243736, | |
| "grad_norm": 1.0291306012367956, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7495, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.5831435079726651, | |
| "grad_norm": 0.6607692936776239, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7459, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.6195899772209568, | |
| "grad_norm": 0.6336927673267501, | |
| "learning_rate": 5e-06, | |
| "loss": 0.748, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6560364464692483, | |
| "grad_norm": 0.8944901762240539, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7452, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6924829157175398, | |
| "grad_norm": 0.8843897330408937, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7445, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7289293849658315, | |
| "grad_norm": 0.6972008986400734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7385, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.765375854214123, | |
| "grad_norm": 0.5894622066220608, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7375, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.8018223234624146, | |
| "grad_norm": 0.5755055010916849, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7366, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8382687927107062, | |
| "grad_norm": 0.6956357933104967, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7418, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8747152619589977, | |
| "grad_norm": 0.7222014717098794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7289, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.9111617312072893, | |
| "grad_norm": 0.6509301599056833, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7353, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9476082004555809, | |
| "grad_norm": 0.5946415096003963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.739, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9840546697038725, | |
| "grad_norm": 0.6399983053851734, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7315, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9986332574031891, | |
| "eval_loss": 0.7315455079078674, | |
| "eval_runtime": 290.5947, | |
| "eval_samples_per_second": 25.437, | |
| "eval_steps_per_second": 0.399, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 1.020501138952164, | |
| "grad_norm": 0.8395093526707935, | |
| "learning_rate": 5e-06, | |
| "loss": 0.747, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0569476082004556, | |
| "grad_norm": 0.9199729633560787, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0933940774487472, | |
| "grad_norm": 0.6628677057491944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6791, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.1298405466970387, | |
| "grad_norm": 0.6614989831751948, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.1662870159453302, | |
| "grad_norm": 0.6964522514874895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6763, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.2027334851936218, | |
| "grad_norm": 0.7090306269606215, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6753, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.2391799544419135, | |
| "grad_norm": 0.648532712130652, | |
| "learning_rate": 5e-06, | |
| "loss": 0.68, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.275626423690205, | |
| "grad_norm": 0.7822954196339824, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.3120728929384966, | |
| "grad_norm": 0.6766423459315555, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.3485193621867881, | |
| "grad_norm": 0.7731309625470634, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6788, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3849658314350797, | |
| "grad_norm": 0.6229285700860081, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6856, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.4214123006833712, | |
| "grad_norm": 0.6927410350677501, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4578587699316627, | |
| "grad_norm": 0.834486739783265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6772, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.4943052391799545, | |
| "grad_norm": 0.7099676513539387, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6803, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.530751708428246, | |
| "grad_norm": 0.6104516289365347, | |
| "learning_rate": 5e-06, | |
| "loss": 0.683, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5671981776765376, | |
| "grad_norm": 0.5971848121166693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6787, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.603644646924829, | |
| "grad_norm": 0.6649414637192727, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6849, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.6400911161731209, | |
| "grad_norm": 0.7320907085872882, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6817, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6765375854214124, | |
| "grad_norm": 0.5705453457499549, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6836, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.712984054669704, | |
| "grad_norm": 0.6288020854363963, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6788, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.7494305239179955, | |
| "grad_norm": 0.5726327402033801, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6808, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.785876993166287, | |
| "grad_norm": 0.5173548522448698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6799, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.8223234624145785, | |
| "grad_norm": 0.5790339638087626, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6747, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.85876993166287, | |
| "grad_norm": 0.6584239869836397, | |
| "learning_rate": 5e-06, | |
| "loss": 0.677, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.8952164009111616, | |
| "grad_norm": 0.5311060458966043, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6854, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.9316628701594531, | |
| "grad_norm": 0.6512560331845895, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6784, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.968109339407745, | |
| "grad_norm": 0.584794911589519, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6805, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9972665148063782, | |
| "eval_loss": 0.7187947630882263, | |
| "eval_runtime": 291.938, | |
| "eval_samples_per_second": 25.32, | |
| "eval_steps_per_second": 0.397, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 2.0045558086560367, | |
| "grad_norm": 0.885264185792607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7193, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.041002277904328, | |
| "grad_norm": 0.7654881044126012, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6233, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.0774487471526197, | |
| "grad_norm": 0.6151945001911823, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6283, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.1138952164009113, | |
| "grad_norm": 0.8374365216828517, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6207, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.150341685649203, | |
| "grad_norm": 0.7031485699411321, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6244, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.1867881548974943, | |
| "grad_norm": 0.8633299586157721, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6258, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.223234624145786, | |
| "grad_norm": 0.7458455143129973, | |
| "learning_rate": 5e-06, | |
| "loss": 0.628, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.2596810933940774, | |
| "grad_norm": 1.0109682737601884, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6264, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.296127562642369, | |
| "grad_norm": 0.6211087319945944, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6242, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.3325740318906605, | |
| "grad_norm": 0.7036393621519607, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6305, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.369020501138952, | |
| "grad_norm": 0.6058323256112293, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6297, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.4054669703872436, | |
| "grad_norm": 0.6608686938446593, | |
| "learning_rate": 5e-06, | |
| "loss": 0.631, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.4419134396355355, | |
| "grad_norm": 0.6199043838308076, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6264, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.478359908883827, | |
| "grad_norm": 0.607673754805363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6266, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.5148063781321186, | |
| "grad_norm": 0.9277091135129097, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6263, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.55125284738041, | |
| "grad_norm": 0.9381891238069503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6317, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.5876993166287017, | |
| "grad_norm": 0.6592786383334494, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6264, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.624145785876993, | |
| "grad_norm": 0.7421181566721138, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6295, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.6605922551252847, | |
| "grad_norm": 0.6781081672896357, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6273, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.6970387243735763, | |
| "grad_norm": 0.609137054982541, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6328, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.733485193621868, | |
| "grad_norm": 0.6919361244155826, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6333, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.7699316628701594, | |
| "grad_norm": 0.6379259386020866, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6306, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.806378132118451, | |
| "grad_norm": 0.6035608731746878, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6338, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.8428246013667424, | |
| "grad_norm": 0.7325417971133363, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6352, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.879271070615034, | |
| "grad_norm": 0.8532590605538493, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6284, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.9157175398633255, | |
| "grad_norm": 0.6185281977585761, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6306, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.9521640091116175, | |
| "grad_norm": 0.6806046770942457, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6402, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.988610478359909, | |
| "grad_norm": 0.6802410015239903, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6348, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.995899772209567, | |
| "eval_loss": 0.7211272120475769, | |
| "eval_runtime": 291.822, | |
| "eval_samples_per_second": 25.331, | |
| "eval_steps_per_second": 0.398, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 2.995899772209567, | |
| "step": 822, | |
| "total_flos": 1376671236096000.0, | |
| "train_loss": 0.6964337152866261, | |
| "train_runtime": 48380.1477, | |
| "train_samples_per_second": 8.709, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 822, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1376671236096000.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |