| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 825, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03636363636363636, |
| "grad_norm": 3.2293971017711174, |
| "learning_rate": 5e-06, |
| "loss": 1.0337, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.07272727272727272, |
| "grad_norm": 1.2461654883314972, |
| "learning_rate": 5e-06, |
| "loss": 0.9092, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10909090909090909, |
| "grad_norm": 1.0937923635217501, |
| "learning_rate": 5e-06, |
| "loss": 0.8658, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14545454545454545, |
| "grad_norm": 1.3350225945199414, |
| "learning_rate": 5e-06, |
| "loss": 0.844, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.18181818181818182, |
| "grad_norm": 1.029425810987488, |
| "learning_rate": 5e-06, |
| "loss": 0.8249, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.21818181818181817, |
| "grad_norm": 1.5219290967515304, |
| "learning_rate": 5e-06, |
| "loss": 0.8068, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2545454545454545, |
| "grad_norm": 1.6740495880819521, |
| "learning_rate": 5e-06, |
| "loss": 0.7989, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2909090909090909, |
| "grad_norm": 1.2973735477904815, |
| "learning_rate": 5e-06, |
| "loss": 0.7921, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.32727272727272727, |
| "grad_norm": 0.8566363002967183, |
| "learning_rate": 5e-06, |
| "loss": 0.781, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.36363636363636365, |
| "grad_norm": 0.9961672641644985, |
| "learning_rate": 5e-06, |
| "loss": 0.7745, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.8927257959121373, |
| "learning_rate": 5e-06, |
| "loss": 0.7754, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.43636363636363634, |
| "grad_norm": 0.6135178704985191, |
| "learning_rate": 5e-06, |
| "loss": 0.772, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.4727272727272727, |
| "grad_norm": 0.7431505188106242, |
| "learning_rate": 5e-06, |
| "loss": 0.7686, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.509090909090909, |
| "grad_norm": 0.7150787812569424, |
| "learning_rate": 5e-06, |
| "loss": 0.7618, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5454545454545454, |
| "grad_norm": 0.6352342662453642, |
| "learning_rate": 5e-06, |
| "loss": 0.7611, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5818181818181818, |
| "grad_norm": 0.6257901300873526, |
| "learning_rate": 5e-06, |
| "loss": 0.7569, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.6181818181818182, |
| "grad_norm": 0.6387102446786417, |
| "learning_rate": 5e-06, |
| "loss": 0.7611, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6545454545454545, |
| "grad_norm": 0.5983754152683597, |
| "learning_rate": 5e-06, |
| "loss": 0.7546, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6909090909090909, |
| "grad_norm": 0.7480127979666656, |
| "learning_rate": 5e-06, |
| "loss": 0.7566, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.7272727272727273, |
| "grad_norm": 0.5804396007389026, |
| "learning_rate": 5e-06, |
| "loss": 0.75, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7636363636363637, |
| "grad_norm": 0.682148918886327, |
| "learning_rate": 5e-06, |
| "loss": 0.7476, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.8039336411015884, |
| "learning_rate": 5e-06, |
| "loss": 0.7462, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.8363636363636363, |
| "grad_norm": 0.6876607052536684, |
| "learning_rate": 5e-06, |
| "loss": 0.7411, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8727272727272727, |
| "grad_norm": 0.6588151842699974, |
| "learning_rate": 5e-06, |
| "loss": 0.7469, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.6715213794720472, |
| "learning_rate": 5e-06, |
| "loss": 0.7378, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.9454545454545454, |
| "grad_norm": 0.5870957383826958, |
| "learning_rate": 5e-06, |
| "loss": 0.7457, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9818181818181818, |
| "grad_norm": 0.6643986810785624, |
| "learning_rate": 5e-06, |
| "loss": 0.7466, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.7417545914649963, |
| "eval_runtime": 26.6218, |
| "eval_samples_per_second": 278.268, |
| "eval_steps_per_second": 1.089, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.018181818181818, |
| "grad_norm": 0.9781239153342394, |
| "learning_rate": 5e-06, |
| "loss": 0.7136, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.0545454545454545, |
| "grad_norm": 0.7152925984087143, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0909090909090908, |
| "grad_norm": 0.6929492576277494, |
| "learning_rate": 5e-06, |
| "loss": 0.6894, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.1272727272727272, |
| "grad_norm": 0.728764264622129, |
| "learning_rate": 5e-06, |
| "loss": 0.6935, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.1636363636363636, |
| "grad_norm": 0.7252517543389313, |
| "learning_rate": 5e-06, |
| "loss": 0.6945, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.6665160391388197, |
| "learning_rate": 5e-06, |
| "loss": 0.6886, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.2363636363636363, |
| "grad_norm": 0.7161659905517039, |
| "learning_rate": 5e-06, |
| "loss": 0.6898, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.2727272727272727, |
| "grad_norm": 0.5719039452566653, |
| "learning_rate": 5e-06, |
| "loss": 0.6934, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.309090909090909, |
| "grad_norm": 0.6060853746189843, |
| "learning_rate": 5e-06, |
| "loss": 0.6922, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.3454545454545455, |
| "grad_norm": 0.6563719933283224, |
| "learning_rate": 5e-06, |
| "loss": 0.6912, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.3818181818181818, |
| "grad_norm": 0.5958006047997326, |
| "learning_rate": 5e-06, |
| "loss": 0.6904, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.4181818181818182, |
| "grad_norm": 0.7430218105320606, |
| "learning_rate": 5e-06, |
| "loss": 0.688, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.4545454545454546, |
| "grad_norm": 0.6322073230662588, |
| "learning_rate": 5e-06, |
| "loss": 0.6883, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.490909090909091, |
| "grad_norm": 0.7151221978666452, |
| "learning_rate": 5e-06, |
| "loss": 0.6934, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.5272727272727273, |
| "grad_norm": 0.6184168187218901, |
| "learning_rate": 5e-06, |
| "loss": 0.6916, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.5636363636363635, |
| "grad_norm": 0.6280848540221795, |
| "learning_rate": 5e-06, |
| "loss": 0.6916, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.6568705155050817, |
| "learning_rate": 5e-06, |
| "loss": 0.6856, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.6363636363636362, |
| "grad_norm": 0.6359258851827682, |
| "learning_rate": 5e-06, |
| "loss": 0.6851, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.6727272727272728, |
| "grad_norm": 0.710888538426671, |
| "learning_rate": 5e-06, |
| "loss": 0.6872, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.709090909090909, |
| "grad_norm": 0.7584066029266229, |
| "learning_rate": 5e-06, |
| "loss": 0.6849, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.7454545454545456, |
| "grad_norm": 0.5960492892442344, |
| "learning_rate": 5e-06, |
| "loss": 0.6891, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.7818181818181817, |
| "grad_norm": 0.5629377755020811, |
| "learning_rate": 5e-06, |
| "loss": 0.6847, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.8181818181818183, |
| "grad_norm": 0.589716689792314, |
| "learning_rate": 5e-06, |
| "loss": 0.6871, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.8545454545454545, |
| "grad_norm": 0.5740509121739076, |
| "learning_rate": 5e-06, |
| "loss": 0.6888, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.8909090909090909, |
| "grad_norm": 0.5891046247600111, |
| "learning_rate": 5e-06, |
| "loss": 0.6884, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.9272727272727272, |
| "grad_norm": 0.6447276827053491, |
| "learning_rate": 5e-06, |
| "loss": 0.6893, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.9636363636363636, |
| "grad_norm": 0.6935516132206995, |
| "learning_rate": 5e-06, |
| "loss": 0.6868, |
| "step": 540 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.5781509823001448, |
| "learning_rate": 5e-06, |
| "loss": 0.6841, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.7281343340873718, |
| "eval_runtime": 26.4698, |
| "eval_samples_per_second": 279.867, |
| "eval_steps_per_second": 1.096, |
| "step": 550 |
| }, |
| { |
| "epoch": 2.036363636363636, |
| "grad_norm": 0.7551729949207574, |
| "learning_rate": 5e-06, |
| "loss": 0.6351, |
| "step": 560 |
| }, |
| { |
| "epoch": 2.0727272727272728, |
| "grad_norm": 0.6070448901420726, |
| "learning_rate": 5e-06, |
| "loss": 0.6307, |
| "step": 570 |
| }, |
| { |
| "epoch": 2.109090909090909, |
| "grad_norm": 0.7225948313371118, |
| "learning_rate": 5e-06, |
| "loss": 0.6357, |
| "step": 580 |
| }, |
| { |
| "epoch": 2.1454545454545455, |
| "grad_norm": 1.3944109200671733, |
| "learning_rate": 5e-06, |
| "loss": 0.6375, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.1818181818181817, |
| "grad_norm": 1.1390572133302885, |
| "learning_rate": 5e-06, |
| "loss": 0.635, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.2181818181818183, |
| "grad_norm": 0.7900509422330505, |
| "learning_rate": 5e-06, |
| "loss": 0.6383, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.2545454545454544, |
| "grad_norm": 0.594871030626621, |
| "learning_rate": 5e-06, |
| "loss": 0.6321, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.290909090909091, |
| "grad_norm": 0.665898906007086, |
| "learning_rate": 5e-06, |
| "loss": 0.6341, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.327272727272727, |
| "grad_norm": 0.6509722897169726, |
| "learning_rate": 5e-06, |
| "loss": 0.6326, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.3636363636363638, |
| "grad_norm": 0.6231670817005929, |
| "learning_rate": 5e-06, |
| "loss": 0.6385, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.4, |
| "grad_norm": 0.6425410588561774, |
| "learning_rate": 5e-06, |
| "loss": 0.6373, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.4363636363636365, |
| "grad_norm": 0.621241338432262, |
| "learning_rate": 5e-06, |
| "loss": 0.6399, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.4727272727272727, |
| "grad_norm": 0.6924233110335524, |
| "learning_rate": 5e-06, |
| "loss": 0.6393, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.509090909090909, |
| "grad_norm": 0.6419114963815122, |
| "learning_rate": 5e-06, |
| "loss": 0.6405, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.5454545454545454, |
| "grad_norm": 0.7336852368102121, |
| "learning_rate": 5e-06, |
| "loss": 0.6385, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.581818181818182, |
| "grad_norm": 0.7922288944252411, |
| "learning_rate": 5e-06, |
| "loss": 0.6377, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.618181818181818, |
| "grad_norm": 0.6500377491351792, |
| "learning_rate": 5e-06, |
| "loss": 0.6427, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.6545454545454543, |
| "grad_norm": 0.6853834065254241, |
| "learning_rate": 5e-06, |
| "loss": 0.6346, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.690909090909091, |
| "grad_norm": 0.8156333668312422, |
| "learning_rate": 5e-06, |
| "loss": 0.6414, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.7272727272727275, |
| "grad_norm": 0.6294215183471213, |
| "learning_rate": 5e-06, |
| "loss": 0.6363, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.7636363636363637, |
| "grad_norm": 0.8237171162592375, |
| "learning_rate": 5e-06, |
| "loss": 0.6421, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.8, |
| "grad_norm": 0.6772752476166749, |
| "learning_rate": 5e-06, |
| "loss": 0.6356, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.8363636363636364, |
| "grad_norm": 0.7780500988065099, |
| "learning_rate": 5e-06, |
| "loss": 0.6425, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.8727272727272726, |
| "grad_norm": 0.6862874007983163, |
| "learning_rate": 5e-06, |
| "loss": 0.6368, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.909090909090909, |
| "grad_norm": 0.5748210856771035, |
| "learning_rate": 5e-06, |
| "loss": 0.6405, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.9454545454545453, |
| "grad_norm": 0.6351457621560951, |
| "learning_rate": 5e-06, |
| "loss": 0.6357, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.981818181818182, |
| "grad_norm": 0.586627253325874, |
| "learning_rate": 5e-06, |
| "loss": 0.6412, |
| "step": 820 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.7299705147743225, |
| "eval_runtime": 25.9228, |
| "eval_samples_per_second": 285.772, |
| "eval_steps_per_second": 1.119, |
| "step": 825 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 825, |
| "total_flos": 1381905727488000.0, |
| "train_loss": 0.7044297796307188, |
| "train_runtime": 5353.5806, |
| "train_samples_per_second": 78.866, |
| "train_steps_per_second": 0.154 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 825, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1381905727488000.0, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|