| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 975, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03076923076923077, | |
| "grad_norm": 2.3324186560475026, | |
| "learning_rate": 5e-06, | |
| "loss": 0.9796, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06153846153846154, | |
| "grad_norm": 1.0624345768919523, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8629, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.09230769230769231, | |
| "grad_norm": 1.0856708355070386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8232, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 1.3950701294784864, | |
| "learning_rate": 5e-06, | |
| "loss": 0.8034, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 0.9965613643303693, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7826, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.18461538461538463, | |
| "grad_norm": 0.835067800401052, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7723, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2153846153846154, | |
| "grad_norm": 0.781647957469068, | |
| "learning_rate": 5e-06, | |
| "loss": 0.767, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 1.1712549789017386, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7559, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.27692307692307694, | |
| "grad_norm": 1.0092940618549995, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7469, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 0.5624307958541879, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7414, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3384615384615385, | |
| "grad_norm": 0.5739018842323079, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7402, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 0.5881069634317425, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7409, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.7381656504744467, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7376, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4307692307692308, | |
| "grad_norm": 0.5677544829127201, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7298, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 0.7988060295240222, | |
| "learning_rate": 5e-06, | |
| "loss": 0.734, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 0.6718712556340437, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7301, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5230769230769231, | |
| "grad_norm": 0.6738769041614976, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7316, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5538461538461539, | |
| "grad_norm": 0.5838925373137313, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7253, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5846153846153846, | |
| "grad_norm": 0.6725428665358475, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7241, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 0.5603996763654583, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7228, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6461538461538462, | |
| "grad_norm": 0.7449029762390439, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7204, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.676923076923077, | |
| "grad_norm": 0.5236476916978245, | |
| "learning_rate": 5e-06, | |
| "loss": 0.717, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.7076923076923077, | |
| "grad_norm": 0.6865974040573393, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7134, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7384615384615385, | |
| "grad_norm": 0.6717184095724111, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7159, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 0.8235556500295945, | |
| "learning_rate": 5e-06, | |
| "loss": 0.711, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5131501040264781, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7165, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8307692307692308, | |
| "grad_norm": 0.5193918430229191, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7055, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8615384615384616, | |
| "grad_norm": 0.5456897002979056, | |
| "learning_rate": 5e-06, | |
| "loss": 0.711, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8923076923076924, | |
| "grad_norm": 0.5750072559309753, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7084, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 0.6116232515773206, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7128, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9538461538461539, | |
| "grad_norm": 0.5681793375147939, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7086, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 0.6343675909171526, | |
| "learning_rate": 5e-06, | |
| "loss": 0.7104, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.7103046774864197, | |
| "eval_runtime": 341.5784, | |
| "eval_samples_per_second": 25.637, | |
| "eval_steps_per_second": 0.401, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.0153846153846153, | |
| "grad_norm": 0.7692870192954333, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6897, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.0461538461538462, | |
| "grad_norm": 0.5505626206403108, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6636, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.0769230769230769, | |
| "grad_norm": 0.5824910302345806, | |
| "learning_rate": 5e-06, | |
| "loss": 0.665, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.1076923076923078, | |
| "grad_norm": 0.6417795632315966, | |
| "learning_rate": 5e-06, | |
| "loss": 0.664, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.1384615384615384, | |
| "grad_norm": 0.6909469669994084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6634, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.1692307692307693, | |
| "grad_norm": 0.5616732809499269, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6653, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 0.567347803253029, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6605, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.2307692307692308, | |
| "grad_norm": 0.6020788318645328, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6649, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.2615384615384615, | |
| "grad_norm": 0.6247830402384781, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6662, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.2923076923076924, | |
| "grad_norm": 0.7223035560412274, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6631, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.323076923076923, | |
| "grad_norm": 0.7277266215407076, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6596, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.353846153846154, | |
| "grad_norm": 0.5531058222331595, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6659, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.3846153846153846, | |
| "grad_norm": 0.7560378640801586, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6654, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.4153846153846155, | |
| "grad_norm": 0.6002650702602214, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6602, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.4461538461538461, | |
| "grad_norm": 0.5446217104790005, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6633, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.476923076923077, | |
| "grad_norm": 0.6427854645725265, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6606, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.5076923076923077, | |
| "grad_norm": 0.5777163615379136, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6619, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.5384615384615383, | |
| "grad_norm": 0.5645927733372786, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6613, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.5692307692307692, | |
| "grad_norm": 0.5877237236567665, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6666, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 0.7318508117188922, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6657, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.6307692307692307, | |
| "grad_norm": 0.5337461204074161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6678, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.6615384615384614, | |
| "grad_norm": 0.6743103368990547, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6593, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.6923076923076923, | |
| "grad_norm": 0.7414434681428989, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6575, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.7230769230769232, | |
| "grad_norm": 0.8444136346063817, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6631, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.7538461538461538, | |
| "grad_norm": 0.5509955001744837, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6592, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.7846153846153845, | |
| "grad_norm": 0.6630480503790037, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6594, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.8153846153846154, | |
| "grad_norm": 0.5120012670268558, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6568, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.8461538461538463, | |
| "grad_norm": 0.5752130113945352, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6599, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.876923076923077, | |
| "grad_norm": 0.5835189539855182, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6591, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.9076923076923076, | |
| "grad_norm": 0.5603728195599548, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6629, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.9384615384615385, | |
| "grad_norm": 0.6856529088244573, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6586, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.9692307692307693, | |
| "grad_norm": 0.5160023610012462, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6603, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.6268329172925445, | |
| "learning_rate": 5e-06, | |
| "loss": 0.658, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.6978012323379517, | |
| "eval_runtime": 343.0061, | |
| "eval_samples_per_second": 25.53, | |
| "eval_steps_per_second": 0.399, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.0307692307692307, | |
| "grad_norm": 0.7968437785991972, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6088, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.0615384615384613, | |
| "grad_norm": 1.1539294867806145, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6083, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.0923076923076924, | |
| "grad_norm": 0.9623521768346174, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6149, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.123076923076923, | |
| "grad_norm": 0.6040507943945268, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6164, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.1538461538461537, | |
| "grad_norm": 0.5851170034060879, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6115, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.184615384615385, | |
| "grad_norm": 0.600096702685487, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6153, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.2153846153846155, | |
| "grad_norm": 0.5589523777176305, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6133, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.246153846153846, | |
| "grad_norm": 0.5907169270955183, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6131, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.276923076923077, | |
| "grad_norm": 0.6628827223280479, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6159, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.3076923076923075, | |
| "grad_norm": 0.6215774766516785, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6149, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.3384615384615386, | |
| "grad_norm": 0.6649815302458084, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6132, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.3692307692307693, | |
| "grad_norm": 0.699779615122456, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6128, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 0.546711048417698, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6164, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.430769230769231, | |
| "grad_norm": 0.6327118907910535, | |
| "learning_rate": 5e-06, | |
| "loss": 0.623, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.4615384615384617, | |
| "grad_norm": 0.5174866030508046, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6166, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.4923076923076923, | |
| "grad_norm": 0.5497949972140503, | |
| "learning_rate": 5e-06, | |
| "loss": 0.615, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.523076923076923, | |
| "grad_norm": 0.5993580492245987, | |
| "learning_rate": 5e-06, | |
| "loss": 0.614, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.5538461538461537, | |
| "grad_norm": 0.6442768713339674, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6222, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.5846153846153848, | |
| "grad_norm": 0.6428843121497717, | |
| "learning_rate": 5e-06, | |
| "loss": 0.617, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.6153846153846154, | |
| "grad_norm": 0.6117392354746288, | |
| "learning_rate": 5e-06, | |
| "loss": 0.62, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.646153846153846, | |
| "grad_norm": 0.7180295224765794, | |
| "learning_rate": 5e-06, | |
| "loss": 0.619, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.676923076923077, | |
| "grad_norm": 0.6501140616713876, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6164, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.707692307692308, | |
| "grad_norm": 0.5412288061836285, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6218, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.7384615384615385, | |
| "grad_norm": 0.7302614499001912, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6176, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.769230769230769, | |
| "grad_norm": 0.6661401784715161, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6217, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 0.7263597509162613, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6154, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.830769230769231, | |
| "grad_norm": 0.7506085991128838, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6189, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.8615384615384616, | |
| "grad_norm": 0.5917421117589724, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6211, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.8923076923076922, | |
| "grad_norm": 0.5306708247183923, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6175, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.9230769230769234, | |
| "grad_norm": 0.6648394376559398, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6222, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.953846153846154, | |
| "grad_norm": 0.5459019838181679, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6174, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.9846153846153847, | |
| "grad_norm": 0.6951553333219783, | |
| "learning_rate": 5e-06, | |
| "loss": 0.6176, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.6986876726150513, | |
| "eval_runtime": 342.0684, | |
| "eval_samples_per_second": 25.6, | |
| "eval_steps_per_second": 0.401, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 975, | |
| "total_flos": 1632951934648320.0, | |
| "train_loss": 0.6751218311603253, | |
| "train_runtime": 57099.1896, | |
| "train_samples_per_second": 8.741, | |
| "train_steps_per_second": 0.017 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 975, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1632951934648320.0, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |