| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 12.0, |
| "eval_steps": 500, |
| "global_step": 108, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.11490125673249552, |
| "grad_norm": 1.8050661087036133, |
| "learning_rate": 0.0, |
| "loss": 0.2697, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.22980251346499103, |
| "grad_norm": 1.9665236473083496, |
| "learning_rate": 9.09090909090909e-08, |
| "loss": 0.2879, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.34470377019748655, |
| "grad_norm": 1.8855077028274536, |
| "learning_rate": 1.818181818181818e-07, |
| "loss": 0.2696, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.45960502692998206, |
| "grad_norm": 1.998793125152588, |
| "learning_rate": 2.727272727272727e-07, |
| "loss": 0.3002, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.5745062836624776, |
| "grad_norm": 1.83466637134552, |
| "learning_rate": 3.636363636363636e-07, |
| "loss": 0.2759, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.6894075403949731, |
| "grad_norm": 1.848267912864685, |
| "learning_rate": 4.545454545454545e-07, |
| "loss": 0.2759, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.8043087971274686, |
| "grad_norm": 1.7061188220977783, |
| "learning_rate": 5.454545454545454e-07, |
| "loss": 0.2666, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.9192100538599641, |
| "grad_norm": 1.8305636644363403, |
| "learning_rate": 6.363636363636363e-07, |
| "loss": 0.2805, |
| "step": 8 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.634291172027588, |
| "learning_rate": 7.272727272727272e-07, |
| "loss": 0.2622, |
| "step": 9 |
| }, |
| { |
| "epoch": 1.1149012567324954, |
| "grad_norm": 2.2485928535461426, |
| "learning_rate": 8.181818181818182e-07, |
| "loss": 0.2544, |
| "step": 10 |
| }, |
| { |
| "epoch": 1.229802513464991, |
| "grad_norm": 1.6178233623504639, |
| "learning_rate": 9.09090909090909e-07, |
| "loss": 0.2777, |
| "step": 11 |
| }, |
| { |
| "epoch": 1.3447037701974867, |
| "grad_norm": 1.6333025693893433, |
| "learning_rate": 1e-06, |
| "loss": 0.2825, |
| "step": 12 |
| }, |
| { |
| "epoch": 1.459605026929982, |
| "grad_norm": 1.5703474283218384, |
| "learning_rate": 9.997640060704816e-07, |
| "loss": 0.2715, |
| "step": 13 |
| }, |
| { |
| "epoch": 1.5745062836624775, |
| "grad_norm": 1.486207127571106, |
| "learning_rate": 9.990562718069702e-07, |
| "loss": 0.283, |
| "step": 14 |
| }, |
| { |
| "epoch": 1.689407540394973, |
| "grad_norm": 1.2863779067993164, |
| "learning_rate": 9.978775395249762e-07, |
| "loss": 0.2554, |
| "step": 15 |
| }, |
| { |
| "epoch": 1.8043087971274687, |
| "grad_norm": 1.2911548614501953, |
| "learning_rate": 9.962290455518912e-07, |
| "loss": 0.2629, |
| "step": 16 |
| }, |
| { |
| "epoch": 1.9192100538599641, |
| "grad_norm": 1.2554247379302979, |
| "learning_rate": 9.941125189302508e-07, |
| "loss": 0.2486, |
| "step": 17 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.239845871925354, |
| "learning_rate": 9.915301796042075e-07, |
| "loss": 0.2368, |
| "step": 18 |
| }, |
| { |
| "epoch": 2.1149012567324954, |
| "grad_norm": 1.3584400415420532, |
| "learning_rate": 9.884847360911167e-07, |
| "loss": 0.2628, |
| "step": 19 |
| }, |
| { |
| "epoch": 2.229802513464991, |
| "grad_norm": 1.2527025938034058, |
| "learning_rate": 9.84979382640675e-07, |
| "loss": 0.2419, |
| "step": 20 |
| }, |
| { |
| "epoch": 2.3447037701974867, |
| "grad_norm": 1.2097444534301758, |
| "learning_rate": 9.81017795884594e-07, |
| "loss": 0.2394, |
| "step": 21 |
| }, |
| { |
| "epoch": 2.459605026929982, |
| "grad_norm": 1.0766065120697021, |
| "learning_rate": 9.766041309803217e-07, |
| "loss": 0.2463, |
| "step": 22 |
| }, |
| { |
| "epoch": 2.5745062836624775, |
| "grad_norm": 0.922896146774292, |
| "learning_rate": 9.717430172528546e-07, |
| "loss": 0.2435, |
| "step": 23 |
| }, |
| { |
| "epoch": 2.6894075403949733, |
| "grad_norm": 0.8510429263114929, |
| "learning_rate": 9.66439553339217e-07, |
| "loss": 0.2445, |
| "step": 24 |
| }, |
| { |
| "epoch": 2.8043087971274687, |
| "grad_norm": 0.8421075344085693, |
| "learning_rate": 9.60699301840693e-07, |
| "loss": 0.222, |
| "step": 25 |
| }, |
| { |
| "epoch": 2.919210053859964, |
| "grad_norm": 0.8339232802391052, |
| "learning_rate": 9.54528283488428e-07, |
| "loss": 0.2197, |
| "step": 26 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.2300190925598145, |
| "learning_rate": 9.479329708285106e-07, |
| "loss": 0.2308, |
| "step": 27 |
| }, |
| { |
| "epoch": 3.1149012567324954, |
| "grad_norm": 0.8397951126098633, |
| "learning_rate": 9.409202814331679e-07, |
| "loss": 0.2355, |
| "step": 28 |
| }, |
| { |
| "epoch": 3.229802513464991, |
| "grad_norm": 0.774864137172699, |
| "learning_rate": 9.334975706451861e-07, |
| "loss": 0.2316, |
| "step": 29 |
| }, |
| { |
| "epoch": 3.3447037701974867, |
| "grad_norm": 0.8159404993057251, |
| "learning_rate": 9.256726238631719e-07, |
| "loss": 0.2309, |
| "step": 30 |
| }, |
| { |
| "epoch": 3.459605026929982, |
| "grad_norm": 0.6702387928962708, |
| "learning_rate": 9.174536483757448e-07, |
| "loss": 0.2132, |
| "step": 31 |
| }, |
| { |
| "epoch": 3.5745062836624775, |
| "grad_norm": 0.6168584823608398, |
| "learning_rate": 9.088492647532243e-07, |
| "loss": 0.2064, |
| "step": 32 |
| }, |
| { |
| "epoch": 3.6894075403949733, |
| "grad_norm": 0.6055068373680115, |
| "learning_rate": 8.998684978058422e-07, |
| "loss": 0.2158, |
| "step": 33 |
| }, |
| { |
| "epoch": 3.8043087971274687, |
| "grad_norm": 0.6474173665046692, |
| "learning_rate": 8.905207671179627e-07, |
| "loss": 0.225, |
| "step": 34 |
| }, |
| { |
| "epoch": 3.919210053859964, |
| "grad_norm": 0.5831088423728943, |
| "learning_rate": 8.808158771682401e-07, |
| "loss": 0.2271, |
| "step": 35 |
| }, |
| { |
| "epoch": 4.0, |
| "grad_norm": 0.7952995896339417, |
| "learning_rate": 8.707640070460731e-07, |
| "loss": 0.1925, |
| "step": 36 |
| }, |
| { |
| "epoch": 4.114901256732495, |
| "grad_norm": 0.5484858751296997, |
| "learning_rate": 8.60375699775147e-07, |
| "loss": 0.2108, |
| "step": 37 |
| }, |
| { |
| "epoch": 4.229802513464991, |
| "grad_norm": 0.5406907200813293, |
| "learning_rate": 8.496618512552564e-07, |
| "loss": 0.2082, |
| "step": 38 |
| }, |
| { |
| "epoch": 4.344703770197486, |
| "grad_norm": 0.6845734119415283, |
| "learning_rate": 8.386336988340129e-07, |
| "loss": 0.2266, |
| "step": 39 |
| }, |
| { |
| "epoch": 4.459605026929982, |
| "grad_norm": 0.8074614405632019, |
| "learning_rate": 8.273028095204173e-07, |
| "loss": 0.2342, |
| "step": 40 |
| }, |
| { |
| "epoch": 4.574506283662478, |
| "grad_norm": 0.7541784048080444, |
| "learning_rate": 8.156810678526652e-07, |
| "loss": 0.2178, |
| "step": 41 |
| }, |
| { |
| "epoch": 4.689407540394973, |
| "grad_norm": 0.6619231700897217, |
| "learning_rate": 8.037806634329078e-07, |
| "loss": 0.1966, |
| "step": 42 |
| }, |
| { |
| "epoch": 4.804308797127469, |
| "grad_norm": 0.6785462498664856, |
| "learning_rate": 7.916140781420428e-07, |
| "loss": 0.2001, |
| "step": 43 |
| }, |
| { |
| "epoch": 4.919210053859964, |
| "grad_norm": 0.6341767907142639, |
| "learning_rate": 7.791940730479434e-07, |
| "loss": 0.1885, |
| "step": 44 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.6831431984901428, |
| "learning_rate": 7.665336750208623e-07, |
| "loss": 0.2095, |
| "step": 45 |
| }, |
| { |
| "epoch": 5.114901256732495, |
| "grad_norm": 0.5810871720314026, |
| "learning_rate": 7.536461630700425e-07, |
| "loss": 0.2121, |
| "step": 46 |
| }, |
| { |
| "epoch": 5.229802513464991, |
| "grad_norm": 0.49919337034225464, |
| "learning_rate": 7.405450544158706e-07, |
| "loss": 0.1943, |
| "step": 47 |
| }, |
| { |
| "epoch": 5.344703770197486, |
| "grad_norm": 0.5304792523384094, |
| "learning_rate": 7.272440903121791e-07, |
| "loss": 0.2043, |
| "step": 48 |
| }, |
| { |
| "epoch": 5.459605026929982, |
| "grad_norm": 0.5594088435173035, |
| "learning_rate": 7.137572216335694e-07, |
| "loss": 0.2177, |
| "step": 49 |
| }, |
| { |
| "epoch": 5.574506283662478, |
| "grad_norm": 0.7475131154060364, |
| "learning_rate": 7.000985942428693e-07, |
| "loss": 0.1891, |
| "step": 50 |
| }, |
| { |
| "epoch": 5.689407540394973, |
| "grad_norm": 0.554442822933197, |
| "learning_rate": 6.862825341540778e-07, |
| "loss": 0.1867, |
| "step": 51 |
| }, |
| { |
| "epoch": 5.804308797127469, |
| "grad_norm": 0.5080501437187195, |
| "learning_rate": 6.723235325063543e-07, |
| "loss": 0.2069, |
| "step": 52 |
| }, |
| { |
| "epoch": 5.919210053859964, |
| "grad_norm": 0.6186196804046631, |
| "learning_rate": 6.582362303648142e-07, |
| "loss": 0.2093, |
| "step": 53 |
| }, |
| { |
| "epoch": 6.0, |
| "grad_norm": 0.6012147665023804, |
| "learning_rate": 6.440354033640738e-07, |
| "loss": 0.1712, |
| "step": 54 |
| }, |
| { |
| "epoch": 6.114901256732495, |
| "grad_norm": 0.5101262927055359, |
| "learning_rate": 6.297359462106502e-07, |
| "loss": 0.1974, |
| "step": 55 |
| }, |
| { |
| "epoch": 6.229802513464991, |
| "grad_norm": 0.4870443642139435, |
| "learning_rate": 6.153528570604707e-07, |
| "loss": 0.2023, |
| "step": 56 |
| }, |
| { |
| "epoch": 6.344703770197486, |
| "grad_norm": 0.46721193194389343, |
| "learning_rate": 6.00901221787878e-07, |
| "loss": 0.1897, |
| "step": 57 |
| }, |
| { |
| "epoch": 6.459605026929982, |
| "grad_norm": 0.4450569152832031, |
| "learning_rate": 5.86396198162632e-07, |
| "loss": 0.1932, |
| "step": 58 |
| }, |
| { |
| "epoch": 6.574506283662478, |
| "grad_norm": 0.4395124912261963, |
| "learning_rate": 5.718529999515017e-07, |
| "loss": 0.1851, |
| "step": 59 |
| }, |
| { |
| "epoch": 6.689407540394973, |
| "grad_norm": 0.4729479253292084, |
| "learning_rate": 5.572868809611257e-07, |
| "loss": 0.2042, |
| "step": 60 |
| }, |
| { |
| "epoch": 6.804308797127469, |
| "grad_norm": 0.4454880952835083, |
| "learning_rate": 5.427131190388743e-07, |
| "loss": 0.1925, |
| "step": 61 |
| }, |
| { |
| "epoch": 6.919210053859964, |
| "grad_norm": 0.40814733505249023, |
| "learning_rate": 5.281470000484985e-07, |
| "loss": 0.1776, |
| "step": 62 |
| }, |
| { |
| "epoch": 7.0, |
| "grad_norm": 0.48163506388664246, |
| "learning_rate": 5.136038018373682e-07, |
| "loss": 0.2039, |
| "step": 63 |
| }, |
| { |
| "epoch": 7.114901256732495, |
| "grad_norm": 0.4723432660102844, |
| "learning_rate": 4.990987782121221e-07, |
| "loss": 0.2026, |
| "step": 64 |
| }, |
| { |
| "epoch": 7.229802513464991, |
| "grad_norm": 0.4103083610534668, |
| "learning_rate": 4.846471429395295e-07, |
| "loss": 0.184, |
| "step": 65 |
| }, |
| { |
| "epoch": 7.344703770197486, |
| "grad_norm": 0.4164697527885437, |
| "learning_rate": 4.7026405378934975e-07, |
| "loss": 0.1951, |
| "step": 66 |
| }, |
| { |
| "epoch": 7.459605026929982, |
| "grad_norm": 0.43192094564437866, |
| "learning_rate": 4.5596459663592625e-07, |
| "loss": 0.1888, |
| "step": 67 |
| }, |
| { |
| "epoch": 7.574506283662478, |
| "grad_norm": 0.40324753522872925, |
| "learning_rate": 4.41763769635186e-07, |
| "loss": 0.1969, |
| "step": 68 |
| }, |
| { |
| "epoch": 7.689407540394973, |
| "grad_norm": 0.4374394416809082, |
| "learning_rate": 4.2767646749364574e-07, |
| "loss": 0.1801, |
| "step": 69 |
| }, |
| { |
| "epoch": 7.804308797127469, |
| "grad_norm": 0.40512681007385254, |
| "learning_rate": 4.1371746584592227e-07, |
| "loss": 0.1849, |
| "step": 70 |
| }, |
| { |
| "epoch": 7.919210053859964, |
| "grad_norm": 0.40997177362442017, |
| "learning_rate": 3.999014057571308e-07, |
| "loss": 0.1922, |
| "step": 71 |
| }, |
| { |
| "epoch": 8.0, |
| "grad_norm": 0.48699864745140076, |
| "learning_rate": 3.862427783664306e-07, |
| "loss": 0.1774, |
| "step": 72 |
| }, |
| { |
| "epoch": 8.114901256732496, |
| "grad_norm": 0.4171445965766907, |
| "learning_rate": 3.7275590968782087e-07, |
| "loss": 0.1887, |
| "step": 73 |
| }, |
| { |
| "epoch": 8.22980251346499, |
| "grad_norm": 0.3887261152267456, |
| "learning_rate": 3.594549455841296e-07, |
| "loss": 0.183, |
| "step": 74 |
| }, |
| { |
| "epoch": 8.344703770197487, |
| "grad_norm": 0.41559508442878723, |
| "learning_rate": 3.4635383692995755e-07, |
| "loss": 0.1858, |
| "step": 75 |
| }, |
| { |
| "epoch": 8.459605026929982, |
| "grad_norm": 0.41108807921409607, |
| "learning_rate": 3.3346632497913773e-07, |
| "loss": 0.1818, |
| "step": 76 |
| }, |
| { |
| "epoch": 8.574506283662478, |
| "grad_norm": 0.4802042245864868, |
| "learning_rate": 3.208059269520568e-07, |
| "loss": 0.1952, |
| "step": 77 |
| }, |
| { |
| "epoch": 8.689407540394972, |
| "grad_norm": 0.3916759192943573, |
| "learning_rate": 3.083859218579573e-07, |
| "loss": 0.181, |
| "step": 78 |
| }, |
| { |
| "epoch": 8.804308797127469, |
| "grad_norm": 0.42845413088798523, |
| "learning_rate": 2.9621933656709207e-07, |
| "loss": 0.1948, |
| "step": 79 |
| }, |
| { |
| "epoch": 8.919210053859963, |
| "grad_norm": 0.4223984181880951, |
| "learning_rate": 2.843189321473349e-07, |
| "loss": 0.1951, |
| "step": 80 |
| }, |
| { |
| "epoch": 9.0, |
| "grad_norm": 0.44743412733078003, |
| "learning_rate": 2.7269719047958267e-07, |
| "loss": 0.1727, |
| "step": 81 |
| }, |
| { |
| "epoch": 9.114901256732496, |
| "grad_norm": 0.41905954480171204, |
| "learning_rate": 2.613663011659871e-07, |
| "loss": 0.1757, |
| "step": 82 |
| }, |
| { |
| "epoch": 9.22980251346499, |
| "grad_norm": 0.399945467710495, |
| "learning_rate": 2.5033814874474356e-07, |
| "loss": 0.1937, |
| "step": 83 |
| }, |
| { |
| "epoch": 9.344703770197487, |
| "grad_norm": 0.3881186842918396, |
| "learning_rate": 2.3962430022485305e-07, |
| "loss": 0.1761, |
| "step": 84 |
| }, |
| { |
| "epoch": 9.459605026929982, |
| "grad_norm": 0.3914501368999481, |
| "learning_rate": 2.2923599295392694e-07, |
| "loss": 0.19, |
| "step": 85 |
| }, |
| { |
| "epoch": 9.574506283662478, |
| "grad_norm": 0.41606462001800537, |
| "learning_rate": 2.1918412283175994e-07, |
| "loss": 0.189, |
| "step": 86 |
| }, |
| { |
| "epoch": 9.689407540394972, |
| "grad_norm": 0.3928024470806122, |
| "learning_rate": 2.0947923288203713e-07, |
| "loss": 0.1857, |
| "step": 87 |
| }, |
| { |
| "epoch": 9.804308797127469, |
| "grad_norm": 0.3906983435153961, |
| "learning_rate": 2.0013150219415793e-07, |
| "loss": 0.1822, |
| "step": 88 |
| }, |
| { |
| "epoch": 9.919210053859963, |
| "grad_norm": 0.3903934061527252, |
| "learning_rate": 1.9115073524677572e-07, |
| "loss": 0.1856, |
| "step": 89 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.5587829947471619, |
| "learning_rate": 1.8254635162425503e-07, |
| "loss": 0.1863, |
| "step": 90 |
| }, |
| { |
| "epoch": 10.114901256732496, |
| "grad_norm": 0.3936590850353241, |
| "learning_rate": 1.7432737613682807e-07, |
| "loss": 0.1954, |
| "step": 91 |
| }, |
| { |
| "epoch": 10.22980251346499, |
| "grad_norm": 0.3949429988861084, |
| "learning_rate": 1.6650242935481388e-07, |
| "loss": 0.1861, |
| "step": 92 |
| }, |
| { |
| "epoch": 10.344703770197487, |
| "grad_norm": 0.4262572228908539, |
| "learning_rate": 1.59079718566832e-07, |
| "loss": 0.198, |
| "step": 93 |
| }, |
| { |
| "epoch": 10.459605026929982, |
| "grad_norm": 0.37948721647262573, |
| "learning_rate": 1.5206702917148945e-07, |
| "loss": 0.1711, |
| "step": 94 |
| }, |
| { |
| "epoch": 10.574506283662478, |
| "grad_norm": 0.3690933585166931, |
| "learning_rate": 1.4547171651157214e-07, |
| "loss": 0.1642, |
| "step": 95 |
| }, |
| { |
| "epoch": 10.689407540394972, |
| "grad_norm": 0.3985244333744049, |
| "learning_rate": 1.3930069815930697e-07, |
| "loss": 0.1866, |
| "step": 96 |
| }, |
| { |
| "epoch": 10.804308797127469, |
| "grad_norm": 0.3972547650337219, |
| "learning_rate": 1.3356044666078315e-07, |
| "loss": 0.1816, |
| "step": 97 |
| }, |
| { |
| "epoch": 10.919210053859963, |
| "grad_norm": 0.403357595205307, |
| "learning_rate": 1.2825698274714542e-07, |
| "loss": 0.1804, |
| "step": 98 |
| }, |
| { |
| "epoch": 11.0, |
| "grad_norm": 0.49961942434310913, |
| "learning_rate": 1.233958690196783e-07, |
| "loss": 0.192, |
| "step": 99 |
| }, |
| { |
| "epoch": 11.114901256732496, |
| "grad_norm": 0.38747331500053406, |
| "learning_rate": 1.1898220411540583e-07, |
| "loss": 0.1722, |
| "step": 100 |
| }, |
| { |
| "epoch": 11.22980251346499, |
| "grad_norm": 0.41416335105895996, |
| "learning_rate": 1.1502061735932499e-07, |
| "loss": 0.1891, |
| "step": 101 |
| }, |
| { |
| "epoch": 11.344703770197487, |
| "grad_norm": 0.41525566577911377, |
| "learning_rate": 1.115152639088833e-07, |
| "loss": 0.1732, |
| "step": 102 |
| }, |
| { |
| "epoch": 11.459605026929982, |
| "grad_norm": 0.39968863129615784, |
| "learning_rate": 1.0846982039579242e-07, |
| "loss": 0.191, |
| "step": 103 |
| }, |
| { |
| "epoch": 11.574506283662478, |
| "grad_norm": 0.4091079831123352, |
| "learning_rate": 1.0588748106974918e-07, |
| "loss": 0.1959, |
| "step": 104 |
| }, |
| { |
| "epoch": 11.689407540394972, |
| "grad_norm": 0.38746750354766846, |
| "learning_rate": 1.0377095444810871e-07, |
| "loss": 0.1811, |
| "step": 105 |
| }, |
| { |
| "epoch": 11.804308797127469, |
| "grad_norm": 0.3729393184185028, |
| "learning_rate": 1.0212246047502372e-07, |
| "loss": 0.1757, |
| "step": 106 |
| }, |
| { |
| "epoch": 11.919210053859963, |
| "grad_norm": 0.42114880681037903, |
| "learning_rate": 1.0094372819302977e-07, |
| "loss": 0.1775, |
| "step": 107 |
| }, |
| { |
| "epoch": 12.0, |
| "grad_norm": 0.5820653438568115, |
| "learning_rate": 1.0023599392951829e-07, |
| "loss": 0.198, |
| "step": 108 |
| }, |
| { |
| "epoch": 12.0, |
| "step": 108, |
| "total_flos": 8.159286503861453e+17, |
| "train_loss": 0.21086449852144276, |
| "train_runtime": 7716.2568, |
| "train_samples_per_second": 0.866, |
| "train_steps_per_second": 0.014 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 108, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 12, |
| "save_steps": 64000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 8.159286503861453e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|