| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 900, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03340989768218835, |
| "grad_norm": 8.104019844032948, |
| "learning_rate": 1.0000000000000002e-06, |
| "loss": 1.6231, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0668197953643767, |
| "grad_norm": 3.4693710845262706, |
| "learning_rate": 2.1111111111111114e-06, |
| "loss": 1.489, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10022969304656504, |
| "grad_norm": 2.3368868000890224, |
| "learning_rate": 3.2222222222222227e-06, |
| "loss": 1.3346, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.1336395907287534, |
| "grad_norm": 1.9194151908776884, |
| "learning_rate": 4.333333333333334e-06, |
| "loss": 1.2275, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.16704948841094175, |
| "grad_norm": 1.7081126245466087, |
| "learning_rate": 5.444444444444445e-06, |
| "loss": 1.1712, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.20045938609313008, |
| "grad_norm": 1.7461363381025887, |
| "learning_rate": 6.555555555555556e-06, |
| "loss": 1.1295, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.23386928377531843, |
| "grad_norm": 1.6467172594579205, |
| "learning_rate": 7.666666666666667e-06, |
| "loss": 1.1083, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.2672791814575068, |
| "grad_norm": 1.7574333391127064, |
| "learning_rate": 8.777777777777778e-06, |
| "loss": 1.0889, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3006890791396951, |
| "grad_norm": 1.5506512108382777, |
| "learning_rate": 9.88888888888889e-06, |
| "loss": 1.0634, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.3340989768218835, |
| "grad_norm": 1.637392091372257, |
| "learning_rate": 9.99695413509548e-06, |
| "loss": 1.0842, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3675088745040718, |
| "grad_norm": 1.640058641808361, |
| "learning_rate": 9.986429983545127e-06, |
| "loss": 1.0614, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.40091877218626015, |
| "grad_norm": 1.6505780161101127, |
| "learning_rate": 9.968405767630857e-06, |
| "loss": 1.0735, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.43432866986844854, |
| "grad_norm": 1.6260516030561405, |
| "learning_rate": 9.942908597485558e-06, |
| "loss": 1.0568, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.46773856755063686, |
| "grad_norm": 1.4772413330384198, |
| "learning_rate": 9.909976823275143e-06, |
| "loss": 1.0812, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.5011484652328252, |
| "grad_norm": 1.5820640912826012, |
| "learning_rate": 9.869659977516261e-06, |
| "loss": 1.0285, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.5345583629150136, |
| "grad_norm": 1.591148450789976, |
| "learning_rate": 9.822018700574696e-06, |
| "loss": 1.0376, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.567968260597202, |
| "grad_norm": 1.5272109606251885, |
| "learning_rate": 9.767124649456484e-06, |
| "loss": 1.0283, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.6013781582793902, |
| "grad_norm": 1.4986155360090707, |
| "learning_rate": 9.705060390028979e-06, |
| "loss": 1.0271, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.6347880559615786, |
| "grad_norm": 1.5634069287930357, |
| "learning_rate": 9.635919272833938e-06, |
| "loss": 1.0261, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.668197953643767, |
| "grad_norm": 1.4240257902836764, |
| "learning_rate": 9.559805292679445e-06, |
| "loss": 1.0091, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.7016078513259553, |
| "grad_norm": 1.5540706999114942, |
| "learning_rate": 9.476832932221835e-06, |
| "loss": 1.0104, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.7350177490081437, |
| "grad_norm": 1.5662551532037128, |
| "learning_rate": 9.38712698977291e-06, |
| "loss": 0.994, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.768427646690332, |
| "grad_norm": 1.5052024562975517, |
| "learning_rate": 9.290822391591418e-06, |
| "loss": 1.0006, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.8018375443725203, |
| "grad_norm": 1.6065362027025494, |
| "learning_rate": 9.188063988941147e-06, |
| "loss": 1.0096, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.8352474420547087, |
| "grad_norm": 1.4428707337680362, |
| "learning_rate": 9.079006340220862e-06, |
| "loss": 0.9901, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.8686573397368971, |
| "grad_norm": 1.5314846163892415, |
| "learning_rate": 8.963813478493788e-06, |
| "loss": 0.9863, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.9020672374190855, |
| "grad_norm": 1.4351095172894568, |
| "learning_rate": 8.842658664766317e-06, |
| "loss": 1.0219, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.9354771351012737, |
| "grad_norm": 1.507687415590904, |
| "learning_rate": 8.715724127386971e-06, |
| "loss": 1.0047, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.9688870327834621, |
| "grad_norm": 1.5014644267583146, |
| "learning_rate": 8.58320078795768e-06, |
| "loss": 1.0026, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.425943640955651, |
| "learning_rate": 8.44528797416954e-06, |
| "loss": 0.9653, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.0334098976821884, |
| "grad_norm": 1.4718810021174351, |
| "learning_rate": 8.302193119995038e-06, |
| "loss": 0.8486, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.0668197953643768, |
| "grad_norm": 1.4077200885873793, |
| "learning_rate": 8.154131453687657e-06, |
| "loss": 0.8382, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.1002296930465651, |
| "grad_norm": 1.5972201624551428, |
| "learning_rate": 8.001325674058124e-06, |
| "loss": 0.8283, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.1336395907287533, |
| "grad_norm": 1.5226433404562436, |
| "learning_rate": 7.84400561551426e-06, |
| "loss": 0.8351, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.1670494884109417, |
| "grad_norm": 1.5106301998037033, |
| "learning_rate": 7.68240790236819e-06, |
| "loss": 0.838, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.20045938609313, |
| "grad_norm": 1.5215853204266103, |
| "learning_rate": 7.5167755929309e-06, |
| "loss": 0.8322, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.2338692837753185, |
| "grad_norm": 1.5217322186980389, |
| "learning_rate": 7.347357813929455e-06, |
| "loss": 0.8227, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.2672791814575068, |
| "grad_norm": 1.6755977423651072, |
| "learning_rate": 7.174409385796726e-06, |
| "loss": 0.8287, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.300689079139695, |
| "grad_norm": 1.5944600963804338, |
| "learning_rate": 6.998190439397262e-06, |
| "loss": 0.8486, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.3340989768218834, |
| "grad_norm": 1.5643959643386598, |
| "learning_rate": 6.818966024765758e-06, |
| "loss": 0.855, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.3675088745040718, |
| "grad_norm": 1.5456085160299915, |
| "learning_rate": 6.637005712446622e-06, |
| "loss": 0.8664, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.4009187721862602, |
| "grad_norm": 1.5082758452600278, |
| "learning_rate": 6.452583188034275e-06, |
| "loss": 0.8555, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.4343286698684485, |
| "grad_norm": 1.6264152681857857, |
| "learning_rate": 6.26597584052401e-06, |
| "loss": 0.8425, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.467738567550637, |
| "grad_norm": 1.6079625298586488, |
| "learning_rate": 6.077464345092601e-06, |
| "loss": 0.8463, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.5011484652328253, |
| "grad_norm": 1.494204317440263, |
| "learning_rate": 5.887332240936177e-06, |
| "loss": 0.8373, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.5345583629150137, |
| "grad_norm": 1.552091126608121, |
| "learning_rate": 5.695865504800328e-06, |
| "loss": 0.8415, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.567968260597202, |
| "grad_norm": 1.4687706324296175, |
| "learning_rate": 5.503352120843923e-06, |
| "loss": 0.8364, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.6013781582793902, |
| "grad_norm": 1.5165507426860219, |
| "learning_rate": 5.310081647483577e-06, |
| "loss": 0.8317, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.6347880559615786, |
| "grad_norm": 1.5074289196814759, |
| "learning_rate": 5.116344781870282e-06, |
| "loss": 0.8313, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.668197953643767, |
| "grad_norm": 1.4816310929794339, |
| "learning_rate": 4.922432922653284e-06, |
| "loss": 0.8514, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.7016078513259552, |
| "grad_norm": 1.5057236332446566, |
| "learning_rate": 4.728637731688832e-06, |
| "loss": 0.8335, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.7350177490081435, |
| "grad_norm": 1.5771287925137742, |
| "learning_rate": 4.53525069535304e-06, |
| "loss": 0.8517, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.768427646690332, |
| "grad_norm": 1.5190684133581556, |
| "learning_rate": 4.342562686118687e-06, |
| "loss": 0.8366, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.8018375443725203, |
| "grad_norm": 1.5589303795357043, |
| "learning_rate": 4.150863525055397e-06, |
| "loss": 0.843, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.8352474420547087, |
| "grad_norm": 1.5319164868806459, |
| "learning_rate": 3.960441545911205e-06, |
| "loss": 0.8402, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.868657339736897, |
| "grad_norm": 1.508115730305158, |
| "learning_rate": 3.7715831614312184e-06, |
| "loss": 0.8415, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.9020672374190855, |
| "grad_norm": 1.5207204322039876, |
| "learning_rate": 3.5845724325656485e-06, |
| "loss": 0.8391, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.9354771351012738, |
| "grad_norm": 1.6058398309364263, |
| "learning_rate": 3.399690641215142e-06, |
| "loss": 0.8333, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.9688870327834622, |
| "grad_norm": 1.6417685296920548, |
| "learning_rate": 3.2172158671561005e-06, |
| "loss": 0.8078, |
| "step": 590 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.582963981597533, |
| "learning_rate": 3.0374225697822645e-06, |
| "loss": 0.8428, |
| "step": 600 |
| }, |
| { |
| "epoch": 2.0334098976821884, |
| "grad_norm": 1.6865316423186703, |
| "learning_rate": 2.86058117529173e-06, |
| "loss": 0.6855, |
| "step": 610 |
| }, |
| { |
| "epoch": 2.0668197953643768, |
| "grad_norm": 1.6853603645992963, |
| "learning_rate": 2.686957669940242e-06, |
| "loss": 0.6695, |
| "step": 620 |
| }, |
| { |
| "epoch": 2.100229693046565, |
| "grad_norm": 1.769846698754755, |
| "learning_rate": 2.5168131999726203e-06, |
| "loss": 0.6845, |
| "step": 630 |
| }, |
| { |
| "epoch": 2.1336395907287535, |
| "grad_norm": 1.743572647134592, |
| "learning_rate": 2.3504036788339763e-06, |
| "loss": 0.6683, |
| "step": 640 |
| }, |
| { |
| "epoch": 2.167049488410942, |
| "grad_norm": 1.827986152296155, |
| "learning_rate": 2.1879794022516006e-06, |
| "loss": 0.6723, |
| "step": 650 |
| }, |
| { |
| "epoch": 2.2004593860931303, |
| "grad_norm": 1.6862359445877158, |
| "learning_rate": 2.0297846717664043e-06, |
| "loss": 0.6788, |
| "step": 660 |
| }, |
| { |
| "epoch": 2.2338692837753182, |
| "grad_norm": 1.7265559141551987, |
| "learning_rate": 1.8760574272802002e-06, |
| "loss": 0.686, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.2672791814575066, |
| "grad_norm": 1.6784351972141776, |
| "learning_rate": 1.7270288891714814e-06, |
| "loss": 0.6759, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.300689079139695, |
| "grad_norm": 1.7845523210344045, |
| "learning_rate": 1.5829232105180143e-06, |
| "loss": 0.6875, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.3340989768218834, |
| "grad_norm": 1.7713046558357322, |
| "learning_rate": 1.4439571399493146e-06, |
| "loss": 0.6866, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.3675088745040718, |
| "grad_norm": 1.7123867307563565, |
| "learning_rate": 1.310339695636118e-06, |
| "loss": 0.6683, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.40091877218626, |
| "grad_norm": 1.7098768284883517, |
| "learning_rate": 1.182271850907199e-06, |
| "loss": 0.6636, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.4343286698684485, |
| "grad_norm": 1.7523526171373818, |
| "learning_rate": 1.0599462319663906e-06, |
| "loss": 0.6907, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.467738567550637, |
| "grad_norm": 1.79044869887376, |
| "learning_rate": 9.435468281644799e-07, |
| "loss": 0.6606, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.5011484652328253, |
| "grad_norm": 1.8797791077083694, |
| "learning_rate": 8.332487152617424e-07, |
| "loss": 0.6858, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.5345583629150137, |
| "grad_norm": 1.6542367973465413, |
| "learning_rate": 7.292177920973726e-07, |
| "loss": 0.6634, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.567968260597202, |
| "grad_norm": 1.85288428953335, |
| "learning_rate": 6.316105310618664e-07, |
| "loss": 0.6636, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.60137815827939, |
| "grad_norm": 1.6896224399244537, |
| "learning_rate": 5.405737427476854e-07, |
| "loss": 0.6786, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.634788055961579, |
| "grad_norm": 1.7762167435792457, |
| "learning_rate": 4.562443551321788e-07, |
| "loss": 0.6845, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.6681979536437668, |
| "grad_norm": 1.8054703475251148, |
| "learning_rate": 3.787492076248994e-07, |
| "loss": 0.6754, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.701607851325955, |
| "grad_norm": 1.679344404898734, |
| "learning_rate": 3.082048602890808e-07, |
| "loss": 0.6744, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.7350177490081435, |
| "grad_norm": 1.725378103601795, |
| "learning_rate": 2.447174185242324e-07, |
| "loss": 0.6524, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.768427646690332, |
| "grad_norm": 1.8002958981650867, |
| "learning_rate": 1.8838237347353848e-07, |
| "loss": 0.6831, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.8018375443725203, |
| "grad_norm": 1.7661404831387393, |
| "learning_rate": 1.3928445839610782e-07, |
| "loss": 0.677, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.8352474420547087, |
| "grad_norm": 1.8674906196125884, |
| "learning_rate": 9.749752122010347e-08, |
| "loss": 0.6811, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.868657339736897, |
| "grad_norm": 1.7100605833681841, |
| "learning_rate": 6.308441346844386e-08, |
| "loss": 0.6714, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.9020672374190855, |
| "grad_norm": 1.7104584417164503, |
| "learning_rate": 3.6096895724141435e-08, |
| "loss": 0.6674, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.935477135101274, |
| "grad_norm": 1.833257936964389, |
| "learning_rate": 1.657555977746972e-08, |
| "loss": 0.6712, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.968887032783462, |
| "grad_norm": 1.7160362714366442, |
| "learning_rate": 4.5497675720540535e-09, |
| "loss": 0.6732, |
| "step": 890 |
| }, |
| { |
| "epoch": 3.0, |
| "grad_norm": 1.8048277473083167, |
| "learning_rate": 3.760704171962282e-11, |
| "loss": 0.6532, |
| "step": 900 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 900, |
| "total_flos": 147705369985024.0, |
| "train_loss": 0.8683794037501017, |
| "train_runtime": 170018.3735, |
| "train_samples_per_second": 0.338, |
| "train_steps_per_second": 0.005 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 900, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 147705369985024.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|