| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9934123847167324, | |
| "eval_steps": 500, | |
| "global_step": 852, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.03513394817742644, | |
| "grad_norm": 0.43717896938323975, | |
| "learning_rate": 0.00019230769230769233, | |
| "loss": 1.994, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.07026789635485288, | |
| "grad_norm": 0.4105484187602997, | |
| "learning_rate": 0.00038461538461538467, | |
| "loss": 1.7122, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.10540184453227931, | |
| "grad_norm": 0.39099714159965515, | |
| "learning_rate": 0.0004999710691449165, | |
| "loss": 1.7017, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.14053579270970576, | |
| "grad_norm": 0.418653279542923, | |
| "learning_rate": 0.0004996456739191905, | |
| "loss": 1.7533, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.1756697408871322, | |
| "grad_norm": 0.402630478143692, | |
| "learning_rate": 0.0004989591921187147, | |
| "loss": 1.6842, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21080368906455862, | |
| "grad_norm": 0.4017012119293213, | |
| "learning_rate": 0.0004979126166682133, | |
| "loss": 1.6915, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.24593763724198506, | |
| "grad_norm": 0.5224213004112244, | |
| "learning_rate": 0.0004965074613305277, | |
| "loss": 1.7208, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.2810715854194115, | |
| "grad_norm": 0.5111001133918762, | |
| "learning_rate": 0.0004947457585171148, | |
| "loss": 1.6386, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.31620553359683795, | |
| "grad_norm": 0.6671903133392334, | |
| "learning_rate": 0.000492630056348375, | |
| "loss": 1.6592, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.3513394817742644, | |
| "grad_norm": 0.4950112998485565, | |
| "learning_rate": 0.0004901634149680608, | |
| "loss": 1.6741, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3864734299516908, | |
| "grad_norm": 0.5058407783508301, | |
| "learning_rate": 0.0004873494021170953, | |
| "loss": 1.7568, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.42160737812911725, | |
| "grad_norm": 0.8665825724601746, | |
| "learning_rate": 0.00048419208797320564, | |
| "loss": 1.7356, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.4567413263065437, | |
| "grad_norm": 0.5124475955963135, | |
| "learning_rate": 0.00048069603926383277, | |
| "loss": 1.7199, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4918752744839701, | |
| "grad_norm": 0.505696177482605, | |
| "learning_rate": 0.0004768663126608342, | |
| "loss": 1.6813, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.5270092226613966, | |
| "grad_norm": 0.509445071220398, | |
| "learning_rate": 0.0004727084474665322, | |
| "loss": 1.7074, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.562143170838823, | |
| "grad_norm": 0.5511056184768677, | |
| "learning_rate": 0.00046822845760168783, | |
| "loss": 1.6766, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5972771190162495, | |
| "grad_norm": 0.46946030855178833, | |
| "learning_rate": 0.0004634328229069881, | |
| "loss": 1.7018, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.6324110671936759, | |
| "grad_norm": 0.49886614084243774, | |
| "learning_rate": 0.00045832847977062875, | |
| "loss": 1.7293, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.6675450153711023, | |
| "grad_norm": 0.49424228072166443, | |
| "learning_rate": 0.0004529228110955478, | |
| "loss": 1.7306, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.7026789635485288, | |
| "grad_norm": 0.5640605092048645, | |
| "learning_rate": 0.00044722363562082237, | |
| "loss": 1.7369, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.7378129117259552, | |
| "grad_norm": 0.5802103281021118, | |
| "learning_rate": 0.0004412391966126735, | |
| "loss": 1.7463, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.7729468599033816, | |
| "grad_norm": 0.5365092158317566, | |
| "learning_rate": 0.0004349781499414369, | |
| "loss": 1.7198, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.8080808080808081, | |
| "grad_norm": 0.5591799020767212, | |
| "learning_rate": 0.00042844955156174345, | |
| "loss": 1.7298, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.8432147562582345, | |
| "grad_norm": 0.5256664156913757, | |
| "learning_rate": 0.000421662844414021, | |
| "loss": 1.6863, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.8783487044356609, | |
| "grad_norm": 0.6382088661193848, | |
| "learning_rate": 0.0004146278447662597, | |
| "loss": 1.7195, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.9134826526130874, | |
| "grad_norm": 0.5260637402534485, | |
| "learning_rate": 0.00040735472801579887, | |
| "loss": 1.7135, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.9486166007905138, | |
| "grad_norm": 0.5244185328483582, | |
| "learning_rate": 0.0003998540139716701, | |
| "loss": 1.6944, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.9837505489679402, | |
| "grad_norm": 0.5998988747596741, | |
| "learning_rate": 0.00039213655163878436, | |
| "loss": 1.6982, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.0188844971453668, | |
| "grad_norm": 0.6315314173698425, | |
| "learning_rate": 0.00038421350352597195, | |
| "loss": 1.5473, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.0540184453227932, | |
| "grad_norm": 0.6495606899261475, | |
| "learning_rate": 0.00037609632950057095, | |
| "loss": 1.3535, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0891523935002196, | |
| "grad_norm": 0.6094574928283691, | |
| "learning_rate": 0.0003677967702129177, | |
| "loss": 1.3452, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.124286341677646, | |
| "grad_norm": 0.5786643028259277, | |
| "learning_rate": 0.0003593268301147139, | |
| "loss": 1.3433, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.1594202898550725, | |
| "grad_norm": 0.6357799172401428, | |
| "learning_rate": 0.00035069876009583234, | |
| "loss": 1.4166, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.194554238032499, | |
| "grad_norm": 0.6568606495857239, | |
| "learning_rate": 0.00034192503976467525, | |
| "loss": 1.323, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.2296881862099254, | |
| "grad_norm": 0.6073617339134216, | |
| "learning_rate": 0.0003330183593977152, | |
| "loss": 1.389, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.2648221343873518, | |
| "grad_norm": 0.5338005423545837, | |
| "learning_rate": 0.00032399160158432606, | |
| "loss": 1.3739, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.2999560825647782, | |
| "grad_norm": 0.6769421100616455, | |
| "learning_rate": 0.00031485782259345406, | |
| "loss": 1.4024, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.3350900307422047, | |
| "grad_norm": 0.6495899558067322, | |
| "learning_rate": 0.0003056302334890786, | |
| "loss": 1.3615, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.370223978919631, | |
| "grad_norm": 0.6488791108131409, | |
| "learning_rate": 0.0002963221810217786, | |
| "loss": 1.3548, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.4053579270970575, | |
| "grad_norm": 0.746868908405304, | |
| "learning_rate": 0.00028694712832404195, | |
| "loss": 1.3749, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.440491875274484, | |
| "grad_norm": 0.6373124122619629, | |
| "learning_rate": 0.0002775186354372408, | |
| "loss": 1.3555, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.4756258234519104, | |
| "grad_norm": 0.6247098445892334, | |
| "learning_rate": 0.0002680503396984382, | |
| "loss": 1.3977, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.5107597716293368, | |
| "grad_norm": 0.6373061537742615, | |
| "learning_rate": 0.00025855593601539415, | |
| "loss": 1.3637, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.5458937198067633, | |
| "grad_norm": 0.7269704341888428, | |
| "learning_rate": 0.00024904915705830234, | |
| "loss": 1.4263, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.5810276679841897, | |
| "grad_norm": 0.6946704983711243, | |
| "learning_rate": 0.0002395437533969069, | |
| "loss": 1.3822, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.6161616161616161, | |
| "grad_norm": 0.6399605870246887, | |
| "learning_rate": 0.0002300534736117292, | |
| "loss": 1.4348, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.6512955643390426, | |
| "grad_norm": 0.6449073553085327, | |
| "learning_rate": 0.00022059204440817246, | |
| "loss": 1.3793, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.686429512516469, | |
| "grad_norm": 0.616384744644165, | |
| "learning_rate": 0.00021117315076226557, | |
| "loss": 1.3917, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.7215634606938954, | |
| "grad_norm": 0.6383651494979858, | |
| "learning_rate": 0.0002018104161267652, | |
| "loss": 1.4097, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.7566974088713219, | |
| "grad_norm": 0.720451295375824, | |
| "learning_rate": 0.00019251738272624416, | |
| "loss": 1.3997, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.7918313570487485, | |
| "grad_norm": 0.6487829685211182, | |
| "learning_rate": 0.00018330749196966806, | |
| "loss": 1.4366, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.826965305226175, | |
| "grad_norm": 0.6398463249206543, | |
| "learning_rate": 0.00017419406500879115, | |
| "loss": 1.3536, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.8620992534036014, | |
| "grad_norm": 0.7006503939628601, | |
| "learning_rate": 0.00016519028347049242, | |
| "loss": 1.3934, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.8972332015810278, | |
| "grad_norm": 0.6170542240142822, | |
| "learning_rate": 0.00015630917039091919, | |
| "loss": 1.4171, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.9323671497584543, | |
| "grad_norm": 0.6998418569564819, | |
| "learning_rate": 0.00014756357137901604, | |
| "loss": 1.3809, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.9675010979358807, | |
| "grad_norm": 0.6567032933235168, | |
| "learning_rate": 0.00013896613603668365, | |
| "loss": 1.3223, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.002635046113307, | |
| "grad_norm": 0.6512529253959656, | |
| "learning_rate": 0.00013052929966244216, | |
| "loss": 1.3693, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.0377689942907335, | |
| "grad_norm": 0.8671336770057678, | |
| "learning_rate": 0.00012226526526506093, | |
| "loss": 1.0046, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.07290294246816, | |
| "grad_norm": 0.7728586792945862, | |
| "learning_rate": 0.00011418598591317242, | |
| "loss": 1.0138, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.1080368906455864, | |
| "grad_norm": 0.8196272253990173, | |
| "learning_rate": 0.0001063031474463983, | |
| "loss": 0.9985, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.143170838823013, | |
| "grad_norm": 0.899488091468811, | |
| "learning_rate": 9.862815157299391e-05, | |
| "loss": 0.9397, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.1783047870004393, | |
| "grad_norm": 0.7770416736602783, | |
| "learning_rate": 9.117209937846053e-05, | |
| "loss": 0.9307, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.2134387351778657, | |
| "grad_norm": 0.8353050947189331, | |
| "learning_rate": 8.394577526897565e-05, | |
| "loss": 0.9334, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 2.248572683355292, | |
| "grad_norm": 0.7152834534645081, | |
| "learning_rate": 7.69596313728691e-05, | |
| "loss": 0.9483, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 2.2837066315327186, | |
| "grad_norm": 1.0104659795761108, | |
| "learning_rate": 7.022377242270251e-05, | |
| "loss": 0.942, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 2.318840579710145, | |
| "grad_norm": 0.7319175004959106, | |
| "learning_rate": 6.374794113982232e-05, | |
| "loss": 0.9242, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 2.3539745278875714, | |
| "grad_norm": 0.7895592451095581, | |
| "learning_rate": 5.7541504142523406e-05, | |
| "loss": 0.99, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 2.389108476064998, | |
| "grad_norm": 0.9462332725524902, | |
| "learning_rate": 5.161343839820762e-05, | |
| "loss": 0.9733, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 2.4242424242424243, | |
| "grad_norm": 0.9174964427947998, | |
| "learning_rate": 4.597231823913112e-05, | |
| "loss": 0.9478, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 2.4593763724198507, | |
| "grad_norm": 0.7435886859893799, | |
| "learning_rate": 4.062630296052222e-05, | |
| "loss": 0.9487, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.494510320597277, | |
| "grad_norm": 0.908430278301239, | |
| "learning_rate": 3.558312501900718e-05, | |
| "loss": 0.9517, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 2.5296442687747036, | |
| "grad_norm": 0.8427926301956177, | |
| "learning_rate": 3.0850078848413704e-05, | |
| "loss": 0.93, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 2.56477821695213, | |
| "grad_norm": 0.8396946787834167, | |
| "learning_rate": 2.643401030912876e-05, | |
| "loss": 0.9528, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 2.5999121651295565, | |
| "grad_norm": 0.8085779547691345, | |
| "learning_rate": 2.234130678627169e-05, | |
| "loss": 0.9257, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 2.635046113306983, | |
| "grad_norm": 0.872416079044342, | |
| "learning_rate": 1.8577887951004264e-05, | |
| "loss": 0.9294, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 2.6701800614844093, | |
| "grad_norm": 0.8148744702339172, | |
| "learning_rate": 1.5149197198340014e-05, | |
| "loss": 0.9166, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 2.7053140096618358, | |
| "grad_norm": 0.8573042750358582, | |
| "learning_rate": 1.206019377383813e-05, | |
| "loss": 0.9481, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 2.740447957839262, | |
| "grad_norm": 0.7604958415031433, | |
| "learning_rate": 9.315345600569069e-06, | |
| "loss": 0.9425, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 2.7755819060166886, | |
| "grad_norm": 0.870971143245697, | |
| "learning_rate": 6.918622816727255e-06, | |
| "loss": 0.9087, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 2.810715854194115, | |
| "grad_norm": 0.9157975316047668, | |
| "learning_rate": 4.873492033237864e-06, | |
| "loss": 0.9547, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8458498023715415, | |
| "grad_norm": 0.9207829236984253, | |
| "learning_rate": 3.1829113196638614e-06, | |
| "loss": 0.9275, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 2.880983750548968, | |
| "grad_norm": 0.9513605237007141, | |
| "learning_rate": 1.8493259256649187e-06, | |
| "loss": 0.981, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 2.9161176987263944, | |
| "grad_norm": 0.854158878326416, | |
| "learning_rate": 8.746647441975619e-07, | |
| "loss": 0.9124, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.951251646903821, | |
| "grad_norm": 0.8659394383430481, | |
| "learning_rate": 2.603375215716186e-07, | |
| "loss": 0.9643, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.9863855950812472, | |
| "grad_norm": 0.94295734167099, | |
| "learning_rate": 7.23281839820622e-09, | |
| "loss": 0.9282, | |
| "step": 850 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 852, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 0, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.0935759953200589e+18, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |