| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.996640537513998, |
| "eval_steps": 500, |
| "global_step": 1002, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.029861888764464353, |
| "grad_norm": 7.79118038465759, |
| "learning_rate": 5e-06, |
| "loss": 0.7999, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.059723777528928705, |
| "grad_norm": 1.5869082965398478, |
| "learning_rate": 5e-06, |
| "loss": 0.7154, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.08958566629339305, |
| "grad_norm": 3.7241425813134916, |
| "learning_rate": 5e-06, |
| "loss": 0.6981, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.11944755505785741, |
| "grad_norm": 3.249970766677569, |
| "learning_rate": 5e-06, |
| "loss": 0.6739, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.14930944382232175, |
| "grad_norm": 1.0372275497781698, |
| "learning_rate": 5e-06, |
| "loss": 0.6619, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.1791713325867861, |
| "grad_norm": 1.049476294965026, |
| "learning_rate": 5e-06, |
| "loss": 0.6628, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.20903322135125046, |
| "grad_norm": 0.9772063955863265, |
| "learning_rate": 5e-06, |
| "loss": 0.6491, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.23889511011571482, |
| "grad_norm": 1.0522293475008135, |
| "learning_rate": 5e-06, |
| "loss": 0.6421, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.2687569988801792, |
| "grad_norm": 0.5111334927310792, |
| "learning_rate": 5e-06, |
| "loss": 0.6352, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.2986188876446435, |
| "grad_norm": 1.016498124540543, |
| "learning_rate": 5e-06, |
| "loss": 0.63, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.3284807764091079, |
| "grad_norm": 0.7265701675113714, |
| "learning_rate": 5e-06, |
| "loss": 0.6298, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.3583426651735722, |
| "grad_norm": 0.8876229395020231, |
| "learning_rate": 5e-06, |
| "loss": 0.6274, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.3882045539380366, |
| "grad_norm": 0.8364258497620431, |
| "learning_rate": 5e-06, |
| "loss": 0.628, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.41806644270250093, |
| "grad_norm": 0.6291309325942491, |
| "learning_rate": 5e-06, |
| "loss": 0.6285, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.4479283314669653, |
| "grad_norm": 0.5172436583673952, |
| "learning_rate": 5e-06, |
| "loss": 0.6159, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.47779022023142964, |
| "grad_norm": 0.8893038012313684, |
| "learning_rate": 5e-06, |
| "loss": 0.6265, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.507652108995894, |
| "grad_norm": 0.6070103390806936, |
| "learning_rate": 5e-06, |
| "loss": 0.6245, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.5375139977603584, |
| "grad_norm": 0.6482213904721557, |
| "learning_rate": 5e-06, |
| "loss": 0.6248, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.5673758865248227, |
| "grad_norm": 0.591959233594621, |
| "learning_rate": 5e-06, |
| "loss": 0.617, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.597237775289287, |
| "grad_norm": 0.5327681984410563, |
| "learning_rate": 5e-06, |
| "loss": 0.617, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.6270996640537514, |
| "grad_norm": 0.745776773912409, |
| "learning_rate": 5e-06, |
| "loss": 0.6185, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.6569615528182158, |
| "grad_norm": 0.5965336383692366, |
| "learning_rate": 5e-06, |
| "loss": 0.6187, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.68682344158268, |
| "grad_norm": 0.6671576319263424, |
| "learning_rate": 5e-06, |
| "loss": 0.6047, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.7166853303471444, |
| "grad_norm": 0.5476041049624493, |
| "learning_rate": 5e-06, |
| "loss": 0.606, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.7465472191116088, |
| "grad_norm": 0.5232305944209529, |
| "learning_rate": 5e-06, |
| "loss": 0.6033, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.7764091078760732, |
| "grad_norm": 0.5030607638626264, |
| "learning_rate": 5e-06, |
| "loss": 0.6069, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.8062709966405375, |
| "grad_norm": 0.78688217701205, |
| "learning_rate": 5e-06, |
| "loss": 0.6076, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.8361328854050019, |
| "grad_norm": 0.5082382138926317, |
| "learning_rate": 5e-06, |
| "loss": 0.6107, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.8659947741694662, |
| "grad_norm": 0.5406317903998468, |
| "learning_rate": 5e-06, |
| "loss": 0.6155, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.8958566629339306, |
| "grad_norm": 0.4838227961227224, |
| "learning_rate": 5e-06, |
| "loss": 0.61, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.9257185516983949, |
| "grad_norm": 0.538038360003273, |
| "learning_rate": 5e-06, |
| "loss": 0.6104, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.9555804404628593, |
| "grad_norm": 0.579477307027621, |
| "learning_rate": 5e-06, |
| "loss": 0.6022, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.9854423292273237, |
| "grad_norm": 0.6031475350206188, |
| "learning_rate": 5e-06, |
| "loss": 0.5957, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.9973870847331093, |
| "eval_loss": 0.6085198521614075, |
| "eval_runtime": 222.4684, |
| "eval_samples_per_second": 40.559, |
| "eval_steps_per_second": 0.634, |
| "step": 334 |
| }, |
| { |
| "epoch": 1.0175438596491229, |
| "grad_norm": 0.49851756303987316, |
| "learning_rate": 5e-06, |
| "loss": 0.632, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.0474057484135872, |
| "grad_norm": 0.708831205926716, |
| "learning_rate": 5e-06, |
| "loss": 0.5529, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.0772676371780516, |
| "grad_norm": 0.8882690672159985, |
| "learning_rate": 5e-06, |
| "loss": 0.5678, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.1071295259425158, |
| "grad_norm": 0.568283145894615, |
| "learning_rate": 5e-06, |
| "loss": 0.5534, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.1369914147069802, |
| "grad_norm": 0.6014068413121769, |
| "learning_rate": 5e-06, |
| "loss": 0.5584, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.1668533034714446, |
| "grad_norm": 1.0968783782743532, |
| "learning_rate": 5e-06, |
| "loss": 0.5566, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.196715192235909, |
| "grad_norm": 0.6321045752759027, |
| "learning_rate": 5e-06, |
| "loss": 0.5488, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.2265770810003733, |
| "grad_norm": 0.5497221776868997, |
| "learning_rate": 5e-06, |
| "loss": 0.5613, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.2564389697648375, |
| "grad_norm": 0.5105034226415843, |
| "learning_rate": 5e-06, |
| "loss": 0.5583, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.286300858529302, |
| "grad_norm": 0.5163025039187094, |
| "learning_rate": 5e-06, |
| "loss": 0.5566, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.3161627472937663, |
| "grad_norm": 0.5078792546389537, |
| "learning_rate": 5e-06, |
| "loss": 0.549, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.3460246360582306, |
| "grad_norm": 0.47338534371884383, |
| "learning_rate": 5e-06, |
| "loss": 0.5559, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.375886524822695, |
| "grad_norm": 0.508128752801008, |
| "learning_rate": 5e-06, |
| "loss": 0.5559, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.4057484135871594, |
| "grad_norm": 0.44672474181920574, |
| "learning_rate": 5e-06, |
| "loss": 0.5607, |
| "step": 470 |
| }, |
| { |
| "epoch": 1.4356103023516238, |
| "grad_norm": 0.6660991317912517, |
| "learning_rate": 5e-06, |
| "loss": 0.5605, |
| "step": 480 |
| }, |
| { |
| "epoch": 1.4654721911160882, |
| "grad_norm": 0.6191494078915318, |
| "learning_rate": 5e-06, |
| "loss": 0.5592, |
| "step": 490 |
| }, |
| { |
| "epoch": 1.4953340798805526, |
| "grad_norm": 0.5600347616787826, |
| "learning_rate": 5e-06, |
| "loss": 0.5565, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.5251959686450167, |
| "grad_norm": 0.5169032233443931, |
| "learning_rate": 5e-06, |
| "loss": 0.5585, |
| "step": 510 |
| }, |
| { |
| "epoch": 1.555057857409481, |
| "grad_norm": 0.4879130577592748, |
| "learning_rate": 5e-06, |
| "loss": 0.562, |
| "step": 520 |
| }, |
| { |
| "epoch": 1.5849197461739455, |
| "grad_norm": 0.4765806580223471, |
| "learning_rate": 5e-06, |
| "loss": 0.5558, |
| "step": 530 |
| }, |
| { |
| "epoch": 1.6147816349384099, |
| "grad_norm": 0.5162828779150767, |
| "learning_rate": 5e-06, |
| "loss": 0.5552, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.6446435237028743, |
| "grad_norm": 0.6168949576647439, |
| "learning_rate": 5e-06, |
| "loss": 0.5571, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.6745054124673384, |
| "grad_norm": 0.5347449788152595, |
| "learning_rate": 5e-06, |
| "loss": 0.5487, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.704367301231803, |
| "grad_norm": 0.5183677617273615, |
| "learning_rate": 5e-06, |
| "loss": 0.5501, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.7342291899962672, |
| "grad_norm": 0.5222132486337936, |
| "learning_rate": 5e-06, |
| "loss": 0.5526, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.7640910787607316, |
| "grad_norm": 0.5153859356904258, |
| "learning_rate": 5e-06, |
| "loss": 0.5614, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.793952967525196, |
| "grad_norm": 0.550927637906372, |
| "learning_rate": 5e-06, |
| "loss": 0.5597, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.8238148562896603, |
| "grad_norm": 0.5061632894558703, |
| "learning_rate": 5e-06, |
| "loss": 0.5522, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.8536767450541247, |
| "grad_norm": 0.5066134636744783, |
| "learning_rate": 5e-06, |
| "loss": 0.5569, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.8835386338185889, |
| "grad_norm": 0.51338686981817, |
| "learning_rate": 5e-06, |
| "loss": 0.5605, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.9134005225830535, |
| "grad_norm": 0.5128301058927047, |
| "learning_rate": 5e-06, |
| "loss": 0.5529, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.9432624113475176, |
| "grad_norm": 0.44341728146025833, |
| "learning_rate": 5e-06, |
| "loss": 0.5501, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.973124300111982, |
| "grad_norm": 0.5942836803667937, |
| "learning_rate": 5e-06, |
| "loss": 0.5557, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.9970138111235536, |
| "eval_loss": 0.5998166799545288, |
| "eval_runtime": 229.4383, |
| "eval_samples_per_second": 39.326, |
| "eval_steps_per_second": 0.615, |
| "step": 668 |
| }, |
| { |
| "epoch": 2.005225830533781, |
| "grad_norm": 0.5802432479761296, |
| "learning_rate": 5e-06, |
| "loss": 0.594, |
| "step": 670 |
| }, |
| { |
| "epoch": 2.0350877192982457, |
| "grad_norm": 0.5662155354308591, |
| "learning_rate": 5e-06, |
| "loss": 0.5113, |
| "step": 680 |
| }, |
| { |
| "epoch": 2.06494960806271, |
| "grad_norm": 0.5633082161780731, |
| "learning_rate": 5e-06, |
| "loss": 0.5055, |
| "step": 690 |
| }, |
| { |
| "epoch": 2.0948114968271745, |
| "grad_norm": 0.6900750030857515, |
| "learning_rate": 5e-06, |
| "loss": 0.5033, |
| "step": 700 |
| }, |
| { |
| "epoch": 2.1246733855916387, |
| "grad_norm": 0.5246660332076203, |
| "learning_rate": 5e-06, |
| "loss": 0.5057, |
| "step": 710 |
| }, |
| { |
| "epoch": 2.1545352743561033, |
| "grad_norm": 0.49576742726803086, |
| "learning_rate": 5e-06, |
| "loss": 0.5043, |
| "step": 720 |
| }, |
| { |
| "epoch": 2.1843971631205674, |
| "grad_norm": 0.5842758806376896, |
| "learning_rate": 5e-06, |
| "loss": 0.5007, |
| "step": 730 |
| }, |
| { |
| "epoch": 2.2142590518850316, |
| "grad_norm": 0.4993809622395107, |
| "learning_rate": 5e-06, |
| "loss": 0.5059, |
| "step": 740 |
| }, |
| { |
| "epoch": 2.244120940649496, |
| "grad_norm": 0.5166071459346999, |
| "learning_rate": 5e-06, |
| "loss": 0.5063, |
| "step": 750 |
| }, |
| { |
| "epoch": 2.2739828294139603, |
| "grad_norm": 0.5111085144892387, |
| "learning_rate": 5e-06, |
| "loss": 0.5037, |
| "step": 760 |
| }, |
| { |
| "epoch": 2.303844718178425, |
| "grad_norm": 0.576142870826061, |
| "learning_rate": 5e-06, |
| "loss": 0.5028, |
| "step": 770 |
| }, |
| { |
| "epoch": 2.333706606942889, |
| "grad_norm": 0.5410227430534348, |
| "learning_rate": 5e-06, |
| "loss": 0.5059, |
| "step": 780 |
| }, |
| { |
| "epoch": 2.3635684957073533, |
| "grad_norm": 0.5135660021965249, |
| "learning_rate": 5e-06, |
| "loss": 0.5071, |
| "step": 790 |
| }, |
| { |
| "epoch": 2.393430384471818, |
| "grad_norm": 0.4639020925564197, |
| "learning_rate": 5e-06, |
| "loss": 0.5043, |
| "step": 800 |
| }, |
| { |
| "epoch": 2.423292273236282, |
| "grad_norm": 0.535934151004682, |
| "learning_rate": 5e-06, |
| "loss": 0.5106, |
| "step": 810 |
| }, |
| { |
| "epoch": 2.4531541620007467, |
| "grad_norm": 0.5086724526817524, |
| "learning_rate": 5e-06, |
| "loss": 0.5115, |
| "step": 820 |
| }, |
| { |
| "epoch": 2.483016050765211, |
| "grad_norm": 0.6279988186129756, |
| "learning_rate": 5e-06, |
| "loss": 0.5028, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.512877939529675, |
| "grad_norm": 0.5761751573387168, |
| "learning_rate": 5e-06, |
| "loss": 0.5047, |
| "step": 840 |
| }, |
| { |
| "epoch": 2.5427398282941396, |
| "grad_norm": 0.47474901886125564, |
| "learning_rate": 5e-06, |
| "loss": 0.5092, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.572601717058604, |
| "grad_norm": 0.5788604577321906, |
| "learning_rate": 5e-06, |
| "loss": 0.502, |
| "step": 860 |
| }, |
| { |
| "epoch": 2.6024636058230683, |
| "grad_norm": 0.532963628175296, |
| "learning_rate": 5e-06, |
| "loss": 0.5055, |
| "step": 870 |
| }, |
| { |
| "epoch": 2.6323254945875325, |
| "grad_norm": 0.5495325925375805, |
| "learning_rate": 5e-06, |
| "loss": 0.5094, |
| "step": 880 |
| }, |
| { |
| "epoch": 2.662187383351997, |
| "grad_norm": 0.5542802761836805, |
| "learning_rate": 5e-06, |
| "loss": 0.5103, |
| "step": 890 |
| }, |
| { |
| "epoch": 2.6920492721164613, |
| "grad_norm": 0.5166995770046318, |
| "learning_rate": 5e-06, |
| "loss": 0.5072, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.721911160880926, |
| "grad_norm": 0.5164196659122122, |
| "learning_rate": 5e-06, |
| "loss": 0.5118, |
| "step": 910 |
| }, |
| { |
| "epoch": 2.75177304964539, |
| "grad_norm": 0.5715114900194982, |
| "learning_rate": 5e-06, |
| "loss": 0.5062, |
| "step": 920 |
| }, |
| { |
| "epoch": 2.781634938409854, |
| "grad_norm": 0.49790842007119634, |
| "learning_rate": 5e-06, |
| "loss": 0.5123, |
| "step": 930 |
| }, |
| { |
| "epoch": 2.811496827174319, |
| "grad_norm": 0.4962762066824268, |
| "learning_rate": 5e-06, |
| "loss": 0.5027, |
| "step": 940 |
| }, |
| { |
| "epoch": 2.841358715938783, |
| "grad_norm": 0.5347697025577659, |
| "learning_rate": 5e-06, |
| "loss": 0.5113, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.8712206047032476, |
| "grad_norm": 0.5207475991953775, |
| "learning_rate": 5e-06, |
| "loss": 0.5113, |
| "step": 960 |
| }, |
| { |
| "epoch": 2.9010824934677117, |
| "grad_norm": 0.5426588692816309, |
| "learning_rate": 5e-06, |
| "loss": 0.5109, |
| "step": 970 |
| }, |
| { |
| "epoch": 2.9309443822321763, |
| "grad_norm": 0.5227682582590668, |
| "learning_rate": 5e-06, |
| "loss": 0.5098, |
| "step": 980 |
| }, |
| { |
| "epoch": 2.9608062709966405, |
| "grad_norm": 0.517717134903454, |
| "learning_rate": 5e-06, |
| "loss": 0.5103, |
| "step": 990 |
| }, |
| { |
| "epoch": 2.990668159761105, |
| "grad_norm": 0.5280565704672064, |
| "learning_rate": 5e-06, |
| "loss": 0.5009, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.996640537513998, |
| "eval_loss": 0.6044070720672607, |
| "eval_runtime": 223.5055, |
| "eval_samples_per_second": 40.37, |
| "eval_steps_per_second": 0.631, |
| "step": 1002 |
| }, |
| { |
| "epoch": 2.996640537513998, |
| "step": 1002, |
| "total_flos": 1678177940275200.0, |
| "train_loss": 0.5665482763282791, |
| "train_runtime": 33246.3656, |
| "train_samples_per_second": 15.469, |
| "train_steps_per_second": 0.03 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1002, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1678177940275200.0, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|