| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 1923, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.015600624024960999, |
| "grad_norm": 12.956160545349121, |
| "learning_rate": 2.7835051546391753e-06, |
| "loss": 1.1012, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.031201248049921998, |
| "grad_norm": 12.30282974243164, |
| "learning_rate": 5.876288659793814e-06, |
| "loss": 0.8156, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.046801872074883, |
| "grad_norm": 9.357769012451172, |
| "learning_rate": 8.969072164948454e-06, |
| "loss": 0.7969, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.062402496099843996, |
| "grad_norm": 3.8810768127441406, |
| "learning_rate": 1.2061855670103093e-05, |
| "loss": 0.3807, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.078003120124805, |
| "grad_norm": 3.557420253753662, |
| "learning_rate": 1.5154639175257731e-05, |
| "loss": 0.3692, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.093603744149766, |
| "grad_norm": 1.7188315391540527, |
| "learning_rate": 1.824742268041237e-05, |
| "loss": 0.2744, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.10920436817472699, |
| "grad_norm": 1.362113118171692, |
| "learning_rate": 2.134020618556701e-05, |
| "loss": 0.2794, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.12480499219968799, |
| "grad_norm": 3.378025770187378, |
| "learning_rate": 2.443298969072165e-05, |
| "loss": 0.2633, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.14040561622464898, |
| "grad_norm": 1.2982730865478516, |
| "learning_rate": 2.7525773195876287e-05, |
| "loss": 0.2838, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.15600624024961, |
| "grad_norm": 1.5544312000274658, |
| "learning_rate": 2.9999911198761025e-05, |
| "loss": 0.2872, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.17160686427457097, |
| "grad_norm": 2.317873001098633, |
| "learning_rate": 2.999680326579471e-05, |
| "loss": 0.2635, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.187207488299532, |
| "grad_norm": 1.5369235277175903, |
| "learning_rate": 2.998925632224497e-05, |
| "loss": 0.2571, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.20280811232449297, |
| "grad_norm": 1.3865573406219482, |
| "learning_rate": 2.9977272601985376e-05, |
| "loss": 0.2297, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.21840873634945399, |
| "grad_norm": 1.2493661642074585, |
| "learning_rate": 2.9960855652162606e-05, |
| "loss": 0.2551, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.23400936037441497, |
| "grad_norm": 1.4932328462600708, |
| "learning_rate": 2.994001033214654e-05, |
| "loss": 0.2507, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.24960998439937598, |
| "grad_norm": 1.5236423015594482, |
| "learning_rate": 2.9914742812091878e-05, |
| "loss": 0.245, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.26521060842433697, |
| "grad_norm": 1.3999335765838623, |
| "learning_rate": 2.9885060571111795e-05, |
| "loss": 0.2361, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.28081123244929795, |
| "grad_norm": 0.8492459058761597, |
| "learning_rate": 2.985097239506416e-05, |
| "loss": 0.2581, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.296411856474259, |
| "grad_norm": 1.2287479639053345, |
| "learning_rate": 2.9812488373950918e-05, |
| "loss": 0.2581, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.31201248049922, |
| "grad_norm": 1.5606762170791626, |
| "learning_rate": 2.9769619898931505e-05, |
| "loss": 0.2614, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.32761310452418096, |
| "grad_norm": 1.4384827613830566, |
| "learning_rate": 2.9722379658951095e-05, |
| "loss": 0.228, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.34321372854914195, |
| "grad_norm": 0.8863910436630249, |
| "learning_rate": 2.9670781636984686e-05, |
| "loss": 0.2408, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.358814352574103, |
| "grad_norm": 1.1458382606506348, |
| "learning_rate": 2.96148411058982e-05, |
| "loss": 0.223, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.374414976599064, |
| "grad_norm": 1.1114429235458374, |
| "learning_rate": 2.955457462392777e-05, |
| "loss": 0.2607, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.39001560062402496, |
| "grad_norm": 0.9094457626342773, |
| "learning_rate": 2.9490000029778514e-05, |
| "loss": 0.2267, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.40561622464898595, |
| "grad_norm": 2.2145955562591553, |
| "learning_rate": 2.9421136437344358e-05, |
| "loss": 0.2514, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.42121684867394693, |
| "grad_norm": 0.9296560287475586, |
| "learning_rate": 2.934800423005037e-05, |
| "loss": 0.2339, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.43681747269890797, |
| "grad_norm": 1.1148079633712769, |
| "learning_rate": 2.927062505481933e-05, |
| "loss": 0.2093, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.45241809672386896, |
| "grad_norm": 1.0592412948608398, |
| "learning_rate": 2.9189021815664287e-05, |
| "loss": 0.2627, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.46801872074882994, |
| "grad_norm": 0.6646207571029663, |
| "learning_rate": 2.910321866690906e-05, |
| "loss": 0.2257, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.4836193447737909, |
| "grad_norm": 1.14007568359375, |
| "learning_rate": 2.901324100603861e-05, |
| "loss": 0.2554, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.49921996879875197, |
| "grad_norm": 1.2674685716629028, |
| "learning_rate": 2.8919115466181455e-05, |
| "loss": 0.2222, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.514820592823713, |
| "grad_norm": 1.4763505458831787, |
| "learning_rate": 2.882086990822637e-05, |
| "loss": 0.2234, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.5304212168486739, |
| "grad_norm": 0.9958274960517883, |
| "learning_rate": 2.8718533412575613e-05, |
| "loss": 0.2532, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.5460218408736349, |
| "grad_norm": 1.1231701374053955, |
| "learning_rate": 2.8612136270537206e-05, |
| "loss": 0.2221, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.5616224648985959, |
| "grad_norm": 1.0642913579940796, |
| "learning_rate": 2.8501709975358828e-05, |
| "loss": 0.223, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.5772230889235569, |
| "grad_norm": 1.683724284172058, |
| "learning_rate": 2.8387287212905888e-05, |
| "loss": 0.2255, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.592823712948518, |
| "grad_norm": 0.9343106150627136, |
| "learning_rate": 2.826890185198658e-05, |
| "loss": 0.2726, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.608424336973479, |
| "grad_norm": 0.8196080923080444, |
| "learning_rate": 2.8146588934326855e-05, |
| "loss": 0.2227, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.62402496099844, |
| "grad_norm": 1.024862289428711, |
| "learning_rate": 2.8020384664198134e-05, |
| "loss": 0.2337, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.6396255850234009, |
| "grad_norm": 1.8752734661102295, |
| "learning_rate": 2.7890326397700974e-05, |
| "loss": 0.2206, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.6552262090483619, |
| "grad_norm": 1.076338768005371, |
| "learning_rate": 2.7756452631707753e-05, |
| "loss": 0.2174, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.6708268330733229, |
| "grad_norm": 1.7308673858642578, |
| "learning_rate": 2.7618802992467718e-05, |
| "loss": 0.2517, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.6864274570982839, |
| "grad_norm": 0.9615013003349304, |
| "learning_rate": 2.747741822387772e-05, |
| "loss": 0.2595, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.7020280811232449, |
| "grad_norm": 1.0114976167678833, |
| "learning_rate": 2.733234017542215e-05, |
| "loss": 0.2129, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.717628705148206, |
| "grad_norm": 1.1466189622879028, |
| "learning_rate": 2.7183611789785597e-05, |
| "loss": 0.23, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.733229329173167, |
| "grad_norm": 0.9956033229827881, |
| "learning_rate": 2.7031277090141938e-05, |
| "loss": 0.1992, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.748829953198128, |
| "grad_norm": 0.8617722392082214, |
| "learning_rate": 2.687538116712363e-05, |
| "loss": 0.2076, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.7644305772230889, |
| "grad_norm": 1.1923143863677979, |
| "learning_rate": 2.6715970165474982e-05, |
| "loss": 0.2123, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.7800312012480499, |
| "grad_norm": 0.8156811594963074, |
| "learning_rate": 2.6553091270393456e-05, |
| "loss": 0.225, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7800312012480499, |
| "eval_loss": 0.2404618114233017, |
| "eval_runtime": 222.9357, |
| "eval_samples_per_second": 8.621, |
| "eval_steps_per_second": 8.621, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.7956318252730109, |
| "grad_norm": 0.9273092150688171, |
| "learning_rate": 2.6386792693562992e-05, |
| "loss": 0.2312, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.8112324492979719, |
| "grad_norm": 1.1608659029006958, |
| "learning_rate": 2.621712365888347e-05, |
| "loss": 0.209, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.8268330733229329, |
| "grad_norm": 1.0634217262268066, |
| "learning_rate": 2.6044134387900598e-05, |
| "loss": 0.207, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.8424336973478939, |
| "grad_norm": 0.863330066204071, |
| "learning_rate": 2.586787608494046e-05, |
| "loss": 0.2295, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.858034321372855, |
| "grad_norm": 0.7286916971206665, |
| "learning_rate": 2.5688400921953197e-05, |
| "loss": 0.2349, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.8736349453978159, |
| "grad_norm": 1.5327069759368896, |
| "learning_rate": 2.5505762023070265e-05, |
| "loss": 0.2505, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.8892355694227769, |
| "grad_norm": 0.9913854002952576, |
| "learning_rate": 2.5320013448879812e-05, |
| "loss": 0.2405, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.9048361934477379, |
| "grad_norm": 0.7204810976982117, |
| "learning_rate": 2.513121018042494e-05, |
| "loss": 0.1962, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.9204368174726989, |
| "grad_norm": 0.8549244999885559, |
| "learning_rate": 2.4939408102929457e-05, |
| "loss": 0.2358, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.9360374414976599, |
| "grad_norm": 0.8404679894447327, |
| "learning_rate": 2.474466398925601e-05, |
| "loss": 0.2517, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.9516380655226209, |
| "grad_norm": 1.46381413936615, |
| "learning_rate": 2.4547035483101474e-05, |
| "loss": 0.2414, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.9672386895475819, |
| "grad_norm": 1.4833369255065918, |
| "learning_rate": 2.43465810819346e-05, |
| "loss": 0.2236, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.982839313572543, |
| "grad_norm": 1.524997353553772, |
| "learning_rate": 2.4143360119680928e-05, |
| "loss": 0.2404, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.9984399375975039, |
| "grad_norm": 0.5853265523910522, |
| "learning_rate": 2.3937432749160113e-05, |
| "loss": 0.2316, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.0140405616224648, |
| "grad_norm": 1.2552859783172607, |
| "learning_rate": 2.3728859924280858e-05, |
| "loss": 0.2286, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.029641185647426, |
| "grad_norm": 1.6384315490722656, |
| "learning_rate": 2.351770338199875e-05, |
| "loss": 0.2048, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.045241809672387, |
| "grad_norm": 1.1269832849502563, |
| "learning_rate": 2.3304025624042265e-05, |
| "loss": 0.1966, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.0608424336973479, |
| "grad_norm": 1.2746951580047607, |
| "learning_rate": 2.308788989841249e-05, |
| "loss": 0.2279, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.076443057722309, |
| "grad_norm": 0.9508239030838013, |
| "learning_rate": 2.2869360180661844e-05, |
| "loss": 0.2257, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.0920436817472698, |
| "grad_norm": 0.7616346478462219, |
| "learning_rate": 2.264850115495752e-05, |
| "loss": 0.2217, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.107644305772231, |
| "grad_norm": 0.8200326561927795, |
| "learning_rate": 2.2425378194935163e-05, |
| "loss": 0.1953, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.1232449297971918, |
| "grad_norm": 1.595325231552124, |
| "learning_rate": 2.220005734434847e-05, |
| "loss": 0.247, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.138845553822153, |
| "grad_norm": 0.860009491443634, |
| "learning_rate": 2.1972605297520388e-05, |
| "loss": 0.183, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.154446177847114, |
| "grad_norm": 1.3904730081558228, |
| "learning_rate": 2.1743089379601842e-05, |
| "loss": 0.217, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.1700468018720749, |
| "grad_norm": 0.9801428914070129, |
| "learning_rate": 2.1511577526643646e-05, |
| "loss": 0.2296, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.185647425897036, |
| "grad_norm": 1.275903344154358, |
| "learning_rate": 2.1278138265487627e-05, |
| "loss": 0.2245, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.2012480499219969, |
| "grad_norm": 1.0965229272842407, |
| "learning_rate": 2.1042840693482907e-05, |
| "loss": 0.1869, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.216848673946958, |
| "grad_norm": 1.6734888553619385, |
| "learning_rate": 2.080575445803326e-05, |
| "loss": 0.2308, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.2324492979719188, |
| "grad_norm": 1.082309603691101, |
| "learning_rate": 2.056694973598169e-05, |
| "loss": 0.1968, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.24804992199688, |
| "grad_norm": 1.0727277994155884, |
| "learning_rate": 2.0326497212838283e-05, |
| "loss": 0.219, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.2636505460218408, |
| "grad_norm": 1.4003663063049316, |
| "learning_rate": 2.008446806185751e-05, |
| "loss": 0.2348, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.2792511700468019, |
| "grad_norm": 1.290280818939209, |
| "learning_rate": 1.9840933922971144e-05, |
| "loss": 0.203, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.294851794071763, |
| "grad_norm": 1.0119068622589111, |
| "learning_rate": 1.9595966881583032e-05, |
| "loss": 0.1948, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.3104524180967239, |
| "grad_norm": 0.9613682627677917, |
| "learning_rate": 1.9349639447232046e-05, |
| "loss": 0.1845, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.3260530421216847, |
| "grad_norm": 1.4405962228775024, |
| "learning_rate": 1.9102024532129452e-05, |
| "loss": 0.1807, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.3416536661466458, |
| "grad_norm": 1.6739627122879028, |
| "learning_rate": 1.8853195429577124e-05, |
| "loss": 0.2036, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.357254290171607, |
| "grad_norm": 1.0754071474075317, |
| "learning_rate": 1.8603225792272897e-05, |
| "loss": 0.2174, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.3728549141965678, |
| "grad_norm": 1.03872811794281, |
| "learning_rate": 1.8352189610509642e-05, |
| "loss": 0.2496, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.388455538221529, |
| "grad_norm": 1.1556732654571533, |
| "learning_rate": 1.8100161190274293e-05, |
| "loss": 0.2099, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.4040561622464898, |
| "grad_norm": 1.54631769657135, |
| "learning_rate": 1.7847215131253534e-05, |
| "loss": 0.2034, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.4196567862714509, |
| "grad_norm": 2.124622344970703, |
| "learning_rate": 1.759342630475247e-05, |
| "loss": 0.1891, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.435257410296412, |
| "grad_norm": 0.9314622282981873, |
| "learning_rate": 1.7338869831532962e-05, |
| "loss": 0.2302, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.4508580343213728, |
| "grad_norm": 0.9277411103248596, |
| "learning_rate": 1.7083621059578093e-05, |
| "loss": 0.2167, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.466458658346334, |
| "grad_norm": 1.0425158739089966, |
| "learning_rate": 1.6827755541789363e-05, |
| "loss": 0.2091, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.4820592823712948, |
| "grad_norm": 1.8837333917617798, |
| "learning_rate": 1.657134901362329e-05, |
| "loss": 0.2039, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.497659906396256, |
| "grad_norm": 1.4193438291549683, |
| "learning_rate": 1.6314477370673874e-05, |
| "loss": 0.2343, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.513260530421217, |
| "grad_norm": 1.1180918216705322, |
| "learning_rate": 1.6057216646207774e-05, |
| "loss": 0.2061, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.5288611544461779, |
| "grad_norm": 0.8356765508651733, |
| "learning_rate": 1.579964298865865e-05, |
| "loss": 0.2248, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.5444617784711387, |
| "grad_norm": 1.0649689435958862, |
| "learning_rate": 1.554183263908745e-05, |
| "loss": 0.1944, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.5600624024960998, |
| "grad_norm": 0.9574515223503113, |
| "learning_rate": 1.5283861908615286e-05, |
| "loss": 0.2144, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.5600624024960998, |
| "eval_loss": 0.22991187870502472, |
| "eval_runtime": 212.0865, |
| "eval_samples_per_second": 9.062, |
| "eval_steps_per_second": 9.062, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.575663026521061, |
| "grad_norm": 1.9467267990112305, |
| "learning_rate": 1.5025807155835557e-05, |
| "loss": 0.2127, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.5912636505460218, |
| "grad_norm": 1.6572612524032593, |
| "learning_rate": 1.4767744764212002e-05, |
| "loss": 0.1974, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.6068642745709827, |
| "grad_norm": 1.8733537197113037, |
| "learning_rate": 1.450975111946947e-05, |
| "loss": 0.2245, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.6224648985959438, |
| "grad_norm": 1.1123310327529907, |
| "learning_rate": 1.42519025869839e-05, |
| "loss": 0.1946, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.6380655226209049, |
| "grad_norm": 1.3432146310806274, |
| "learning_rate": 1.3994275489178445e-05, |
| "loss": 0.2209, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.653666146645866, |
| "grad_norm": 1.7226901054382324, |
| "learning_rate": 1.3736946082932203e-05, |
| "loss": 0.1922, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.6692667706708268, |
| "grad_norm": 0.8934373259544373, |
| "learning_rate": 1.347999053700846e-05, |
| "loss": 0.1996, |
| "step": 1070 |
| }, |
| { |
| "epoch": 1.6848673946957877, |
| "grad_norm": 1.0361180305480957, |
| "learning_rate": 1.3223484909508899e-05, |
| "loss": 0.1875, |
| "step": 1080 |
| }, |
| { |
| "epoch": 1.7004680187207488, |
| "grad_norm": 1.2863364219665527, |
| "learning_rate": 1.296750512536065e-05, |
| "loss": 0.1951, |
| "step": 1090 |
| }, |
| { |
| "epoch": 1.71606864274571, |
| "grad_norm": 1.3062769174575806, |
| "learning_rate": 1.2712126953842734e-05, |
| "loss": 0.2124, |
| "step": 1100 |
| }, |
| { |
| "epoch": 1.7316692667706708, |
| "grad_norm": 1.2328706979751587, |
| "learning_rate": 1.245742598615855e-05, |
| "loss": 0.182, |
| "step": 1110 |
| }, |
| { |
| "epoch": 1.7472698907956317, |
| "grad_norm": 1.0888112783432007, |
| "learning_rate": 1.2203477613061136e-05, |
| "loss": 0.2155, |
| "step": 1120 |
| }, |
| { |
| "epoch": 1.7628705148205928, |
| "grad_norm": 0.9680263996124268, |
| "learning_rate": 1.1950357002537672e-05, |
| "loss": 0.2049, |
| "step": 1130 |
| }, |
| { |
| "epoch": 1.7784711388455539, |
| "grad_norm": 1.1818790435791016, |
| "learning_rate": 1.1698139077560021e-05, |
| "loss": 0.2048, |
| "step": 1140 |
| }, |
| { |
| "epoch": 1.794071762870515, |
| "grad_norm": 1.0962156057357788, |
| "learning_rate": 1.1446898493907707e-05, |
| "loss": 0.2145, |
| "step": 1150 |
| }, |
| { |
| "epoch": 1.8096723868954758, |
| "grad_norm": 1.179892659187317, |
| "learning_rate": 1.1196709618070055e-05, |
| "loss": 0.2048, |
| "step": 1160 |
| }, |
| { |
| "epoch": 1.8252730109204367, |
| "grad_norm": 0.753470242023468, |
| "learning_rate": 1.0947646505233888e-05, |
| "loss": 0.2039, |
| "step": 1170 |
| }, |
| { |
| "epoch": 1.8408736349453978, |
| "grad_norm": 1.2412258386611938, |
| "learning_rate": 1.0699782877363435e-05, |
| "loss": 0.2305, |
| "step": 1180 |
| }, |
| { |
| "epoch": 1.856474258970359, |
| "grad_norm": 0.9623850584030151, |
| "learning_rate": 1.0453192101378812e-05, |
| "loss": 0.2028, |
| "step": 1190 |
| }, |
| { |
| "epoch": 1.8720748829953198, |
| "grad_norm": 1.9972094297409058, |
| "learning_rate": 1.0207947167439665e-05, |
| "loss": 0.185, |
| "step": 1200 |
| }, |
| { |
| "epoch": 1.8876755070202809, |
| "grad_norm": 1.424430251121521, |
| "learning_rate": 9.964120667340252e-06, |
| "loss": 0.2163, |
| "step": 1210 |
| }, |
| { |
| "epoch": 1.9032761310452417, |
| "grad_norm": 0.9559535980224609, |
| "learning_rate": 9.721784773022505e-06, |
| "loss": 0.1774, |
| "step": 1220 |
| }, |
| { |
| "epoch": 1.9188767550702028, |
| "grad_norm": 1.1195014715194702, |
| "learning_rate": 9.481011215213333e-06, |
| "loss": 0.212, |
| "step": 1230 |
| }, |
| { |
| "epoch": 1.934477379095164, |
| "grad_norm": 2.1014575958251953, |
| "learning_rate": 9.241871262192553e-06, |
| "loss": 0.1878, |
| "step": 1240 |
| }, |
| { |
| "epoch": 1.9500780031201248, |
| "grad_norm": 0.9159352779388428, |
| "learning_rate": 9.004435698697638e-06, |
| "loss": 0.1992, |
| "step": 1250 |
| }, |
| { |
| "epoch": 1.9656786271450857, |
| "grad_norm": 1.7032984495162964, |
| "learning_rate": 8.768774804971705e-06, |
| "loss": 0.2284, |
| "step": 1260 |
| }, |
| { |
| "epoch": 1.9812792511700468, |
| "grad_norm": 0.9082481861114502, |
| "learning_rate": 8.534958335960701e-06, |
| "loss": 0.1948, |
| "step": 1270 |
| }, |
| { |
| "epoch": 1.9968798751950079, |
| "grad_norm": 1.1760114431381226, |
| "learning_rate": 8.303055500666185e-06, |
| "loss": 0.1939, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.012480499219969, |
| "grad_norm": 1.515101432800293, |
| "learning_rate": 8.073134941659631e-06, |
| "loss": 0.2213, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.0280811232449296, |
| "grad_norm": 0.7668793201446533, |
| "learning_rate": 7.845264714764464e-06, |
| "loss": 0.1783, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.0436817472698907, |
| "grad_norm": 1.2395737171173096, |
| "learning_rate": 7.619512268911687e-06, |
| "loss": 0.201, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.059282371294852, |
| "grad_norm": 1.0414377450942993, |
| "learning_rate": 7.395944426175209e-06, |
| "loss": 0.1866, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.074882995319813, |
| "grad_norm": 1.3810758590698242, |
| "learning_rate": 7.174627361992733e-06, |
| "loss": 0.2028, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.090483619344774, |
| "grad_norm": 1.0247944593429565, |
| "learning_rate": 6.955626585577968e-06, |
| "loss": 0.1705, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.1060842433697347, |
| "grad_norm": 1.0820848941802979, |
| "learning_rate": 6.73900692053012e-06, |
| "loss": 0.2025, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.1216848673946958, |
| "grad_norm": 1.0407085418701172, |
| "learning_rate": 6.5248324856462825e-06, |
| "loss": 0.1858, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.137285491419657, |
| "grad_norm": 0.9962034821510315, |
| "learning_rate": 6.313166675942475e-06, |
| "loss": 0.1958, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.152886115444618, |
| "grad_norm": 1.0945219993591309, |
| "learning_rate": 6.104072143888874e-06, |
| "loss": 0.1727, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.1684867394695786, |
| "grad_norm": 1.7445493936538696, |
| "learning_rate": 5.897610780864885e-06, |
| "loss": 0.2164, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.1840873634945397, |
| "grad_norm": 1.2947014570236206, |
| "learning_rate": 5.693843698839448e-06, |
| "loss": 0.2124, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.199687987519501, |
| "grad_norm": 1.7516402006149292, |
| "learning_rate": 5.4928312122821106e-06, |
| "loss": 0.169, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.215288611544462, |
| "grad_norm": 0.8668001294136047, |
| "learning_rate": 5.294632820310068e-06, |
| "loss": 0.1479, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.230889235569423, |
| "grad_norm": 1.2017163038253784, |
| "learning_rate": 5.099307189076637e-06, |
| "loss": 0.1909, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.2464898595943836, |
| "grad_norm": 1.1504899263381958, |
| "learning_rate": 4.906912134406216e-06, |
| "loss": 0.188, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.2620904836193447, |
| "grad_norm": 1.597075343132019, |
| "learning_rate": 4.717504604680997e-06, |
| "loss": 0.1938, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.277691107644306, |
| "grad_norm": 1.7201188802719116, |
| "learning_rate": 4.531140663984368e-06, |
| "loss": 0.1867, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.293291731669267, |
| "grad_norm": 1.5416109561920166, |
| "learning_rate": 4.3478754755061526e-06, |
| "loss": 0.1807, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.308892355694228, |
| "grad_norm": 1.2527023553848267, |
| "learning_rate": 4.167763285214421e-06, |
| "loss": 0.1839, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.3244929797191887, |
| "grad_norm": 1.764147162437439, |
| "learning_rate": 3.990857405798876e-06, |
| "loss": 0.2142, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.3400936037441498, |
| "grad_norm": 1.8016437292099, |
| "learning_rate": 3.817210200890411e-06, |
| "loss": 0.195, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.3400936037441498, |
| "eval_loss": 0.24235089123249054, |
| "eval_runtime": 211.616, |
| "eval_samples_per_second": 9.082, |
| "eval_steps_per_second": 9.082, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.355694227769111, |
| "grad_norm": 1.1703391075134277, |
| "learning_rate": 3.6468730695616733e-06, |
| "loss": 0.1855, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.371294851794072, |
| "grad_norm": 1.0875886678695679, |
| "learning_rate": 3.479896431113043e-06, |
| "loss": 0.1939, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.3868954758190326, |
| "grad_norm": 0.8574866652488708, |
| "learning_rate": 3.3163297101486995e-06, |
| "loss": 0.1834, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.4024960998439937, |
| "grad_norm": 0.8828145265579224, |
| "learning_rate": 3.156221321947055e-06, |
| "loss": 0.1806, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.418096723868955, |
| "grad_norm": 1.8176056146621704, |
| "learning_rate": 2.999618658129983e-06, |
| "loss": 0.1717, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.433697347893916, |
| "grad_norm": 0.9186724424362183, |
| "learning_rate": 2.846568072635042e-06, |
| "loss": 0.1721, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.4492979719188765, |
| "grad_norm": 1.5324952602386475, |
| "learning_rate": 2.6971148679948256e-06, |
| "loss": 0.1928, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.4648985959438376, |
| "grad_norm": 1.231868028640747, |
| "learning_rate": 2.551303281927559e-06, |
| "loss": 0.1904, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.4804992199687987, |
| "grad_norm": 2.1853291988372803, |
| "learning_rate": 2.4091764742428483e-06, |
| "loss": 0.1843, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.49609984399376, |
| "grad_norm": 0.9732924103736877, |
| "learning_rate": 2.2707765140665256e-06, |
| "loss": 0.1801, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.511700468018721, |
| "grad_norm": 1.5212299823760986, |
| "learning_rate": 2.1361443673882688e-06, |
| "loss": 0.1802, |
| "step": 1610 |
| }, |
| { |
| "epoch": 2.5273010920436816, |
| "grad_norm": 1.6022634506225586, |
| "learning_rate": 2.0053198849358323e-06, |
| "loss": 0.1965, |
| "step": 1620 |
| }, |
| { |
| "epoch": 2.5429017160686427, |
| "grad_norm": 1.337381362915039, |
| "learning_rate": 1.8783417903793037e-06, |
| "loss": 0.202, |
| "step": 1630 |
| }, |
| { |
| "epoch": 2.5585023400936038, |
| "grad_norm": 2.099640369415283, |
| "learning_rate": 1.7552476688690482e-06, |
| "loss": 0.2049, |
| "step": 1640 |
| }, |
| { |
| "epoch": 2.574102964118565, |
| "grad_norm": 1.7517180442810059, |
| "learning_rate": 1.6360739559105786e-06, |
| "loss": 0.1762, |
| "step": 1650 |
| }, |
| { |
| "epoch": 2.589703588143526, |
| "grad_norm": 1.3570579290390015, |
| "learning_rate": 1.52085592657977e-06, |
| "loss": 0.1519, |
| "step": 1660 |
| }, |
| { |
| "epoch": 2.6053042121684866, |
| "grad_norm": 1.470625638961792, |
| "learning_rate": 1.409627685081531e-06, |
| "loss": 0.1724, |
| "step": 1670 |
| }, |
| { |
| "epoch": 2.6209048361934477, |
| "grad_norm": 1.7645882368087769, |
| "learning_rate": 1.3024221546550713e-06, |
| "loss": 0.1788, |
| "step": 1680 |
| }, |
| { |
| "epoch": 2.636505460218409, |
| "grad_norm": 1.4340243339538574, |
| "learning_rate": 1.1992710678286929e-06, |
| "loss": 0.17, |
| "step": 1690 |
| }, |
| { |
| "epoch": 2.6521060842433695, |
| "grad_norm": 1.6330922842025757, |
| "learning_rate": 1.100204957027079e-06, |
| "loss": 0.1713, |
| "step": 1700 |
| }, |
| { |
| "epoch": 2.667706708268331, |
| "grad_norm": 1.0880082845687866, |
| "learning_rate": 1.005253145533761e-06, |
| "loss": 0.1758, |
| "step": 1710 |
| }, |
| { |
| "epoch": 2.6833073322932917, |
| "grad_norm": 2.9010987281799316, |
| "learning_rate": 9.144437388115295e-07, |
| "loss": 0.2119, |
| "step": 1720 |
| }, |
| { |
| "epoch": 2.6989079563182528, |
| "grad_norm": 1.8082084655761719, |
| "learning_rate": 8.278036161832869e-07, |
| "loss": 0.1732, |
| "step": 1730 |
| }, |
| { |
| "epoch": 2.714508580343214, |
| "grad_norm": 1.6634337902069092, |
| "learning_rate": 7.453584228758553e-07, |
| "loss": 0.2212, |
| "step": 1740 |
| }, |
| { |
| "epoch": 2.7301092043681745, |
| "grad_norm": 1.1831082105636597, |
| "learning_rate": 6.671325624290503e-07, |
| "loss": 0.1902, |
| "step": 1750 |
| }, |
| { |
| "epoch": 2.7457098283931356, |
| "grad_norm": 1.9070043563842773, |
| "learning_rate": 5.931491894723107e-07, |
| "loss": 0.1737, |
| "step": 1760 |
| }, |
| { |
| "epoch": 2.7613104524180967, |
| "grad_norm": 1.6935715675354004, |
| "learning_rate": 5.234302028710008e-07, |
| "loss": 0.1841, |
| "step": 1770 |
| }, |
| { |
| "epoch": 2.776911076443058, |
| "grad_norm": 1.1902638673782349, |
| "learning_rate": 4.579962392443959e-07, |
| "loss": 0.1501, |
| "step": 1780 |
| }, |
| { |
| "epoch": 2.792511700468019, |
| "grad_norm": 1.1872891187667847, |
| "learning_rate": 3.968666668573179e-07, |
| "loss": 0.195, |
| "step": 1790 |
| }, |
| { |
| "epoch": 2.8081123244929795, |
| "grad_norm": 1.0661053657531738, |
| "learning_rate": 3.4005957988716473e-07, |
| "loss": 0.1911, |
| "step": 1800 |
| }, |
| { |
| "epoch": 2.8237129485179406, |
| "grad_norm": 1.0829426050186157, |
| "learning_rate": 2.8759179306810657e-07, |
| "loss": 0.17, |
| "step": 1810 |
| }, |
| { |
| "epoch": 2.8393135725429017, |
| "grad_norm": 1.3797425031661987, |
| "learning_rate": 2.3947883671396e-07, |
| "loss": 0.192, |
| "step": 1820 |
| }, |
| { |
| "epoch": 2.854914196567863, |
| "grad_norm": 1.4285950660705566, |
| "learning_rate": 1.9573495212126535e-07, |
| "loss": 0.1988, |
| "step": 1830 |
| }, |
| { |
| "epoch": 2.870514820592824, |
| "grad_norm": 0.9140704274177551, |
| "learning_rate": 1.5637308735390044e-07, |
| "loss": 0.2175, |
| "step": 1840 |
| }, |
| { |
| "epoch": 2.8861154446177846, |
| "grad_norm": 1.3677566051483154, |
| "learning_rate": 1.2140489341049777e-07, |
| "loss": 0.1775, |
| "step": 1850 |
| }, |
| { |
| "epoch": 2.9017160686427457, |
| "grad_norm": 1.7777496576309204, |
| "learning_rate": 9.084072077576999e-08, |
| "loss": 0.1381, |
| "step": 1860 |
| }, |
| { |
| "epoch": 2.9173166926677068, |
| "grad_norm": 1.4862127304077148, |
| "learning_rate": 6.468961635680893e-08, |
| "loss": 0.177, |
| "step": 1870 |
| }, |
| { |
| "epoch": 2.932917316692668, |
| "grad_norm": 1.7681442499160767, |
| "learning_rate": 4.295932080521925e-08, |
| "loss": 0.1928, |
| "step": 1880 |
| }, |
| { |
| "epoch": 2.948517940717629, |
| "grad_norm": 3.2976081371307373, |
| "learning_rate": 2.565626622591466e-08, |
| "loss": 0.1981, |
| "step": 1890 |
| }, |
| { |
| "epoch": 2.9641185647425896, |
| "grad_norm": 1.4172863960266113, |
| "learning_rate": 1.2785574273224132e-08, |
| "loss": 0.1983, |
| "step": 1900 |
| }, |
| { |
| "epoch": 2.9797191887675507, |
| "grad_norm": 1.8934816122055054, |
| "learning_rate": 4.3510546349045945e-09, |
| "loss": 0.1933, |
| "step": 1910 |
| }, |
| { |
| "epoch": 2.995319812792512, |
| "grad_norm": 1.8813408613204956, |
| "learning_rate": 3.552039044829591e-10, |
| "loss": 0.1862, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.0, |
| "step": 1923, |
| "total_flos": 2.3277958164873216e+17, |
| "train_loss": 0.22298803400137868, |
| "train_runtime": 7941.0421, |
| "train_samples_per_second": 5.811, |
| "train_steps_per_second": 0.242 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 1923, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.3277958164873216e+17, |
| "train_batch_size": 3, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|