| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 6.0, |
| "eval_steps": 500, |
| "global_step": 2490, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.060350030175015085, |
| "grad_norm": 0.31135597825050354, |
| "learning_rate": 8.18181818181818e-05, |
| "loss": 1.9235, |
| "mean_token_accuracy": 0.6165974247455597, |
| "num_tokens": 157030.0, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.12070006035003017, |
| "grad_norm": 0.32249024510383606, |
| "learning_rate": 0.00016704545454545452, |
| "loss": 1.0538, |
| "mean_token_accuracy": 0.7450298243761062, |
| "num_tokens": 284204.0, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.18105009052504525, |
| "grad_norm": 0.3015844523906708, |
| "learning_rate": 0.0002522727272727273, |
| "loss": 0.6726, |
| "mean_token_accuracy": 0.8188469475507736, |
| "num_tokens": 441659.0, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.24140012070006034, |
| "grad_norm": 0.5104885697364807, |
| "learning_rate": 0.0002999887132933212, |
| "loss": 0.5348, |
| "mean_token_accuracy": 0.8504667854309083, |
| "num_tokens": 567533.0, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.30175015087507545, |
| "grad_norm": 0.278422474861145, |
| "learning_rate": 0.0002998791256978121, |
| "loss": 0.4144, |
| "mean_token_accuracy": 0.8816475421190262, |
| "num_tokens": 725296.0, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.3621001810500905, |
| "grad_norm": 0.35799795389175415, |
| "learning_rate": 0.0002996530399366737, |
| "loss": 0.3693, |
| "mean_token_accuracy": 0.8962992638349533, |
| "num_tokens": 851708.0, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.4224502112251056, |
| "grad_norm": 0.2955951690673828, |
| "learning_rate": 0.00029931063174202567, |
| "loss": 0.2746, |
| "mean_token_accuracy": 0.921732594370842, |
| "num_tokens": 1008739.0, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.4828002414001207, |
| "grad_norm": 0.3763117790222168, |
| "learning_rate": 0.00029885216726118104, |
| "loss": 0.2386, |
| "mean_token_accuracy": 0.9305408012866974, |
| "num_tokens": 1134832.0, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.5431502715751357, |
| "grad_norm": 0.3240911066532135, |
| "learning_rate": 0.00029827800284977474, |
| "loss": 0.1885, |
| "mean_token_accuracy": 0.9465595126152039, |
| "num_tokens": 1293095.0, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.6035003017501509, |
| "grad_norm": 0.3682485520839691, |
| "learning_rate": 0.00029758858479477575, |
| "loss": 0.1832, |
| "mean_token_accuracy": 0.9486982274055481, |
| "num_tokens": 1419060.0, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.663850331925166, |
| "grad_norm": 0.29996615648269653, |
| "learning_rate": 0.0002967844489675963, |
| "loss": 0.1501, |
| "mean_token_accuracy": 0.9576406496763229, |
| "num_tokens": 1577978.0, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.724200362100181, |
| "grad_norm": 0.3568132221698761, |
| "learning_rate": 0.00029586622040756957, |
| "loss": 0.1246, |
| "mean_token_accuracy": 0.964940841794014, |
| "num_tokens": 1705052.0, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.7845503922751962, |
| "grad_norm": 0.258465439081192, |
| "learning_rate": 0.0002948346128361186, |
| "loss": 0.1089, |
| "mean_token_accuracy": 0.970317553281784, |
| "num_tokens": 1861669.0, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.8449004224502112, |
| "grad_norm": 0.3066297173500061, |
| "learning_rate": 0.00029369042810199416, |
| "loss": 0.1072, |
| "mean_token_accuracy": 0.9702006059885026, |
| "num_tokens": 1988735.0, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.9052504526252263, |
| "grad_norm": 0.22371642291545868, |
| "learning_rate": 0.0002924345555580135, |
| "loss": 0.1117, |
| "mean_token_accuracy": 0.9696371680498124, |
| "num_tokens": 2146989.0, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.9656004828002414, |
| "grad_norm": 0.4630758762359619, |
| "learning_rate": 0.000291067971369783, |
| "loss": 0.0863, |
| "mean_token_accuracy": 0.9765483093261719, |
| "num_tokens": 2274615.0, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.09050165861845016, |
| "eval_mean_token_accuracy": 0.9750120665576006, |
| "eval_num_tokens": 2354180.0, |
| "eval_runtime": 61.8103, |
| "eval_samples_per_second": 5.97, |
| "eval_steps_per_second": 2.993, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.024140012070006, |
| "grad_norm": 0.17408408224582672, |
| "learning_rate": 0.0002895917377569438, |
| "loss": 0.1029, |
| "mean_token_accuracy": 0.9737048333453149, |
| "num_tokens": 2423619.0, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.0844900422450212, |
| "grad_norm": 0.20633958280086517, |
| "learning_rate": 0.00028800700216752875, |
| "loss": 0.0578, |
| "mean_token_accuracy": 0.9834175568819046, |
| "num_tokens": 2566868.0, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.1448400724200363, |
| "grad_norm": 0.1839175969362259, |
| "learning_rate": 0.00028631499638607285, |
| "loss": 0.0801, |
| "mean_token_accuracy": 0.9778328466415406, |
| "num_tokens": 2709054.0, |
| "step": 475 |
| }, |
| { |
| "epoch": 1.2051901025950513, |
| "grad_norm": 0.25615933537483215, |
| "learning_rate": 0.0002845170355761712, |
| "loss": 0.0614, |
| "mean_token_accuracy": 0.9824786186218262, |
| "num_tokens": 2851375.0, |
| "step": 500 |
| }, |
| { |
| "epoch": 1.2655401327700664, |
| "grad_norm": 0.16976642608642578, |
| "learning_rate": 0.0002826145172582274, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9807569885253906, |
| "num_tokens": 2993397.0, |
| "step": 525 |
| }, |
| { |
| "epoch": 1.3258901629450814, |
| "grad_norm": 0.15633493661880493, |
| "learning_rate": 0.00028060892022318764, |
| "loss": 0.0555, |
| "mean_token_accuracy": 0.9843500143289566, |
| "num_tokens": 3136497.0, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.3862401931200965, |
| "grad_norm": 0.16217610239982605, |
| "learning_rate": 0.0002785018033831051, |
| "loss": 0.0652, |
| "mean_token_accuracy": 0.982210916876793, |
| "num_tokens": 3277611.0, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.4465902232951118, |
| "grad_norm": 0.10138845443725586, |
| "learning_rate": 0.0002762948045594276, |
| "loss": 0.0523, |
| "mean_token_accuracy": 0.985246667265892, |
| "num_tokens": 3420182.0, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.5069402534701268, |
| "grad_norm": 0.12522730231285095, |
| "learning_rate": 0.0002739896392099502, |
| "loss": 0.063, |
| "mean_token_accuracy": 0.9825242179632186, |
| "num_tokens": 3561580.0, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.567290283645142, |
| "grad_norm": 0.16772225499153137, |
| "learning_rate": 0.00027158809909542307, |
| "loss": 0.0458, |
| "mean_token_accuracy": 0.9863684195280075, |
| "num_tokens": 3703648.0, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.627640313820157, |
| "grad_norm": 0.11150436848402023, |
| "learning_rate": 0.00026909205088685, |
| "loss": 0.0701, |
| "mean_token_accuracy": 0.9805551666021347, |
| "num_tokens": 3846275.0, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.687990343995172, |
| "grad_norm": 0.13114990293979645, |
| "learning_rate": 0.0002665034347145612, |
| "loss": 0.047, |
| "mean_token_accuracy": 0.9866662234067917, |
| "num_tokens": 3990181.0, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.748340374170187, |
| "grad_norm": 0.1373077780008316, |
| "learning_rate": 0.000263824262660187, |
| "loss": 0.0658, |
| "mean_token_accuracy": 0.9822418791055679, |
| "num_tokens": 4132061.0, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.8086904043452021, |
| "grad_norm": 0.1755433827638626, |
| "learning_rate": 0.0002610566171927056, |
| "loss": 0.0393, |
| "mean_token_accuracy": 0.9886017143726349, |
| "num_tokens": 4273899.0, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.8690404345202172, |
| "grad_norm": 0.12637771666049957, |
| "learning_rate": 0.00025820264954977976, |
| "loss": 0.0632, |
| "mean_token_accuracy": 0.9825896525382996, |
| "num_tokens": 4414846.0, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.9293904646952322, |
| "grad_norm": 0.0651940405368805, |
| "learning_rate": 0.00025526457806564136, |
| "loss": 0.0419, |
| "mean_token_accuracy": 0.9879031604528428, |
| "num_tokens": 4557169.0, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.9897404948702473, |
| "grad_norm": 0.12048754841089249, |
| "learning_rate": 0.00025224468644682245, |
| "loss": 0.0498, |
| "mean_token_accuracy": 0.9858449596166611, |
| "num_tokens": 4688618.0, |
| "step": 825 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.054688576608896255, |
| "eval_mean_token_accuracy": 0.9851620287508578, |
| "eval_num_tokens": 4708360.0, |
| "eval_runtime": 61.7668, |
| "eval_samples_per_second": 5.974, |
| "eval_steps_per_second": 2.995, |
| "step": 830 |
| }, |
| { |
| "epoch": 2.048280024140012, |
| "grad_norm": 0.10747923702001572, |
| "learning_rate": 0.00024914532199707444, |
| "loss": 0.0491, |
| "mean_token_accuracy": 0.9856173869260808, |
| "num_tokens": 4837968.0, |
| "step": 850 |
| }, |
| { |
| "epoch": 2.1086300543150274, |
| "grad_norm": 0.13443028926849365, |
| "learning_rate": 0.00024596889379285353, |
| "loss": 0.0316, |
| "mean_token_accuracy": 0.9905168610811234, |
| "num_tokens": 4971141.0, |
| "step": 875 |
| }, |
| { |
| "epoch": 2.1689800844900424, |
| "grad_norm": 0.1226191446185112, |
| "learning_rate": 0.00024271787081079228, |
| "loss": 0.0527, |
| "mean_token_accuracy": 0.9845808875560761, |
| "num_tokens": 5124003.0, |
| "step": 900 |
| }, |
| { |
| "epoch": 2.2293301146650575, |
| "grad_norm": 0.13954219222068787, |
| "learning_rate": 0.00023939478000861117, |
| "loss": 0.03, |
| "mean_token_accuracy": 0.991157540678978, |
| "num_tokens": 5256623.0, |
| "step": 925 |
| }, |
| { |
| "epoch": 2.2896801448400725, |
| "grad_norm": 0.07871092110872269, |
| "learning_rate": 0.00023600220436096318, |
| "loss": 0.0401, |
| "mean_token_accuracy": 0.9883328771591187, |
| "num_tokens": 5405783.0, |
| "step": 950 |
| }, |
| { |
| "epoch": 2.3500301750150876, |
| "grad_norm": 0.07625500857830048, |
| "learning_rate": 0.00023254278085173684, |
| "loss": 0.0329, |
| "mean_token_accuracy": 0.9900754565000534, |
| "num_tokens": 5538983.0, |
| "step": 975 |
| }, |
| { |
| "epoch": 2.4103802051901027, |
| "grad_norm": 0.08441416919231415, |
| "learning_rate": 0.00022901919842437972, |
| "loss": 0.045, |
| "mean_token_accuracy": 0.9868701863288879, |
| "num_tokens": 5689941.0, |
| "step": 1000 |
| }, |
| { |
| "epoch": 2.4707302353651177, |
| "grad_norm": 0.10544892400503159, |
| "learning_rate": 0.00022543419589183397, |
| "loss": 0.0282, |
| "mean_token_accuracy": 0.991428604722023, |
| "num_tokens": 5823463.0, |
| "step": 1025 |
| }, |
| { |
| "epoch": 2.5310802655401328, |
| "grad_norm": 0.08695073425769806, |
| "learning_rate": 0.00022179055980770993, |
| "loss": 0.0461, |
| "mean_token_accuracy": 0.9864470016956329, |
| "num_tokens": 5974358.0, |
| "step": 1050 |
| }, |
| { |
| "epoch": 2.591430295715148, |
| "grad_norm": 0.09350312501192093, |
| "learning_rate": 0.0002180911223003513, |
| "loss": 0.0327, |
| "mean_token_accuracy": 0.990239828824997, |
| "num_tokens": 6106913.0, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.651780325890163, |
| "grad_norm": 0.06971506774425507, |
| "learning_rate": 0.00021433875887147627, |
| "loss": 0.0496, |
| "mean_token_accuracy": 0.9856115758419037, |
| "num_tokens": 6260849.0, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.712130356065178, |
| "grad_norm": 0.07409551739692688, |
| "learning_rate": 0.00021053638616110525, |
| "loss": 0.031, |
| "mean_token_accuracy": 0.9906682467460632, |
| "num_tokens": 6393788.0, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.772480386240193, |
| "grad_norm": 0.08459241688251495, |
| "learning_rate": 0.00020668695968051274, |
| "loss": 0.0417, |
| "mean_token_accuracy": 0.9878255265951157, |
| "num_tokens": 6545295.0, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.832830416415208, |
| "grad_norm": 0.06320163607597351, |
| "learning_rate": 0.00020279347151496482, |
| "loss": 0.0305, |
| "mean_token_accuracy": 0.9911946403980255, |
| "num_tokens": 6677075.0, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.8931804465902236, |
| "grad_norm": 0.08129168301820755, |
| "learning_rate": 0.00019885894799802922, |
| "loss": 0.0465, |
| "mean_token_accuracy": 0.9863498049974442, |
| "num_tokens": 6828145.0, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.9535304767652386, |
| "grad_norm": 0.054761067032814026, |
| "learning_rate": 0.00019488644735926396, |
| "loss": 0.0298, |
| "mean_token_accuracy": 0.9909803092479705, |
| "num_tokens": 6960911.0, |
| "step": 1225 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.04527445137500763, |
| "eval_mean_token_accuracy": 0.9875696955500423, |
| "eval_num_tokens": 7062540.0, |
| "eval_runtime": 61.7191, |
| "eval_samples_per_second": 5.979, |
| "eval_steps_per_second": 2.997, |
| "step": 1245 |
| }, |
| { |
| "epoch": 3.012070006035003, |
| "grad_norm": 0.061567842960357666, |
| "learning_rate": 0.00019087905734711452, |
| "loss": 0.0384, |
| "mean_token_accuracy": 0.9884870881886826, |
| "num_tokens": 7098246.0, |
| "step": 1250 |
| }, |
| { |
| "epoch": 3.0724200362100182, |
| "grad_norm": 0.07762719690799713, |
| "learning_rate": 0.00018683989282886613, |
| "loss": 0.0252, |
| "mean_token_accuracy": 0.9920413041114807, |
| "num_tokens": 7246634.0, |
| "step": 1275 |
| }, |
| { |
| "epoch": 3.1327700663850333, |
| "grad_norm": 0.07748158276081085, |
| "learning_rate": 0.0001827720933695173, |
| "loss": 0.0313, |
| "mean_token_accuracy": 0.9904079920053482, |
| "num_tokens": 7381014.0, |
| "step": 1300 |
| }, |
| { |
| "epoch": 3.1931200965600484, |
| "grad_norm": 0.06404800713062286, |
| "learning_rate": 0.00017867882079145627, |
| "loss": 0.0276, |
| "mean_token_accuracy": 0.9916363900899887, |
| "num_tokens": 7528065.0, |
| "step": 1325 |
| }, |
| { |
| "epoch": 3.2534701267350634, |
| "grad_norm": 0.09366205334663391, |
| "learning_rate": 0.00017456325671683724, |
| "loss": 0.0288, |
| "mean_token_accuracy": 0.9909596383571625, |
| "num_tokens": 7664596.0, |
| "step": 1350 |
| }, |
| { |
| "epoch": 3.3138201569100785, |
| "grad_norm": 0.06945887953042984, |
| "learning_rate": 0.00017042860009456638, |
| "loss": 0.029, |
| "mean_token_accuracy": 0.9913074505329132, |
| "num_tokens": 7814551.0, |
| "step": 1375 |
| }, |
| { |
| "epoch": 3.3741701870850935, |
| "grad_norm": 0.07191596925258636, |
| "learning_rate": 0.00016627806471382066, |
| "loss": 0.0311, |
| "mean_token_accuracy": 0.9902618253231048, |
| "num_tokens": 7952304.0, |
| "step": 1400 |
| }, |
| { |
| "epoch": 3.4345202172601086, |
| "grad_norm": 0.07108636200428009, |
| "learning_rate": 0.00016211487670603078, |
| "loss": 0.0263, |
| "mean_token_accuracy": 0.9916422361135483, |
| "num_tokens": 8102624.0, |
| "step": 1425 |
| }, |
| { |
| "epoch": 3.4948702474351236, |
| "grad_norm": 0.1367121785879135, |
| "learning_rate": 0.0001579422720372715, |
| "loss": 0.0286, |
| "mean_token_accuracy": 0.9912952971458435, |
| "num_tokens": 8239465.0, |
| "step": 1450 |
| }, |
| { |
| "epoch": 3.5552202776101387, |
| "grad_norm": 0.064446821808815, |
| "learning_rate": 0.00015376349399300745, |
| "loss": 0.0246, |
| "mean_token_accuracy": 0.9919304746389389, |
| "num_tokens": 8387292.0, |
| "step": 1475 |
| }, |
| { |
| "epoch": 3.6155703077851538, |
| "grad_norm": 0.1252497136592865, |
| "learning_rate": 0.0001495817906571492, |
| "loss": 0.031, |
| "mean_token_accuracy": 0.990118277668953, |
| "num_tokens": 8523620.0, |
| "step": 1500 |
| }, |
| { |
| "epoch": 3.675920337960169, |
| "grad_norm": 0.04818420112133026, |
| "learning_rate": 0.00014540041238738055, |
| "loss": 0.0262, |
| "mean_token_accuracy": 0.9917533618211746, |
| "num_tokens": 8673400.0, |
| "step": 1525 |
| }, |
| { |
| "epoch": 3.736270368135184, |
| "grad_norm": 0.08937755227088928, |
| "learning_rate": 0.00014122260928871734, |
| "loss": 0.0307, |
| "mean_token_accuracy": 0.9904319751262665, |
| "num_tokens": 8810209.0, |
| "step": 1550 |
| }, |
| { |
| "epoch": 3.796620398310199, |
| "grad_norm": 0.0582498162984848, |
| "learning_rate": 0.00013705162868726396, |
| "loss": 0.0264, |
| "mean_token_accuracy": 0.9915424859523774, |
| "num_tokens": 8958149.0, |
| "step": 1575 |
| }, |
| { |
| "epoch": 3.856970428485214, |
| "grad_norm": 0.08708362281322479, |
| "learning_rate": 0.00013289071260612855, |
| "loss": 0.0267, |
| "mean_token_accuracy": 0.9918914479017258, |
| "num_tokens": 9093310.0, |
| "step": 1600 |
| }, |
| { |
| "epoch": 3.9173204586602295, |
| "grad_norm": 0.08365115523338318, |
| "learning_rate": 0.00012874309524546083, |
| "loss": 0.0263, |
| "mean_token_accuracy": 0.9918374270200729, |
| "num_tokens": 9238911.0, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.9776704888352445, |
| "grad_norm": 0.08191365003585815, |
| "learning_rate": 0.00012461200046857084, |
| "loss": 0.0256, |
| "mean_token_accuracy": 0.9919486922025681, |
| "num_tokens": 9369805.0, |
| "step": 1650 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.04307638853788376, |
| "eval_mean_token_accuracy": 0.9885664443711977, |
| "eval_num_tokens": 9416720.0, |
| "eval_runtime": 61.7987, |
| "eval_samples_per_second": 5.971, |
| "eval_steps_per_second": 2.994, |
| "step": 1660 |
| }, |
| { |
| "epoch": 4.036210018105009, |
| "grad_norm": 0.04609857127070427, |
| "learning_rate": 0.00012050063929608123, |
| "loss": 0.0261, |
| "mean_token_accuracy": 0.9915816790049838, |
| "num_tokens": 9518620.0, |
| "step": 1675 |
| }, |
| { |
| "epoch": 4.096560048280024, |
| "grad_norm": 0.0421195812523365, |
| "learning_rate": 0.0001164122074100633, |
| "loss": 0.0197, |
| "mean_token_accuracy": 0.9934104865789414, |
| "num_tokens": 9656880.0, |
| "step": 1700 |
| }, |
| { |
| "epoch": 4.15691007845504, |
| "grad_norm": 0.06511716544628143, |
| "learning_rate": 0.00011234988267009415, |
| "loss": 0.0238, |
| "mean_token_accuracy": 0.9923344779014588, |
| "num_tokens": 9803496.0, |
| "step": 1725 |
| }, |
| { |
| "epoch": 4.217260108630055, |
| "grad_norm": 0.04106709733605385, |
| "learning_rate": 0.00010831682264316787, |
| "loss": 0.0196, |
| "mean_token_accuracy": 0.9936438763141632, |
| "num_tokens": 9942120.0, |
| "step": 1750 |
| }, |
| { |
| "epoch": 4.27761013880507, |
| "grad_norm": 0.035884249955415726, |
| "learning_rate": 0.00010431616214937911, |
| "loss": 0.0238, |
| "mean_token_accuracy": 0.992466544508934, |
| "num_tokens": 10088250.0, |
| "step": 1775 |
| }, |
| { |
| "epoch": 4.337960168980085, |
| "grad_norm": 0.08095250278711319, |
| "learning_rate": 0.00010035101082528777, |
| "loss": 0.0194, |
| "mean_token_accuracy": 0.993693078160286, |
| "num_tokens": 10225040.0, |
| "step": 1800 |
| }, |
| { |
| "epoch": 4.3983101991551, |
| "grad_norm": 0.05341358855366707, |
| "learning_rate": 9.642445070685809e-05, |
| "loss": 0.0266, |
| "mean_token_accuracy": 0.9913520681858062, |
| "num_tokens": 10370929.0, |
| "step": 1825 |
| }, |
| { |
| "epoch": 4.458660229330115, |
| "grad_norm": 0.07090672850608826, |
| "learning_rate": 9.253953383385157e-05, |
| "loss": 0.0196, |
| "mean_token_accuracy": 0.9933394527435303, |
| "num_tokens": 10508749.0, |
| "step": 1850 |
| }, |
| { |
| "epoch": 4.51901025950513, |
| "grad_norm": 0.05845542252063751, |
| "learning_rate": 8.869927987753459e-05, |
| "loss": 0.0238, |
| "mean_token_accuracy": 0.992053787112236, |
| "num_tokens": 10655521.0, |
| "step": 1875 |
| }, |
| { |
| "epoch": 4.579360289680145, |
| "grad_norm": 0.04508192464709282, |
| "learning_rate": 8.490667379354661e-05, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.9938723063468933, |
| "num_tokens": 10792437.0, |
| "step": 1900 |
| }, |
| { |
| "epoch": 4.63971031985516, |
| "grad_norm": 0.0697343647480011, |
| "learning_rate": 8.116466350175079e-05, |
| "loss": 0.0235, |
| "mean_token_accuracy": 0.9926882380247116, |
| "num_tokens": 10938497.0, |
| "step": 1925 |
| }, |
| { |
| "epoch": 4.700060350030175, |
| "grad_norm": 0.038746606558561325, |
| "learning_rate": 7.747615759487304e-05, |
| "loss": 0.0188, |
| "mean_token_accuracy": 0.9936485493183136, |
| "num_tokens": 11076307.0, |
| "step": 1950 |
| }, |
| { |
| "epoch": 4.76041038020519, |
| "grad_norm": 0.059882089495658875, |
| "learning_rate": 7.38440230777085e-05, |
| "loss": 0.0225, |
| "mean_token_accuracy": 0.9926686590909958, |
| "num_tokens": 11222212.0, |
| "step": 1975 |
| }, |
| { |
| "epoch": 4.820760410380205, |
| "grad_norm": 0.058393221348524094, |
| "learning_rate": 7.027108313865378e-05, |
| "loss": 0.0189, |
| "mean_token_accuracy": 0.9937667572498321, |
| "num_tokens": 11359271.0, |
| "step": 2000 |
| }, |
| { |
| "epoch": 4.88111044055522, |
| "grad_norm": 0.06429804116487503, |
| "learning_rate": 6.676011495529687e-05, |
| "loss": 0.024, |
| "mean_token_accuracy": 0.9916912001371384, |
| "num_tokens": 11506584.0, |
| "step": 2025 |
| }, |
| { |
| "epoch": 4.941460470730235, |
| "grad_norm": 0.061373207718133926, |
| "learning_rate": 6.331384753577056e-05, |
| "loss": 0.0189, |
| "mean_token_accuracy": 0.9937462341785431, |
| "num_tokens": 11643764.0, |
| "step": 2050 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.17021211981773376, |
| "learning_rate": 5.993495959754631e-05, |
| "loss": 0.0211, |
| "mean_token_accuracy": 0.99315461301312, |
| "num_tokens": 11770900.0, |
| "step": 2075 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.04168040677905083, |
| "eval_mean_token_accuracy": 0.9894425063519865, |
| "eval_num_tokens": 11770900.0, |
| "eval_runtime": 61.7626, |
| "eval_samples_per_second": 5.974, |
| "eval_steps_per_second": 2.995, |
| "step": 2075 |
| }, |
| { |
| "epoch": 5.060350030175015, |
| "grad_norm": 0.04657725617289543, |
| "learning_rate": 5.662607748531929e-05, |
| "loss": 0.0193, |
| "mean_token_accuracy": 0.993717749118805, |
| "num_tokens": 11929041.0, |
| "step": 2100 |
| }, |
| { |
| "epoch": 5.12070006035003, |
| "grad_norm": 0.05181601271033287, |
| "learning_rate": 5.338977312960054e-05, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9944871377944946, |
| "num_tokens": 12055264.0, |
| "step": 2125 |
| }, |
| { |
| "epoch": 5.181050090525045, |
| "grad_norm": 0.03697885945439339, |
| "learning_rate": 5.022856204760504e-05, |
| "loss": 0.0172, |
| "mean_token_accuracy": 0.9943871372938156, |
| "num_tokens": 12214801.0, |
| "step": 2150 |
| }, |
| { |
| "epoch": 5.24140012070006, |
| "grad_norm": 0.04903125390410423, |
| "learning_rate": 4.7144901387988374e-05, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9945520520210266, |
| "num_tokens": 12340609.0, |
| "step": 2175 |
| }, |
| { |
| "epoch": 5.301750150875075, |
| "grad_norm": 0.03869150951504707, |
| "learning_rate": 4.4141188020952563e-05, |
| "loss": 0.017, |
| "mean_token_accuracy": 0.9942963647842408, |
| "num_tokens": 12498584.0, |
| "step": 2200 |
| }, |
| { |
| "epoch": 5.36210018105009, |
| "grad_norm": 0.09273343533277512, |
| "learning_rate": 4.1219756675204454e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.994449194073677, |
| "num_tokens": 12625206.0, |
| "step": 2225 |
| }, |
| { |
| "epoch": 5.422450211225105, |
| "grad_norm": 0.04754876345396042, |
| "learning_rate": 3.838287812321625e-05, |
| "loss": 0.019, |
| "mean_token_accuracy": 0.993769793510437, |
| "num_tokens": 12781252.0, |
| "step": 2250 |
| }, |
| { |
| "epoch": 5.4828002414001205, |
| "grad_norm": 0.0511883907020092, |
| "learning_rate": 3.5632757416197544e-05, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.994466689825058, |
| "num_tokens": 12907745.0, |
| "step": 2275 |
| }, |
| { |
| "epoch": 5.5431502715751355, |
| "grad_norm": 0.033768199384212494, |
| "learning_rate": 3.297153217015155e-05, |
| "loss": 0.0186, |
| "mean_token_accuracy": 0.99390440762043, |
| "num_tokens": 13064137.0, |
| "step": 2300 |
| }, |
| { |
| "epoch": 5.603500301750151, |
| "grad_norm": 0.052670273929834366, |
| "learning_rate": 3.0401270904346992e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9942348504066467, |
| "num_tokens": 13191301.0, |
| "step": 2325 |
| }, |
| { |
| "epoch": 5.663850331925166, |
| "grad_norm": 0.07454020529985428, |
| "learning_rate": 2.7923971433497916e-05, |
| "loss": 0.018, |
| "mean_token_accuracy": 0.9938998764753342, |
| "num_tokens": 13349099.0, |
| "step": 2350 |
| }, |
| { |
| "epoch": 5.724200362100181, |
| "grad_norm": 0.05309119448065758, |
| "learning_rate": 2.554155931490085e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9944328534603118, |
| "num_tokens": 13475909.0, |
| "step": 2375 |
| }, |
| { |
| "epoch": 5.784550392275197, |
| "grad_norm": 0.0445110946893692, |
| "learning_rate": 2.3255886351735707e-05, |
| "loss": 0.0163, |
| "mean_token_accuracy": 0.9946111005544662, |
| "num_tokens": 13632061.0, |
| "step": 2400 |
| }, |
| { |
| "epoch": 5.844900422450211, |
| "grad_norm": 0.049861881881952286, |
| "learning_rate": 2.1068729153695202e-05, |
| "loss": 0.0168, |
| "mean_token_accuracy": 0.9944065082073211, |
| "num_tokens": 13758814.0, |
| "step": 2425 |
| }, |
| { |
| "epoch": 5.905250452625227, |
| "grad_norm": 0.03456846997141838, |
| "learning_rate": 1.898178775605993e-05, |
| "loss": 0.0169, |
| "mean_token_accuracy": 0.9943563443422317, |
| "num_tokens": 13916713.0, |
| "step": 2450 |
| }, |
| { |
| "epoch": 5.965600482800241, |
| "grad_norm": 0.05459899455308914, |
| "learning_rate": 1.699668429829371e-05, |
| "loss": 0.0167, |
| "mean_token_accuracy": 0.9942887610197068, |
| "num_tokens": 14043576.0, |
| "step": 2475 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.04268869385123253, |
| "eval_mean_token_accuracy": 0.9897750036136524, |
| "eval_num_tokens": 14125080.0, |
| "eval_runtime": 61.7471, |
| "eval_samples_per_second": 5.976, |
| "eval_steps_per_second": 2.996, |
| "step": 2490 |
| } |
| ], |
| "logging_steps": 25, |
| "max_steps": 2905, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 7, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.068068549186775e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|