| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9989550679205852, |
| "eval_steps": 500, |
| "global_step": 478, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0020898641588296763, |
| "grad_norm": 216.79754638671875, |
| "learning_rate": 6.2499999999999995e-06, |
| "loss": 57.9838, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.01044932079414838, |
| "grad_norm": 184.4412841796875, |
| "learning_rate": 3.125e-05, |
| "loss": 60.093, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.02089864158829676, |
| "grad_norm": 107.91060638427734, |
| "learning_rate": 6.25e-05, |
| "loss": 48.3094, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.03134796238244514, |
| "grad_norm": 17.1436710357666, |
| "learning_rate": 9.374999999999999e-05, |
| "loss": 33.2668, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.04179728317659352, |
| "grad_norm": 12.335116386413574, |
| "learning_rate": 0.000125, |
| "loss": 27.698, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0522466039707419, |
| "grad_norm": 6.2943196296691895, |
| "learning_rate": 0.00015625, |
| "loss": 25.9692, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.06269592476489028, |
| "grad_norm": 5.466517448425293, |
| "learning_rate": 0.00018749999999999998, |
| "loss": 25.2691, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.07314524555903866, |
| "grad_norm": 9.744288444519043, |
| "learning_rate": 0.00021874999999999998, |
| "loss": 23.7082, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.08359456635318704, |
| "grad_norm": 19.27219581604004, |
| "learning_rate": 0.00025, |
| "loss": 21.3655, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.09404388714733543, |
| "grad_norm": 41.77222442626953, |
| "learning_rate": 0.00028125, |
| "loss": 16.1707, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.1044932079414838, |
| "grad_norm": 18.60293960571289, |
| "learning_rate": 0.0002999839868651235, |
| "loss": 8.0969, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.11494252873563218, |
| "grad_norm": 11.452897071838379, |
| "learning_rate": 0.00029980387835984494, |
| "loss": 4.1367, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.12539184952978055, |
| "grad_norm": 8.422245979309082, |
| "learning_rate": 0.000299423886051382, |
| "loss": 3.1254, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.13584117032392895, |
| "grad_norm": 2.444629669189453, |
| "learning_rate": 0.0002988445169647103, |
| "loss": 2.4463, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.14629049111807732, |
| "grad_norm": 1.307098627090454, |
| "learning_rate": 0.0002980665441538907, |
| "loss": 2.1685, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.15673981191222572, |
| "grad_norm": 2.10964298248291, |
| "learning_rate": 0.0002970910056705806, |
| "loss": 2.0392, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.1671891327063741, |
| "grad_norm": 1.1905853748321533, |
| "learning_rate": 0.0002959192031789579, |
| "loss": 1.9225, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.17763845350052246, |
| "grad_norm": 0.8916841745376587, |
| "learning_rate": 0.0002945527002189068, |
| "loss": 1.8422, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.18808777429467086, |
| "grad_norm": 3.186051845550537, |
| "learning_rate": 0.00029299332011978107, |
| "loss": 1.748, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.19853709508881923, |
| "grad_norm": 3.865817070007324, |
| "learning_rate": 0.00029124314356752967, |
| "loss": 1.7184, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.2089864158829676, |
| "grad_norm": 2.8790738582611084, |
| "learning_rate": 0.0002893045058284311, |
| "loss": 1.6432, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.219435736677116, |
| "grad_norm": 1.6771491765975952, |
| "learning_rate": 0.00028717999363313967, |
| "loss": 1.6567, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.22988505747126436, |
| "grad_norm": 2.725285530090332, |
| "learning_rate": 0.00028487244172520246, |
| "loss": 1.6157, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.24033437826541273, |
| "grad_norm": 2.289280652999878, |
| "learning_rate": 0.0002823849290786517, |
| "loss": 1.6148, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.2507836990595611, |
| "grad_norm": 2.0211188793182373, |
| "learning_rate": 0.0002797207747897198, |
| "loss": 1.5858, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2612330198537095, |
| "grad_norm": 2.0264103412628174, |
| "learning_rate": 0.00027688353364815834, |
| "loss": 1.5708, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.2716823406478579, |
| "grad_norm": 0.9253348112106323, |
| "learning_rate": 0.0002738769913940706, |
| "loss": 1.5481, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.28213166144200624, |
| "grad_norm": 3.3143184185028076, |
| "learning_rate": 0.00027070515966658604, |
| "loss": 1.5535, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.29258098223615464, |
| "grad_norm": 4.024845600128174, |
| "learning_rate": 0.0002673722706511174, |
| "loss": 1.5542, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.30303030303030304, |
| "grad_norm": 3.718261241912842, |
| "learning_rate": 0.00026388277143234146, |
| "loss": 1.5507, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.31347962382445144, |
| "grad_norm": 1.9526076316833496, |
| "learning_rate": 0.0002602413180604401, |
| "loss": 1.5251, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.3239289446185998, |
| "grad_norm": 1.5725075006484985, |
| "learning_rate": 0.00025645276933851667, |
| "loss": 1.4937, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.3343782654127482, |
| "grad_norm": 4.266882419586182, |
| "learning_rate": 0.00025252218033947993, |
| "loss": 1.4944, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.3448275862068966, |
| "grad_norm": 2.6647915840148926, |
| "learning_rate": 0.0002484547956610429, |
| "loss": 1.4798, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.3552769070010449, |
| "grad_norm": 2.0770153999328613, |
| "learning_rate": 0.0002442560424278399, |
| "loss": 1.4708, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.3657262277951933, |
| "grad_norm": 1.8132774829864502, |
| "learning_rate": 0.00023993152304999582, |
| "loss": 1.4554, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3761755485893417, |
| "grad_norm": 1.9493850469589233, |
| "learning_rate": 0.00023548700774781242, |
| "loss": 1.485, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.38662486938349006, |
| "grad_norm": 3.6726951599121094, |
| "learning_rate": 0.00023092842685254442, |
| "loss": 1.4584, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.39707419017763845, |
| "grad_norm": 2.253319501876831, |
| "learning_rate": 0.00022626186289353913, |
| "loss": 1.4569, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.40752351097178685, |
| "grad_norm": 3.336820125579834, |
| "learning_rate": 0.00022149354248229784, |
| "loss": 1.4334, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.4179728317659352, |
| "grad_norm": 3.0895018577575684, |
| "learning_rate": 0.0002166298280042877, |
| "loss": 1.4203, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.4284221525600836, |
| "grad_norm": 1.8486225605010986, |
| "learning_rate": 0.00021167720912959004, |
| "loss": 1.414, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.438871473354232, |
| "grad_norm": 0.7216203808784485, |
| "learning_rate": 0.00020664229415371266, |
| "loss": 1.3897, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.44932079414838033, |
| "grad_norm": 2.909454107284546, |
| "learning_rate": 0.0002015318011801192, |
| "loss": 1.3713, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.45977011494252873, |
| "grad_norm": 1.5531753301620483, |
| "learning_rate": 0.0001963525491562421, |
| "loss": 1.4055, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4702194357366771, |
| "grad_norm": 4.848015308380127, |
| "learning_rate": 0.00019111144877493873, |
| "loss": 1.435, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.48066875653082547, |
| "grad_norm": 4.833097457885742, |
| "learning_rate": 0.00018581549325353126, |
| "loss": 1.417, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.49111807732497387, |
| "grad_norm": 1.415703296661377, |
| "learning_rate": 0.00018047174900273435, |
| "loss": 1.4449, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.5015673981191222, |
| "grad_norm": 0.9621894359588623, |
| "learning_rate": 0.00017508734619791966, |
| "loss": 1.3907, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.5120167189132706, |
| "grad_norm": 2.091428279876709, |
| "learning_rate": 0.0001696694692653004, |
| "loss": 1.3581, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.522466039707419, |
| "grad_norm": 1.3531287908554077, |
| "learning_rate": 0.00016422534729572738, |
| "loss": 1.3717, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.5329153605015674, |
| "grad_norm": 1.8569897413253784, |
| "learning_rate": 0.0001587622443988899, |
| "loss": 1.3811, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.5433646812957158, |
| "grad_norm": 4.248292446136475, |
| "learning_rate": 0.0001532874500107902, |
| "loss": 1.3797, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.5538140020898642, |
| "grad_norm": 2.5460174083709717, |
| "learning_rate": 0.0001478082691674256, |
| "loss": 1.3576, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.5642633228840125, |
| "grad_norm": 1.3485275506973267, |
| "learning_rate": 0.00014233201275765494, |
| "loss": 1.383, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5747126436781609, |
| "grad_norm": 1.1686965227127075, |
| "learning_rate": 0.00013686598776825563, |
| "loss": 1.3715, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5851619644723093, |
| "grad_norm": 1.8593087196350098, |
| "learning_rate": 0.0001314174875341878, |
| "loss": 1.3671, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5956112852664577, |
| "grad_norm": 1.5989689826965332, |
| "learning_rate": 0.0001259937820070732, |
| "loss": 1.3379, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.6060606060606061, |
| "grad_norm": 3.129467248916626, |
| "learning_rate": 0.00012060210805487529, |
| "loss": 1.3436, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.6165099268547545, |
| "grad_norm": 1.071311593055725, |
| "learning_rate": 0.00011524965980572284, |
| "loss": 1.3711, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.6269592476489029, |
| "grad_norm": 2.8161048889160156, |
| "learning_rate": 0.00010994357904876106, |
| "loss": 1.3242, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.6374085684430512, |
| "grad_norm": 0.9445050954818726, |
| "learning_rate": 0.00010469094570483928, |
| "loss": 1.3217, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.6478578892371996, |
| "grad_norm": 1.53034508228302, |
| "learning_rate": 9.949876837974944e-05, |
| "loss": 1.314, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.658307210031348, |
| "grad_norm": 1.8168761730194092, |
| "learning_rate": 9.437397501262026e-05, |
| "loss": 1.3365, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.6687565308254964, |
| "grad_norm": 1.4955302476882935, |
| "learning_rate": 8.932340363194595e-05, |
| "loss": 1.3154, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6792058516196448, |
| "grad_norm": 1.2552021741867065, |
| "learning_rate": 8.435379323158218e-05, |
| "loss": 1.3366, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6896551724137931, |
| "grad_norm": 2.914289712905884, |
| "learning_rate": 7.947177477888472e-05, |
| "loss": 1.3233, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.7001044932079414, |
| "grad_norm": 1.3406000137329102, |
| "learning_rate": 7.46838623669881e-05, |
| "loss": 1.3264, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.7105538140020898, |
| "grad_norm": 0.9025297164916992, |
| "learning_rate": 6.999644452302975e-05, |
| "loss": 1.3197, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.7210031347962382, |
| "grad_norm": 1.2824598550796509, |
| "learning_rate": 6.541577568391758e-05, |
| "loss": 1.3201, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.7314524555903866, |
| "grad_norm": 0.9296241998672485, |
| "learning_rate": 6.0947967851014405e-05, |
| "loss": 1.3097, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.741901776384535, |
| "grad_norm": 0.8738858699798584, |
| "learning_rate": 5.659898243487463e-05, |
| "loss": 1.3044, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.7523510971786834, |
| "grad_norm": 1.8482000827789307, |
| "learning_rate": 5.237462230091467e-05, |
| "loss": 1.3108, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.7628004179728317, |
| "grad_norm": 2.537909746170044, |
| "learning_rate": 4.8280524026630565e-05, |
| "loss": 1.3164, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.7732497387669801, |
| "grad_norm": 1.3068586587905884, |
| "learning_rate": 4.432215038069449e-05, |
| "loss": 1.2782, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.7836990595611285, |
| "grad_norm": 1.3742858171463013, |
| "learning_rate": 4.0504783033964645e-05, |
| "loss": 1.3179, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.7941483803552769, |
| "grad_norm": 1.2923156023025513, |
| "learning_rate": 3.6833515512134606e-05, |
| "loss": 1.2904, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.8045977011494253, |
| "grad_norm": 0.7867398262023926, |
| "learning_rate": 3.331324639942526e-05, |
| "loss": 1.3029, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.8150470219435737, |
| "grad_norm": 1.1442195177078247, |
| "learning_rate": 2.9948672802388135e-05, |
| "loss": 1.3069, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.8254963427377221, |
| "grad_norm": 1.4821033477783203, |
| "learning_rate": 2.67442840825406e-05, |
| "loss": 1.3177, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.8359456635318704, |
| "grad_norm": 0.9633380770683289, |
| "learning_rate": 2.3704355866196373e-05, |
| "loss": 1.3249, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.8463949843260188, |
| "grad_norm": 1.2908155918121338, |
| "learning_rate": 2.083294433948324e-05, |
| "loss": 1.3449, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.8568443051201672, |
| "grad_norm": 1.1834619045257568, |
| "learning_rate": 1.813388083616068e-05, |
| "loss": 1.3086, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.8672936259143156, |
| "grad_norm": 1.1399352550506592, |
| "learning_rate": 1.5610766725458834e-05, |
| "loss": 1.315, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.877742946708464, |
| "grad_norm": 1.2300066947937012, |
| "learning_rate": 1.326696860675981e-05, |
| "loss": 1.2894, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.8881922675026124, |
| "grad_norm": 0.9975532293319702, |
| "learning_rate": 1.1105613817532976e-05, |
| "loss": 1.2953, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8986415882967607, |
| "grad_norm": 0.9357336163520813, |
| "learning_rate": 9.129586260518634e-06, |
| "loss": 1.3159, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.9090909090909091, |
| "grad_norm": 0.7603440880775452, |
| "learning_rate": 7.34152255572697e-06, |
| "loss": 1.2897, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.9195402298850575, |
| "grad_norm": 0.8711851835250854, |
| "learning_rate": 5.743808522387544e-06, |
| "loss": 1.275, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.9299895506792059, |
| "grad_norm": 0.9144044518470764, |
| "learning_rate": 4.33857599554282e-06, |
| "loss": 1.328, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.9404388714733543, |
| "grad_norm": 0.862479567527771, |
| "learning_rate": 3.1276999815337544e-06, |
| "loss": 1.2879, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.9508881922675027, |
| "grad_norm": 0.7352892756462097, |
| "learning_rate": 2.1127961561727193e-06, |
| "loss": 1.2873, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.9613375130616509, |
| "grad_norm": 2.582821846008301, |
| "learning_rate": 1.2952187089419642e-06, |
| "loss": 1.3191, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.9717868338557993, |
| "grad_norm": 0.7060139179229736, |
| "learning_rate": 6.760585360942872e-07, |
| "loss": 1.3047, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.9822361546499477, |
| "grad_norm": 0.8089200258255005, |
| "learning_rate": 2.5614178506644934e-07, |
| "loss": 1.2743, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.9926854754440961, |
| "grad_norm": 1.2739328145980835, |
| "learning_rate": 3.6028752148081766e-08, |
| "loss": 1.3004, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.9989550679205852, |
| "eval_loss": 1.9203195571899414, |
| "eval_runtime": 0.8302, |
| "eval_samples_per_second": 2.409, |
| "eval_steps_per_second": 1.205, |
| "step": 478 |
| }, |
| { |
| "epoch": 0.9989550679205852, |
| "step": 478, |
| "total_flos": 3.643767570437243e+17, |
| "train_loss": 4.360991338805674, |
| "train_runtime": 2613.4355, |
| "train_samples_per_second": 2.928, |
| "train_steps_per_second": 0.183 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 478, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.643767570437243e+17, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|