{ "best_global_step": 6000, "best_metric": 0.95930004, "best_model_checkpoint": "/user/yutianyu/Duplex_Finetune/output/4B_LLaVA_SFT/zero3_0dot6B_LLaVA_SFT_nopacking/v0-20251202-145343/checkpoint-6000", "epoch": 1.4081488957213604, "eval_steps": 100, "global_step": 6000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002347032177811158, "grad_norm": 19.127729366517716, "learning_rate": 4e-09, "loss": 1.9448599815368652, "num_input_tokens_seen": 197295, "step": 1, "token_acc": 0.5819854991634132 }, { "epoch": 0.002347032177811158, "grad_norm": 16.494659300395906, "learning_rate": 4e-08, "loss": 1.92243406507704, "num_input_tokens_seen": 2020272, "step": 10, "token_acc": 0.5839277085360667 }, { "epoch": 0.004694064355622316, "grad_norm": 22.126428554674575, "learning_rate": 8e-08, "loss": 1.916154670715332, "num_input_tokens_seen": 4084884, "step": 20, "token_acc": 0.5835957997637319 }, { "epoch": 0.007041096533433474, "grad_norm": 12.603610772661288, "learning_rate": 1.2e-07, "loss": 1.8856426239013673, "num_input_tokens_seen": 6056667, "step": 30, "token_acc": 0.584776074988841 }, { "epoch": 0.009388128711244632, "grad_norm": 10.779646193701467, "learning_rate": 1.6e-07, "loss": 1.8353569030761718, "num_input_tokens_seen": 8091435, "step": 40, "token_acc": 0.5897754631538845 }, { "epoch": 0.011735160889055789, "grad_norm": 136.86547531489668, "learning_rate": 2e-07, "loss": 1.721211051940918, "num_input_tokens_seen": 10091673, "step": 50, "token_acc": 0.6031858358236022 }, { "epoch": 0.014082193066866948, "grad_norm": 5.080233785680673, "learning_rate": 2.4e-07, "loss": 1.6145668029785156, "num_input_tokens_seen": 12099135, "step": 60, "token_acc": 0.6209089567372474 }, { "epoch": 0.016429225244678103, "grad_norm": 12.207422981911558, "learning_rate": 2.8e-07, "loss": 1.5414657592773438, "num_input_tokens_seen": 14135250, "step": 70, "token_acc": 0.6310852754061408 }, { "epoch": 0.018776257422489263, "grad_norm": 8.118283422146021, "learning_rate": 3.2e-07, "loss": 1.528026008605957, "num_input_tokens_seen": 16200873, "step": 80, "token_acc": 0.6382918453943185 }, { "epoch": 0.02112328960030042, "grad_norm": 3.47292239362735, "learning_rate": 3.6e-07, "loss": 1.4715272903442382, "num_input_tokens_seen": 18177258, "step": 90, "token_acc": 0.6432060553309527 }, { "epoch": 0.023470321778111577, "grad_norm": 2.2332542577632526, "learning_rate": 4e-07, "loss": 1.4491453170776367, "num_input_tokens_seen": 20161581, "step": 100, "token_acc": 0.6477376652924822 }, { "epoch": 0.023470321778111577, "eval_loss": 1.4709749221801758, "eval_runtime": 33.2583, "eval_samples_per_second": 30.068, "eval_steps_per_second": 1.263, "eval_token_acc": 0.6456822326354424, "num_input_tokens_seen": 20161581, "step": 100 }, { "epoch": 0.025817353955922735, "grad_norm": 1.9112779247166412, "learning_rate": 4.3999999999999997e-07, "loss": 1.4166399002075196, "num_input_tokens_seen": 22179654, "step": 110, "token_acc": 0.6549487213586689 }, { "epoch": 0.028164386133733895, "grad_norm": 1.897077748827419, "learning_rate": 4.8e-07, "loss": 1.3960003852844238, "num_input_tokens_seen": 24157263, "step": 120, "token_acc": 0.6563052255139141 }, { "epoch": 0.030511418311545052, "grad_norm": 2.063047689522777, "learning_rate": 5.2e-07, "loss": 1.373966884613037, "num_input_tokens_seen": 26112051, "step": 130, "token_acc": 0.6609459618615088 }, { "epoch": 0.032858450489356206, "grad_norm": 6.453024686764437, "learning_rate": 5.6e-07, "loss": 1.3647557258605958, "num_input_tokens_seen": 28133607, "step": 140, "token_acc": 0.6628235998176535 }, { "epoch": 0.035205482667167366, "grad_norm": 1.9586538878052986, "learning_rate": 6e-07, "loss": 1.394300651550293, "num_input_tokens_seen": 30182052, "step": 150, "token_acc": 0.6566977644780848 }, { "epoch": 0.03755251484497853, "grad_norm": 4.663911418765899, "learning_rate": 6.4e-07, "loss": 1.3559602737426757, "num_input_tokens_seen": 32177622, "step": 160, "token_acc": 0.6647926044470018 }, { "epoch": 0.03989954702278968, "grad_norm": 2.649130437820903, "learning_rate": 6.800000000000001e-07, "loss": 1.320611572265625, "num_input_tokens_seen": 34150332, "step": 170, "token_acc": 0.671068499517214 }, { "epoch": 0.04224657920060084, "grad_norm": 1.988272208775732, "learning_rate": 7.2e-07, "loss": 1.3357341766357422, "num_input_tokens_seen": 36129600, "step": 180, "token_acc": 0.6666514308426074 }, { "epoch": 0.044593611378412, "grad_norm": 2.089282254202976, "learning_rate": 7.599999999999999e-07, "loss": 1.3014695167541503, "num_input_tokens_seen": 38143260, "step": 190, "token_acc": 0.6714680103247839 }, { "epoch": 0.046940643556223155, "grad_norm": 1.6962028687120758, "learning_rate": 8e-07, "loss": 1.316438865661621, "num_input_tokens_seen": 40117473, "step": 200, "token_acc": 0.6719479705996623 }, { "epoch": 0.046940643556223155, "eval_loss": 1.3232439756393433, "eval_runtime": 32.4409, "eval_samples_per_second": 30.825, "eval_steps_per_second": 1.295, "eval_token_acc": 0.6714895778029132, "num_input_tokens_seen": 40117473, "step": 200 }, { "epoch": 0.049287675734034316, "grad_norm": 2.7375989158006453, "learning_rate": 8.399999999999999e-07, "loss": 1.2971059799194335, "num_input_tokens_seen": 42051432, "step": 210, "token_acc": 0.6724870727708412 }, { "epoch": 0.05163470791184547, "grad_norm": 1.8333521024827166, "learning_rate": 8.799999999999999e-07, "loss": 1.251258945465088, "num_input_tokens_seen": 44060409, "step": 220, "token_acc": 0.6825975678761506 }, { "epoch": 0.05398174008965663, "grad_norm": 1.7067135742590114, "learning_rate": 9.2e-07, "loss": 1.2541748046875, "num_input_tokens_seen": 46140576, "step": 230, "token_acc": 0.6838314298998598 }, { "epoch": 0.05632877226746779, "grad_norm": 2.117007115532979, "learning_rate": 9.6e-07, "loss": 1.2801358222961425, "num_input_tokens_seen": 48218463, "step": 240, "token_acc": 0.6776609031540706 }, { "epoch": 0.058675804445278944, "grad_norm": 1.9477588029675073, "learning_rate": 1e-06, "loss": 1.272374153137207, "num_input_tokens_seen": 50212704, "step": 250, "token_acc": 0.6784600688499179 }, { "epoch": 0.061022836623090104, "grad_norm": 2.0653725127756495, "learning_rate": 1.04e-06, "loss": 1.2390222549438477, "num_input_tokens_seen": 52210065, "step": 260, "token_acc": 0.6846190216694448 }, { "epoch": 0.06336986880090126, "grad_norm": 1.6368439315898582, "learning_rate": 1.08e-06, "loss": 1.2289260864257812, "num_input_tokens_seen": 54287394, "step": 270, "token_acc": 0.6870453172664521 }, { "epoch": 0.06571690097871241, "grad_norm": 1.8897694890455825, "learning_rate": 1.12e-06, "loss": 1.2016170501708985, "num_input_tokens_seen": 56286087, "step": 280, "token_acc": 0.6947360302493355 }, { "epoch": 0.06806393315652358, "grad_norm": 4.191924245995845, "learning_rate": 1.16e-06, "loss": 1.2314638137817382, "num_input_tokens_seen": 58259631, "step": 290, "token_acc": 0.6869124082650091 }, { "epoch": 0.07041096533433473, "grad_norm": 2.010449522588459, "learning_rate": 1.2e-06, "loss": 1.2306774139404297, "num_input_tokens_seen": 60262377, "step": 300, "token_acc": 0.6861436424474188 }, { "epoch": 0.07041096533433473, "eval_loss": 1.253986120223999, "eval_runtime": 32.5346, "eval_samples_per_second": 30.737, "eval_steps_per_second": 1.291, "eval_token_acc": 0.6843470834006602, "num_input_tokens_seen": 60262377, "step": 300 }, { "epoch": 0.07275799751214589, "grad_norm": 1.6903068952734268, "learning_rate": 1.24e-06, "loss": 1.2244423866271972, "num_input_tokens_seen": 62257881, "step": 310, "token_acc": 0.6906986736484506 }, { "epoch": 0.07510502968995705, "grad_norm": 2.00802423495143, "learning_rate": 1.28e-06, "loss": 1.2204778671264649, "num_input_tokens_seen": 64218216, "step": 320, "token_acc": 0.6881087345222366 }, { "epoch": 0.07745206186776821, "grad_norm": 2.136483656358153, "learning_rate": 1.32e-06, "loss": 1.1911478996276856, "num_input_tokens_seen": 66217860, "step": 330, "token_acc": 0.6964124357320138 }, { "epoch": 0.07979909404557936, "grad_norm": 1.7796738858497867, "learning_rate": 1.3600000000000001e-06, "loss": 1.195077896118164, "num_input_tokens_seen": 68222130, "step": 340, "token_acc": 0.6935684789950294 }, { "epoch": 0.08214612622339053, "grad_norm": 1.7117012000365959, "learning_rate": 1.4e-06, "loss": 1.2089216232299804, "num_input_tokens_seen": 70209570, "step": 350, "token_acc": 0.6915523828674844 }, { "epoch": 0.08449315840120168, "grad_norm": 1.928181461412703, "learning_rate": 1.44e-06, "loss": 1.195500946044922, "num_input_tokens_seen": 72228831, "step": 360, "token_acc": 0.6958758115748244 }, { "epoch": 0.08684019057901284, "grad_norm": 2.016364423213612, "learning_rate": 1.48e-06, "loss": 1.1975667953491211, "num_input_tokens_seen": 74280357, "step": 370, "token_acc": 0.6944993196346585 }, { "epoch": 0.089187222756824, "grad_norm": 1.9269783210667364, "learning_rate": 1.5199999999999998e-06, "loss": 1.196579933166504, "num_input_tokens_seen": 76255509, "step": 380, "token_acc": 0.693136319725866 }, { "epoch": 0.09153425493463516, "grad_norm": 2.276495563257121, "learning_rate": 1.5599999999999999e-06, "loss": 1.1727699279785155, "num_input_tokens_seen": 78216720, "step": 390, "token_acc": 0.6980378317334839 }, { "epoch": 0.09388128711244631, "grad_norm": 2.665827226302004, "learning_rate": 1.6e-06, "loss": 1.2070913314819336, "num_input_tokens_seen": 80187780, "step": 400, "token_acc": 0.6931723081009408 }, { "epoch": 0.09388128711244631, "eval_loss": 1.210001826286316, "eval_runtime": 32.311, "eval_samples_per_second": 30.949, "eval_steps_per_second": 1.3, "eval_token_acc": 0.693446596338958, "num_input_tokens_seen": 80187780, "step": 400 }, { "epoch": 0.09622831929025746, "grad_norm": 1.8020869110136488, "learning_rate": 1.6399999999999998e-06, "loss": 1.1979689598083496, "num_input_tokens_seen": 82253211, "step": 410, "token_acc": 0.6943290418797176 }, { "epoch": 0.09857535146806863, "grad_norm": 1.613339482743251, "learning_rate": 1.6799999999999998e-06, "loss": 1.1750219345092774, "num_input_tokens_seen": 84275082, "step": 420, "token_acc": 0.6988580180720491 }, { "epoch": 0.10092238364587978, "grad_norm": 2.0225577242890402, "learning_rate": 1.7199999999999998e-06, "loss": 1.160631275177002, "num_input_tokens_seen": 86256174, "step": 430, "token_acc": 0.7049912003932076 }, { "epoch": 0.10326941582369094, "grad_norm": 2.0588425205195047, "learning_rate": 1.7599999999999999e-06, "loss": 1.155072021484375, "num_input_tokens_seen": 88191771, "step": 440, "token_acc": 0.7027978727051616 }, { "epoch": 0.1056164480015021, "grad_norm": 1.750652589288128, "learning_rate": 1.8e-06, "loss": 1.1657937049865723, "num_input_tokens_seen": 90280068, "step": 450, "token_acc": 0.70126095038482 }, { "epoch": 0.10796348017931326, "grad_norm": 1.6965579041737329, "learning_rate": 1.84e-06, "loss": 1.1403490066528321, "num_input_tokens_seen": 92282058, "step": 460, "token_acc": 0.7044917775975158 }, { "epoch": 0.11031051235712441, "grad_norm": 1.8743542107195483, "learning_rate": 1.8799999999999998e-06, "loss": 1.1614572525024414, "num_input_tokens_seen": 94268343, "step": 470, "token_acc": 0.7024686011260286 }, { "epoch": 0.11265754453493558, "grad_norm": 2.2378848353450693, "learning_rate": 1.92e-06, "loss": 1.1589451789855958, "num_input_tokens_seen": 96220941, "step": 480, "token_acc": 0.7040593029694393 }, { "epoch": 0.11500457671274673, "grad_norm": 1.7219641168340587, "learning_rate": 1.96e-06, "loss": 1.139027214050293, "num_input_tokens_seen": 98234790, "step": 490, "token_acc": 0.7069929196641098 }, { "epoch": 0.11735160889055789, "grad_norm": 1.7720161431115489, "learning_rate": 2e-06, "loss": 1.1347829818725585, "num_input_tokens_seen": 100243815, "step": 500, "token_acc": 0.7060081282908567 }, { "epoch": 0.11735160889055789, "eval_loss": 1.1757478713989258, "eval_runtime": 32.3883, "eval_samples_per_second": 30.875, "eval_steps_per_second": 1.297, "eval_token_acc": 0.7003347106484153, "num_input_tokens_seen": 100243815, "step": 500 }, { "epoch": 0.11969864106836904, "grad_norm": 1.8210047186952776, "learning_rate": 1.9999912270311373e-06, "loss": 1.1792086601257323, "num_input_tokens_seen": 102249078, "step": 510, "token_acc": 0.698590893627688 }, { "epoch": 0.12204567324618021, "grad_norm": 1.8609755736841171, "learning_rate": 1.999964908278481e-06, "loss": 1.1209921836853027, "num_input_tokens_seen": 104220897, "step": 520, "token_acc": 0.7090112628579576 }, { "epoch": 0.12439270542399136, "grad_norm": 2.1446809333226584, "learning_rate": 1.9999210442038163e-06, "loss": 1.1469528198242187, "num_input_tokens_seen": 106234191, "step": 530, "token_acc": 0.703947954006619 }, { "epoch": 0.12673973760180252, "grad_norm": 2.046893089210468, "learning_rate": 1.9998596355767802e-06, "loss": 1.1571426391601562, "num_input_tokens_seen": 108272712, "step": 540, "token_acc": 0.7027365001081043 }, { "epoch": 0.12908676977961367, "grad_norm": 1.8591189017227578, "learning_rate": 1.999780683474845e-06, "loss": 1.1333347320556642, "num_input_tokens_seen": 110241915, "step": 550, "token_acc": 0.7072802072223069 }, { "epoch": 0.13143380195742482, "grad_norm": 1.6591451063058131, "learning_rate": 1.9996841892832997e-06, "loss": 1.1434220314025878, "num_input_tokens_seen": 112166943, "step": 560, "token_acc": 0.7056084295682411 }, { "epoch": 0.133780834135236, "grad_norm": 2.020864993257282, "learning_rate": 1.999570154695225e-06, "loss": 1.1571636199951172, "num_input_tokens_seen": 114151494, "step": 570, "token_acc": 0.7044949720967205 }, { "epoch": 0.13612786631304716, "grad_norm": 2.064129107439252, "learning_rate": 1.9994385817114644e-06, "loss": 1.1311494827270507, "num_input_tokens_seen": 116169552, "step": 580, "token_acc": 0.7063148017463998 }, { "epoch": 0.1384748984908583, "grad_norm": 2.0906868028581798, "learning_rate": 1.999289472640589e-06, "loss": 1.1150264739990234, "num_input_tokens_seen": 118161789, "step": 590, "token_acc": 0.7104190105422314 }, { "epoch": 0.14082193066866946, "grad_norm": 1.783399118737723, "learning_rate": 1.999122830098858e-06, "loss": 1.14277925491333, "num_input_tokens_seen": 120188337, "step": 600, "token_acc": 0.7054239286277058 }, { "epoch": 0.14082193066866946, "eval_loss": 1.150290846824646, "eval_runtime": 32.9507, "eval_samples_per_second": 30.348, "eval_steps_per_second": 1.275, "eval_token_acc": 0.7054176958057293, "num_input_tokens_seen": 120188337, "step": 600 }, { "epoch": 0.14316896284648062, "grad_norm": 2.087850842021689, "learning_rate": 1.998938657010171e-06, "loss": 1.1017154693603515, "num_input_tokens_seen": 122187903, "step": 610, "token_acc": 0.7150525542709177 }, { "epoch": 0.14551599502429177, "grad_norm": 1.941074004275762, "learning_rate": 1.9987369566060176e-06, "loss": 1.0946624755859375, "num_input_tokens_seen": 124171368, "step": 620, "token_acc": 0.7163183324905894 }, { "epoch": 0.14786302720210295, "grad_norm": 2.176865017774056, "learning_rate": 1.9985177324254197e-06, "loss": 1.1165874481201172, "num_input_tokens_seen": 126183993, "step": 630, "token_acc": 0.7110827727359269 }, { "epoch": 0.1502100593799141, "grad_norm": 1.7953820815140804, "learning_rate": 1.998280988314872e-06, "loss": 1.1424741744995117, "num_input_tokens_seen": 128176863, "step": 640, "token_acc": 0.7053827925519703 }, { "epoch": 0.15255709155772526, "grad_norm": 1.8929944794579523, "learning_rate": 1.9980267284282714e-06, "loss": 1.1028331756591796, "num_input_tokens_seen": 130125408, "step": 650, "token_acc": 0.7125094339622642 }, { "epoch": 0.15490412373553641, "grad_norm": 1.691459310367227, "learning_rate": 1.9977549572268466e-06, "loss": 1.107553482055664, "num_input_tokens_seen": 132065343, "step": 660, "token_acc": 0.7138998256484975 }, { "epoch": 0.15725115591334757, "grad_norm": 2.1233419395787556, "learning_rate": 1.9974656794790772e-06, "loss": 1.1101640701293944, "num_input_tokens_seen": 134090148, "step": 670, "token_acc": 0.713199782361379 }, { "epoch": 0.15959818809115872, "grad_norm": 1.7559578625602645, "learning_rate": 1.997158900260614e-06, "loss": 1.1094940185546875, "num_input_tokens_seen": 136112988, "step": 680, "token_acc": 0.7122396887639626 }, { "epoch": 0.16194522026896987, "grad_norm": 1.7829226146649233, "learning_rate": 1.9968346249541846e-06, "loss": 1.117540168762207, "num_input_tokens_seen": 138058629, "step": 690, "token_acc": 0.7106555900807559 }, { "epoch": 0.16429225244678106, "grad_norm": 2.1483315176659166, "learning_rate": 1.9964928592495045e-06, "loss": 1.0879833221435546, "num_input_tokens_seen": 140078598, "step": 700, "token_acc": 0.7166827394425921 }, { "epoch": 0.16429225244678106, "eval_loss": 1.1313835382461548, "eval_runtime": 32.334, "eval_samples_per_second": 30.927, "eval_steps_per_second": 1.299, "eval_token_acc": 0.7089033032478474, "num_input_tokens_seen": 140078598, "step": 700 }, { "epoch": 0.1666392846245922, "grad_norm": 1.867471314482214, "learning_rate": 1.9961336091431724e-06, "loss": 1.1190789222717286, "num_input_tokens_seen": 142099659, "step": 710, "token_acc": 0.712375749359721 }, { "epoch": 0.16898631680240336, "grad_norm": 1.9022337846097856, "learning_rate": 1.995756880938569e-06, "loss": 1.0825121879577637, "num_input_tokens_seen": 144092310, "step": 720, "token_acc": 0.7172525783126845 }, { "epoch": 0.17133334898021452, "grad_norm": 1.9984437173736713, "learning_rate": 1.9953626812457438e-06, "loss": 1.095411491394043, "num_input_tokens_seen": 146064039, "step": 730, "token_acc": 0.714463713054313 }, { "epoch": 0.17368038115802567, "grad_norm": 2.1447202509234304, "learning_rate": 1.9949510169813e-06, "loss": 1.1152179718017579, "num_input_tokens_seen": 148112049, "step": 740, "token_acc": 0.712060909164676 }, { "epoch": 0.17602741333583682, "grad_norm": 1.6936993245361356, "learning_rate": 1.994521895368273e-06, "loss": 1.0852348327636718, "num_input_tokens_seen": 150133974, "step": 750, "token_acc": 0.7162799236018076 }, { "epoch": 0.178374445513648, "grad_norm": 2.352601598144833, "learning_rate": 1.9940753239360045e-06, "loss": 1.1107561111450195, "num_input_tokens_seen": 152099280, "step": 760, "token_acc": 0.7127718906860011 }, { "epoch": 0.18072147769145916, "grad_norm": 1.84112693117569, "learning_rate": 1.9936113105200084e-06, "loss": 1.110912036895752, "num_input_tokens_seen": 154146792, "step": 770, "token_acc": 0.7112778436268925 }, { "epoch": 0.1830685098692703, "grad_norm": 2.3592228692729367, "learning_rate": 1.9931298632618353e-06, "loss": 1.127957820892334, "num_input_tokens_seen": 156087093, "step": 780, "token_acc": 0.7073941119432238 }, { "epoch": 0.18541554204708147, "grad_norm": 1.8453081635946817, "learning_rate": 1.9926309906089288e-06, "loss": 1.0826932907104492, "num_input_tokens_seen": 158083548, "step": 790, "token_acc": 0.7176969639197369 }, { "epoch": 0.18776257422489262, "grad_norm": 1.6598465812647105, "learning_rate": 1.9921147013144777e-06, "loss": 1.097795295715332, "num_input_tokens_seen": 160083087, "step": 800, "token_acc": 0.712001722391892 }, { "epoch": 0.18776257422489262, "eval_loss": 1.1155238151550293, "eval_runtime": 32.4633, "eval_samples_per_second": 30.804, "eval_steps_per_second": 1.294, "eval_token_acc": 0.712074975185245, "num_input_tokens_seen": 160083087, "step": 800 }, { "epoch": 0.19010960640270377, "grad_norm": 2.345143984991418, "learning_rate": 1.9915810044372615e-06, "loss": 1.0773065567016602, "num_input_tokens_seen": 162043827, "step": 810, "token_acc": 0.7185854363462685 }, { "epoch": 0.19245663858051493, "grad_norm": 1.6218625881025774, "learning_rate": 1.991029909341493e-06, "loss": 1.1322909355163575, "num_input_tokens_seen": 164065197, "step": 820, "token_acc": 0.7112101172756877 }, { "epoch": 0.1948036707583261, "grad_norm": 3.4128089423104204, "learning_rate": 1.990461425696651e-06, "loss": 1.1018625259399415, "num_input_tokens_seen": 166095825, "step": 830, "token_acc": 0.7132049834650468 }, { "epoch": 0.19715070293613726, "grad_norm": 3.8983014033715273, "learning_rate": 1.9898755634773155e-06, "loss": 1.092278289794922, "num_input_tokens_seen": 168127596, "step": 840, "token_acc": 0.7165934113928826 }, { "epoch": 0.19949773511394842, "grad_norm": 1.7080322418933676, "learning_rate": 1.9892723329629885e-06, "loss": 1.0770910263061524, "num_input_tokens_seen": 170112078, "step": 850, "token_acc": 0.7174151496405977 }, { "epoch": 0.20184476729175957, "grad_norm": 1.6399516756726806, "learning_rate": 1.988651744737914e-06, "loss": 1.119683837890625, "num_input_tokens_seen": 172089120, "step": 860, "token_acc": 0.7093057553740301 }, { "epoch": 0.20419179946957072, "grad_norm": 1.9211847623415963, "learning_rate": 1.988013809690895e-06, "loss": 1.0811002731323243, "num_input_tokens_seen": 174102978, "step": 870, "token_acc": 0.7170278749197704 }, { "epoch": 0.20653883164738188, "grad_norm": 1.7860065176012982, "learning_rate": 1.9873585390151003e-06, "loss": 1.0824663162231445, "num_input_tokens_seen": 176106354, "step": 880, "token_acc": 0.7187242752799151 }, { "epoch": 0.20888586382519303, "grad_norm": 6.212586622472415, "learning_rate": 1.986685944207868e-06, "loss": 1.0738523483276368, "num_input_tokens_seen": 178098096, "step": 890, "token_acc": 0.7199481706694962 }, { "epoch": 0.2112328960030042, "grad_norm": 1.7634279436109257, "learning_rate": 1.985996037070505e-06, "loss": 1.0606145858764648, "num_input_tokens_seen": 180140916, "step": 900, "token_acc": 0.7212711540534449 }, { "epoch": 0.2112328960030042, "eval_loss": 1.102053165435791, "eval_runtime": 32.763, "eval_samples_per_second": 30.522, "eval_steps_per_second": 1.282, "eval_token_acc": 0.7141986565407077, "num_input_tokens_seen": 180140916, "step": 900 }, { "epoch": 0.21357992818081536, "grad_norm": 1.927927016491826, "learning_rate": 1.9852888297080784e-06, "loss": 1.0789798736572265, "num_input_tokens_seen": 182134725, "step": 910, "token_acc": 0.7180467099845159 }, { "epoch": 0.21592696035862652, "grad_norm": 1.633422631466873, "learning_rate": 1.9845643345292055e-06, "loss": 1.075742530822754, "num_input_tokens_seen": 184161738, "step": 920, "token_acc": 0.719577260000721 }, { "epoch": 0.21827399253643767, "grad_norm": 1.763375240928624, "learning_rate": 1.9838225642458328e-06, "loss": 1.0633999824523925, "num_input_tokens_seen": 186250896, "step": 930, "token_acc": 0.7216072711554525 }, { "epoch": 0.22062102471424883, "grad_norm": 1.578730561244102, "learning_rate": 1.9830635318730153e-06, "loss": 1.0807870864868163, "num_input_tokens_seen": 188240646, "step": 940, "token_acc": 0.719998073905838 }, { "epoch": 0.22296805689205998, "grad_norm": 1.9778473417464, "learning_rate": 1.9822872507286887e-06, "loss": 1.0958086013793946, "num_input_tokens_seen": 190240614, "step": 950, "token_acc": 0.715133457837701 }, { "epoch": 0.22531508906987116, "grad_norm": 1.7070736536906375, "learning_rate": 1.9814937344334326e-06, "loss": 1.083117961883545, "num_input_tokens_seen": 192202005, "step": 960, "token_acc": 0.718299042165819 }, { "epoch": 0.22766212124768231, "grad_norm": 1.6694843702106625, "learning_rate": 1.9806829969102353e-06, "loss": 1.0489460945129394, "num_input_tokens_seen": 194152464, "step": 970, "token_acc": 0.7243972802430247 }, { "epoch": 0.23000915342549347, "grad_norm": 1.6802225185406368, "learning_rate": 1.9798550523842466e-06, "loss": 1.055472183227539, "num_input_tokens_seen": 196146252, "step": 980, "token_acc": 0.7222500499869107 }, { "epoch": 0.23235618560330462, "grad_norm": 1.586112316988885, "learning_rate": 1.9790099153825295e-06, "loss": 1.0688490867614746, "num_input_tokens_seen": 198216198, "step": 990, "token_acc": 0.721910041723649 }, { "epoch": 0.23470321778111577, "grad_norm": 1.9657681362344652, "learning_rate": 1.9781476007338054e-06, "loss": 1.0997188568115235, "num_input_tokens_seen": 200266242, "step": 1000, "token_acc": 0.7134316006040672 }, { "epoch": 0.23470321778111577, "eval_loss": 1.0927079916000366, "eval_runtime": 32.3958, "eval_samples_per_second": 30.868, "eval_steps_per_second": 1.296, "eval_token_acc": 0.7168070912490478, "num_input_tokens_seen": 200266242, "step": 1000 }, { "epoch": 0.23705024995892693, "grad_norm": 2.279451886611171, "learning_rate": 1.9772681235681933e-06, "loss": 1.0306278228759767, "num_input_tokens_seen": 202268343, "step": 1010, "token_acc": 0.7296832940863017 }, { "epoch": 0.23939728213673808, "grad_norm": 1.7883291650438458, "learning_rate": 1.976371499316945e-06, "loss": 1.0757831573486327, "num_input_tokens_seen": 204289632, "step": 1020, "token_acc": 0.7182072037465692 }, { "epoch": 0.24174431431454926, "grad_norm": 1.8342347796963645, "learning_rate": 1.975457743712173e-06, "loss": 1.0590785980224608, "num_input_tokens_seen": 206327745, "step": 1030, "token_acc": 0.7225627285705905 }, { "epoch": 0.24409134649236042, "grad_norm": 1.762378045102792, "learning_rate": 1.974526872786577e-06, "loss": 1.0789016723632812, "num_input_tokens_seen": 208322556, "step": 1040, "token_acc": 0.7185882266690018 }, { "epoch": 0.24643837867017157, "grad_norm": 1.7642619697840807, "learning_rate": 1.97357890287316e-06, "loss": 1.090459442138672, "num_input_tokens_seen": 210345396, "step": 1050, "token_acc": 0.715633342030789 }, { "epoch": 0.24878541084798272, "grad_norm": 1.8062010829609079, "learning_rate": 1.9726138506049433e-06, "loss": 1.0327832221984863, "num_input_tokens_seen": 212289177, "step": 1060, "token_acc": 0.728890125802145 }, { "epoch": 0.2511324430257939, "grad_norm": 1.6741852997103905, "learning_rate": 1.971631732914674e-06, "loss": 1.0438125610351563, "num_input_tokens_seen": 214294110, "step": 1070, "token_acc": 0.7274771422710105 }, { "epoch": 0.25347947520360503, "grad_norm": 1.8889183202576878, "learning_rate": 1.970632567034527e-06, "loss": 1.0874737739562987, "num_input_tokens_seen": 216250632, "step": 1080, "token_acc": 0.7169543090609345 }, { "epoch": 0.2558265073814162, "grad_norm": 1.768581214259287, "learning_rate": 1.9696163704958057e-06, "loss": 1.0529390335083009, "num_input_tokens_seen": 218235084, "step": 1090, "token_acc": 0.7233062911737727 }, { "epoch": 0.25817353955922734, "grad_norm": 1.6728742294003298, "learning_rate": 1.968583161128631e-06, "loss": 1.0434741973876953, "num_input_tokens_seen": 220250775, "step": 1100, "token_acc": 0.72555486645587 }, { "epoch": 0.25817353955922734, "eval_loss": 1.0830632448196411, "eval_runtime": 32.7745, "eval_samples_per_second": 30.512, "eval_steps_per_second": 1.281, "eval_token_acc": 0.7187414879619584, "num_input_tokens_seen": 220250775, "step": 1100 }, { "epoch": 0.2605205717370385, "grad_norm": 3.5179698506650827, "learning_rate": 1.9675329570616295e-06, "loss": 1.036564826965332, "num_input_tokens_seen": 222248643, "step": 1110, "token_acc": 0.7253935790918138 }, { "epoch": 0.26286760391484965, "grad_norm": 1.7194590974245725, "learning_rate": 1.9664657767216175e-06, "loss": 1.034214401245117, "num_input_tokens_seen": 224176074, "step": 1120, "token_acc": 0.731699968385116 }, { "epoch": 0.2652146360926608, "grad_norm": 1.6151699401355315, "learning_rate": 1.9653816388332737e-06, "loss": 1.0186534881591798, "num_input_tokens_seen": 226256241, "step": 1130, "token_acc": 0.729031512194937 }, { "epoch": 0.267561668270472, "grad_norm": 1.915048663566233, "learning_rate": 1.9642805624188146e-06, "loss": 1.0460872650146484, "num_input_tokens_seen": 228227991, "step": 1140, "token_acc": 0.7245494456551131 }, { "epoch": 0.26990870044828313, "grad_norm": 2.3808335250565387, "learning_rate": 1.963162566797658e-06, "loss": 1.0558183670043946, "num_input_tokens_seen": 230254347, "step": 1150, "token_acc": 0.7232573802936575 }, { "epoch": 0.2722557326260943, "grad_norm": 1.7367289249419906, "learning_rate": 1.962027671586086e-06, "loss": 1.050713062286377, "num_input_tokens_seen": 232285218, "step": 1160, "token_acc": 0.7248766799700481 }, { "epoch": 0.27460276480390544, "grad_norm": 1.8903258230442381, "learning_rate": 1.9608758966968984e-06, "loss": 1.0442859649658203, "num_input_tokens_seen": 234350787, "step": 1170, "token_acc": 0.7246446168983565 }, { "epoch": 0.2769497969817166, "grad_norm": 2.0858660720659064, "learning_rate": 1.959707262339067e-06, "loss": 1.0628435134887695, "num_input_tokens_seen": 236401623, "step": 1180, "token_acc": 0.7223079815551465 }, { "epoch": 0.2792968291595278, "grad_norm": 2.977405059549, "learning_rate": 1.9585217890173757e-06, "loss": 1.0738126754760742, "num_input_tokens_seen": 238361190, "step": 1190, "token_acc": 0.7190871093733786 }, { "epoch": 0.28164386133733893, "grad_norm": 2.283563309099777, "learning_rate": 1.957319497532067e-06, "loss": 1.0180787086486816, "num_input_tokens_seen": 240437730, "step": 1200, "token_acc": 0.7330017297652685 }, { "epoch": 0.28164386133733893, "eval_loss": 1.0745400190353394, "eval_runtime": 32.4066, "eval_samples_per_second": 30.858, "eval_steps_per_second": 1.296, "eval_token_acc": 0.7201588144317999, "num_input_tokens_seen": 240437730, "step": 1200 }, { "epoch": 0.2839908935151501, "grad_norm": 1.9270515119564795, "learning_rate": 1.956100408978472e-06, "loss": 1.0345954895019531, "num_input_tokens_seen": 242382708, "step": 1210, "token_acc": 0.7277172037115998 }, { "epoch": 0.28633792569296124, "grad_norm": 1.5733358413499778, "learning_rate": 1.954864544746643e-06, "loss": 1.0476463317871094, "num_input_tokens_seen": 244350303, "step": 1220, "token_acc": 0.7255343803753794 }, { "epoch": 0.2886849578707724, "grad_norm": 2.0867528996051345, "learning_rate": 1.9536119265209757e-06, "loss": 1.0576335906982421, "num_input_tokens_seen": 246334116, "step": 1230, "token_acc": 0.7241534895699202 }, { "epoch": 0.29103199004858354, "grad_norm": 1.587056177259835, "learning_rate": 1.952342576279833e-06, "loss": 1.0451471328735351, "num_input_tokens_seen": 248362662, "step": 1240, "token_acc": 0.7264873056477157 }, { "epoch": 0.2933790222263947, "grad_norm": 3.147776681472526, "learning_rate": 1.9510565162951534e-06, "loss": 1.0531164169311524, "num_input_tokens_seen": 250326474, "step": 1250, "token_acc": 0.7241427379495411 }, { "epoch": 0.2957260544042059, "grad_norm": 1.677213988705626, "learning_rate": 1.9497537691320667e-06, "loss": 1.0469918251037598, "num_input_tokens_seen": 252382641, "step": 1260, "token_acc": 0.7247498649880667 }, { "epoch": 0.29807308658201703, "grad_norm": 3.244921913867558, "learning_rate": 1.9484343576484934e-06, "loss": 1.0731307983398437, "num_input_tokens_seen": 254380842, "step": 1270, "token_acc": 0.7198529707146587 }, { "epoch": 0.3004201187598282, "grad_norm": 2.2715118534896424, "learning_rate": 1.9470983049947442e-06, "loss": 1.0327179908752442, "num_input_tokens_seen": 256367745, "step": 1280, "token_acc": 0.7273322442040123 }, { "epoch": 0.30276715093763934, "grad_norm": 2.388511262608066, "learning_rate": 1.9457456346131168e-06, "loss": 1.0295280456542968, "num_input_tokens_seen": 258362418, "step": 1290, "token_acc": 0.7289352257814815 }, { "epoch": 0.3051141831154505, "grad_norm": 2.284896449465709, "learning_rate": 1.944376370237481e-06, "loss": 1.0356334686279296, "num_input_tokens_seen": 260389752, "step": 1300, "token_acc": 0.7264502277424404 }, { "epoch": 0.3051141831154505, "eval_loss": 1.0682131052017212, "eval_runtime": 32.3728, "eval_samples_per_second": 30.89, "eval_steps_per_second": 1.297, "eval_token_acc": 0.7214607234366704, "num_input_tokens_seen": 260389752, "step": 1300 }, { "epoch": 0.30746121529326165, "grad_norm": 1.8462096822584517, "learning_rate": 1.9429905358928646e-06, "loss": 1.0431997299194335, "num_input_tokens_seen": 262425369, "step": 1310, "token_acc": 0.7247579875646393 }, { "epoch": 0.30980824747107283, "grad_norm": 2.7288254092061286, "learning_rate": 1.94158815589503e-06, "loss": 1.03179931640625, "num_input_tokens_seen": 264478839, "step": 1320, "token_acc": 0.7273030599423818 }, { "epoch": 0.312155279648884, "grad_norm": 2.0483013477422563, "learning_rate": 1.9401692548500502e-06, "loss": 1.0194345474243165, "num_input_tokens_seen": 266467188, "step": 1330, "token_acc": 0.7318709842049548 }, { "epoch": 0.31450231182669514, "grad_norm": 3.607937481626218, "learning_rate": 1.938733857653874e-06, "loss": 1.0359786987304687, "num_input_tokens_seen": 268553511, "step": 1340, "token_acc": 0.7270260288085842 }, { "epoch": 0.3168493440045063, "grad_norm": 2.2908695328416244, "learning_rate": 1.9372819894918914e-06, "loss": 1.005875015258789, "num_input_tokens_seen": 270556128, "step": 1350, "token_acc": 0.733425647272143 }, { "epoch": 0.31919637618231744, "grad_norm": 2.2530826851795576, "learning_rate": 1.935813675838491e-06, "loss": 1.0363348007202149, "num_input_tokens_seen": 272585331, "step": 1360, "token_acc": 0.7270068150894993 }, { "epoch": 0.3215434083601286, "grad_norm": 1.6599911510535466, "learning_rate": 1.934328942456612e-06, "loss": 0.9922657012939453, "num_input_tokens_seen": 274625832, "step": 1370, "token_acc": 0.7369969482933556 }, { "epoch": 0.32389044053793975, "grad_norm": 1.6571812543491504, "learning_rate": 1.9328278153972946e-06, "loss": 1.0838043212890625, "num_input_tokens_seen": 276646638, "step": 1380, "token_acc": 0.7254781164111181 }, { "epoch": 0.32623747271575093, "grad_norm": 1.7846961468797993, "learning_rate": 1.9313103209992204e-06, "loss": 1.0071705818176269, "num_input_tokens_seen": 278652339, "step": 1390, "token_acc": 0.733368638373526 }, { "epoch": 0.3285845048935621, "grad_norm": 2.1490918049490717, "learning_rate": 1.929776485888251e-06, "loss": 1.0504549026489258, "num_input_tokens_seen": 280677636, "step": 1400, "token_acc": 0.72332943463746 }, { "epoch": 0.3285845048935621, "eval_loss": 1.061837077140808, "eval_runtime": 32.7164, "eval_samples_per_second": 30.566, "eval_steps_per_second": 1.284, "eval_token_acc": 0.7231458184252441, "num_input_tokens_seen": 280677636, "step": 1400 }, { "epoch": 0.33093153707137324, "grad_norm": 1.776580604562134, "learning_rate": 1.928226336976963e-06, "loss": 1.0266141891479492, "num_input_tokens_seen": 282669069, "step": 1410, "token_acc": 0.7291277131940492 }, { "epoch": 0.3332785692491844, "grad_norm": 8.438214405501748, "learning_rate": 1.926659901464172e-06, "loss": 1.0292797088623047, "num_input_tokens_seen": 284659779, "step": 1420, "token_acc": 0.7288078819771109 }, { "epoch": 0.33562560142699555, "grad_norm": 2.252060217551861, "learning_rate": 1.925077206834458e-06, "loss": 1.0228628158569335, "num_input_tokens_seen": 286673274, "step": 1430, "token_acc": 0.7280252171611444 }, { "epoch": 0.3379726336048067, "grad_norm": 1.4651418770258904, "learning_rate": 1.923478280857682e-06, "loss": 1.0042032241821288, "num_input_tokens_seen": 288677157, "step": 1440, "token_acc": 0.7343410272213868 }, { "epoch": 0.34031966578261785, "grad_norm": 1.6827171089675037, "learning_rate": 1.9218631515885003e-06, "loss": 1.0294583320617676, "num_input_tokens_seen": 290678706, "step": 1450, "token_acc": 0.7304443621152334 }, { "epoch": 0.34266669796042903, "grad_norm": 1.7341043440646111, "learning_rate": 1.9202318473658702e-06, "loss": 0.9965463638305664, "num_input_tokens_seen": 292647750, "step": 1460, "token_acc": 0.736443122122828 }, { "epoch": 0.3450137301382402, "grad_norm": 1.706569258628379, "learning_rate": 1.918584396812554e-06, "loss": 1.0162506103515625, "num_input_tokens_seen": 294701517, "step": 1470, "token_acc": 0.7316330245383567 }, { "epoch": 0.34736076231605134, "grad_norm": 1.6208113959472872, "learning_rate": 1.9169208288346163e-06, "loss": 1.0112849235534669, "num_input_tokens_seen": 296720586, "step": 1480, "token_acc": 0.732423183545091 }, { "epoch": 0.3497077944938625, "grad_norm": 1.7865465491021926, "learning_rate": 1.9152411726209172e-06, "loss": 1.0156356811523437, "num_input_tokens_seen": 298684938, "step": 1490, "token_acc": 0.7308413793103449 }, { "epoch": 0.35205482667167365, "grad_norm": 2.059441241693384, "learning_rate": 1.9135454576426007e-06, "loss": 1.0275184631347656, "num_input_tokens_seen": 300684201, "step": 1500, "token_acc": 0.730526369912453 }, { "epoch": 0.35205482667167365, "eval_loss": 1.0552641153335571, "eval_runtime": 32.4705, "eval_samples_per_second": 30.797, "eval_steps_per_second": 1.293, "eval_token_acc": 0.72383370651647, "num_input_tokens_seen": 300684201, "step": 1500 }, { "epoch": 0.35440185884948483, "grad_norm": 2.3565377610515594, "learning_rate": 1.9118337136525756e-06, "loss": 1.0185004234313966, "num_input_tokens_seen": 302704359, "step": 1510, "token_acc": 0.7304355716162425 }, { "epoch": 0.356748891027296, "grad_norm": 14.877826986152865, "learning_rate": 1.9101059706849955e-06, "loss": 1.019582176208496, "num_input_tokens_seen": 304651629, "step": 1520, "token_acc": 0.731234582403383 }, { "epoch": 0.35909592320510714, "grad_norm": 2.879334483584151, "learning_rate": 1.908362259054731e-06, "loss": 1.0251285552978515, "num_input_tokens_seen": 306641097, "step": 1530, "token_acc": 0.7294201685316217 }, { "epoch": 0.3614429553829183, "grad_norm": 1.7887355243868148, "learning_rate": 1.9066026093568377e-06, "loss": 1.0157214164733888, "num_input_tokens_seen": 308660178, "step": 1540, "token_acc": 0.7307293262997984 }, { "epoch": 0.36378998756072944, "grad_norm": 1.867513936920377, "learning_rate": 1.9048270524660196e-06, "loss": 1.0161379814147948, "num_input_tokens_seen": 310777926, "step": 1550, "token_acc": 0.7304925609175636 }, { "epoch": 0.3661370197385406, "grad_norm": 6.302806843132354, "learning_rate": 1.9030356195360873e-06, "loss": 0.9866199493408203, "num_input_tokens_seen": 312788916, "step": 1560, "token_acc": 0.7381302995035983 }, { "epoch": 0.36848405191635175, "grad_norm": 1.8930345198459555, "learning_rate": 1.9012283419994113e-06, "loss": 1.0311415672302247, "num_input_tokens_seen": 314814855, "step": 1570, "token_acc": 0.7291705656140012 }, { "epoch": 0.37083108409416293, "grad_norm": 2.3487824750816646, "learning_rate": 1.899405251566371e-06, "loss": 1.0350725173950195, "num_input_tokens_seen": 316867344, "step": 1580, "token_acc": 0.7278371704934657 }, { "epoch": 0.3731781162719741, "grad_norm": 2.0782965598493917, "learning_rate": 1.8975663802247975e-06, "loss": 1.0283987998962403, "num_input_tokens_seen": 318871404, "step": 1590, "token_acc": 0.7280485561890748 }, { "epoch": 0.37552514844978524, "grad_norm": 2.8179476770543546, "learning_rate": 1.8957117602394128e-06, "loss": 1.027695655822754, "num_input_tokens_seen": 320871228, "step": 1600, "token_acc": 0.7284322929815703 }, { "epoch": 0.37552514844978524, "eval_loss": 1.0503556728363037, "eval_runtime": 32.4119, "eval_samples_per_second": 30.853, "eval_steps_per_second": 1.296, "eval_token_acc": 0.7255742018882297, "num_input_tokens_seen": 320871228, "step": 1600 }, { "epoch": 0.3778721806275964, "grad_norm": 2.240496844348581, "learning_rate": 1.8938414241512637e-06, "loss": 1.0263992309570313, "num_input_tokens_seen": 322930128, "step": 1610, "token_acc": 0.731757208141934 }, { "epoch": 0.38021921280540755, "grad_norm": 3.896191708778685, "learning_rate": 1.8919554047771507e-06, "loss": 1.0006643295288087, "num_input_tokens_seen": 324982575, "step": 1620, "token_acc": 0.732137966433454 }, { "epoch": 0.38256624498321873, "grad_norm": 1.7935819973243883, "learning_rate": 1.8900537352090523e-06, "loss": 0.9882081985473633, "num_input_tokens_seen": 326990898, "step": 1630, "token_acc": 0.7385387731711782 }, { "epoch": 0.38491327716102985, "grad_norm": 3.1640907355889496, "learning_rate": 1.8881364488135445e-06, "loss": 1.0018336296081543, "num_input_tokens_seen": 329033799, "step": 1640, "token_acc": 0.7350213182627736 }, { "epoch": 0.38726030933884104, "grad_norm": 5.630791095478135, "learning_rate": 1.8862035792312146e-06, "loss": 0.9879220962524414, "num_input_tokens_seen": 331067478, "step": 1650, "token_acc": 0.736295696568692 }, { "epoch": 0.3896073415166522, "grad_norm": 1.5905696004173981, "learning_rate": 1.8842551603760723e-06, "loss": 1.004323387145996, "num_input_tokens_seen": 333089880, "step": 1660, "token_acc": 0.7334599037600028 }, { "epoch": 0.39195437369446334, "grad_norm": 43.2007654518171, "learning_rate": 1.8822912264349532e-06, "loss": 1.0126733779907227, "num_input_tokens_seen": 335093103, "step": 1670, "token_acc": 0.7332479964381122 }, { "epoch": 0.3943014058722745, "grad_norm": 1.6733459020369337, "learning_rate": 1.8803118118669202e-06, "loss": 1.0368854522705078, "num_input_tokens_seen": 337115598, "step": 1680, "token_acc": 0.7274540217150455 }, { "epoch": 0.39664843805008565, "grad_norm": 1.9876180817181506, "learning_rate": 1.8783169514026577e-06, "loss": 1.0030999183654785, "num_input_tokens_seen": 339154959, "step": 1690, "token_acc": 0.7345074320050601 }, { "epoch": 0.39899547022789683, "grad_norm": 1.842434463603931, "learning_rate": 1.8763066800438634e-06, "loss": 0.9946871757507324, "num_input_tokens_seen": 341186700, "step": 1700, "token_acc": 0.7359575477937458 }, { "epoch": 0.39899547022789683, "eval_loss": 1.0446056127548218, "eval_runtime": 33.305, "eval_samples_per_second": 30.026, "eval_steps_per_second": 1.261, "eval_token_acc": 0.7265437085939844, "num_input_tokens_seen": 341186700, "step": 1700 }, { "epoch": 0.40134250240570796, "grad_norm": 1.9481089599377517, "learning_rate": 1.8742810330626335e-06, "loss": 1.0056350708007813, "num_input_tokens_seen": 343197345, "step": 1710, "token_acc": 0.7343789679900354 }, { "epoch": 0.40368953458351914, "grad_norm": 1.8925573831015579, "learning_rate": 1.8722400460008437e-06, "loss": 1.0299295425415038, "num_input_tokens_seen": 345220860, "step": 1720, "token_acc": 0.727836675491576 }, { "epoch": 0.4060365667613303, "grad_norm": 1.568094384198171, "learning_rate": 1.8701837546695256e-06, "loss": 1.011802864074707, "num_input_tokens_seen": 347269032, "step": 1730, "token_acc": 0.731503068944188 }, { "epoch": 0.40838359893914145, "grad_norm": 4.690102343755759, "learning_rate": 1.8681121951482393e-06, "loss": 1.0340707778930665, "num_input_tokens_seen": 349265856, "step": 1740, "token_acc": 0.7287572174652813 }, { "epoch": 0.4107306311169526, "grad_norm": 2.0732894110715776, "learning_rate": 1.8660254037844386e-06, "loss": 1.0054452896118165, "num_input_tokens_seen": 351220833, "step": 1750, "token_acc": 0.7349583487050085 }, { "epoch": 0.41307766329476375, "grad_norm": 4.563573246901434, "learning_rate": 1.863923417192835e-06, "loss": 0.9984481811523438, "num_input_tokens_seen": 353217660, "step": 1760, "token_acc": 0.7346953872236972 }, { "epoch": 0.41542469547257493, "grad_norm": 1.8182323815552697, "learning_rate": 1.861806272254755e-06, "loss": 1.0026565551757813, "num_input_tokens_seen": 355231713, "step": 1770, "token_acc": 0.734238520256768 }, { "epoch": 0.41777172765038606, "grad_norm": 2.3723528968369867, "learning_rate": 1.859674006117491e-06, "loss": 0.9838489532470703, "num_input_tokens_seen": 357318357, "step": 1780, "token_acc": 0.7385274102305481 }, { "epoch": 0.42011875982819724, "grad_norm": 2.413365084744393, "learning_rate": 1.8575266561936522e-06, "loss": 1.0196653366088868, "num_input_tokens_seen": 359351646, "step": 1790, "token_acc": 0.730992332131187 }, { "epoch": 0.4224657920060084, "grad_norm": 7.914722238930336, "learning_rate": 1.8553642601605066e-06, "loss": 0.9948186874389648, "num_input_tokens_seen": 361303284, "step": 1800, "token_acc": 0.7360711800377772 }, { "epoch": 0.4224657920060084, "eval_loss": 1.038891315460205, "eval_runtime": 32.4449, "eval_samples_per_second": 30.821, "eval_steps_per_second": 1.295, "eval_token_acc": 0.7277024999422913, "num_input_tokens_seen": 361303284, "step": 1800 }, { "epoch": 0.42481282418381955, "grad_norm": 1.7384957796876852, "learning_rate": 1.8531868559593203e-06, "loss": 1.0075714111328125, "num_input_tokens_seen": 363290772, "step": 1810, "token_acc": 0.7332521267838883 }, { "epoch": 0.42715985636163073, "grad_norm": 1.73396216177198, "learning_rate": 1.850994481794692e-06, "loss": 1.018679428100586, "num_input_tokens_seen": 365299026, "step": 1820, "token_acc": 0.7299744624828494 }, { "epoch": 0.42950688853944186, "grad_norm": 1.859054699772832, "learning_rate": 1.8487871761338819e-06, "loss": 0.9975422859191895, "num_input_tokens_seen": 367342086, "step": 1830, "token_acc": 0.735841141099147 }, { "epoch": 0.43185392071725304, "grad_norm": 1.6167732458245692, "learning_rate": 1.8465649777061376e-06, "loss": 1.0366539001464843, "num_input_tokens_seen": 369276633, "step": 1840, "token_acc": 0.7277804414793901 }, { "epoch": 0.4342009528950642, "grad_norm": 2.534040309718505, "learning_rate": 1.844327925502015e-06, "loss": 1.0096059799194337, "num_input_tokens_seen": 371265615, "step": 1850, "token_acc": 0.7326266219047257 }, { "epoch": 0.43654798507287534, "grad_norm": 1.9228862468394357, "learning_rate": 1.8420760587726921e-06, "loss": 1.0271913528442382, "num_input_tokens_seen": 373272270, "step": 1860, "token_acc": 0.7302226164565024 }, { "epoch": 0.4388950172506865, "grad_norm": 1.5025282734361622, "learning_rate": 1.8398094170292829e-06, "loss": 1.0059158325195312, "num_input_tokens_seen": 375279099, "step": 1870, "token_acc": 0.7330154465542768 }, { "epoch": 0.44124204942849765, "grad_norm": 4.754818039721933, "learning_rate": 1.8375280400421418e-06, "loss": 0.9967041969299316, "num_input_tokens_seen": 377223396, "step": 1880, "token_acc": 0.7358239778762203 }, { "epoch": 0.44358908160630883, "grad_norm": 1.691685468916323, "learning_rate": 1.8352319678401674e-06, "loss": 0.999173927307129, "num_input_tokens_seen": 379235661, "step": 1890, "token_acc": 0.7347835016672305 }, { "epoch": 0.44593611378411996, "grad_norm": 1.7737231328640157, "learning_rate": 1.8329212407100993e-06, "loss": 0.9919824600219727, "num_input_tokens_seen": 381243486, "step": 1900, "token_acc": 0.7371798315515523 }, { "epoch": 0.44593611378411996, "eval_loss": 1.0355346202850342, "eval_runtime": 32.2582, "eval_samples_per_second": 31.0, "eval_steps_per_second": 1.302, "eval_token_acc": 0.7281641698021745, "num_input_tokens_seen": 381243486, "step": 1900 }, { "epoch": 0.44828314596193114, "grad_norm": 2.5554510353139115, "learning_rate": 1.8305958991958126e-06, "loss": 0.9984329223632813, "num_input_tokens_seen": 383266650, "step": 1910, "token_acc": 0.7348018362631924 }, { "epoch": 0.4506301781397423, "grad_norm": 3.4304227222936854, "learning_rate": 1.8282559840976042e-06, "loss": 0.9989996910095215, "num_input_tokens_seen": 385198056, "step": 1920, "token_acc": 0.7340237302248127 }, { "epoch": 0.45297721031755345, "grad_norm": 1.8203825695395843, "learning_rate": 1.8259015364714785e-06, "loss": 1.005854892730713, "num_input_tokens_seen": 387174645, "step": 1930, "token_acc": 0.7344124724323412 }, { "epoch": 0.45532424249536463, "grad_norm": 2.3790186216357387, "learning_rate": 1.8235325976284273e-06, "loss": 1.0130582809448243, "num_input_tokens_seen": 389123001, "step": 1940, "token_acc": 0.7329481871636396 }, { "epoch": 0.45767127467317575, "grad_norm": 2.2702679233421366, "learning_rate": 1.821149209133704e-06, "loss": 1.0077364921569825, "num_input_tokens_seen": 391185051, "step": 1950, "token_acc": 0.7325617754275695 }, { "epoch": 0.46001830685098694, "grad_norm": 1.7113606013198168, "learning_rate": 1.8187514128060944e-06, "loss": 1.0020957946777345, "num_input_tokens_seen": 393232749, "step": 1960, "token_acc": 0.7342212411181741 }, { "epoch": 0.46236533902879806, "grad_norm": 2.0134995821074524, "learning_rate": 1.816339250717184e-06, "loss": 0.9884714126586914, "num_input_tokens_seen": 395240403, "step": 1970, "token_acc": 0.7366033551966206 }, { "epoch": 0.46471237120660924, "grad_norm": 3.624673089989278, "learning_rate": 1.8139127651906181e-06, "loss": 1.0036752700805665, "num_input_tokens_seen": 397222695, "step": 1980, "token_acc": 0.7327492557949239 }, { "epoch": 0.4670594033844204, "grad_norm": 12.741541567504669, "learning_rate": 1.811471998801361e-06, "loss": 1.0088150024414062, "num_input_tokens_seen": 399265515, "step": 1990, "token_acc": 0.7318671375057033 }, { "epoch": 0.46940643556223155, "grad_norm": 1.9147316254240543, "learning_rate": 1.8090169943749474e-06, "loss": 1.0098794937133788, "num_input_tokens_seen": 401254572, "step": 2000, "token_acc": 0.7348985741915172 }, { "epoch": 0.46940643556223155, "eval_loss": 1.0299264192581177, "eval_runtime": 32.8145, "eval_samples_per_second": 30.474, "eval_steps_per_second": 1.28, "eval_token_acc": 0.7290736594261443, "num_input_tokens_seen": 401254572, "step": 2000 }, { "epoch": 0.47175346774004273, "grad_norm": 1.8961444721894498, "learning_rate": 1.8065477949867325e-06, "loss": 1.016146469116211, "num_input_tokens_seen": 403296912, "step": 2010, "token_acc": 0.7310783889798314 }, { "epoch": 0.47410049991785386, "grad_norm": 1.5674703012341533, "learning_rate": 1.8040644439611345e-06, "loss": 1.0078514099121094, "num_input_tokens_seen": 405292185, "step": 2020, "token_acc": 0.7319825043230597 }, { "epoch": 0.47644753209566504, "grad_norm": 1.9494898023759353, "learning_rate": 1.8015669848708766e-06, "loss": 1.0296178817749024, "num_input_tokens_seen": 407303625, "step": 2030, "token_acc": 0.7293832613834421 }, { "epoch": 0.47879456427347616, "grad_norm": 10.968015568038117, "learning_rate": 1.7990554615362197e-06, "loss": 0.9932464599609375, "num_input_tokens_seen": 409284657, "step": 2040, "token_acc": 0.7361576877608628 }, { "epoch": 0.48114159645128735, "grad_norm": 1.5634395112041464, "learning_rate": 1.7965299180241961e-06, "loss": 0.9930622100830078, "num_input_tokens_seen": 411350526, "step": 2050, "token_acc": 0.7371341064431953 }, { "epoch": 0.4834886286290985, "grad_norm": 4.940871877481185, "learning_rate": 1.7939903986478354e-06, "loss": 0.9968077659606933, "num_input_tokens_seen": 413329158, "step": 2060, "token_acc": 0.7364979106166089 }, { "epoch": 0.48583566080690965, "grad_norm": 1.6357352710651227, "learning_rate": 1.7914369479653857e-06, "loss": 1.0207565307617188, "num_input_tokens_seen": 415301217, "step": 2070, "token_acc": 0.7303749705838948 }, { "epoch": 0.48818269298472083, "grad_norm": 2.246788650609953, "learning_rate": 1.788869610779534e-06, "loss": 1.00274658203125, "num_input_tokens_seen": 417261702, "step": 2080, "token_acc": 0.7341963767701447 }, { "epoch": 0.49052972516253196, "grad_norm": 1.56745308904305, "learning_rate": 1.7862884321366187e-06, "loss": 1.0060449600219727, "num_input_tokens_seen": 419262057, "step": 2090, "token_acc": 0.7324562018430577 }, { "epoch": 0.49287675734034314, "grad_norm": 1.7117337983013203, "learning_rate": 1.7836934573258397e-06, "loss": 0.9900275230407715, "num_input_tokens_seen": 421246710, "step": 2100, "token_acc": 0.7372878593403012 }, { "epoch": 0.49287675734034314, "eval_loss": 1.027020812034607, "eval_runtime": 32.799, "eval_samples_per_second": 30.489, "eval_steps_per_second": 1.281, "eval_token_acc": 0.7296553634495971, "num_input_tokens_seen": 421246710, "step": 2100 }, { "epoch": 0.49522378951815427, "grad_norm": 1.5242891687227014, "learning_rate": 1.781084731878463e-06, "loss": 0.9901479721069336, "num_input_tokens_seen": 423187323, "step": 2110, "token_acc": 0.7374922148637526 }, { "epoch": 0.49757082169596545, "grad_norm": 2.148393307418336, "learning_rate": 1.7784623015670235e-06, "loss": 0.9794765472412109, "num_input_tokens_seen": 425214681, "step": 2120, "token_acc": 0.7396016635749383 }, { "epoch": 0.49991785387377663, "grad_norm": 1.6777795098531292, "learning_rate": 1.7758262124045194e-06, "loss": 1.0104660987854004, "num_input_tokens_seen": 427125735, "step": 2130, "token_acc": 0.7328506355953969 }, { "epoch": 0.5022648860515878, "grad_norm": 1.8399011401453165, "learning_rate": 1.7731765106436071e-06, "loss": 0.9876059532165528, "num_input_tokens_seen": 429143655, "step": 2140, "token_acc": 0.7383790968301517 }, { "epoch": 0.5046119182293989, "grad_norm": 3.2054794139242047, "learning_rate": 1.7705132427757892e-06, "loss": 1.003396987915039, "num_input_tokens_seen": 431161200, "step": 2150, "token_acc": 0.7355545283928578 }, { "epoch": 0.5069589504072101, "grad_norm": 1.5550880678151673, "learning_rate": 1.7678364555305976e-06, "loss": 0.9901845932006836, "num_input_tokens_seen": 433164327, "step": 2160, "token_acc": 0.7361521188091766 }, { "epoch": 0.5093059825850212, "grad_norm": 2.416552637489239, "learning_rate": 1.7651461958747741e-06, "loss": 1.0047142028808593, "num_input_tokens_seen": 435216456, "step": 2170, "token_acc": 0.733555096342685 }, { "epoch": 0.5116530147628324, "grad_norm": 2.387719191103811, "learning_rate": 1.7624425110114479e-06, "loss": 1.0148651123046875, "num_input_tokens_seen": 437206023, "step": 2180, "token_acc": 0.7325390238452453 }, { "epoch": 0.5140000469406436, "grad_norm": 1.481562163308891, "learning_rate": 1.7597254483793048e-06, "loss": 0.9734397888183594, "num_input_tokens_seen": 439163631, "step": 2190, "token_acc": 0.7413863843737306 }, { "epoch": 0.5163470791184547, "grad_norm": 7.115442308152491, "learning_rate": 1.7569950556517563e-06, "loss": 1.019681167602539, "num_input_tokens_seen": 441170622, "step": 2200, "token_acc": 0.7295540569410798 }, { "epoch": 0.5163470791184547, "eval_loss": 1.0215942859649658, "eval_runtime": 32.4901, "eval_samples_per_second": 30.779, "eval_steps_per_second": 1.293, "eval_token_acc": 0.7315389764779207, "num_input_tokens_seen": 441170622, "step": 2200 }, { "epoch": 0.5186941112962659, "grad_norm": 1.6355732837542087, "learning_rate": 1.7542513807361037e-06, "loss": 1.0146623611450196, "num_input_tokens_seen": 443157417, "step": 2210, "token_acc": 0.7331868122856259 }, { "epoch": 0.521041143474077, "grad_norm": 1.5373557972963237, "learning_rate": 1.7514944717726961e-06, "loss": 0.996919822692871, "num_input_tokens_seen": 445115421, "step": 2220, "token_acc": 0.7370941300202442 }, { "epoch": 0.5233881756518882, "grad_norm": 3.095727021967102, "learning_rate": 1.748724377134086e-06, "loss": 1.008862018585205, "num_input_tokens_seen": 447113430, "step": 2230, "token_acc": 0.7321047500353728 }, { "epoch": 0.5257352078296993, "grad_norm": 1.6318669740450855, "learning_rate": 1.7459411454241822e-06, "loss": 1.0091367721557618, "num_input_tokens_seen": 449067504, "step": 2240, "token_acc": 0.7306417201986045 }, { "epoch": 0.5280822400075105, "grad_norm": 1.8958429005632293, "learning_rate": 1.743144825477394e-06, "loss": 0.9806262016296386, "num_input_tokens_seen": 451028514, "step": 2250, "token_acc": 0.7392674057301928 }, { "epoch": 0.5304292721853217, "grad_norm": 1.8300311325163234, "learning_rate": 1.740335466357778e-06, "loss": 0.9876058578491211, "num_input_tokens_seen": 453088446, "step": 2260, "token_acc": 0.7375388829110828 }, { "epoch": 0.5327763043631328, "grad_norm": 1.6283939332628163, "learning_rate": 1.737513117358174e-06, "loss": 1.0128792762756347, "num_input_tokens_seen": 455064009, "step": 2270, "token_acc": 0.7309403491726847 }, { "epoch": 0.535123336540944, "grad_norm": 1.7443727538000593, "learning_rate": 1.7346778279993416e-06, "loss": 1.0167512893676758, "num_input_tokens_seen": 457049565, "step": 2280, "token_acc": 0.7327466353251444 }, { "epoch": 0.5374703687187551, "grad_norm": 2.318872931178241, "learning_rate": 1.731829648029091e-06, "loss": 0.9633228302001953, "num_input_tokens_seen": 459050343, "step": 2290, "token_acc": 0.7410114142684382 }, { "epoch": 0.5398174008965663, "grad_norm": 1.5210715736947538, "learning_rate": 1.7289686274214115e-06, "loss": 0.9929851531982422, "num_input_tokens_seen": 461049750, "step": 2300, "token_acc": 0.7357508251313404 }, { "epoch": 0.5398174008965663, "eval_loss": 1.0185507535934448, "eval_runtime": 32.6195, "eval_samples_per_second": 30.657, "eval_steps_per_second": 1.288, "eval_token_acc": 0.731474342697537, "num_input_tokens_seen": 461049750, "step": 2300 }, { "epoch": 0.5421644330743774, "grad_norm": 1.5749401648234354, "learning_rate": 1.7260948163755917e-06, "loss": 0.9968940734863281, "num_input_tokens_seen": 462989622, "step": 2310, "token_acc": 0.7375997849195517 }, { "epoch": 0.5445114652521886, "grad_norm": 2.5312095421318928, "learning_rate": 1.723208265315342e-06, "loss": 0.9779894828796387, "num_input_tokens_seen": 465006357, "step": 2320, "token_acc": 0.7394803638714152 }, { "epoch": 0.5468584974299998, "grad_norm": 3.2822780472953803, "learning_rate": 1.720309024887907e-06, "loss": 1.0032640457153321, "num_input_tokens_seen": 467017005, "step": 2330, "token_acc": 0.7345803640542331 }, { "epoch": 0.5492055296078109, "grad_norm": 1.6687009392941055, "learning_rate": 1.7173971459631787e-06, "loss": 1.0077280044555663, "num_input_tokens_seen": 468979461, "step": 2340, "token_acc": 0.7342930917761522 }, { "epoch": 0.5515525617856221, "grad_norm": 11.650174621954747, "learning_rate": 1.7144726796328032e-06, "loss": 0.9968754768371582, "num_input_tokens_seen": 470994735, "step": 2350, "token_acc": 0.734416431505073 }, { "epoch": 0.5538995939634332, "grad_norm": 2.599642616517287, "learning_rate": 1.7115356772092855e-06, "loss": 1.0374162673950196, "num_input_tokens_seen": 472979052, "step": 2360, "token_acc": 0.7287551723023211 }, { "epoch": 0.5562466261412444, "grad_norm": 2.7538705299088453, "learning_rate": 1.7085861902250862e-06, "loss": 1.0119436264038086, "num_input_tokens_seen": 475016298, "step": 2370, "token_acc": 0.7321991702851346 }, { "epoch": 0.5585936583190556, "grad_norm": 2.3397709495881682, "learning_rate": 1.7056242704317208e-06, "loss": 0.9402626991271973, "num_input_tokens_seen": 477109281, "step": 2380, "token_acc": 0.7490173941732094 }, { "epoch": 0.5609406904968667, "grad_norm": 1.879207656038821, "learning_rate": 1.7026499697988492e-06, "loss": 0.9886844635009766, "num_input_tokens_seen": 479146713, "step": 2390, "token_acc": 0.7365850879725937 }, { "epoch": 0.5632877226746779, "grad_norm": 1.9704873763682087, "learning_rate": 1.6996633405133653e-06, "loss": 0.9943101882934571, "num_input_tokens_seen": 481102911, "step": 2400, "token_acc": 0.7366662244187203 }, { "epoch": 0.5632877226746779, "eval_loss": 1.015251636505127, "eval_runtime": 32.7961, "eval_samples_per_second": 30.491, "eval_steps_per_second": 1.281, "eval_token_acc": 0.7320929803097804, "num_input_tokens_seen": 481102911, "step": 2400 }, { "epoch": 0.565634754852489, "grad_norm": 1.6632244132905207, "learning_rate": 1.6966644349784808e-06, "loss": 0.9883607864379883, "num_input_tokens_seen": 483084549, "step": 2410, "token_acc": 0.7358879192027988 }, { "epoch": 0.5679817870303002, "grad_norm": 1.5330248452956106, "learning_rate": 1.6936533058128049e-06, "loss": 1.0042284965515136, "num_input_tokens_seen": 485112228, "step": 2420, "token_acc": 0.7344426514994169 }, { "epoch": 0.5703288192081113, "grad_norm": 2.5405918981273867, "learning_rate": 1.6906300058494227e-06, "loss": 0.9880990982055664, "num_input_tokens_seen": 487123020, "step": 2430, "token_acc": 0.7372175131700104 }, { "epoch": 0.5726758513859225, "grad_norm": 3.9012975042201297, "learning_rate": 1.6875945881349673e-06, "loss": 0.9801074981689453, "num_input_tokens_seen": 489120441, "step": 2440, "token_acc": 0.7381837376558823 }, { "epoch": 0.5750228835637337, "grad_norm": 1.6637494968221076, "learning_rate": 1.6845471059286886e-06, "loss": 1.0021610260009766, "num_input_tokens_seen": 491066049, "step": 2450, "token_acc": 0.7346050699774175 }, { "epoch": 0.5773699157415448, "grad_norm": 1.652438429477013, "learning_rate": 1.6814876127015198e-06, "loss": 0.9841398239135742, "num_input_tokens_seen": 493112928, "step": 2460, "token_acc": 0.7378321905180247 }, { "epoch": 0.579716947919356, "grad_norm": 3.543309593586376, "learning_rate": 1.678416162135138e-06, "loss": 0.979088020324707, "num_input_tokens_seen": 495119139, "step": 2470, "token_acc": 0.7399358154268393 }, { "epoch": 0.5820639800971671, "grad_norm": 2.893410134875752, "learning_rate": 1.6753328081210244e-06, "loss": 0.9998300552368165, "num_input_tokens_seen": 497115090, "step": 2480, "token_acc": 0.7359860001129023 }, { "epoch": 0.5844110122749783, "grad_norm": 1.9583144196315403, "learning_rate": 1.6722376047595161e-06, "loss": 0.9970391273498536, "num_input_tokens_seen": 499168851, "step": 2490, "token_acc": 0.7355328073638283 }, { "epoch": 0.5867580444527895, "grad_norm": 5.903330257525673, "learning_rate": 1.669130606358858e-06, "loss": 1.0149246215820313, "num_input_tokens_seen": 501138603, "step": 2500, "token_acc": 0.7320385426697377 }, { "epoch": 0.5867580444527895, "eval_loss": 1.0123026371002197, "eval_runtime": 32.7432, "eval_samples_per_second": 30.541, "eval_steps_per_second": 1.283, "eval_token_acc": 0.7328916691673784, "num_input_tokens_seen": 501138603, "step": 2500 }, { "epoch": 0.5891050766306006, "grad_norm": 1.919536213839018, "learning_rate": 1.6660118674342515e-06, "loss": 0.9900060653686523, "num_input_tokens_seen": 503184900, "step": 2510, "token_acc": 0.7371078337925816 }, { "epoch": 0.5914521088084118, "grad_norm": 3.4511789649891966, "learning_rate": 1.6628814427068952e-06, "loss": 0.9589821815490722, "num_input_tokens_seen": 505223106, "step": 2520, "token_acc": 0.7453759303446423 }, { "epoch": 0.5937991409862229, "grad_norm": 3.9395749071950554, "learning_rate": 1.6597393871030261e-06, "loss": 0.9944395065307617, "num_input_tokens_seen": 507246369, "step": 2530, "token_acc": 0.7347724854980832 }, { "epoch": 0.5961461731640341, "grad_norm": 1.5397013326592903, "learning_rate": 1.6565857557529564e-06, "loss": 0.9756797790527344, "num_input_tokens_seen": 509308893, "step": 2540, "token_acc": 0.7391703562324037 }, { "epoch": 0.5984932053418452, "grad_norm": 1.7526411604347196, "learning_rate": 1.6534206039901055e-06, "loss": 0.9834499359130859, "num_input_tokens_seen": 511244184, "step": 2550, "token_acc": 0.7380458487339893 }, { "epoch": 0.6008402375196564, "grad_norm": 2.2921640319260024, "learning_rate": 1.6502439873500286e-06, "loss": 1.0054790496826171, "num_input_tokens_seen": 513290352, "step": 2560, "token_acc": 0.734738491502126 }, { "epoch": 0.6031872696974675, "grad_norm": 1.9064014496743276, "learning_rate": 1.6470559615694445e-06, "loss": 0.9771562576293945, "num_input_tokens_seen": 515276862, "step": 2570, "token_acc": 0.7392910978769869 }, { "epoch": 0.6055343018752787, "grad_norm": 2.0609613172670764, "learning_rate": 1.6438565825852537e-06, "loss": 0.9563516616821289, "num_input_tokens_seen": 517288296, "step": 2580, "token_acc": 0.744728798321846 }, { "epoch": 0.6078813340530899, "grad_norm": 1.7302019611107595, "learning_rate": 1.6406459065335614e-06, "loss": 0.9771955490112305, "num_input_tokens_seen": 519254622, "step": 2590, "token_acc": 0.740443198920546 }, { "epoch": 0.610228366230901, "grad_norm": 1.8178684355141148, "learning_rate": 1.6374239897486897e-06, "loss": 0.9703773498535156, "num_input_tokens_seen": 521236017, "step": 2600, "token_acc": 0.7407382220106489 }, { "epoch": 0.610228366230901, "eval_loss": 1.0093790292739868, "eval_runtime": 32.6088, "eval_samples_per_second": 30.667, "eval_steps_per_second": 1.288, "eval_token_acc": 0.7333256388356686, "num_input_tokens_seen": 521236017, "step": 2600 }, { "epoch": 0.6125753984087122, "grad_norm": 1.8585384534519827, "learning_rate": 1.6341908887621894e-06, "loss": 0.9817310333251953, "num_input_tokens_seen": 523212513, "step": 2610, "token_acc": 0.738175322879972 }, { "epoch": 0.6149224305865233, "grad_norm": 2.4244702161030625, "learning_rate": 1.6309466603018495e-06, "loss": 0.9609703063964844, "num_input_tokens_seen": 525216327, "step": 2620, "token_acc": 0.7439178110371839 }, { "epoch": 0.6172694627643345, "grad_norm": 1.638774547412265, "learning_rate": 1.6276913612907004e-06, "loss": 0.9597613334655761, "num_input_tokens_seen": 527198007, "step": 2630, "token_acc": 0.7433998992304688 }, { "epoch": 0.6196164949421457, "grad_norm": 1.8052959287052057, "learning_rate": 1.6244250488460155e-06, "loss": 0.9595340728759766, "num_input_tokens_seen": 529328826, "step": 2640, "token_acc": 0.7424487405247924 }, { "epoch": 0.6219635271199568, "grad_norm": 2.9059084324443987, "learning_rate": 1.6211477802783102e-06, "loss": 0.9733432769775391, "num_input_tokens_seen": 531353727, "step": 2650, "token_acc": 0.7391637709236651 }, { "epoch": 0.624310559297768, "grad_norm": 2.4613981471794117, "learning_rate": 1.6178596130903343e-06, "loss": 0.9548052787780762, "num_input_tokens_seen": 533357184, "step": 2660, "token_acc": 0.7445567764998143 }, { "epoch": 0.6266575914755791, "grad_norm": 1.959126642555864, "learning_rate": 1.6145606049760642e-06, "loss": 0.9767616271972657, "num_input_tokens_seen": 535321791, "step": 2670, "token_acc": 0.7381060525928277 }, { "epoch": 0.6290046236533903, "grad_norm": 1.5308332739370312, "learning_rate": 1.6112508138196917e-06, "loss": 0.9835859298706054, "num_input_tokens_seen": 537364758, "step": 2680, "token_acc": 0.7381528449040924 }, { "epoch": 0.6313516558312015, "grad_norm": 1.8506281977228691, "learning_rate": 1.6079302976946053e-06, "loss": 0.9697771072387695, "num_input_tokens_seen": 539423991, "step": 2690, "token_acc": 0.7428583040298499 }, { "epoch": 0.6336986880090126, "grad_norm": 2.143447146073978, "learning_rate": 1.604599114862375e-06, "loss": 0.9710499763488769, "num_input_tokens_seen": 541385301, "step": 2700, "token_acc": 0.7437010271608948 }, { "epoch": 0.6336986880090126, "eval_loss": 1.006402611732483, "eval_runtime": 32.4804, "eval_samples_per_second": 30.788, "eval_steps_per_second": 1.293, "eval_token_acc": 0.7333256388356686, "num_input_tokens_seen": 541385301, "step": 2700 }, { "epoch": 0.6360457201868238, "grad_norm": 1.6759371033254091, "learning_rate": 1.6012573237717265e-06, "loss": 0.9557651519775391, "num_input_tokens_seen": 543498738, "step": 2710, "token_acc": 0.744166114013349 }, { "epoch": 0.6383927523646349, "grad_norm": 1.8250942426916423, "learning_rate": 1.5979049830575188e-06, "loss": 0.9645903587341309, "num_input_tokens_seen": 545489775, "step": 2720, "token_acc": 0.7429352817436318 }, { "epoch": 0.6407397845424461, "grad_norm": 1.9217599973801651, "learning_rate": 1.5945421515397134e-06, "loss": 0.9858356475830078, "num_input_tokens_seen": 547577721, "step": 2730, "token_acc": 0.7375185153736568 }, { "epoch": 0.6430868167202572, "grad_norm": 1.7809745721720633, "learning_rate": 1.591168888222342e-06, "loss": 0.9513526916503906, "num_input_tokens_seen": 549624339, "step": 2740, "token_acc": 0.745696874109412 }, { "epoch": 0.6454338488980684, "grad_norm": 4.185287393591199, "learning_rate": 1.587785252292473e-06, "loss": 1.0034643173217774, "num_input_tokens_seen": 551637576, "step": 2750, "token_acc": 0.7338386568669174 }, { "epoch": 0.6477808810758795, "grad_norm": 1.5787917866107477, "learning_rate": 1.584391303119172e-06, "loss": 0.9657976150512695, "num_input_tokens_seen": 553630620, "step": 2760, "token_acc": 0.7424132245973986 }, { "epoch": 0.6501279132536907, "grad_norm": 1.6169135735671403, "learning_rate": 1.58098710025246e-06, "loss": 0.976175594329834, "num_input_tokens_seen": 555634122, "step": 2770, "token_acc": 0.7385256195920764 }, { "epoch": 0.6524749454315019, "grad_norm": 2.786090764996497, "learning_rate": 1.5775727034222674e-06, "loss": 1.0152118682861329, "num_input_tokens_seen": 557567646, "step": 2780, "token_acc": 0.7318658065576464 }, { "epoch": 0.654821977609313, "grad_norm": 2.3429135221710142, "learning_rate": 1.5741481725373898e-06, "loss": 0.9660276412963867, "num_input_tokens_seen": 559612812, "step": 2790, "token_acc": 0.7423686792009822 }, { "epoch": 0.6571690097871242, "grad_norm": 2.066607274894778, "learning_rate": 1.5707135676844319e-06, "loss": 0.9577510833740235, "num_input_tokens_seen": 561582108, "step": 2800, "token_acc": 0.7451270299890406 }, { "epoch": 0.6571690097871242, "eval_loss": 1.0031476020812988, "eval_runtime": 32.6219, "eval_samples_per_second": 30.654, "eval_steps_per_second": 1.287, "eval_token_acc": 0.7348629994690796, "num_input_tokens_seen": 561582108, "step": 2800 }, { "epoch": 0.6595160419649353, "grad_norm": 2.6635168105713385, "learning_rate": 1.5672689491267565e-06, "loss": 0.9600403785705567, "num_input_tokens_seen": 563559690, "step": 2810, "token_acc": 0.7428777482846697 }, { "epoch": 0.6618630741427465, "grad_norm": 1.7450802216902526, "learning_rate": 1.5638143773034266e-06, "loss": 0.9954195022583008, "num_input_tokens_seen": 565524792, "step": 2820, "token_acc": 0.7348587056347071 }, { "epoch": 0.6642101063205577, "grad_norm": 1.8754649742088336, "learning_rate": 1.5603499128281444e-06, "loss": 0.969937515258789, "num_input_tokens_seen": 567451971, "step": 2830, "token_acc": 0.7414208823996457 }, { "epoch": 0.6665571384983688, "grad_norm": 1.5835197058667514, "learning_rate": 1.556875616488188e-06, "loss": 0.969327163696289, "num_input_tokens_seen": 569462406, "step": 2840, "token_acc": 0.7401524628156212 }, { "epoch": 0.66890417067618, "grad_norm": 2.6099644468289567, "learning_rate": 1.553391549243344e-06, "loss": 0.9504291534423828, "num_input_tokens_seen": 571500279, "step": 2850, "token_acc": 0.7466074001336113 }, { "epoch": 0.6712512028539911, "grad_norm": 2.482803714476407, "learning_rate": 1.54989777222484e-06, "loss": 0.9784445762634277, "num_input_tokens_seen": 573509781, "step": 2860, "token_acc": 0.7380322581926356 }, { "epoch": 0.6735982350318023, "grad_norm": 3.1626495400003227, "learning_rate": 1.546394346734269e-06, "loss": 0.9782054901123047, "num_input_tokens_seen": 575490657, "step": 2870, "token_acc": 0.7396435152006547 }, { "epoch": 0.6759452672096135, "grad_norm": 1.8352007570418352, "learning_rate": 1.5428813342425175e-06, "loss": 0.9893608093261719, "num_input_tokens_seen": 577443624, "step": 2880, "token_acc": 0.7371560289894273 }, { "epoch": 0.6782922993874246, "grad_norm": 11.589107012378998, "learning_rate": 1.5393587963886834e-06, "loss": 0.9795863151550293, "num_input_tokens_seen": 579501576, "step": 2890, "token_acc": 0.738575752796563 }, { "epoch": 0.6806393315652357, "grad_norm": 2.3582930414965713, "learning_rate": 1.5358267949789964e-06, "loss": 0.986695671081543, "num_input_tokens_seen": 581445867, "step": 2900, "token_acc": 0.7377336684807478 }, { "epoch": 0.6806393315652357, "eval_loss": 1.0008372068405151, "eval_runtime": 32.2631, "eval_samples_per_second": 30.995, "eval_steps_per_second": 1.302, "eval_token_acc": 0.7351584681794049, "num_input_tokens_seen": 581445867, "step": 2900 }, { "epoch": 0.6829863637430469, "grad_norm": 1.8683225729956336, "learning_rate": 1.532285391985734e-06, "loss": 0.9824249267578125, "num_input_tokens_seen": 583473981, "step": 2910, "token_acc": 0.7386771656575185 }, { "epoch": 0.6853333959208581, "grad_norm": 1.9051084978341761, "learning_rate": 1.5287346495461316e-06, "loss": 0.9780803680419922, "num_input_tokens_seen": 585488343, "step": 2920, "token_acc": 0.7386755390868261 }, { "epoch": 0.6876804280986692, "grad_norm": 1.7526173635768003, "learning_rate": 1.5251746299612958e-06, "loss": 0.9556564331054688, "num_input_tokens_seen": 587536749, "step": 2930, "token_acc": 0.7437935964230544 }, { "epoch": 0.6900274602764804, "grad_norm": 1.908495061384156, "learning_rate": 1.5216053956951078e-06, "loss": 0.9559732437133789, "num_input_tokens_seen": 589505883, "step": 2940, "token_acc": 0.7442760675515612 }, { "epoch": 0.6923744924542915, "grad_norm": 1.682313702090843, "learning_rate": 1.5180270093731302e-06, "loss": 0.9883411407470704, "num_input_tokens_seen": 591496815, "step": 2950, "token_acc": 0.7374446310537534 }, { "epoch": 0.6947215246321027, "grad_norm": 1.6223526724021116, "learning_rate": 1.5144395337815063e-06, "loss": 0.9544116973876953, "num_input_tokens_seen": 593483805, "step": 2960, "token_acc": 0.7434431431260328 }, { "epoch": 0.6970685568099139, "grad_norm": 3.3044227000595106, "learning_rate": 1.5108430318658599e-06, "loss": 0.9596687316894531, "num_input_tokens_seen": 595472802, "step": 2970, "token_acc": 0.7425995483387807 }, { "epoch": 0.699415588987725, "grad_norm": 1.9946238986715072, "learning_rate": 1.507237566730189e-06, "loss": 0.9447664260864258, "num_input_tokens_seen": 597458052, "step": 2980, "token_acc": 0.7471448055436924 }, { "epoch": 0.7017626211655362, "grad_norm": 1.9985767739510494, "learning_rate": 1.5036232016357608e-06, "loss": 0.9753869056701661, "num_input_tokens_seen": 599511099, "step": 2990, "token_acc": 0.7407295913625692 }, { "epoch": 0.7041096533433473, "grad_norm": 1.7848206984050603, "learning_rate": 1.5e-06, "loss": 0.9929049491882325, "num_input_tokens_seen": 601494039, "step": 3000, "token_acc": 0.733967886177249 }, { "epoch": 0.7041096533433473, "eval_loss": 0.9992188215255737, "eval_runtime": 32.8388, "eval_samples_per_second": 30.452, "eval_steps_per_second": 1.279, "eval_token_acc": 0.7353292860275616, "num_input_tokens_seen": 601494039, "step": 3000 }, { "epoch": 0.7064566855211585, "grad_norm": 1.5971312735806535, "learning_rate": 1.4963680253953767e-06, "loss": 0.9550104141235352, "num_input_tokens_seen": 603479547, "step": 3010, "token_acc": 0.7457036074683664 }, { "epoch": 0.7088037176989697, "grad_norm": 1.7733615613171219, "learning_rate": 1.4927273415482915e-06, "loss": 0.9737858772277832, "num_input_tokens_seen": 605442297, "step": 3020, "token_acc": 0.7412765006450565 }, { "epoch": 0.7111507498767808, "grad_norm": 6.926370874803529, "learning_rate": 1.4890780123379563e-06, "loss": 0.9665937423706055, "num_input_tokens_seen": 607477695, "step": 3030, "token_acc": 0.7405696365107176 }, { "epoch": 0.713497782054592, "grad_norm": 1.6484833491764401, "learning_rate": 1.485420101795274e-06, "loss": 0.95927734375, "num_input_tokens_seen": 609444318, "step": 3040, "token_acc": 0.7442635774417046 }, { "epoch": 0.7158448142324031, "grad_norm": 2.0360741208385913, "learning_rate": 1.4817536741017151e-06, "loss": 0.9595672607421875, "num_input_tokens_seen": 611390574, "step": 3050, "token_acc": 0.743094030233154 }, { "epoch": 0.7181918464102143, "grad_norm": 1.6990138073052663, "learning_rate": 1.4780787935881923e-06, "loss": 0.9530370712280274, "num_input_tokens_seen": 613394736, "step": 3060, "token_acc": 0.7442468822691946 }, { "epoch": 0.7205388785880255, "grad_norm": 1.7762074096620095, "learning_rate": 1.474395524733929e-06, "loss": 0.9581127166748047, "num_input_tokens_seen": 615392505, "step": 3070, "token_acc": 0.7441699918818188 }, { "epoch": 0.7228859107658366, "grad_norm": 2.463738917765937, "learning_rate": 1.4707039321653328e-06, "loss": 0.9451935768127442, "num_input_tokens_seen": 617397957, "step": 3080, "token_acc": 0.7463462899737582 }, { "epoch": 0.7252329429436477, "grad_norm": 1.5266790692018193, "learning_rate": 1.4670040806548554e-06, "loss": 0.9604751586914062, "num_input_tokens_seen": 619431237, "step": 3090, "token_acc": 0.743774946972139 }, { "epoch": 0.7275799751214589, "grad_norm": 2.3079209032431858, "learning_rate": 1.4632960351198617e-06, "loss": 0.958247184753418, "num_input_tokens_seen": 621429906, "step": 3100, "token_acc": 0.7430188770047043 }, { "epoch": 0.7275799751214589, "eval_loss": 0.9967913031578064, "eval_runtime": 32.9622, "eval_samples_per_second": 30.338, "eval_steps_per_second": 1.274, "eval_token_acc": 0.7358602063664273, "num_input_tokens_seen": 621429906, "step": 3100 }, { "epoch": 0.7299270072992701, "grad_norm": 1.9140433022488452, "learning_rate": 1.459579860621488e-06, "loss": 0.9593525886535644, "num_input_tokens_seen": 623425752, "step": 3110, "token_acc": 0.7432277726301421 }, { "epoch": 0.7322740394770813, "grad_norm": 1.8212366585882274, "learning_rate": 1.4558556223635e-06, "loss": 0.9617977142333984, "num_input_tokens_seen": 625420740, "step": 3120, "token_acc": 0.742332781810841 }, { "epoch": 0.7346210716548924, "grad_norm": 1.81167215973652, "learning_rate": 1.4521233856911506e-06, "loss": 0.958807373046875, "num_input_tokens_seen": 627481314, "step": 3130, "token_acc": 0.7424123292987752 }, { "epoch": 0.7369681038327035, "grad_norm": 2.3831847210640373, "learning_rate": 1.4483832160900325e-06, "loss": 0.9585672378540039, "num_input_tokens_seen": 629442897, "step": 3140, "token_acc": 0.7439413187403806 }, { "epoch": 0.7393151360105147, "grad_norm": 2.043229737472813, "learning_rate": 1.4446351791849273e-06, "loss": 0.9544695854187012, "num_input_tokens_seen": 631432200, "step": 3150, "token_acc": 0.7442476653043495 }, { "epoch": 0.7416621681883259, "grad_norm": 4.811000207072732, "learning_rate": 1.4408793407386585e-06, "loss": 0.9843364715576172, "num_input_tokens_seen": 633445356, "step": 3160, "token_acc": 0.7394687633144498 }, { "epoch": 0.744009200366137, "grad_norm": 2.4106731528164027, "learning_rate": 1.4371157666509327e-06, "loss": 0.9410341262817383, "num_input_tokens_seen": 635526396, "step": 3170, "token_acc": 0.7483812367179011 }, { "epoch": 0.7463562325439482, "grad_norm": 1.7671356519717534, "learning_rate": 1.4333445229571873e-06, "loss": 0.9693818092346191, "num_input_tokens_seen": 637512357, "step": 3180, "token_acc": 0.7406820079650566 }, { "epoch": 0.7487032647217593, "grad_norm": 7.649772673674551, "learning_rate": 1.429565675827428e-06, "loss": 0.9459026336669922, "num_input_tokens_seen": 639512292, "step": 3190, "token_acc": 0.7462010482209617 }, { "epoch": 0.7510502968995705, "grad_norm": 1.8216992633152906, "learning_rate": 1.4257792915650725e-06, "loss": 0.9720870971679687, "num_input_tokens_seen": 641562030, "step": 3200, "token_acc": 0.7415353056114234 }, { "epoch": 0.7510502968995705, "eval_loss": 0.9943264722824097, "eval_runtime": 32.5126, "eval_samples_per_second": 30.757, "eval_steps_per_second": 1.292, "eval_token_acc": 0.7364419103898802, "num_input_tokens_seen": 641562030, "step": 3200 }, { "epoch": 0.7533973290773817, "grad_norm": 2.5481168225977733, "learning_rate": 1.421985436605783e-06, "loss": 0.9607316970825195, "num_input_tokens_seen": 643584060, "step": 3210, "token_acc": 0.7432411531496256 }, { "epoch": 0.7557443612551928, "grad_norm": 2.118059690668745, "learning_rate": 1.4181841775163012e-06, "loss": 0.9484768867492676, "num_input_tokens_seen": 645607389, "step": 3220, "token_acc": 0.7466806535620841 }, { "epoch": 0.7580913934330039, "grad_norm": 1.536905846651224, "learning_rate": 1.4143755809932843e-06, "loss": 0.9712394714355469, "num_input_tokens_seen": 647631456, "step": 3230, "token_acc": 0.7404885747138855 }, { "epoch": 0.7604384256108151, "grad_norm": 2.4582256619795935, "learning_rate": 1.4105597138621279e-06, "loss": 0.9821660041809082, "num_input_tokens_seen": 649623648, "step": 3240, "token_acc": 0.7392644424148588 }, { "epoch": 0.7627854577886263, "grad_norm": 1.726231353540505, "learning_rate": 1.4067366430758004e-06, "loss": 0.9590049743652344, "num_input_tokens_seen": 651641892, "step": 3250, "token_acc": 0.7437141846756814 }, { "epoch": 0.7651324899664375, "grad_norm": 1.922673635746528, "learning_rate": 1.4029064357136626e-06, "loss": 0.9750150680541992, "num_input_tokens_seen": 653604150, "step": 3260, "token_acc": 0.7414191376968158 }, { "epoch": 0.7674795221442486, "grad_norm": 1.6204247475263307, "learning_rate": 1.3990691589802952e-06, "loss": 0.9551026344299316, "num_input_tokens_seen": 655600902, "step": 3270, "token_acc": 0.7445325970386258 }, { "epoch": 0.7698265543220597, "grad_norm": 1.4635122389462327, "learning_rate": 1.3952248802043165e-06, "loss": 0.9669751167297364, "num_input_tokens_seen": 657608466, "step": 3280, "token_acc": 0.7429139464814524 }, { "epoch": 0.7721735864998709, "grad_norm": 8.22536747363378, "learning_rate": 1.3913736668372024e-06, "loss": 0.9439043045043946, "num_input_tokens_seen": 659654619, "step": 3290, "token_acc": 0.7456336900472631 }, { "epoch": 0.7745206186776821, "grad_norm": 9.543988869279518, "learning_rate": 1.3875155864521028e-06, "loss": 0.9564947128295899, "num_input_tokens_seen": 661688691, "step": 3300, "token_acc": 0.7438499491922926 }, { "epoch": 0.7745206186776821, "eval_loss": 0.9920927882194519, "eval_runtime": 32.5499, "eval_samples_per_second": 30.722, "eval_steps_per_second": 1.29, "eval_token_acc": 0.7375683848479953, "num_input_tokens_seen": 661688691, "step": 3300 }, { "epoch": 0.7768676508554933, "grad_norm": 1.6682984595331, "learning_rate": 1.3836507067426564e-06, "loss": 0.9715993881225586, "num_input_tokens_seen": 663716223, "step": 3310, "token_acc": 0.7398505776738471 }, { "epoch": 0.7792146830333044, "grad_norm": 1.4995458445383936, "learning_rate": 1.379779095521801e-06, "loss": 0.9635456085205079, "num_input_tokens_seen": 665680179, "step": 3320, "token_acc": 0.7437482270495307 }, { "epoch": 0.7815617152111155, "grad_norm": 2.9414382856417665, "learning_rate": 1.3759008207205866e-06, "loss": 0.955263328552246, "num_input_tokens_seen": 667683303, "step": 3330, "token_acc": 0.7446334146072263 }, { "epoch": 0.7839087473889267, "grad_norm": 2.4991760245357852, "learning_rate": 1.3720159503869814e-06, "loss": 0.9503087997436523, "num_input_tokens_seen": 669640779, "step": 3340, "token_acc": 0.7461391567718691 }, { "epoch": 0.7862557795667379, "grad_norm": 1.6816037590987798, "learning_rate": 1.3681245526846781e-06, "loss": 0.9773989677429199, "num_input_tokens_seen": 671655801, "step": 3350, "token_acc": 0.7381824953149948 }, { "epoch": 0.788602811744549, "grad_norm": 13.867129183446435, "learning_rate": 1.3642266958918981e-06, "loss": 0.9606409072875977, "num_input_tokens_seen": 673618887, "step": 3360, "token_acc": 0.7444690515700922 }, { "epoch": 0.7909498439223602, "grad_norm": 1.68642281844996, "learning_rate": 1.3603224484001947e-06, "loss": 0.9683753967285156, "num_input_tokens_seen": 675600486, "step": 3370, "token_acc": 0.7418183604302765 }, { "epoch": 0.7932968761001713, "grad_norm": 3.6876372609830947, "learning_rate": 1.3564118787132506e-06, "loss": 0.9690577507019043, "num_input_tokens_seen": 677573577, "step": 3380, "token_acc": 0.7409562127336359 }, { "epoch": 0.7956439082779825, "grad_norm": 2.195994268899717, "learning_rate": 1.3524950554456784e-06, "loss": 0.9620229721069335, "num_input_tokens_seen": 679562811, "step": 3390, "token_acc": 0.7450269148735509 }, { "epoch": 0.7979909404557937, "grad_norm": 1.66370232482538, "learning_rate": 1.3485720473218152e-06, "loss": 0.9747153282165527, "num_input_tokens_seen": 681515289, "step": 3400, "token_acc": 0.7406435118536351 }, { "epoch": 0.7979909404557937, "eval_loss": 0.989876389503479, "eval_runtime": 32.5612, "eval_samples_per_second": 30.711, "eval_steps_per_second": 1.29, "eval_token_acc": 0.7377299692989543, "num_input_tokens_seen": 681515289, "step": 3400 }, { "epoch": 0.8003379726336048, "grad_norm": 7.5891295742929765, "learning_rate": 1.344642923174517e-06, "loss": 0.9531444549560547, "num_input_tokens_seen": 683512767, "step": 3410, "token_acc": 0.743843269116981 }, { "epoch": 0.8026850048114159, "grad_norm": 2.227763312783814, "learning_rate": 1.3407077519439517e-06, "loss": 0.9736311912536622, "num_input_tokens_seen": 685506138, "step": 3420, "token_acc": 0.7399243439837672 }, { "epoch": 0.8050320369892271, "grad_norm": 2.113082381566505, "learning_rate": 1.3367666026763882e-06, "loss": 0.9282070159912109, "num_input_tokens_seen": 687553683, "step": 3430, "token_acc": 0.7491222650322436 }, { "epoch": 0.8073790691670383, "grad_norm": 2.6528881128590505, "learning_rate": 1.3328195445229867e-06, "loss": 0.9803478240966796, "num_input_tokens_seen": 689471004, "step": 3440, "token_acc": 0.7387909473555786 }, { "epoch": 0.8097261013448495, "grad_norm": 1.9793103853657699, "learning_rate": 1.3288666467385831e-06, "loss": 0.9667415618896484, "num_input_tokens_seen": 691496667, "step": 3450, "token_acc": 0.7424123423266975 }, { "epoch": 0.8120731335226606, "grad_norm": 1.7709247958435497, "learning_rate": 1.3249079786804764e-06, "loss": 0.9529176712036133, "num_input_tokens_seen": 693546759, "step": 3460, "token_acc": 0.7441175099271877 }, { "epoch": 0.8144201657004717, "grad_norm": 1.5610954433541373, "learning_rate": 1.3209436098072093e-06, "loss": 0.9164794921875, "num_input_tokens_seen": 695642895, "step": 3470, "token_acc": 0.7535160611124015 }, { "epoch": 0.8167671978782829, "grad_norm": 5.4874386973622675, "learning_rate": 1.3169736096773518e-06, "loss": 0.9681709289550782, "num_input_tokens_seen": 697628748, "step": 3480, "token_acc": 0.7417104783717662 }, { "epoch": 0.8191142300560941, "grad_norm": 1.5904173197084162, "learning_rate": 1.3129980479482781e-06, "loss": 0.9423411369323731, "num_input_tokens_seen": 699612816, "step": 3490, "token_acc": 0.7463674068222216 }, { "epoch": 0.8214612622339053, "grad_norm": 2.5766852327480185, "learning_rate": 1.3090169943749473e-06, "loss": 0.9422481536865235, "num_input_tokens_seen": 701681886, "step": 3500, "token_acc": 0.746677911017143 }, { "epoch": 0.8214612622339053, "eval_loss": 0.9871490597724915, "eval_runtime": 32.4224, "eval_samples_per_second": 30.843, "eval_steps_per_second": 1.295, "eval_token_acc": 0.738205489254634, "num_input_tokens_seen": 701681886, "step": 3500 }, { "epoch": 0.8238082944117164, "grad_norm": 1.6839398965190602, "learning_rate": 1.3050305188086776e-06, "loss": 0.9780057907104492, "num_input_tokens_seen": 703749471, "step": 3510, "token_acc": 0.7461169628181562 }, { "epoch": 0.8261553265895275, "grad_norm": 1.6472063314918655, "learning_rate": 1.3010386911959206e-06, "loss": 0.9228075981140137, "num_input_tokens_seen": 705742899, "step": 3520, "token_acc": 0.750938660857144 }, { "epoch": 0.8285023587673387, "grad_norm": 2.0632172614206934, "learning_rate": 1.2970415815770348e-06, "loss": 0.9639385223388672, "num_input_tokens_seen": 707763786, "step": 3530, "token_acc": 0.7435530770762796 }, { "epoch": 0.8308493909451499, "grad_norm": 1.9277876571318946, "learning_rate": 1.2930392600850572e-06, "loss": 0.9361279487609864, "num_input_tokens_seen": 709803774, "step": 3540, "token_acc": 0.7479319140358494 }, { "epoch": 0.833196423122961, "grad_norm": 1.7198703719511412, "learning_rate": 1.2890317969444716e-06, "loss": 0.9535655975341797, "num_input_tokens_seen": 711862587, "step": 3550, "token_acc": 0.7448029965128141 }, { "epoch": 0.8355434553007721, "grad_norm": 2.212450916967764, "learning_rate": 1.285019262469976e-06, "loss": 0.9320892333984375, "num_input_tokens_seen": 713902905, "step": 3560, "token_acc": 0.7496303953267546 }, { "epoch": 0.8378904874785833, "grad_norm": 1.9712068144466057, "learning_rate": 1.281001727065251e-06, "loss": 0.9570484161376953, "num_input_tokens_seen": 715896024, "step": 3570, "token_acc": 0.7434224760474031 }, { "epoch": 0.8402375196563945, "grad_norm": 10.730434108038908, "learning_rate": 1.2769792612217224e-06, "loss": 0.9570381164550781, "num_input_tokens_seen": 717863472, "step": 3580, "token_acc": 0.7445581595776979 }, { "epoch": 0.8425845518342057, "grad_norm": 3.30727503447712, "learning_rate": 1.2729519355173253e-06, "loss": 0.9440830230712891, "num_input_tokens_seen": 719863371, "step": 3590, "token_acc": 0.7474822302083397 }, { "epoch": 0.8449315840120168, "grad_norm": 3.713841498382935, "learning_rate": 1.2689198206152656e-06, "loss": 0.9532724380493164, "num_input_tokens_seen": 721831113, "step": 3600, "token_acc": 0.7449260731906336 }, { "epoch": 0.8449315840120168, "eval_loss": 0.9854407906532288, "eval_runtime": 32.7136, "eval_samples_per_second": 30.568, "eval_steps_per_second": 1.284, "eval_token_acc": 0.738343990212599, "num_input_tokens_seen": 721831113, "step": 3600 }, { "epoch": 0.8472786161898279, "grad_norm": 2.0589116386432122, "learning_rate": 1.2648829872627807e-06, "loss": 0.9483745574951172, "num_input_tokens_seen": 723825324, "step": 3610, "token_acc": 0.745855639432676 }, { "epoch": 0.8496256483676391, "grad_norm": 2.2896157925507143, "learning_rate": 1.2608415062898969e-06, "loss": 0.9875471115112304, "num_input_tokens_seen": 725824848, "step": 3620, "token_acc": 0.736929354012106 }, { "epoch": 0.8519726805454503, "grad_norm": 1.8359922545608438, "learning_rate": 1.2567954486081878e-06, "loss": 0.9514982223510742, "num_input_tokens_seen": 727830747, "step": 3630, "token_acc": 0.7452454133152131 }, { "epoch": 0.8543197127232615, "grad_norm": 3.153372907954943, "learning_rate": 1.2527448852095292e-06, "loss": 0.9558559417724609, "num_input_tokens_seen": 729852828, "step": 3640, "token_acc": 0.7435630305059377 }, { "epoch": 0.8566667449010726, "grad_norm": 3.2189620482043386, "learning_rate": 1.2486898871648551e-06, "loss": 0.9721113204956054, "num_input_tokens_seen": 731850777, "step": 3650, "token_acc": 0.7411079350146542 }, { "epoch": 0.8590137770788837, "grad_norm": 3.3099093175401872, "learning_rate": 1.2446305256229072e-06, "loss": 0.9803009986877441, "num_input_tokens_seen": 733814010, "step": 3660, "token_acc": 0.7365633927510155 }, { "epoch": 0.8613608092566949, "grad_norm": 1.5270344015944395, "learning_rate": 1.2405668718089917e-06, "loss": 0.9435177803039551, "num_input_tokens_seen": 735837123, "step": 3670, "token_acc": 0.746749139522123 }, { "epoch": 0.8637078414345061, "grad_norm": 5.787047190916268, "learning_rate": 1.2364989970237248e-06, "loss": 0.956524658203125, "num_input_tokens_seen": 737845806, "step": 3680, "token_acc": 0.7443589079040083 }, { "epoch": 0.8660548736123173, "grad_norm": 8.359169785563331, "learning_rate": 1.232426972641784e-06, "loss": 0.9011870384216308, "num_input_tokens_seen": 739830486, "step": 3690, "token_acc": 0.75567660422689 }, { "epoch": 0.8684019057901284, "grad_norm": 1.5845135247364173, "learning_rate": 1.2283508701106558e-06, "loss": 0.9817106246948242, "num_input_tokens_seen": 741791226, "step": 3700, "token_acc": 0.7385339271890049 }, { "epoch": 0.8684019057901284, "eval_loss": 0.983921468257904, "eval_runtime": 32.7463, "eval_samples_per_second": 30.538, "eval_steps_per_second": 1.283, "eval_token_acc": 0.7384963412663604, "num_input_tokens_seen": 741791226, "step": 3700 }, { "epoch": 0.8707489379679395, "grad_norm": 2.3840469812175087, "learning_rate": 1.224270760949381e-06, "loss": 0.9575783729553222, "num_input_tokens_seen": 743787261, "step": 3710, "token_acc": 0.7436981812982442 }, { "epoch": 0.8730959701457507, "grad_norm": 1.947777089028747, "learning_rate": 1.2201867167473015e-06, "loss": 0.9696456909179687, "num_input_tokens_seen": 745796382, "step": 3720, "token_acc": 0.7412485623553386 }, { "epoch": 0.8754430023235619, "grad_norm": 1.755420766932852, "learning_rate": 1.2160988091628022e-06, "loss": 0.9615589141845703, "num_input_tokens_seen": 747780156, "step": 3730, "token_acc": 0.7427405478352258 }, { "epoch": 0.877790034501373, "grad_norm": 1.5327100981263035, "learning_rate": 1.2120071099220547e-06, "loss": 0.9285150527954101, "num_input_tokens_seen": 749739183, "step": 3740, "token_acc": 0.7498815184287402 }, { "epoch": 0.8801370666791841, "grad_norm": 1.797316309204294, "learning_rate": 1.207911690817759e-06, "loss": 0.9365687370300293, "num_input_tokens_seen": 751694550, "step": 3750, "token_acc": 0.747152564554286 }, { "epoch": 0.8824840988569953, "grad_norm": 3.689781286827284, "learning_rate": 1.2038126237078849e-06, "loss": 0.953128433227539, "num_input_tokens_seen": 753712974, "step": 3760, "token_acc": 0.7452915604974099 }, { "epoch": 0.8848311310348065, "grad_norm": 1.7805781440802038, "learning_rate": 1.1997099805144068e-06, "loss": 0.9508394241333008, "num_input_tokens_seen": 755748069, "step": 3770, "token_acc": 0.7452503865456881 }, { "epoch": 0.8871781632126177, "grad_norm": 1.6166917326261805, "learning_rate": 1.195603833222048e-06, "loss": 0.9421730995178222, "num_input_tokens_seen": 757732731, "step": 3780, "token_acc": 0.746435002974226 }, { "epoch": 0.8895251953904288, "grad_norm": 2.7425269690357057, "learning_rate": 1.191494253877013e-06, "loss": 0.9745880126953125, "num_input_tokens_seen": 759774399, "step": 3790, "token_acc": 0.7450119697550278 }, { "epoch": 0.8918722275682399, "grad_norm": 1.6146982833566892, "learning_rate": 1.1873813145857248e-06, "loss": 0.9547751426696778, "num_input_tokens_seen": 761780385, "step": 3800, "token_acc": 0.7437249909057839 }, { "epoch": 0.8918722275682399, "eval_loss": 0.9822799563407898, "eval_runtime": 32.7794, "eval_samples_per_second": 30.507, "eval_steps_per_second": 1.281, "eval_token_acc": 0.738865677154267, "num_input_tokens_seen": 761780385, "step": 3800 }, { "epoch": 0.8942192597460511, "grad_norm": 8.557612907531114, "learning_rate": 1.1832650875135597e-06, "loss": 0.9583858489990235, "num_input_tokens_seen": 763769655, "step": 3810, "token_acc": 0.7431487370276885 }, { "epoch": 0.8965662919238623, "grad_norm": 1.5077356512025262, "learning_rate": 1.1791456448835825e-06, "loss": 0.9206510543823242, "num_input_tokens_seen": 765823593, "step": 3820, "token_acc": 0.7506628223950441 }, { "epoch": 0.8989133241016735, "grad_norm": 1.5006830716992956, "learning_rate": 1.175023058975276e-06, "loss": 0.9615950584411621, "num_input_tokens_seen": 767831079, "step": 3830, "token_acc": 0.7423029397870712 }, { "epoch": 0.9012603562794846, "grad_norm": 1.6769633570300284, "learning_rate": 1.1708974021232767e-06, "loss": 0.9534446716308593, "num_input_tokens_seen": 769798548, "step": 3840, "token_acc": 0.7445747944292532 }, { "epoch": 0.9036073884572957, "grad_norm": 1.759779515088976, "learning_rate": 1.1667687467161023e-06, "loss": 0.9459953308105469, "num_input_tokens_seen": 771774078, "step": 3850, "token_acc": 0.744865905394826 }, { "epoch": 0.9059544206351069, "grad_norm": 1.6599709517731647, "learning_rate": 1.1626371651948836e-06, "loss": 0.9330622673034668, "num_input_tokens_seen": 773817642, "step": 3860, "token_acc": 0.7481679393835271 }, { "epoch": 0.9083014528129181, "grad_norm": 1.6573686376498213, "learning_rate": 1.158502730052093e-06, "loss": 0.943012809753418, "num_input_tokens_seen": 775877070, "step": 3870, "token_acc": 0.7472794230837547 }, { "epoch": 0.9106484849907293, "grad_norm": 2.4726992986853444, "learning_rate": 1.1543655138302713e-06, "loss": 0.9866430282592773, "num_input_tokens_seen": 777904599, "step": 3880, "token_acc": 0.7372872068022087 }, { "epoch": 0.9129955171685403, "grad_norm": 1.7326340330977308, "learning_rate": 1.150225589120757e-06, "loss": 0.9427039146423339, "num_input_tokens_seen": 779960793, "step": 3890, "token_acc": 0.7463757958063197 }, { "epoch": 0.9153425493463515, "grad_norm": 1.634253822545075, "learning_rate": 1.1460830285624116e-06, "loss": 0.9683923721313477, "num_input_tokens_seen": 782008791, "step": 3900, "token_acc": 0.741813429536215 }, { "epoch": 0.9153425493463515, "eval_loss": 0.97979736328125, "eval_runtime": 32.457, "eval_samples_per_second": 30.81, "eval_steps_per_second": 1.294, "eval_token_acc": 0.739553565245493, "num_input_tokens_seen": 782008791, "step": 3900 }, { "epoch": 0.9176895815241627, "grad_norm": 5.153362224558377, "learning_rate": 1.1419379048403444e-06, "loss": 0.9662550926208496, "num_input_tokens_seen": 784016886, "step": 3910, "token_acc": 0.7420221405659442 }, { "epoch": 0.9200366137019739, "grad_norm": 1.9857737502868835, "learning_rate": 1.137790290684638e-06, "loss": 0.9286038398742675, "num_input_tokens_seen": 786018876, "step": 3920, "token_acc": 0.7495468248085001 }, { "epoch": 0.922383645879785, "grad_norm": 1.842562371990634, "learning_rate": 1.1336402588690725e-06, "loss": 0.9483222007751465, "num_input_tokens_seen": 788055180, "step": 3930, "token_acc": 0.7456087098512761 }, { "epoch": 0.9247306780575961, "grad_norm": 1.928971592873294, "learning_rate": 1.1294878822098467e-06, "loss": 0.9480892181396484, "num_input_tokens_seen": 790110096, "step": 3940, "token_acc": 0.7468523363829526 }, { "epoch": 0.9270777102354073, "grad_norm": 1.6567939468576487, "learning_rate": 1.1253332335643042e-06, "loss": 0.947171974182129, "num_input_tokens_seen": 792098733, "step": 3950, "token_acc": 0.7463428498622995 }, { "epoch": 0.9294247424132185, "grad_norm": 2.382881124913188, "learning_rate": 1.1211763858296505e-06, "loss": 0.9341253280639649, "num_input_tokens_seen": 794107374, "step": 3960, "token_acc": 0.749001431982777 }, { "epoch": 0.9317717745910297, "grad_norm": 2.385202785146866, "learning_rate": 1.1170174119416775e-06, "loss": 0.9605335235595703, "num_input_tokens_seen": 796145907, "step": 3970, "token_acc": 0.7420721101207574 }, { "epoch": 0.9341188067688408, "grad_norm": 1.6538910226354369, "learning_rate": 1.1128563848734815e-06, "loss": 0.904339599609375, "num_input_tokens_seen": 798189987, "step": 3980, "token_acc": 0.7552502219081598 }, { "epoch": 0.9364658389466519, "grad_norm": 2.1083368115488206, "learning_rate": 1.108693377634185e-06, "loss": 0.9489521980285645, "num_input_tokens_seen": 800197461, "step": 3990, "token_acc": 0.7454285509759317 }, { "epoch": 0.9388128711244631, "grad_norm": 1.9940124977981624, "learning_rate": 1.1045284632676535e-06, "loss": 0.9406743049621582, "num_input_tokens_seen": 802174746, "step": 4000, "token_acc": 0.7459721976990789 }, { "epoch": 0.9388128711244631, "eval_loss": 0.9778164029121399, "eval_runtime": 32.5943, "eval_samples_per_second": 30.68, "eval_steps_per_second": 1.289, "eval_token_acc": 0.7396274324230743, "num_input_tokens_seen": 802174746, "step": 4000 }, { "epoch": 0.9411599033022743, "grad_norm": 1.869832978916969, "learning_rate": 1.1003617148512149e-06, "loss": 0.9346565246582031, "num_input_tokens_seen": 804141819, "step": 4010, "token_acc": 0.7472374245472837 }, { "epoch": 0.9435069354800855, "grad_norm": 2.364187676148168, "learning_rate": 1.0961932054943776e-06, "loss": 0.9504963874816894, "num_input_tokens_seen": 806092293, "step": 4020, "token_acc": 0.7476745370464685 }, { "epoch": 0.9458539676578966, "grad_norm": 1.7457815556862932, "learning_rate": 1.0920230083375472e-06, "loss": 0.9478288650512695, "num_input_tokens_seen": 808096725, "step": 4030, "token_acc": 0.7461893605967633 }, { "epoch": 0.9482009998357077, "grad_norm": 1.7540758806187229, "learning_rate": 1.0878511965507434e-06, "loss": 0.9289562225341796, "num_input_tokens_seen": 810119691, "step": 4040, "token_acc": 0.7498504598729057 }, { "epoch": 0.9505480320135189, "grad_norm": 5.524603084757776, "learning_rate": 1.0836778433323157e-06, "loss": 0.9280494689941406, "num_input_tokens_seen": 812173641, "step": 4050, "token_acc": 0.7489092478671032 }, { "epoch": 0.9528950641913301, "grad_norm": 2.2610221290856205, "learning_rate": 1.0795030219076598e-06, "loss": 0.9323202133178711, "num_input_tokens_seen": 814155057, "step": 4060, "token_acc": 0.7484355792832109 }, { "epoch": 0.9552420963691413, "grad_norm": 1.7453803466041382, "learning_rate": 1.0753268055279328e-06, "loss": 0.9361183166503906, "num_input_tokens_seen": 816203571, "step": 4070, "token_acc": 0.7480308978092947 }, { "epoch": 0.9575891285469523, "grad_norm": 3.200843146499252, "learning_rate": 1.071149267468767e-06, "loss": 0.9665923118591309, "num_input_tokens_seen": 818255160, "step": 4080, "token_acc": 0.7428710890766919 }, { "epoch": 0.9599361607247635, "grad_norm": 2.769528286877977, "learning_rate": 1.066970481028985e-06, "loss": 0.9312915802001953, "num_input_tokens_seen": 820210017, "step": 4090, "token_acc": 0.7505294435331026 }, { "epoch": 0.9622831929025747, "grad_norm": 3.5116532009374186, "learning_rate": 1.0627905195293135e-06, "loss": 0.9360153198242187, "num_input_tokens_seen": 822213030, "step": 4100, "token_acc": 0.7485829324512936 }, { "epoch": 0.9622831929025747, "eval_loss": 0.9762653112411499, "eval_runtime": 32.7782, "eval_samples_per_second": 30.508, "eval_steps_per_second": 1.281, "eval_token_acc": 0.7401121857759516, "num_input_tokens_seen": 822213030, "step": 4100 }, { "epoch": 0.9646302250803859, "grad_norm": 5.045367081523594, "learning_rate": 1.0586094563110963e-06, "loss": 0.9286471366882324, "num_input_tokens_seen": 824216382, "step": 4110, "token_acc": 0.7514687934606761 }, { "epoch": 0.966977257258197, "grad_norm": 2.1231322680588756, "learning_rate": 1.054427364735009e-06, "loss": 0.9417591094970703, "num_input_tokens_seen": 826177221, "step": 4120, "token_acc": 0.746542864029784 }, { "epoch": 0.9693242894360081, "grad_norm": 1.5051650791104427, "learning_rate": 1.0502443181797696e-06, "loss": 0.9733121871948243, "num_input_tokens_seen": 828212934, "step": 4130, "token_acc": 0.7397737060065835 }, { "epoch": 0.9716713216138193, "grad_norm": 1.9170280031638867, "learning_rate": 1.0460603900408523e-06, "loss": 0.9613967895507812, "num_input_tokens_seen": 830208120, "step": 4140, "token_acc": 0.7418330397530002 }, { "epoch": 0.9740183537916305, "grad_norm": 2.477727800782275, "learning_rate": 1.0418756537291995e-06, "loss": 0.920326042175293, "num_input_tokens_seen": 832205229, "step": 4150, "token_acc": 0.7535178501070156 }, { "epoch": 0.9763653859694417, "grad_norm": 1.544900641515008, "learning_rate": 1.0376901826699347e-06, "loss": 0.9237567901611328, "num_input_tokens_seen": 834138633, "step": 4160, "token_acc": 0.7496954091824597 }, { "epoch": 0.9787124181472528, "grad_norm": 1.6877147081648456, "learning_rate": 1.0335040503010715e-06, "loss": 0.9391614913940429, "num_input_tokens_seen": 836153739, "step": 4170, "token_acc": 0.7479080675786391 }, { "epoch": 0.9810594503250639, "grad_norm": 2.055524057953317, "learning_rate": 1.0293173300722284e-06, "loss": 0.9410205841064453, "num_input_tokens_seen": 838071294, "step": 4180, "token_acc": 0.747964305973199 }, { "epoch": 0.9834064825028751, "grad_norm": 1.9825443022012719, "learning_rate": 1.0251300954433374e-06, "loss": 0.9293361663818359, "num_input_tokens_seen": 840082950, "step": 4190, "token_acc": 0.7505939412855415 }, { "epoch": 0.9857535146806863, "grad_norm": 1.6517348379687422, "learning_rate": 1.020942419883357e-06, "loss": 0.9549247741699218, "num_input_tokens_seen": 842083761, "step": 4200, "token_acc": 0.7446830629715671 }, { "epoch": 0.9857535146806863, "eval_loss": 0.9754964709281921, "eval_runtime": 32.4547, "eval_samples_per_second": 30.812, "eval_steps_per_second": 1.294, "eval_token_acc": 0.7408277740587705, "num_input_tokens_seen": 842083761, "step": 4200 }, { "epoch": 0.9881005468584975, "grad_norm": 1.7669813904614138, "learning_rate": 1.0167543768689815e-06, "loss": 0.9350774765014649, "num_input_tokens_seen": 844080483, "step": 4210, "token_acc": 0.7474908930171247 }, { "epoch": 0.9904475790363085, "grad_norm": 1.9977363833715536, "learning_rate": 1.0125660398833527e-06, "loss": 0.9390117645263671, "num_input_tokens_seen": 846069951, "step": 4220, "token_acc": 0.7463500450267371 }, { "epoch": 0.9927946112141197, "grad_norm": 1.6725983628184662, "learning_rate": 1.0083774824147707e-06, "loss": 0.946631908416748, "num_input_tokens_seen": 848098152, "step": 4230, "token_acc": 0.7457750693945103 }, { "epoch": 0.9951416433919309, "grad_norm": 1.7247846754251406, "learning_rate": 1.004188777955404e-06, "loss": 0.9343754768371582, "num_input_tokens_seen": 850113609, "step": 4240, "token_acc": 0.7490662455788695 }, { "epoch": 0.9974886755697421, "grad_norm": 2.0830434897072894, "learning_rate": 1e-06, "loss": 0.9314743041992187, "num_input_tokens_seen": 852105906, "step": 4250, "token_acc": 0.749313829578074 }, { "epoch": 0.9998357077475533, "grad_norm": 1.814610722365582, "learning_rate": 9.958112220445962e-07, "loss": 0.9592094421386719, "num_input_tokens_seen": 854098311, "step": 4260, "token_acc": 0.7431068897769029 }, { "epoch": 1.00211232896003, "grad_norm": 1.5113637229667725, "learning_rate": 9.916225175852293e-07, "loss": 0.894398307800293, "num_input_tokens_seen": 856086594, "step": 4270, "token_acc": 0.7580048741904789 }, { "epoch": 1.0044593611378412, "grad_norm": 4.446393040487181, "learning_rate": 9.874339601166472e-07, "loss": 0.9135477066040039, "num_input_tokens_seen": 858108198, "step": 4280, "token_acc": 0.7531681304263087 }, { "epoch": 1.0068063933156524, "grad_norm": 1.9208454193735196, "learning_rate": 9.832456231310188e-07, "loss": 0.9318746566772461, "num_input_tokens_seen": 860120775, "step": 4290, "token_acc": 0.747537408902533 }, { "epoch": 1.0091534254934635, "grad_norm": 1.5928331203409287, "learning_rate": 9.790575801166431e-07, "loss": 0.9145861625671386, "num_input_tokens_seen": 862143132, "step": 4300, "token_acc": 0.7532685063928213 }, { "epoch": 1.0091534254934635, "eval_loss": 0.9742150902748108, "eval_runtime": 32.578, "eval_samples_per_second": 30.696, "eval_steps_per_second": 1.289, "eval_token_acc": 0.7412802105214561, "num_input_tokens_seen": 862143132, "step": 4300 }, { "epoch": 1.0115004576712747, "grad_norm": 2.2199758281219837, "learning_rate": 9.748699045566625e-07, "loss": 0.9037257194519043, "num_input_tokens_seen": 864130884, "step": 4310, "token_acc": 0.7554067579469933 }, { "epoch": 1.013847489849086, "grad_norm": 2.5403224399288926, "learning_rate": 9.706826699277717e-07, "loss": 0.8928478240966797, "num_input_tokens_seen": 866146368, "step": 4320, "token_acc": 0.7571011279244853 }, { "epoch": 1.016194522026897, "grad_norm": 1.6880663111795373, "learning_rate": 9.664959496989284e-07, "loss": 0.8799491882324219, "num_input_tokens_seen": 868132068, "step": 4330, "token_acc": 0.7608739162744612 }, { "epoch": 1.018541554204708, "grad_norm": 1.9603998555475624, "learning_rate": 9.623098173300653e-07, "loss": 0.9061168670654297, "num_input_tokens_seen": 870168408, "step": 4340, "token_acc": 0.7558231445173181 }, { "epoch": 1.0208885863825192, "grad_norm": 2.052768381078441, "learning_rate": 9.581243462708005e-07, "loss": 0.891018009185791, "num_input_tokens_seen": 872101149, "step": 4350, "token_acc": 0.7599988872462524 }, { "epoch": 1.0232356185603304, "grad_norm": 1.514439023769519, "learning_rate": 9.539396099591476e-07, "loss": 0.9129314422607422, "num_input_tokens_seen": 874087335, "step": 4360, "token_acc": 0.7564216192481887 }, { "epoch": 1.0255826507381416, "grad_norm": 1.8673183879809325, "learning_rate": 9.497556818202304e-07, "loss": 0.9109779357910156, "num_input_tokens_seen": 876059952, "step": 4370, "token_acc": 0.7535195830085737 }, { "epoch": 1.0279296829159528, "grad_norm": 6.147575681746076, "learning_rate": 9.45572635264991e-07, "loss": 0.9013278961181641, "num_input_tokens_seen": 878124633, "step": 4380, "token_acc": 0.756046360357164 }, { "epoch": 1.030276715093764, "grad_norm": 3.3826066958331045, "learning_rate": 9.413905436889033e-07, "loss": 0.8935451507568359, "num_input_tokens_seen": 880109727, "step": 4390, "token_acc": 0.7567750980510352 }, { "epoch": 1.0326237472715751, "grad_norm": 2.791787214417096, "learning_rate": 9.372094804706866e-07, "loss": 0.9111810684204101, "num_input_tokens_seen": 882111045, "step": 4400, "token_acc": 0.7554985194799139 }, { "epoch": 1.0326237472715751, "eval_loss": 0.9730333685874939, "eval_runtime": 32.4657, "eval_samples_per_second": 30.802, "eval_steps_per_second": 1.294, "eval_token_acc": 0.7414048613836246, "num_input_tokens_seen": 882111045, "step": 4400 }, { "epoch": 1.0349707794493863, "grad_norm": 1.927568219024905, "learning_rate": 9.330295189710151e-07, "loss": 0.9100271224975586, "num_input_tokens_seen": 884198595, "step": 4410, "token_acc": 0.7540011119241447 }, { "epoch": 1.0373178116271975, "grad_norm": 2.5062754907489797, "learning_rate": 9.288507325312334e-07, "loss": 0.8903081893920899, "num_input_tokens_seen": 886152855, "step": 4420, "token_acc": 0.7574611181168558 }, { "epoch": 1.0396648438050087, "grad_norm": 1.9923532749108916, "learning_rate": 9.246731944720674e-07, "loss": 0.9105890274047852, "num_input_tokens_seen": 888141444, "step": 4430, "token_acc": 0.7539804724713297 }, { "epoch": 1.0420118759828196, "grad_norm": 1.8502910487817004, "learning_rate": 9.204969780923403e-07, "loss": 0.9087862968444824, "num_input_tokens_seen": 890115771, "step": 4440, "token_acc": 0.7559308727674652 }, { "epoch": 1.0443589081606308, "grad_norm": 5.223223230980478, "learning_rate": 9.163221566676847e-07, "loss": 0.9071809768676757, "num_input_tokens_seen": 892098426, "step": 4450, "token_acc": 0.7547434701771973 }, { "epoch": 1.046705940338442, "grad_norm": 1.5951294272531664, "learning_rate": 9.121488034492568e-07, "loss": 0.9115602493286132, "num_input_tokens_seen": 894150594, "step": 4460, "token_acc": 0.7560878381891606 }, { "epoch": 1.0490529725162532, "grad_norm": 24.227203178087926, "learning_rate": 9.079769916624529e-07, "loss": 0.8929647445678711, "num_input_tokens_seen": 896182068, "step": 4470, "token_acc": 0.7569376280966494 }, { "epoch": 1.0514000046940644, "grad_norm": 4.446148288911931, "learning_rate": 9.038067945056227e-07, "loss": 0.8845357894897461, "num_input_tokens_seen": 898144740, "step": 4480, "token_acc": 0.7596217335121099 }, { "epoch": 1.0537470368718755, "grad_norm": 2.33113822520666, "learning_rate": 8.996382851487849e-07, "loss": 0.9204854011535645, "num_input_tokens_seen": 900153033, "step": 4490, "token_acc": 0.7531009457228544 }, { "epoch": 1.0560940690496867, "grad_norm": 1.6705258835681585, "learning_rate": 8.954715367323466e-07, "loss": 0.9108184814453125, "num_input_tokens_seen": 902159874, "step": 4500, "token_acc": 0.7534851198704926 }, { "epoch": 1.0560940690496867, "eval_loss": 0.9722611308097839, "eval_runtime": 32.6343, "eval_samples_per_second": 30.643, "eval_steps_per_second": 1.287, "eval_token_acc": 0.7414879619584035, "num_input_tokens_seen": 902159874, "step": 4500 }, { "epoch": 1.058441101227498, "grad_norm": 1.814968079519632, "learning_rate": 8.91306622365815e-07, "loss": 0.9042104721069336, "num_input_tokens_seen": 904127259, "step": 4510, "token_acc": 0.7549473429720114 }, { "epoch": 1.060788133405309, "grad_norm": 1.9598731265622114, "learning_rate": 8.871436151265182e-07, "loss": 0.9021028518676758, "num_input_tokens_seen": 906131709, "step": 4520, "token_acc": 0.7555155495065009 }, { "epoch": 1.06313516558312, "grad_norm": 3.5546689619235106, "learning_rate": 8.829825880583226e-07, "loss": 0.8736377716064453, "num_input_tokens_seen": 908144946, "step": 4530, "token_acc": 0.7615734862488263 }, { "epoch": 1.0654821977609312, "grad_norm": 3.1846241923818295, "learning_rate": 8.788236141703497e-07, "loss": 0.9034311294555664, "num_input_tokens_seen": 910148658, "step": 4540, "token_acc": 0.7564678744009387 }, { "epoch": 1.0678292299387424, "grad_norm": 2.027265382942688, "learning_rate": 8.746667664356955e-07, "loss": 0.9266244888305664, "num_input_tokens_seen": 912148779, "step": 4550, "token_acc": 0.7503857571491999 }, { "epoch": 1.0701762621165536, "grad_norm": 1.7499276338815972, "learning_rate": 8.705121177901531e-07, "loss": 0.900362205505371, "num_input_tokens_seen": 914157060, "step": 4560, "token_acc": 0.757182167972395 }, { "epoch": 1.0725232942943648, "grad_norm": 2.8471968306459092, "learning_rate": 8.663597411309278e-07, "loss": 0.8963720321655273, "num_input_tokens_seen": 916145403, "step": 4570, "token_acc": 0.7560617462222132 }, { "epoch": 1.074870326472176, "grad_norm": 1.6540494435074347, "learning_rate": 8.62209709315362e-07, "loss": 0.9004743576049805, "num_input_tokens_seen": 918115113, "step": 4580, "token_acc": 0.7545025247249607 }, { "epoch": 1.0772173586499871, "grad_norm": 2.057030263327695, "learning_rate": 8.580620951596556e-07, "loss": 0.9495843887329102, "num_input_tokens_seen": 920159124, "step": 4590, "token_acc": 0.7448036906164115 }, { "epoch": 1.0795643908277983, "grad_norm": 1.7066770272878358, "learning_rate": 8.539169714375885e-07, "loss": 0.9105659484863281, "num_input_tokens_seen": 922121547, "step": 4600, "token_acc": 0.7536738054675078 }, { "epoch": 1.0795643908277983, "eval_loss": 0.9716529250144958, "eval_runtime": 32.5395, "eval_samples_per_second": 30.732, "eval_steps_per_second": 1.291, "eval_token_acc": 0.7416818632995544, "num_input_tokens_seen": 922121547, "step": 4600 }, { "epoch": 1.0819114230056095, "grad_norm": 1.9597668178542205, "learning_rate": 8.497744108792429e-07, "loss": 0.8963167190551757, "num_input_tokens_seen": 924093546, "step": 4610, "token_acc": 0.7577693693987556 }, { "epoch": 1.0842584551834205, "grad_norm": 1.477104530901047, "learning_rate": 8.456344861697287e-07, "loss": 0.9177652359008789, "num_input_tokens_seen": 926103639, "step": 4620, "token_acc": 0.7516901953627176 }, { "epoch": 1.0866054873612316, "grad_norm": 1.8830008370086135, "learning_rate": 8.414972699479075e-07, "loss": 0.9002264022827149, "num_input_tokens_seen": 928135683, "step": 4630, "token_acc": 0.7559382042427807 }, { "epoch": 1.0889525195390428, "grad_norm": 3.016423460140028, "learning_rate": 8.373628348051163e-07, "loss": 0.8956707000732422, "num_input_tokens_seen": 930127536, "step": 4640, "token_acc": 0.7571149500895269 }, { "epoch": 1.091299551716854, "grad_norm": 1.57022279289949, "learning_rate": 8.332312532838978e-07, "loss": 0.9269239425659179, "num_input_tokens_seen": 932125299, "step": 4650, "token_acc": 0.7517471473920727 }, { "epoch": 1.0936465838946652, "grad_norm": 3.5134027190857435, "learning_rate": 8.291025978767234e-07, "loss": 0.9176504135131835, "num_input_tokens_seen": 934168311, "step": 4660, "token_acc": 0.7548118730939853 }, { "epoch": 1.0959936160724764, "grad_norm": 2.5211326313148623, "learning_rate": 8.249769410247238e-07, "loss": 0.9234855651855469, "num_input_tokens_seen": 936133608, "step": 4670, "token_acc": 0.7515400792838399 }, { "epoch": 1.0983406482502875, "grad_norm": 2.572125880008109, "learning_rate": 8.208543551164177e-07, "loss": 0.8986695289611817, "num_input_tokens_seen": 938147853, "step": 4680, "token_acc": 0.7556977694823225 }, { "epoch": 1.1006876804280987, "grad_norm": 2.988789824663344, "learning_rate": 8.167349124864404e-07, "loss": 0.9072399139404297, "num_input_tokens_seen": 940144569, "step": 4690, "token_acc": 0.7530836929897347 }, { "epoch": 1.10303471260591, "grad_norm": 1.6468695088048304, "learning_rate": 8.126186854142751e-07, "loss": 0.9020254135131835, "num_input_tokens_seen": 942165525, "step": 4700, "token_acc": 0.7548501978958501 }, { "epoch": 1.10303471260591, "eval_loss": 0.9701104164123535, "eval_runtime": 33.0994, "eval_samples_per_second": 30.212, "eval_steps_per_second": 1.269, "eval_token_acc": 0.7415941460261767, "num_input_tokens_seen": 942165525, "step": 4700 }, { "epoch": 1.105381744783721, "grad_norm": 1.6564712470148706, "learning_rate": 8.08505746122987e-07, "loss": 0.8915030479431152, "num_input_tokens_seen": 944177469, "step": 4710, "token_acc": 0.7565814201146365 }, { "epoch": 1.107728776961532, "grad_norm": 2.791755191613104, "learning_rate": 8.043961667779518e-07, "loss": 0.9122766494750977, "num_input_tokens_seen": 946234932, "step": 4720, "token_acc": 0.7535114631778791 }, { "epoch": 1.1100758091393432, "grad_norm": 1.6738087861309878, "learning_rate": 8.002900194855931e-07, "loss": 0.9000448226928711, "num_input_tokens_seen": 948228513, "step": 4730, "token_acc": 0.7559363093706895 }, { "epoch": 1.1124228413171544, "grad_norm": 1.5535937671654965, "learning_rate": 7.961873762921151e-07, "loss": 0.9070523262023926, "num_input_tokens_seen": 950332011, "step": 4740, "token_acc": 0.7553185494918014 }, { "epoch": 1.1147698734949656, "grad_norm": 2.301542689211403, "learning_rate": 7.920883091822408e-07, "loss": 0.90597505569458, "num_input_tokens_seen": 952319049, "step": 4750, "token_acc": 0.7548275049458286 }, { "epoch": 1.1171169056727768, "grad_norm": 1.7473797104994677, "learning_rate": 7.879928900779455e-07, "loss": 0.9030384063720703, "num_input_tokens_seen": 954299892, "step": 4760, "token_acc": 0.756532667257456 }, { "epoch": 1.119463937850588, "grad_norm": 2.558847573037429, "learning_rate": 7.839011908371979e-07, "loss": 0.9100503921508789, "num_input_tokens_seen": 956318847, "step": 4770, "token_acc": 0.7527636165796845 }, { "epoch": 1.1218109700283991, "grad_norm": 1.9894868553546619, "learning_rate": 7.798132832526985e-07, "loss": 0.8903913497924805, "num_input_tokens_seen": 958308174, "step": 4780, "token_acc": 0.7594328320061341 }, { "epoch": 1.1241580022062103, "grad_norm": 1.9090250979917347, "learning_rate": 7.757292390506189e-07, "loss": 0.9077445983886718, "num_input_tokens_seen": 960311976, "step": 4790, "token_acc": 0.7563037639640341 }, { "epoch": 1.1265050343840215, "grad_norm": 1.5195604142033567, "learning_rate": 7.716491298893441e-07, "loss": 0.9030027389526367, "num_input_tokens_seen": 962312673, "step": 4800, "token_acc": 0.7546611261686987 }, { "epoch": 1.1265050343840215, "eval_loss": 0.9690244197845459, "eval_runtime": 32.6363, "eval_samples_per_second": 30.641, "eval_steps_per_second": 1.287, "eval_token_acc": 0.7421065995706471, "num_input_tokens_seen": 962312673, "step": 4800 }, { "epoch": 1.1288520665618327, "grad_norm": 7.06114138514342, "learning_rate": 7.675730273582159e-07, "loss": 0.9238859176635742, "num_input_tokens_seen": 964266690, "step": 4810, "token_acc": 0.7510988303005139 }, { "epoch": 1.1311990987396436, "grad_norm": 1.9887377640273287, "learning_rate": 7.635010029762755e-07, "loss": 0.893895149230957, "num_input_tokens_seen": 966243534, "step": 4820, "token_acc": 0.7578514127725531 }, { "epoch": 1.1335461309174548, "grad_norm": 2.8072244352137545, "learning_rate": 7.594331281910081e-07, "loss": 0.8709514617919922, "num_input_tokens_seen": 968205627, "step": 4830, "token_acc": 0.7630826790971541 }, { "epoch": 1.135893163095266, "grad_norm": 1.5697632247100872, "learning_rate": 7.553694743770927e-07, "loss": 0.8988607406616211, "num_input_tokens_seen": 970177137, "step": 4840, "token_acc": 0.7561233380663482 }, { "epoch": 1.1382401952730772, "grad_norm": 2.446099829583827, "learning_rate": 7.513101128351453e-07, "loss": 0.9138158798217774, "num_input_tokens_seen": 972139821, "step": 4850, "token_acc": 0.7539762326169406 }, { "epoch": 1.1405872274508884, "grad_norm": 2.2189495017577103, "learning_rate": 7.472551147904707e-07, "loss": 0.9274373054504395, "num_input_tokens_seen": 974155848, "step": 4860, "token_acc": 0.750778398745103 }, { "epoch": 1.1429342596286995, "grad_norm": 1.4873269538334397, "learning_rate": 7.432045513918122e-07, "loss": 0.8865886688232422, "num_input_tokens_seen": 976121469, "step": 4870, "token_acc": 0.7581827865316892 }, { "epoch": 1.1452812918065107, "grad_norm": 1.7178629684971727, "learning_rate": 7.391584937101033e-07, "loss": 0.9193226814270019, "num_input_tokens_seen": 978125502, "step": 4880, "token_acc": 0.7524842758549445 }, { "epoch": 1.147628323984322, "grad_norm": 1.7659656442538727, "learning_rate": 7.351170127372191e-07, "loss": 0.8870782852172852, "num_input_tokens_seen": 980151348, "step": 4890, "token_acc": 0.7591273127875505 }, { "epoch": 1.149975356162133, "grad_norm": 2.512190986249406, "learning_rate": 7.310801793847343e-07, "loss": 0.9009071350097656, "num_input_tokens_seen": 982116819, "step": 4900, "token_acc": 0.7555736532655332 }, { "epoch": 1.149975356162133, "eval_loss": 0.967960000038147, "eval_runtime": 33.0751, "eval_samples_per_second": 30.234, "eval_steps_per_second": 1.27, "eval_token_acc": 0.7421296830636412, "num_input_tokens_seen": 982116819, "step": 4900 }, { "epoch": 1.152322388339944, "grad_norm": 1.6304789397632498, "learning_rate": 7.270480644826749e-07, "loss": 0.9345785140991211, "num_input_tokens_seen": 984113586, "step": 4910, "token_acc": 0.7481138414862325 }, { "epoch": 1.1546694205177552, "grad_norm": 1.6773663128682315, "learning_rate": 7.230207387782776e-07, "loss": 0.9058225631713868, "num_input_tokens_seen": 986134590, "step": 4920, "token_acc": 0.7572044267504292 }, { "epoch": 1.1570164526955664, "grad_norm": 2.467806440031501, "learning_rate": 7.18998272934749e-07, "loss": 0.905792236328125, "num_input_tokens_seen": 988128006, "step": 4930, "token_acc": 0.7552128911554362 }, { "epoch": 1.1593634848733776, "grad_norm": 2.3456217936104613, "learning_rate": 7.149807375300238e-07, "loss": 0.8924792289733887, "num_input_tokens_seen": 990097689, "step": 4940, "token_acc": 0.7572508060847032 }, { "epoch": 1.1617105170511888, "grad_norm": 2.059131762842591, "learning_rate": 7.109682030555282e-07, "loss": 0.8982337951660156, "num_input_tokens_seen": 992129379, "step": 4950, "token_acc": 0.7551930966690015 }, { "epoch": 1.164057549229, "grad_norm": 2.9573480896222772, "learning_rate": 7.069607399149426e-07, "loss": 0.8968988418579101, "num_input_tokens_seen": 994140366, "step": 4960, "token_acc": 0.7568408887934138 }, { "epoch": 1.1664045814068111, "grad_norm": 1.7230957488152536, "learning_rate": 7.029584184229652e-07, "loss": 0.909503173828125, "num_input_tokens_seen": 996159930, "step": 4970, "token_acc": 0.7549473717210192 }, { "epoch": 1.1687516135846223, "grad_norm": 1.7012209659002009, "learning_rate": 6.989613088040795e-07, "loss": 0.8788484573364258, "num_input_tokens_seen": 998200734, "step": 4980, "token_acc": 0.7586579539038453 }, { "epoch": 1.1710986457624335, "grad_norm": 1.592891016055058, "learning_rate": 6.949694811913225e-07, "loss": 0.9113107681274414, "num_input_tokens_seen": 1000131159, "step": 4990, "token_acc": 0.7557149987259054 }, { "epoch": 1.1734456779402445, "grad_norm": 5.935924197471871, "learning_rate": 6.909830056250526e-07, "loss": 0.900279426574707, "num_input_tokens_seen": 1002152949, "step": 5000, "token_acc": 0.7555415584180373 }, { "epoch": 1.1734456779402445, "eval_loss": 0.9671830534934998, "eval_runtime": 33.371, "eval_samples_per_second": 29.966, "eval_steps_per_second": 1.259, "eval_token_acc": 0.7425636527319314, "num_input_tokens_seen": 1002152949, "step": 5000 }, { "epoch": 1.1757927101180556, "grad_norm": 2.200894548140831, "learning_rate": 6.870019520517217e-07, "loss": 0.8960202217102051, "num_input_tokens_seen": 1004157984, "step": 5010, "token_acc": 0.7569971090628078 }, { "epoch": 1.1781397422958668, "grad_norm": 1.566572723585561, "learning_rate": 6.830263903226482e-07, "loss": 0.9069774627685547, "num_input_tokens_seen": 1006144677, "step": 5020, "token_acc": 0.7552951138157661 }, { "epoch": 1.180486774473678, "grad_norm": 2.012794050429991, "learning_rate": 6.790563901927906e-07, "loss": 0.903378677368164, "num_input_tokens_seen": 1008183480, "step": 5030, "token_acc": 0.7542571237096386 }, { "epoch": 1.1828338066514892, "grad_norm": 2.6190444654182663, "learning_rate": 6.750920213195237e-07, "loss": 0.9192432403564453, "num_input_tokens_seen": 1010200815, "step": 5040, "token_acc": 0.752674829722257 }, { "epoch": 1.1851808388293004, "grad_norm": 1.876294751139499, "learning_rate": 6.711333532614167e-07, "loss": 0.8876149177551269, "num_input_tokens_seen": 1012244334, "step": 5050, "token_acc": 0.7581334816982072 }, { "epoch": 1.1875278710071115, "grad_norm": 3.023714292115771, "learning_rate": 6.671804554770134e-07, "loss": 0.9129764556884765, "num_input_tokens_seen": 1014307173, "step": 5060, "token_acc": 0.7553209579424762 }, { "epoch": 1.1898749031849227, "grad_norm": 1.9132860678026469, "learning_rate": 6.63233397323612e-07, "loss": 0.9299371719360352, "num_input_tokens_seen": 1016348544, "step": 5070, "token_acc": 0.7510845945047212 }, { "epoch": 1.192221935362734, "grad_norm": 1.7646493320200434, "learning_rate": 6.592922480560483e-07, "loss": 0.8976167678833008, "num_input_tokens_seen": 1018332171, "step": 5080, "token_acc": 0.7562631418499035 }, { "epoch": 1.1945689675405449, "grad_norm": 1.6538220495495426, "learning_rate": 6.55357076825483e-07, "loss": 0.9083082199096679, "num_input_tokens_seen": 1020317589, "step": 5090, "token_acc": 0.7535232253620915 }, { "epoch": 1.196915999718356, "grad_norm": 1.8373787795166967, "learning_rate": 6.51427952678185e-07, "loss": 0.897801399230957, "num_input_tokens_seen": 1022291424, "step": 5100, "token_acc": 0.7568812436238018 }, { "epoch": 1.196915999718356, "eval_loss": 0.965643048286438, "eval_runtime": 32.457, "eval_samples_per_second": 30.81, "eval_steps_per_second": 1.294, "eval_token_acc": 0.7426790701969022, "num_input_tokens_seen": 1022291424, "step": 5100 }, { "epoch": 1.1992630318961672, "grad_norm": 1.6643922341831798, "learning_rate": 6.475049445543214e-07, "loss": 0.8832623481750488, "num_input_tokens_seen": 1024326642, "step": 5110, "token_acc": 0.7609418407772097 }, { "epoch": 1.2016100640739784, "grad_norm": 2.8760528519429527, "learning_rate": 6.435881212867493e-07, "loss": 0.8896665573120117, "num_input_tokens_seen": 1026358665, "step": 5120, "token_acc": 0.7582770940849544 }, { "epoch": 1.2039570962517896, "grad_norm": 2.002315720555266, "learning_rate": 6.396775515998054e-07, "loss": 0.9143696784973144, "num_input_tokens_seen": 1028363571, "step": 5130, "token_acc": 0.7524985799614379 }, { "epoch": 1.2063041284296008, "grad_norm": 2.371576045666034, "learning_rate": 6.357733041081017e-07, "loss": 0.9304786682128906, "num_input_tokens_seen": 1030342941, "step": 5140, "token_acc": 0.7486818472638695 }, { "epoch": 1.208651160607412, "grad_norm": 2.346943055260075, "learning_rate": 6.31875447315322e-07, "loss": 0.9241456031799317, "num_input_tokens_seen": 1032378225, "step": 5150, "token_acc": 0.7510933676127989 }, { "epoch": 1.2109981927852231, "grad_norm": 2.231488980986392, "learning_rate": 6.279840496130188e-07, "loss": 0.9039559364318848, "num_input_tokens_seen": 1034346864, "step": 5160, "token_acc": 0.7524411349410404 }, { "epoch": 1.2133452249630343, "grad_norm": 1.9646213179831136, "learning_rate": 6.240991792794133e-07, "loss": 0.9074276924133301, "num_input_tokens_seen": 1036368729, "step": 5170, "token_acc": 0.7546195549754318 }, { "epoch": 1.2156922571408453, "grad_norm": 1.722457316805155, "learning_rate": 6.202209044781989e-07, "loss": 0.8936328887939453, "num_input_tokens_seen": 1038356424, "step": 5180, "token_acc": 0.7567584358948151 }, { "epoch": 1.2180392893186567, "grad_norm": 3.480235780891435, "learning_rate": 6.163492932573438e-07, "loss": 0.8924088478088379, "num_input_tokens_seen": 1040404614, "step": 5190, "token_acc": 0.759963029202667 }, { "epoch": 1.2203863214964676, "grad_norm": 3.7980987371120305, "learning_rate": 6.124844135478971e-07, "loss": 0.9037540435791016, "num_input_tokens_seen": 1042409814, "step": 5200, "token_acc": 0.7544269749931005 }, { "epoch": 1.2203863214964676, "eval_loss": 0.9651933908462524, "eval_runtime": 32.4721, "eval_samples_per_second": 30.796, "eval_steps_per_second": 1.293, "eval_token_acc": 0.7432146072343667, "num_input_tokens_seen": 1042409814, "step": 5200 }, { "epoch": 1.2227333536742788, "grad_norm": 2.0301844609252324, "learning_rate": 6.086263331627975e-07, "loss": 0.8960711479187011, "num_input_tokens_seen": 1044474747, "step": 5210, "token_acc": 0.7566766133085695 }, { "epoch": 1.22508038585209, "grad_norm": 2.006861134477907, "learning_rate": 6.047751197956838e-07, "loss": 0.8874652862548829, "num_input_tokens_seen": 1046542701, "step": 5220, "token_acc": 0.7577207817130738 }, { "epoch": 1.2274274180299012, "grad_norm": 1.64084154179337, "learning_rate": 6.009308410197047e-07, "loss": 0.9375964164733886, "num_input_tokens_seen": 1048531923, "step": 5230, "token_acc": 0.7477447658832623 }, { "epoch": 1.2297744502077124, "grad_norm": 2.376806108677906, "learning_rate": 5.970935642863374e-07, "loss": 0.9305553436279297, "num_input_tokens_seen": 1050497172, "step": 5240, "token_acc": 0.7477491309741687 }, { "epoch": 1.2321214823855235, "grad_norm": 2.0017133938943603, "learning_rate": 5.932633569241999e-07, "loss": 0.9117889404296875, "num_input_tokens_seen": 1052489067, "step": 5250, "token_acc": 0.7528568241041047 }, { "epoch": 1.2344685145633347, "grad_norm": 1.676786348660199, "learning_rate": 5.89440286137872e-07, "loss": 0.9003104209899903, "num_input_tokens_seen": 1054479834, "step": 5260, "token_acc": 0.7555148409000024 }, { "epoch": 1.236815546741146, "grad_norm": 3.0440164850905087, "learning_rate": 5.856244190067159e-07, "loss": 0.9047473907470703, "num_input_tokens_seen": 1056426330, "step": 5270, "token_acc": 0.755049574664931 }, { "epoch": 1.239162578918957, "grad_norm": 2.869133561984615, "learning_rate": 5.818158224836987e-07, "loss": 0.9154601097106934, "num_input_tokens_seen": 1058453490, "step": 5280, "token_acc": 0.7520037800567009 }, { "epoch": 1.241509611096768, "grad_norm": 3.801710317044165, "learning_rate": 5.780145633942173e-07, "loss": 0.9164340972900391, "num_input_tokens_seen": 1060486977, "step": 5290, "token_acc": 0.752695566601707 }, { "epoch": 1.2438566432745792, "grad_norm": 2.741349679065458, "learning_rate": 5.742207084349273e-07, "loss": 0.871244239807129, "num_input_tokens_seen": 1062507609, "step": 5300, "token_acc": 0.7623417495900512 }, { "epoch": 1.2438566432745792, "eval_loss": 0.9639586210250854, "eval_runtime": 32.2213, "eval_samples_per_second": 31.035, "eval_steps_per_second": 1.303, "eval_token_acc": 0.7430853396735994, "num_input_tokens_seen": 1062507609, "step": 5300 }, { "epoch": 1.2462036754523904, "grad_norm": 2.0632482933314273, "learning_rate": 5.704343241725719e-07, "loss": 0.902606201171875, "num_input_tokens_seen": 1064565387, "step": 5310, "token_acc": 0.7573666940890565 }, { "epoch": 1.2485507076302016, "grad_norm": 2.5206259888398805, "learning_rate": 5.666554770428128e-07, "loss": 0.8999618530273438, "num_input_tokens_seen": 1066547697, "step": 5320, "token_acc": 0.7568080644124454 }, { "epoch": 1.2508977398080128, "grad_norm": 2.5949843975238664, "learning_rate": 5.628842333490673e-07, "loss": 0.9164423942565918, "num_input_tokens_seen": 1068581145, "step": 5330, "token_acc": 0.7550268878909339 }, { "epoch": 1.253244771985824, "grad_norm": 4.036566885568054, "learning_rate": 5.591206592613416e-07, "loss": 0.905246353149414, "num_input_tokens_seen": 1070601372, "step": 5340, "token_acc": 0.7552413610147676 }, { "epoch": 1.2555918041636351, "grad_norm": 5.474286064221549, "learning_rate": 5.553648208150728e-07, "loss": 0.8880559921264648, "num_input_tokens_seen": 1072560906, "step": 5350, "token_acc": 0.7592060617200068 }, { "epoch": 1.2579388363414463, "grad_norm": 1.7453040114014564, "learning_rate": 5.51616783909968e-07, "loss": 0.9003293991088868, "num_input_tokens_seen": 1074501144, "step": 5360, "token_acc": 0.7574462673279918 }, { "epoch": 1.2602858685192575, "grad_norm": 2.437893460638386, "learning_rate": 5.478766143088491e-07, "loss": 0.8865642547607422, "num_input_tokens_seen": 1076535018, "step": 5370, "token_acc": 0.7606810169616077 }, { "epoch": 1.2626329006970685, "grad_norm": 1.8609894370837823, "learning_rate": 5.441443776365002e-07, "loss": 0.8910144805908203, "num_input_tokens_seen": 1078579935, "step": 5380, "token_acc": 0.7576510815314375 }, { "epoch": 1.2649799328748796, "grad_norm": 2.6436944735193184, "learning_rate": 5.404201393785122e-07, "loss": 0.8772344589233398, "num_input_tokens_seen": 1080564321, "step": 5390, "token_acc": 0.7608925444457297 }, { "epoch": 1.2673269650526908, "grad_norm": 2.992758801643484, "learning_rate": 5.367039648801385e-07, "loss": 0.9159189224243164, "num_input_tokens_seen": 1082533953, "step": 5400, "token_acc": 0.7533061633594679 }, { "epoch": 1.2673269650526908, "eval_loss": 0.9630009531974792, "eval_runtime": 32.5066, "eval_samples_per_second": 30.763, "eval_steps_per_second": 1.292, "eval_token_acc": 0.7429468387156345, "num_input_tokens_seen": 1082533953, "step": 5400 }, { "epoch": 1.269673997230502, "grad_norm": 2.692503141737527, "learning_rate": 5.329959193451448e-07, "loss": 0.8941567420959473, "num_input_tokens_seen": 1084574751, "step": 5410, "token_acc": 0.7571006112607204 }, { "epoch": 1.2720210294083132, "grad_norm": 1.5669484406824739, "learning_rate": 5.292960678346674e-07, "loss": 0.8758008003234863, "num_input_tokens_seen": 1086604491, "step": 5420, "token_acc": 0.7609121373438931 }, { "epoch": 1.2743680615861244, "grad_norm": 2.7196923900884538, "learning_rate": 5.256044752660709e-07, "loss": 0.8903736114501953, "num_input_tokens_seen": 1088619414, "step": 5430, "token_acc": 0.7592087326109695 }, { "epoch": 1.2767150937639355, "grad_norm": 3.3252231876281044, "learning_rate": 5.219212064118078e-07, "loss": 0.8977795600891113, "num_input_tokens_seen": 1090588407, "step": 5440, "token_acc": 0.7549231473500579 }, { "epoch": 1.2790621259417467, "grad_norm": 2.7540341423324115, "learning_rate": 5.182463258982846e-07, "loss": 0.9006612777709961, "num_input_tokens_seen": 1092638625, "step": 5450, "token_acc": 0.7552658524098589 }, { "epoch": 1.281409158119558, "grad_norm": 3.5072503422513153, "learning_rate": 5.14579898204726e-07, "loss": 0.907337760925293, "num_input_tokens_seen": 1094630577, "step": 5460, "token_acc": 0.7542059011906609 }, { "epoch": 1.2837561902973689, "grad_norm": 5.240311266290964, "learning_rate": 5.109219876620441e-07, "loss": 0.8758956909179687, "num_input_tokens_seen": 1096660965, "step": 5470, "token_acc": 0.7625046517718082 }, { "epoch": 1.28610322247518, "grad_norm": 3.910915359433382, "learning_rate": 5.072726584517085e-07, "loss": 0.8722602844238281, "num_input_tokens_seen": 1098640854, "step": 5480, "token_acc": 0.7603257317050821 }, { "epoch": 1.2884502546529912, "grad_norm": 1.6770275314193752, "learning_rate": 5.036319746046231e-07, "loss": 0.8983705520629883, "num_input_tokens_seen": 1100637150, "step": 5490, "token_acc": 0.7550046700338503 }, { "epoch": 1.2907972868308024, "grad_norm": 2.5881524894297745, "learning_rate": 5.000000000000002e-07, "loss": 0.894923210144043, "num_input_tokens_seen": 1102652097, "step": 5500, "token_acc": 0.7564139373070671 }, { "epoch": 1.2907972868308024, "eval_loss": 0.9622647762298584, "eval_runtime": 32.3358, "eval_samples_per_second": 30.925, "eval_steps_per_second": 1.299, "eval_token_acc": 0.7433207913021398, "num_input_tokens_seen": 1102652097, "step": 5500 }, { "epoch": 1.2931443190086136, "grad_norm": 2.4458432348948125, "learning_rate": 4.963767983642391e-07, "loss": 0.9156219482421875, "num_input_tokens_seen": 1104675948, "step": 5510, "token_acc": 0.7537255650881494 }, { "epoch": 1.2954913511864248, "grad_norm": 1.6909965435703207, "learning_rate": 4.927624332698109e-07, "loss": 0.8871401786804199, "num_input_tokens_seen": 1106680473, "step": 5520, "token_acc": 0.7581608722152928 }, { "epoch": 1.297838383364236, "grad_norm": 2.752441639954046, "learning_rate": 4.891569681341402e-07, "loss": 0.8774595260620117, "num_input_tokens_seen": 1108675587, "step": 5530, "token_acc": 0.7608597953994441 }, { "epoch": 1.3001854155420471, "grad_norm": 4.312103259940411, "learning_rate": 4.855604662184934e-07, "loss": 0.94571533203125, "num_input_tokens_seen": 1110676452, "step": 5540, "token_acc": 0.7557507607034466 }, { "epoch": 1.3025324477198583, "grad_norm": 15.655690028463368, "learning_rate": 4.819729906268699e-07, "loss": 0.906065559387207, "num_input_tokens_seen": 1112710338, "step": 5550, "token_acc": 0.7553128935752625 }, { "epoch": 1.3048794798976693, "grad_norm": 4.05493729865088, "learning_rate": 4.783946043048922e-07, "loss": 0.8648593902587891, "num_input_tokens_seen": 1114786149, "step": 5560, "token_acc": 0.763232807351506 }, { "epoch": 1.3072265120754807, "grad_norm": 18.132742646186717, "learning_rate": 4.748253700387042e-07, "loss": 0.9057920455932618, "num_input_tokens_seen": 1116792414, "step": 5570, "token_acc": 0.7558468058389578 }, { "epoch": 1.3095735442532916, "grad_norm": 4.473293823942013, "learning_rate": 4.712653504538683e-07, "loss": 0.9168581008911133, "num_input_tokens_seen": 1118755668, "step": 5580, "token_acc": 0.7533578569509507 }, { "epoch": 1.3119205764311028, "grad_norm": 1.8718331058830788, "learning_rate": 4.677146080142663e-07, "loss": 0.8930509567260743, "num_input_tokens_seen": 1120786350, "step": 5590, "token_acc": 0.7578146339884224 }, { "epoch": 1.314267608608914, "grad_norm": 3.8006137217544853, "learning_rate": 4.641732050210031e-07, "loss": 0.8965305328369141, "num_input_tokens_seen": 1122830253, "step": 5600, "token_acc": 0.757193734996655 }, { "epoch": 1.314267608608914, "eval_loss": 0.9616973996162415, "eval_runtime": 32.7101, "eval_samples_per_second": 30.572, "eval_steps_per_second": 1.284, "eval_token_acc": 0.7441794972415225, "num_input_tokens_seen": 1122830253, "step": 5600 }, { "epoch": 1.3166146407867252, "grad_norm": 2.1310327327750103, "learning_rate": 4.6064120361131654e-07, "loss": 0.8685415267944336, "num_input_tokens_seen": 1124770431, "step": 5610, "token_acc": 0.7614362220849722 }, { "epoch": 1.3189616729645364, "grad_norm": 5.594205953222117, "learning_rate": 4.571186657574827e-07, "loss": 0.8749109268188476, "num_input_tokens_seen": 1126803909, "step": 5620, "token_acc": 0.7609511594419571 }, { "epoch": 1.3213087051423476, "grad_norm": 1.4907604209575505, "learning_rate": 4.5360565326573097e-07, "loss": 0.8923271179199219, "num_input_tokens_seen": 1128846693, "step": 5630, "token_acc": 0.7566236892264636 }, { "epoch": 1.3236557373201587, "grad_norm": 1.6487071255049761, "learning_rate": 4.5010222777516016e-07, "loss": 0.8908859252929687, "num_input_tokens_seen": 1130851539, "step": 5640, "token_acc": 0.7570941516923059 }, { "epoch": 1.3260027694979697, "grad_norm": 3.588061851379213, "learning_rate": 4.46608450756656e-07, "loss": 0.8966587066650391, "num_input_tokens_seen": 1132815081, "step": 5650, "token_acc": 0.7556865728413845 }, { "epoch": 1.328349801675781, "grad_norm": 1.8146830930493871, "learning_rate": 4.431243835118124e-07, "loss": 0.8989040374755859, "num_input_tokens_seen": 1134802443, "step": 5660, "token_acc": 0.7558611844953211 }, { "epoch": 1.330696833853592, "grad_norm": 1.549860770891768, "learning_rate": 4.3965008717185546e-07, "loss": 0.9029041290283203, "num_input_tokens_seen": 1136825982, "step": 5670, "token_acc": 0.7547953414140695 }, { "epoch": 1.3330438660314032, "grad_norm": 8.564808279417944, "learning_rate": 4.361856226965732e-07, "loss": 0.9094319343566895, "num_input_tokens_seen": 1138844418, "step": 5680, "token_acc": 0.7534089471178856 }, { "epoch": 1.3353908982092144, "grad_norm": 2.4122342846590117, "learning_rate": 4.327310508732437e-07, "loss": 0.9330079078674316, "num_input_tokens_seen": 1140865437, "step": 5690, "token_acc": 0.7480073371962428 }, { "epoch": 1.3377379303870256, "grad_norm": 2.222842201988777, "learning_rate": 4.292864323155684e-07, "loss": 0.9154201507568359, "num_input_tokens_seen": 1142840739, "step": 5700, "token_acc": 0.7531476710355994 }, { "epoch": 1.3377379303870256, "eval_loss": 0.9612286686897278, "eval_runtime": 32.3029, "eval_samples_per_second": 30.957, "eval_steps_per_second": 1.3, "eval_token_acc": 0.7439994459961682, "num_input_tokens_seen": 1142840739, "step": 5700 }, { "epoch": 1.3400849625648368, "grad_norm": 1.810955978874136, "learning_rate": 4.258518274626103e-07, "loss": 0.8730932235717773, "num_input_tokens_seen": 1144886610, "step": 5710, "token_acc": 0.763370671624448 }, { "epoch": 1.342431994742648, "grad_norm": 4.997305168302919, "learning_rate": 4.224272965777326e-07, "loss": 0.8956947326660156, "num_input_tokens_seen": 1146863130, "step": 5720, "token_acc": 0.756512774681123 }, { "epoch": 1.3447790269204591, "grad_norm": 1.7714657210450584, "learning_rate": 4.1901289974754017e-07, "loss": 0.9034318923950195, "num_input_tokens_seen": 1148825958, "step": 5730, "token_acc": 0.7528903974023187 }, { "epoch": 1.34712605909827, "grad_norm": 1.7970727143535203, "learning_rate": 4.15608696880828e-07, "loss": 0.9018034934997559, "num_input_tokens_seen": 1150869660, "step": 5740, "token_acc": 0.7552370910083663 }, { "epoch": 1.3494730912760815, "grad_norm": 2.3962942580845765, "learning_rate": 4.1221474770752696e-07, "loss": 0.8888204574584961, "num_input_tokens_seen": 1152904527, "step": 5750, "token_acc": 0.7579487303127656 }, { "epoch": 1.3518201234538925, "grad_norm": 4.2459307299089355, "learning_rate": 4.0883111177765793e-07, "loss": 0.882927131652832, "num_input_tokens_seen": 1154856621, "step": 5760, "token_acc": 0.760532270444878 }, { "epoch": 1.3541671556317036, "grad_norm": 8.805122520612176, "learning_rate": 4.05457848460287e-07, "loss": 0.8931197166442871, "num_input_tokens_seen": 1156841811, "step": 5770, "token_acc": 0.7581117296199616 }, { "epoch": 1.3565141878095148, "grad_norm": 1.8029745033128655, "learning_rate": 4.020950169424815e-07, "loss": 0.8755680084228515, "num_input_tokens_seen": 1158825375, "step": 5780, "token_acc": 0.7617876391236407 }, { "epoch": 1.358861219987326, "grad_norm": 2.3214932218170348, "learning_rate": 3.9874267622827326e-07, "loss": 0.8934176445007325, "num_input_tokens_seen": 1160840175, "step": 5790, "token_acc": 0.7589212683515132 }, { "epoch": 1.3612082521651372, "grad_norm": 2.3985162877965873, "learning_rate": 3.9540088513762516e-07, "loss": 0.8847217559814453, "num_input_tokens_seen": 1162829856, "step": 5800, "token_acc": 0.7612809344881545 }, { "epoch": 1.3612082521651372, "eval_loss": 0.9602800607681274, "eval_runtime": 32.5062, "eval_samples_per_second": 30.763, "eval_steps_per_second": 1.292, "eval_token_acc": 0.7438470949424066, "num_input_tokens_seen": 1162829856, "step": 5800 }, { "epoch": 1.3635552843429484, "grad_norm": 2.0010713169478738, "learning_rate": 3.9206970230539484e-07, "loss": 0.8922606468200683, "num_input_tokens_seen": 1164855291, "step": 5810, "token_acc": 0.7569838860463012 }, { "epoch": 1.3659023165207596, "grad_norm": 3.272109870466011, "learning_rate": 3.887491861803085e-07, "loss": 0.9000480651855469, "num_input_tokens_seen": 1166861097, "step": 5820, "token_acc": 0.7566891172207229 }, { "epoch": 1.3682493486985707, "grad_norm": 1.544077325900252, "learning_rate": 3.8543939502393553e-07, "loss": 0.8689347267150879, "num_input_tokens_seen": 1168887147, "step": 5830, "token_acc": 0.7627186945780682 }, { "epoch": 1.370596380876382, "grad_norm": 2.103440792541161, "learning_rate": 3.8214038690966577e-07, "loss": 0.8851211547851563, "num_input_tokens_seen": 1170981615, "step": 5840, "token_acc": 0.7597759262487763 }, { "epoch": 1.3729434130541929, "grad_norm": 1.7677876308306728, "learning_rate": 3.788522197216897e-07, "loss": 0.9024602890014648, "num_input_tokens_seen": 1172878617, "step": 5850, "token_acc": 0.7557560328803166 }, { "epoch": 1.375290445232004, "grad_norm": 1.9402726241839798, "learning_rate": 3.7557495115398443e-07, "loss": 0.9134780883789062, "num_input_tokens_seen": 1174893015, "step": 5860, "token_acc": 0.753564070544764 }, { "epoch": 1.3776374774098152, "grad_norm": 1.72330146825218, "learning_rate": 3.7230863870929963e-07, "loss": 0.8972689628601074, "num_input_tokens_seen": 1176936135, "step": 5870, "token_acc": 0.7560207487897523 }, { "epoch": 1.3799845095876264, "grad_norm": 1.8072698773269937, "learning_rate": 3.690533396981503e-07, "loss": 0.8984692573547364, "num_input_tokens_seen": 1178895693, "step": 5880, "token_acc": 0.756615972827414 }, { "epoch": 1.3823315417654376, "grad_norm": 2.0171801198061714, "learning_rate": 3.6580911123781056e-07, "loss": 0.8955293655395508, "num_input_tokens_seen": 1180888149, "step": 5890, "token_acc": 0.75720176277118 }, { "epoch": 1.3846785739432488, "grad_norm": 1.5526432676933917, "learning_rate": 3.625760102513102e-07, "loss": 0.8883472442626953, "num_input_tokens_seen": 1182949920, "step": 5900, "token_acc": 0.7599701073124605 }, { "epoch": 1.3846785739432488, "eval_loss": 0.9595866799354553, "eval_runtime": 32.1786, "eval_samples_per_second": 31.077, "eval_steps_per_second": 1.305, "eval_token_acc": 0.7442902980078946, "num_input_tokens_seen": 1182949920, "step": 5900 }, { "epoch": 1.38702560612106, "grad_norm": 7.97384438350482, "learning_rate": 3.593540934664383e-07, "loss": 0.889987564086914, "num_input_tokens_seen": 1184970120, "step": 5910, "token_acc": 0.758863473503418 }, { "epoch": 1.3893726382988711, "grad_norm": 1.537068043139113, "learning_rate": 3.561434174147463e-07, "loss": 0.911767578125, "num_input_tokens_seen": 1186953870, "step": 5920, "token_acc": 0.7535046522800585 }, { "epoch": 1.3917196704766823, "grad_norm": 2.792916091074644, "learning_rate": 3.5294403843055597e-07, "loss": 0.8944547653198243, "num_input_tokens_seen": 1188957102, "step": 5930, "token_acc": 0.7568861383047926 }, { "epoch": 1.3940667026544933, "grad_norm": 2.5956068773284557, "learning_rate": 3.497560126499709e-07, "loss": 0.8902932167053222, "num_input_tokens_seen": 1190999568, "step": 5940, "token_acc": 0.7563681534101937 }, { "epoch": 1.3964137348323047, "grad_norm": 1.606752628432743, "learning_rate": 3.465793960098945e-07, "loss": 0.8962507247924805, "num_input_tokens_seen": 1193049609, "step": 5950, "token_acc": 0.7568774963666619 }, { "epoch": 1.3987607670101156, "grad_norm": 3.201548177908894, "learning_rate": 3.434142442470437e-07, "loss": 0.8878293037414551, "num_input_tokens_seen": 1195126131, "step": 5960, "token_acc": 0.7593972961018481 }, { "epoch": 1.4011077991879268, "grad_norm": 2.0402971482769034, "learning_rate": 3.4026061289697396e-07, "loss": 0.8985117912292481, "num_input_tokens_seen": 1197179763, "step": 5970, "token_acc": 0.7568663489501413 }, { "epoch": 1.403454831365738, "grad_norm": 2.0743463496848085, "learning_rate": 3.371185572931048e-07, "loss": 0.9137758255004883, "num_input_tokens_seen": 1199156916, "step": 5980, "token_acc": 0.7521096549123137 }, { "epoch": 1.4058018635435492, "grad_norm": 1.908781885304316, "learning_rate": 3.3398813256574843e-07, "loss": 0.8940442085266114, "num_input_tokens_seen": 1201161525, "step": 5990, "token_acc": 0.7591090088367569 }, { "epoch": 1.4081488957213604, "grad_norm": 1.8574032864901908, "learning_rate": 3.308693936411421e-07, "loss": 0.8737678527832031, "num_input_tokens_seen": 1203195084, "step": 6000, "token_acc": 0.7614718846052603 }, { "epoch": 1.4081488957213604, "eval_loss": 0.9593000411987305, "eval_runtime": 32.4448, "eval_samples_per_second": 30.822, "eval_steps_per_second": 1.295, "eval_token_acc": 0.7439671291059763, "num_input_tokens_seen": 1203195084, "step": 6000 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 1203195084, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3956677446926336e+16, "train_batch_size": 3, "trial_name": null, "trial_params": null }