{ "best_metric": 1.86671543, "best_model_checkpoint": "/home/anubhab-pg/sm745052/swift/exp_output_paligemma/v1-20250508-175335/checkpoint-3500", "epoch": 3.0, "eval_steps": 500, "global_step": 4944, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006067961165048543, "grad_norm": 27.07073974609375, "learning_rate": 9.999998990554643e-05, "loss": 4.470662593841553, "memory(GiB)": 29.74, "step": 1, "token_acc": 0.2638888888888889, "train_speed(iter/s)": 0.176069 }, { "epoch": 0.003033980582524272, "grad_norm": 11.808771133422852, "learning_rate": 9.999974763886429e-05, "loss": 3.900416851043701, "memory(GiB)": 29.74, "step": 5, "token_acc": 0.2831050228310502, "train_speed(iter/s)": 0.398445 }, { "epoch": 0.006067961165048544, "grad_norm": 12.929791450500488, "learning_rate": 9.999899055800455e-05, "loss": 2.7944046020507813, "memory(GiB)": 29.74, "step": 10, "token_acc": 0.4377358490566038, "train_speed(iter/s)": 0.483776 }, { "epoch": 0.009101941747572815, "grad_norm": 8.702505111694336, "learning_rate": 9.99977287650631e-05, "loss": 2.662510871887207, "memory(GiB)": 38.25, "step": 15, "token_acc": 0.44649446494464945, "train_speed(iter/s)": 0.514334 }, { "epoch": 0.012135922330097087, "grad_norm": 8.632209777832031, "learning_rate": 9.999596227277707e-05, "loss": 2.5844635009765624, "memory(GiB)": 38.86, "step": 20, "token_acc": 0.4867549668874172, "train_speed(iter/s)": 0.516824 }, { "epoch": 0.01516990291262136, "grad_norm": 7.144984245300293, "learning_rate": 9.999369109897819e-05, "loss": 2.760052490234375, "memory(GiB)": 38.86, "step": 25, "token_acc": 0.42524916943521596, "train_speed(iter/s)": 0.531359 }, { "epoch": 0.01820388349514563, "grad_norm": 13.147393226623535, "learning_rate": 9.999091526659272e-05, "loss": 2.7114631652832033, "memory(GiB)": 38.86, "step": 30, "token_acc": 0.42613636363636365, "train_speed(iter/s)": 0.54308 }, { "epoch": 0.021237864077669904, "grad_norm": 10.967350959777832, "learning_rate": 9.998763480364113e-05, "loss": 2.8917694091796875, "memory(GiB)": 38.86, "step": 35, "token_acc": 0.3957703927492447, "train_speed(iter/s)": 0.55201 }, { "epoch": 0.024271844660194174, "grad_norm": 6.367667198181152, "learning_rate": 9.99838497432379e-05, "loss": 2.7323539733886717, "memory(GiB)": 38.86, "step": 40, "token_acc": 0.4843205574912892, "train_speed(iter/s)": 0.554469 }, { "epoch": 0.027305825242718445, "grad_norm": 7.910516262054443, "learning_rate": 9.997956012359109e-05, "loss": 2.541508674621582, "memory(GiB)": 38.86, "step": 45, "token_acc": 0.4327485380116959, "train_speed(iter/s)": 0.558152 }, { "epoch": 0.03033980582524272, "grad_norm": 6.039285182952881, "learning_rate": 9.997476598800203e-05, "loss": 2.543034553527832, "memory(GiB)": 38.86, "step": 50, "token_acc": 0.45058139534883723, "train_speed(iter/s)": 0.56043 }, { "epoch": 0.03337378640776699, "grad_norm": 8.210753440856934, "learning_rate": 9.99694673848649e-05, "loss": 2.3589075088500975, "memory(GiB)": 38.86, "step": 55, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 0.562231 }, { "epoch": 0.03640776699029126, "grad_norm": 9.309414863586426, "learning_rate": 9.996366436766611e-05, "loss": 2.3582067489624023, "memory(GiB)": 38.86, "step": 60, "token_acc": 0.4788273615635179, "train_speed(iter/s)": 0.567887 }, { "epoch": 0.03944174757281554, "grad_norm": 6.9387993812561035, "learning_rate": 9.995735699498394e-05, "loss": 2.5982736587524413, "memory(GiB)": 38.86, "step": 65, "token_acc": 0.436046511627907, "train_speed(iter/s)": 0.573648 }, { "epoch": 0.04247572815533981, "grad_norm": 7.010188579559326, "learning_rate": 9.995054533048777e-05, "loss": 2.508279228210449, "memory(GiB)": 38.86, "step": 70, "token_acc": 0.4281609195402299, "train_speed(iter/s)": 0.579165 }, { "epoch": 0.04550970873786408, "grad_norm": 8.828622817993164, "learning_rate": 9.994322944293763e-05, "loss": 2.588084411621094, "memory(GiB)": 38.86, "step": 75, "token_acc": 0.4558303886925795, "train_speed(iter/s)": 0.583997 }, { "epoch": 0.04854368932038835, "grad_norm": 8.112404823303223, "learning_rate": 9.993540940618334e-05, "loss": 2.316554832458496, "memory(GiB)": 38.86, "step": 80, "token_acc": 0.4552238805970149, "train_speed(iter/s)": 0.582967 }, { "epoch": 0.05157766990291262, "grad_norm": 8.621855735778809, "learning_rate": 9.992708529916379e-05, "loss": 2.366764259338379, "memory(GiB)": 38.86, "step": 85, "token_acc": 0.44025157232704404, "train_speed(iter/s)": 0.585803 }, { "epoch": 0.05461165048543689, "grad_norm": 8.71721076965332, "learning_rate": 9.991825720590626e-05, "loss": 2.346388244628906, "memory(GiB)": 38.86, "step": 90, "token_acc": 0.4457831325301205, "train_speed(iter/s)": 0.588259 }, { "epoch": 0.05764563106796117, "grad_norm": 6.412728786468506, "learning_rate": 9.990892521552546e-05, "loss": 2.4675243377685545, "memory(GiB)": 38.86, "step": 95, "token_acc": 0.4896755162241888, "train_speed(iter/s)": 0.587726 }, { "epoch": 0.06067961165048544, "grad_norm": 10.379164695739746, "learning_rate": 9.989908942222264e-05, "loss": 2.24587345123291, "memory(GiB)": 38.86, "step": 100, "token_acc": 0.5168067226890757, "train_speed(iter/s)": 0.588352 }, { "epoch": 0.06371359223300971, "grad_norm": 6.4799370765686035, "learning_rate": 9.988874992528468e-05, "loss": 2.652623748779297, "memory(GiB)": 38.86, "step": 105, "token_acc": 0.4127906976744186, "train_speed(iter/s)": 0.589898 }, { "epoch": 0.06674757281553398, "grad_norm": 6.027382850646973, "learning_rate": 9.987790682908306e-05, "loss": 2.2998146057128905, "memory(GiB)": 38.86, "step": 110, "token_acc": 0.4807121661721068, "train_speed(iter/s)": 0.589495 }, { "epoch": 0.06978155339805825, "grad_norm": 6.517679214477539, "learning_rate": 9.986656024307286e-05, "loss": 2.5867145538330076, "memory(GiB)": 38.86, "step": 115, "token_acc": 0.455026455026455, "train_speed(iter/s)": 0.589597 }, { "epoch": 0.07281553398058252, "grad_norm": 7.561508655548096, "learning_rate": 9.985471028179154e-05, "loss": 2.4384201049804686, "memory(GiB)": 38.86, "step": 120, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 0.589834 }, { "epoch": 0.07584951456310679, "grad_norm": 7.263455867767334, "learning_rate": 9.984235706485789e-05, "loss": 2.373090362548828, "memory(GiB)": 38.86, "step": 125, "token_acc": 0.4657039711191336, "train_speed(iter/s)": 0.590074 }, { "epoch": 0.07888349514563107, "grad_norm": 6.084628582000732, "learning_rate": 9.98295007169708e-05, "loss": 2.5036380767822264, "memory(GiB)": 38.86, "step": 130, "token_acc": 0.47941176470588237, "train_speed(iter/s)": 0.592166 }, { "epoch": 0.08191747572815535, "grad_norm": 8.00130844116211, "learning_rate": 9.981614136790796e-05, "loss": 2.153367614746094, "memory(GiB)": 39.22, "step": 135, "token_acc": 0.5410764872521246, "train_speed(iter/s)": 0.591399 }, { "epoch": 0.08495145631067962, "grad_norm": 6.030653953552246, "learning_rate": 9.980227915252459e-05, "loss": 2.2291128158569338, "memory(GiB)": 39.22, "step": 140, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 0.591432 }, { "epoch": 0.08798543689320389, "grad_norm": 6.891486167907715, "learning_rate": 9.978791421075206e-05, "loss": 2.5422630310058594, "memory(GiB)": 39.22, "step": 145, "token_acc": 0.4812286689419795, "train_speed(iter/s)": 0.588377 }, { "epoch": 0.09101941747572816, "grad_norm": 7.838645935058594, "learning_rate": 9.97730466875965e-05, "loss": 2.476850128173828, "memory(GiB)": 39.22, "step": 150, "token_acc": 0.4542372881355932, "train_speed(iter/s)": 0.590615 }, { "epoch": 0.09405339805825243, "grad_norm": 7.769046306610107, "learning_rate": 9.975767673313734e-05, "loss": 2.592838096618652, "memory(GiB)": 39.22, "step": 155, "token_acc": 0.4678362573099415, "train_speed(iter/s)": 0.591754 }, { "epoch": 0.0970873786407767, "grad_norm": 5.977383136749268, "learning_rate": 9.974180450252569e-05, "loss": 2.345209503173828, "memory(GiB)": 39.22, "step": 160, "token_acc": 0.4849624060150376, "train_speed(iter/s)": 0.592414 }, { "epoch": 0.10012135922330097, "grad_norm": 6.340784549713135, "learning_rate": 9.972543015598295e-05, "loss": 2.4988531112670898, "memory(GiB)": 39.22, "step": 165, "token_acc": 0.4491525423728814, "train_speed(iter/s)": 0.592598 }, { "epoch": 0.10315533980582524, "grad_norm": 6.322139263153076, "learning_rate": 9.970855385879908e-05, "loss": 2.7641939163208007, "memory(GiB)": 39.22, "step": 170, "token_acc": 0.42450142450142453, "train_speed(iter/s)": 0.593328 }, { "epoch": 0.10618932038834951, "grad_norm": 8.460200309753418, "learning_rate": 9.969117578133089e-05, "loss": 2.4497074127197265, "memory(GiB)": 39.22, "step": 175, "token_acc": 0.4819672131147541, "train_speed(iter/s)": 0.592932 }, { "epoch": 0.10922330097087378, "grad_norm": 6.508354663848877, "learning_rate": 9.96732960990005e-05, "loss": 2.3542524337768556, "memory(GiB)": 39.22, "step": 180, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 0.594793 }, { "epoch": 0.11225728155339806, "grad_norm": 6.436831474304199, "learning_rate": 9.965491499229332e-05, "loss": 2.355543518066406, "memory(GiB)": 39.22, "step": 185, "token_acc": 0.48639455782312924, "train_speed(iter/s)": 0.595112 }, { "epoch": 0.11529126213592233, "grad_norm": 5.326399803161621, "learning_rate": 9.963603264675648e-05, "loss": 2.626679611206055, "memory(GiB)": 39.22, "step": 190, "token_acc": 0.45058139534883723, "train_speed(iter/s)": 0.59601 }, { "epoch": 0.1183252427184466, "grad_norm": 6.522929668426514, "learning_rate": 9.961664925299677e-05, "loss": 2.417061424255371, "memory(GiB)": 39.22, "step": 195, "token_acc": 0.49050632911392406, "train_speed(iter/s)": 0.595902 }, { "epoch": 0.12135922330097088, "grad_norm": 5.905120849609375, "learning_rate": 9.95967650066788e-05, "loss": 2.5360954284667967, "memory(GiB)": 39.22, "step": 200, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 0.59608 }, { "epoch": 0.12439320388349515, "grad_norm": 7.083728790283203, "learning_rate": 9.957638010852301e-05, "loss": 2.5276988983154296, "memory(GiB)": 39.22, "step": 205, "token_acc": 0.43333333333333335, "train_speed(iter/s)": 0.597234 }, { "epoch": 0.12742718446601942, "grad_norm": 6.791469573974609, "learning_rate": 9.955549476430364e-05, "loss": 2.6791542053222654, "memory(GiB)": 39.22, "step": 210, "token_acc": 0.44481605351170567, "train_speed(iter/s)": 0.597785 }, { "epoch": 0.1304611650485437, "grad_norm": 8.691610336303711, "learning_rate": 9.953410918484667e-05, "loss": 2.5277048110961915, "memory(GiB)": 39.22, "step": 215, "token_acc": 0.4937888198757764, "train_speed(iter/s)": 0.598803 }, { "epoch": 0.13349514563106796, "grad_norm": 5.966423988342285, "learning_rate": 9.951222358602763e-05, "loss": 2.5550731658935546, "memory(GiB)": 39.22, "step": 220, "token_acc": 0.4676470588235294, "train_speed(iter/s)": 0.599676 }, { "epoch": 0.13652912621359223, "grad_norm": 8.491061210632324, "learning_rate": 9.948983818876954e-05, "loss": 2.433759880065918, "memory(GiB)": 39.22, "step": 225, "token_acc": 0.4908424908424908, "train_speed(iter/s)": 0.598817 }, { "epoch": 0.1395631067961165, "grad_norm": 4.885462760925293, "learning_rate": 9.946695321904056e-05, "loss": 2.5523433685302734, "memory(GiB)": 39.22, "step": 230, "token_acc": 0.45478723404255317, "train_speed(iter/s)": 0.597615 }, { "epoch": 0.14259708737864077, "grad_norm": 6.235279083251953, "learning_rate": 9.944356890785177e-05, "loss": 2.3788055419921874, "memory(GiB)": 39.22, "step": 235, "token_acc": 0.4809384164222874, "train_speed(iter/s)": 0.598117 }, { "epoch": 0.14563106796116504, "grad_norm": 5.3688130378723145, "learning_rate": 9.941968549125481e-05, "loss": 2.4541061401367186, "memory(GiB)": 39.22, "step": 240, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 0.596655 }, { "epoch": 0.1486650485436893, "grad_norm": 5.759191036224365, "learning_rate": 9.939530321033955e-05, "loss": 2.168326568603516, "memory(GiB)": 39.22, "step": 245, "token_acc": 0.501628664495114, "train_speed(iter/s)": 0.5967 }, { "epoch": 0.15169902912621358, "grad_norm": 8.470988273620605, "learning_rate": 9.937042231123155e-05, "loss": 2.5771547317504884, "memory(GiB)": 39.22, "step": 250, "token_acc": 0.501628664495114, "train_speed(iter/s)": 0.596782 }, { "epoch": 0.15473300970873785, "grad_norm": 6.000228404998779, "learning_rate": 9.934504304508974e-05, "loss": 2.5160358428955076, "memory(GiB)": 39.22, "step": 255, "token_acc": 0.4469914040114613, "train_speed(iter/s)": 0.596955 }, { "epoch": 0.15776699029126215, "grad_norm": 7.762350082397461, "learning_rate": 9.931916566810371e-05, "loss": 2.245794677734375, "memory(GiB)": 39.22, "step": 260, "token_acc": 0.521594684385382, "train_speed(iter/s)": 0.596759 }, { "epoch": 0.16080097087378642, "grad_norm": 7.007081031799316, "learning_rate": 9.929279044149123e-05, "loss": 2.3080322265625, "memory(GiB)": 39.22, "step": 265, "token_acc": 0.4964788732394366, "train_speed(iter/s)": 0.595878 }, { "epoch": 0.1638349514563107, "grad_norm": 5.466193675994873, "learning_rate": 9.926591763149559e-05, "loss": 2.1369998931884764, "memory(GiB)": 39.22, "step": 270, "token_acc": 0.5296052631578947, "train_speed(iter/s)": 0.596845 }, { "epoch": 0.16686893203883496, "grad_norm": 7.380741596221924, "learning_rate": 9.923854750938291e-05, "loss": 2.2451313018798826, "memory(GiB)": 39.22, "step": 275, "token_acc": 0.5, "train_speed(iter/s)": 0.595956 }, { "epoch": 0.16990291262135923, "grad_norm": 6.371977806091309, "learning_rate": 9.921068035143936e-05, "loss": 2.408839797973633, "memory(GiB)": 39.22, "step": 280, "token_acc": 0.46296296296296297, "train_speed(iter/s)": 0.596145 }, { "epoch": 0.1729368932038835, "grad_norm": 7.335880279541016, "learning_rate": 9.918231643896852e-05, "loss": 2.199435234069824, "memory(GiB)": 39.22, "step": 285, "token_acc": 0.5050167224080268, "train_speed(iter/s)": 0.595974 }, { "epoch": 0.17597087378640777, "grad_norm": 7.418302536010742, "learning_rate": 9.915345605828828e-05, "loss": 2.3224533081054686, "memory(GiB)": 39.22, "step": 290, "token_acc": 0.5035971223021583, "train_speed(iter/s)": 0.596204 }, { "epoch": 0.17900485436893204, "grad_norm": 6.758571624755859, "learning_rate": 9.912409950072821e-05, "loss": 2.4346172332763674, "memory(GiB)": 39.22, "step": 295, "token_acc": 0.48429319371727747, "train_speed(iter/s)": 0.597578 }, { "epoch": 0.1820388349514563, "grad_norm": 10.919794082641602, "learning_rate": 9.909424706262647e-05, "loss": 2.4341407775878907, "memory(GiB)": 39.22, "step": 300, "token_acc": 0.48857142857142855, "train_speed(iter/s)": 0.598081 }, { "epoch": 0.18507281553398058, "grad_norm": 5.728007793426514, "learning_rate": 9.906389904532688e-05, "loss": 2.120174026489258, "memory(GiB)": 39.22, "step": 305, "token_acc": 0.5371024734982333, "train_speed(iter/s)": 0.597458 }, { "epoch": 0.18810679611650485, "grad_norm": 6.871456623077393, "learning_rate": 9.903305575517584e-05, "loss": 2.342795181274414, "memory(GiB)": 39.22, "step": 310, "token_acc": 0.498567335243553, "train_speed(iter/s)": 0.596534 }, { "epoch": 0.19114077669902912, "grad_norm": 8.57703971862793, "learning_rate": 9.900171750351925e-05, "loss": 2.6183086395263673, "memory(GiB)": 39.22, "step": 315, "token_acc": 0.4625, "train_speed(iter/s)": 0.597484 }, { "epoch": 0.1941747572815534, "grad_norm": 7.768932342529297, "learning_rate": 9.89698846066994e-05, "loss": 2.291164207458496, "memory(GiB)": 39.22, "step": 320, "token_acc": 0.5154320987654321, "train_speed(iter/s)": 0.596998 }, { "epoch": 0.19720873786407767, "grad_norm": 6.957777976989746, "learning_rate": 9.893755738605171e-05, "loss": 2.211928367614746, "memory(GiB)": 39.22, "step": 325, "token_acc": 0.5045871559633027, "train_speed(iter/s)": 0.597803 }, { "epoch": 0.20024271844660194, "grad_norm": 6.228968143463135, "learning_rate": 9.890473616790154e-05, "loss": 2.4344671249389647, "memory(GiB)": 39.22, "step": 330, "token_acc": 0.47413793103448276, "train_speed(iter/s)": 0.597936 }, { "epoch": 0.2032766990291262, "grad_norm": 5.3689422607421875, "learning_rate": 9.887142128356092e-05, "loss": 2.6146148681640624, "memory(GiB)": 39.22, "step": 335, "token_acc": 0.45222929936305734, "train_speed(iter/s)": 0.59885 }, { "epoch": 0.20631067961165048, "grad_norm": 5.215574264526367, "learning_rate": 9.88376130693251e-05, "loss": 2.0827293395996094, "memory(GiB)": 39.22, "step": 340, "token_acc": 0.5172413793103449, "train_speed(iter/s)": 0.599013 }, { "epoch": 0.20934466019417475, "grad_norm": 5.897531509399414, "learning_rate": 9.880331186646925e-05, "loss": 2.232925796508789, "memory(GiB)": 39.22, "step": 345, "token_acc": 0.4750830564784053, "train_speed(iter/s)": 0.599063 }, { "epoch": 0.21237864077669902, "grad_norm": 5.433231830596924, "learning_rate": 9.876851802124503e-05, "loss": 2.4904659271240233, "memory(GiB)": 39.22, "step": 350, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 0.598348 }, { "epoch": 0.2154126213592233, "grad_norm": 7.521780967712402, "learning_rate": 9.873323188487697e-05, "loss": 2.5035079956054687, "memory(GiB)": 39.22, "step": 355, "token_acc": 0.4612903225806452, "train_speed(iter/s)": 0.597584 }, { "epoch": 0.21844660194174756, "grad_norm": 7.7608256340026855, "learning_rate": 9.869745381355906e-05, "loss": 2.2622493743896483, "memory(GiB)": 39.22, "step": 360, "token_acc": 0.5156695156695157, "train_speed(iter/s)": 0.59833 }, { "epoch": 0.22148058252427186, "grad_norm": 9.321803092956543, "learning_rate": 9.86611841684511e-05, "loss": 2.4146696090698243, "memory(GiB)": 39.22, "step": 365, "token_acc": 0.47335423197492166, "train_speed(iter/s)": 0.599026 }, { "epoch": 0.22451456310679613, "grad_norm": 9.292396545410156, "learning_rate": 9.862442331567503e-05, "loss": 2.3599546432495115, "memory(GiB)": 39.22, "step": 370, "token_acc": 0.4956268221574344, "train_speed(iter/s)": 0.5998 }, { "epoch": 0.2275485436893204, "grad_norm": 8.419163703918457, "learning_rate": 9.858717162631128e-05, "loss": 2.6148075103759765, "memory(GiB)": 39.22, "step": 375, "token_acc": 0.46048109965635736, "train_speed(iter/s)": 0.599846 }, { "epoch": 0.23058252427184467, "grad_norm": 6.27116584777832, "learning_rate": 9.854942947639501e-05, "loss": 2.4621152877807617, "memory(GiB)": 39.22, "step": 380, "token_acc": 0.505524861878453, "train_speed(iter/s)": 0.600276 }, { "epoch": 0.23361650485436894, "grad_norm": 7.211396217346191, "learning_rate": 9.851119724691225e-05, "loss": 2.5144262313842773, "memory(GiB)": 39.22, "step": 385, "token_acc": 0.4525993883792049, "train_speed(iter/s)": 0.600896 }, { "epoch": 0.2366504854368932, "grad_norm": 6.34926700592041, "learning_rate": 9.84724753237962e-05, "loss": 2.4521541595458984, "memory(GiB)": 39.22, "step": 390, "token_acc": 0.5, "train_speed(iter/s)": 0.600572 }, { "epoch": 0.23968446601941748, "grad_norm": 6.972572326660156, "learning_rate": 9.843326409792317e-05, "loss": 2.6046756744384765, "memory(GiB)": 39.22, "step": 395, "token_acc": 0.44884488448844884, "train_speed(iter/s)": 0.600491 }, { "epoch": 0.24271844660194175, "grad_norm": 11.898480415344238, "learning_rate": 9.839356396510875e-05, "loss": 2.3576316833496094, "memory(GiB)": 39.22, "step": 400, "token_acc": 0.4472843450479233, "train_speed(iter/s)": 0.601068 }, { "epoch": 0.24575242718446602, "grad_norm": 5.818270683288574, "learning_rate": 9.835337532610376e-05, "loss": 2.0870508193969726, "memory(GiB)": 39.22, "step": 405, "token_acc": 0.526813880126183, "train_speed(iter/s)": 0.601138 }, { "epoch": 0.2487864077669903, "grad_norm": 8.206275939941406, "learning_rate": 9.831269858659023e-05, "loss": 2.1485408782958983, "memory(GiB)": 39.61, "step": 410, "token_acc": 0.5371900826446281, "train_speed(iter/s)": 0.598807 }, { "epoch": 0.2518203883495146, "grad_norm": 7.233333587646484, "learning_rate": 9.827153415717729e-05, "loss": 2.37838191986084, "memory(GiB)": 39.61, "step": 415, "token_acc": 0.5067114093959731, "train_speed(iter/s)": 0.598786 }, { "epoch": 0.25485436893203883, "grad_norm": 6.615445613861084, "learning_rate": 9.822988245339701e-05, "loss": 2.3126983642578125, "memory(GiB)": 39.61, "step": 420, "token_acc": 0.514018691588785, "train_speed(iter/s)": 0.599013 }, { "epoch": 0.25788834951456313, "grad_norm": 7.5856523513793945, "learning_rate": 9.818774389570027e-05, "loss": 2.4124004364013674, "memory(GiB)": 39.61, "step": 425, "token_acc": 0.511864406779661, "train_speed(iter/s)": 0.598978 }, { "epoch": 0.2609223300970874, "grad_norm": 4.8371381759643555, "learning_rate": 9.814511890945241e-05, "loss": 2.2959733963012696, "memory(GiB)": 39.61, "step": 430, "token_acc": 0.5327380952380952, "train_speed(iter/s)": 0.59816 }, { "epoch": 0.26395631067961167, "grad_norm": 6.623883247375488, "learning_rate": 9.810200792492904e-05, "loss": 2.1788196563720703, "memory(GiB)": 39.61, "step": 435, "token_acc": 0.5016611295681063, "train_speed(iter/s)": 0.597541 }, { "epoch": 0.2669902912621359, "grad_norm": 6.926652431488037, "learning_rate": 9.805841137731164e-05, "loss": 2.1499845504760744, "memory(GiB)": 39.61, "step": 440, "token_acc": 0.5192307692307693, "train_speed(iter/s)": 0.597116 }, { "epoch": 0.2700242718446602, "grad_norm": 9.03418254852295, "learning_rate": 9.801432970668318e-05, "loss": 2.1190351486206054, "memory(GiB)": 39.61, "step": 445, "token_acc": 0.5272727272727272, "train_speed(iter/s)": 0.597517 }, { "epoch": 0.27305825242718446, "grad_norm": 8.781913757324219, "learning_rate": 9.79697633580237e-05, "loss": 2.4038110733032227, "memory(GiB)": 39.61, "step": 450, "token_acc": 0.48179271708683474, "train_speed(iter/s)": 0.59777 }, { "epoch": 0.27609223300970875, "grad_norm": 5.531435489654541, "learning_rate": 9.792471278120573e-05, "loss": 2.3716163635253906, "memory(GiB)": 39.61, "step": 455, "token_acc": 0.4847560975609756, "train_speed(iter/s)": 0.597608 }, { "epoch": 0.279126213592233, "grad_norm": 5.956150054931641, "learning_rate": 9.787917843098989e-05, "loss": 2.181165313720703, "memory(GiB)": 39.61, "step": 460, "token_acc": 0.5051903114186851, "train_speed(iter/s)": 0.597412 }, { "epoch": 0.2821601941747573, "grad_norm": 7.345389366149902, "learning_rate": 9.783316076702019e-05, "loss": 2.4305038452148438, "memory(GiB)": 39.61, "step": 465, "token_acc": 0.47802197802197804, "train_speed(iter/s)": 0.597506 }, { "epoch": 0.28519417475728154, "grad_norm": 5.4440388679504395, "learning_rate": 9.778666025381943e-05, "loss": 2.178025245666504, "memory(GiB)": 39.61, "step": 470, "token_acc": 0.5167785234899329, "train_speed(iter/s)": 0.597433 }, { "epoch": 0.28822815533980584, "grad_norm": 6.164299011230469, "learning_rate": 9.77396773607845e-05, "loss": 2.1623489379882814, "memory(GiB)": 39.61, "step": 475, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 0.597134 }, { "epoch": 0.2912621359223301, "grad_norm": 6.166046619415283, "learning_rate": 9.769221256218164e-05, "loss": 2.3753950119018556, "memory(GiB)": 39.61, "step": 480, "token_acc": 0.4879518072289157, "train_speed(iter/s)": 0.597494 }, { "epoch": 0.2942961165048544, "grad_norm": 6.958017826080322, "learning_rate": 9.764426633714167e-05, "loss": 2.21927547454834, "memory(GiB)": 39.61, "step": 485, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 0.597922 }, { "epoch": 0.2973300970873786, "grad_norm": 6.639190673828125, "learning_rate": 9.759583916965517e-05, "loss": 2.4649885177612303, "memory(GiB)": 39.61, "step": 490, "token_acc": 0.4845360824742268, "train_speed(iter/s)": 0.597703 }, { "epoch": 0.3003640776699029, "grad_norm": 5.950069904327393, "learning_rate": 9.754693154856751e-05, "loss": 2.612634468078613, "memory(GiB)": 39.61, "step": 495, "token_acc": 0.45478723404255317, "train_speed(iter/s)": 0.597528 }, { "epoch": 0.30339805825242716, "grad_norm": 6.54391622543335, "learning_rate": 9.7497543967574e-05, "loss": 2.3269075393676757, "memory(GiB)": 39.61, "step": 500, "token_acc": 0.47770700636942676, "train_speed(iter/s)": 0.597896 }, { "epoch": 0.30339805825242716, "eval_loss": 1.981583833694458, "eval_runtime": 12.577, "eval_samples_per_second": 7.951, "eval_steps_per_second": 7.951, "eval_token_acc": 0.48756906077348067, "step": 500 }, { "epoch": 0.30643203883495146, "grad_norm": 10.183039665222168, "learning_rate": 9.74476769252149e-05, "loss": 2.1522619247436525, "memory(GiB)": 39.61, "step": 505, "token_acc": 0.49651741293532337, "train_speed(iter/s)": 0.587357 }, { "epoch": 0.3094660194174757, "grad_norm": 7.112940788269043, "learning_rate": 9.739733092487035e-05, "loss": 2.388911247253418, "memory(GiB)": 39.61, "step": 510, "token_acc": 0.501577287066246, "train_speed(iter/s)": 0.587206 }, { "epoch": 0.3125, "grad_norm": 6.210618495941162, "learning_rate": 9.73465064747553e-05, "loss": 2.59771614074707, "memory(GiB)": 39.61, "step": 515, "token_acc": 0.4624624624624625, "train_speed(iter/s)": 0.587584 }, { "epoch": 0.3155339805825243, "grad_norm": 6.931279182434082, "learning_rate": 9.729520408791434e-05, "loss": 2.512074279785156, "memory(GiB)": 39.61, "step": 520, "token_acc": 0.4910394265232975, "train_speed(iter/s)": 0.587423 }, { "epoch": 0.31856796116504854, "grad_norm": 6.450678825378418, "learning_rate": 9.72434242822167e-05, "loss": 2.1714031219482424, "memory(GiB)": 39.61, "step": 525, "token_acc": 0.5186721991701245, "train_speed(iter/s)": 0.586863 }, { "epoch": 0.32160194174757284, "grad_norm": 6.14349365234375, "learning_rate": 9.719116758035074e-05, "loss": 2.5791160583496096, "memory(GiB)": 39.61, "step": 530, "token_acc": 0.46153846153846156, "train_speed(iter/s)": 0.585986 }, { "epoch": 0.3246359223300971, "grad_norm": 6.583944797515869, "learning_rate": 9.71384345098189e-05, "loss": 2.3987077713012694, "memory(GiB)": 39.61, "step": 535, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 0.58625 }, { "epoch": 0.3276699029126214, "grad_norm": 5.9314422607421875, "learning_rate": 9.70852256029323e-05, "loss": 2.235941505432129, "memory(GiB)": 39.61, "step": 540, "token_acc": 0.5250737463126843, "train_speed(iter/s)": 0.586408 }, { "epoch": 0.3307038834951456, "grad_norm": 7.305792331695557, "learning_rate": 9.703154139680533e-05, "loss": 2.417573928833008, "memory(GiB)": 39.61, "step": 545, "token_acc": 0.5104477611940299, "train_speed(iter/s)": 0.586501 }, { "epoch": 0.3337378640776699, "grad_norm": 5.719043731689453, "learning_rate": 9.697738243335028e-05, "loss": 2.2177127838134765, "memory(GiB)": 39.61, "step": 550, "token_acc": 0.5016181229773463, "train_speed(iter/s)": 0.586526 }, { "epoch": 0.33677184466019416, "grad_norm": 6.281179428100586, "learning_rate": 9.692274925927185e-05, "loss": 2.1101545333862304, "memory(GiB)": 39.61, "step": 555, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 0.586182 }, { "epoch": 0.33980582524271846, "grad_norm": 5.763940811157227, "learning_rate": 9.686764242606163e-05, "loss": 2.2045364379882812, "memory(GiB)": 39.61, "step": 560, "token_acc": 0.5030864197530864, "train_speed(iter/s)": 0.585339 }, { "epoch": 0.3428398058252427, "grad_norm": 8.95957088470459, "learning_rate": 9.681206248999257e-05, "loss": 2.5135177612304687, "memory(GiB)": 39.61, "step": 565, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 0.584751 }, { "epoch": 0.345873786407767, "grad_norm": 5.587778568267822, "learning_rate": 9.675601001211326e-05, "loss": 2.392421340942383, "memory(GiB)": 39.61, "step": 570, "token_acc": 0.4461538461538462, "train_speed(iter/s)": 0.585118 }, { "epoch": 0.34890776699029125, "grad_norm": 7.836484432220459, "learning_rate": 9.669948555824242e-05, "loss": 2.324014663696289, "memory(GiB)": 39.61, "step": 575, "token_acc": 0.4639175257731959, "train_speed(iter/s)": 0.584897 }, { "epoch": 0.35194174757281554, "grad_norm": 5.96414041519165, "learning_rate": 9.664248969896303e-05, "loss": 2.302785301208496, "memory(GiB)": 39.61, "step": 580, "token_acc": 0.4904109589041096, "train_speed(iter/s)": 0.585033 }, { "epoch": 0.3549757281553398, "grad_norm": 7.691707611083984, "learning_rate": 9.65850230096167e-05, "loss": 2.4697898864746093, "memory(GiB)": 39.61, "step": 585, "token_acc": 0.4444444444444444, "train_speed(iter/s)": 0.584897 }, { "epoch": 0.3580097087378641, "grad_norm": 8.556262016296387, "learning_rate": 9.652708607029779e-05, "loss": 2.2903860092163084, "memory(GiB)": 40.86, "step": 590, "token_acc": 0.47115384615384615, "train_speed(iter/s)": 0.584145 }, { "epoch": 0.36104368932038833, "grad_norm": 6.732985496520996, "learning_rate": 9.646867946584757e-05, "loss": 2.1200277328491213, "memory(GiB)": 40.86, "step": 595, "token_acc": 0.532608695652174, "train_speed(iter/s)": 0.584433 }, { "epoch": 0.3640776699029126, "grad_norm": 6.632906913757324, "learning_rate": 9.64098037858483e-05, "loss": 2.4770671844482424, "memory(GiB)": 40.86, "step": 600, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 0.584177 }, { "epoch": 0.36711165048543687, "grad_norm": 8.074189186096191, "learning_rate": 9.635045962461735e-05, "loss": 2.0175329208374024, "memory(GiB)": 40.86, "step": 605, "token_acc": 0.5444444444444444, "train_speed(iter/s)": 0.584218 }, { "epoch": 0.37014563106796117, "grad_norm": 10.57684326171875, "learning_rate": 9.62906475812011e-05, "loss": 2.471089172363281, "memory(GiB)": 40.86, "step": 610, "token_acc": 0.47604790419161674, "train_speed(iter/s)": 0.584641 }, { "epoch": 0.3731796116504854, "grad_norm": 9.030044555664062, "learning_rate": 9.623036825936898e-05, "loss": 2.4689071655273436, "memory(GiB)": 40.86, "step": 615, "token_acc": 0.4551282051282051, "train_speed(iter/s)": 0.58472 }, { "epoch": 0.3762135922330097, "grad_norm": 12.650615692138672, "learning_rate": 9.616962226760728e-05, "loss": 2.4379999160766603, "memory(GiB)": 40.86, "step": 620, "token_acc": 0.4965753424657534, "train_speed(iter/s)": 0.583902 }, { "epoch": 0.379247572815534, "grad_norm": 6.823087692260742, "learning_rate": 9.610841021911312e-05, "loss": 2.2892841339111327, "memory(GiB)": 40.86, "step": 625, "token_acc": 0.4925373134328358, "train_speed(iter/s)": 0.584344 }, { "epoch": 0.38228155339805825, "grad_norm": 6.585781097412109, "learning_rate": 9.604673273178819e-05, "loss": 2.1564374923706056, "memory(GiB)": 40.86, "step": 630, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 0.584389 }, { "epoch": 0.38531553398058255, "grad_norm": 7.104307174682617, "learning_rate": 9.59845904282325e-05, "loss": 2.1816734313964843, "memory(GiB)": 40.86, "step": 635, "token_acc": 0.5308219178082192, "train_speed(iter/s)": 0.585012 }, { "epoch": 0.3883495145631068, "grad_norm": 7.516766548156738, "learning_rate": 9.592198393573816e-05, "loss": 2.276702308654785, "memory(GiB)": 40.86, "step": 640, "token_acc": 0.5102739726027398, "train_speed(iter/s)": 0.585101 }, { "epoch": 0.3913834951456311, "grad_norm": 8.942841529846191, "learning_rate": 9.585891388628298e-05, "loss": 2.3461095809936525, "memory(GiB)": 40.86, "step": 645, "token_acc": 0.527972027972028, "train_speed(iter/s)": 0.585177 }, { "epoch": 0.39441747572815533, "grad_norm": 7.309288024902344, "learning_rate": 9.579538091652414e-05, "loss": 2.3102886199951174, "memory(GiB)": 40.86, "step": 650, "token_acc": 0.5029585798816568, "train_speed(iter/s)": 0.584698 }, { "epoch": 0.39745145631067963, "grad_norm": 8.047052383422852, "learning_rate": 9.573138566779171e-05, "loss": 2.2706655502319335, "memory(GiB)": 40.86, "step": 655, "token_acc": 0.48942598187311176, "train_speed(iter/s)": 0.584864 }, { "epoch": 0.40048543689320387, "grad_norm": 5.1699442863464355, "learning_rate": 9.566692878608229e-05, "loss": 2.3724884033203124, "memory(GiB)": 40.86, "step": 660, "token_acc": 0.5276872964169381, "train_speed(iter/s)": 0.584586 }, { "epoch": 0.40351941747572817, "grad_norm": 7.834784030914307, "learning_rate": 9.560201092205231e-05, "loss": 2.149821090698242, "memory(GiB)": 40.86, "step": 665, "token_acc": 0.5422535211267606, "train_speed(iter/s)": 0.584347 }, { "epoch": 0.4065533980582524, "grad_norm": 5.076271057128906, "learning_rate": 9.553663273101162e-05, "loss": 2.2725826263427735, "memory(GiB)": 40.86, "step": 670, "token_acc": 0.48223350253807107, "train_speed(iter/s)": 0.584476 }, { "epoch": 0.4095873786407767, "grad_norm": 5.5801005363464355, "learning_rate": 9.54707948729168e-05, "loss": 2.556637001037598, "memory(GiB)": 40.86, "step": 675, "token_acc": 0.49714285714285716, "train_speed(iter/s)": 0.583956 }, { "epoch": 0.41262135922330095, "grad_norm": 7.065471649169922, "learning_rate": 9.540449801236451e-05, "loss": 2.326729393005371, "memory(GiB)": 40.86, "step": 680, "token_acc": 0.5097402597402597, "train_speed(iter/s)": 0.583876 }, { "epoch": 0.41565533980582525, "grad_norm": 5.337322235107422, "learning_rate": 9.533774281858481e-05, "loss": 2.34055118560791, "memory(GiB)": 40.86, "step": 685, "token_acc": 0.45645645645645644, "train_speed(iter/s)": 0.584332 }, { "epoch": 0.4186893203883495, "grad_norm": 6.009404182434082, "learning_rate": 9.527052996543436e-05, "loss": 2.368490791320801, "memory(GiB)": 40.86, "step": 690, "token_acc": 0.49240121580547114, "train_speed(iter/s)": 0.584492 }, { "epoch": 0.4217233009708738, "grad_norm": 7.1615495681762695, "learning_rate": 9.520286013138959e-05, "loss": 2.2751487731933593, "memory(GiB)": 40.86, "step": 695, "token_acc": 0.51875, "train_speed(iter/s)": 0.584253 }, { "epoch": 0.42475728155339804, "grad_norm": 6.305184841156006, "learning_rate": 9.513473399954001e-05, "loss": 2.2249755859375, "memory(GiB)": 40.86, "step": 700, "token_acc": 0.5359477124183006, "train_speed(iter/s)": 0.583644 }, { "epoch": 0.42779126213592233, "grad_norm": 6.879371166229248, "learning_rate": 9.506615225758111e-05, "loss": 2.1284107208251952, "memory(GiB)": 40.86, "step": 705, "token_acc": 0.46647230320699706, "train_speed(iter/s)": 0.583054 }, { "epoch": 0.4308252427184466, "grad_norm": 5.7029523849487305, "learning_rate": 9.499711559780756e-05, "loss": 2.3587778091430662, "memory(GiB)": 40.86, "step": 710, "token_acc": 0.4859154929577465, "train_speed(iter/s)": 0.583094 }, { "epoch": 0.4338592233009709, "grad_norm": 7.390230178833008, "learning_rate": 9.492762471710612e-05, "loss": 2.6136167526245115, "memory(GiB)": 40.86, "step": 715, "token_acc": 0.46646341463414637, "train_speed(iter/s)": 0.582932 }, { "epoch": 0.4368932038834951, "grad_norm": 5.883137226104736, "learning_rate": 9.485768031694872e-05, "loss": 2.2231393814086915, "memory(GiB)": 40.86, "step": 720, "token_acc": 0.49818181818181817, "train_speed(iter/s)": 0.582775 }, { "epoch": 0.4399271844660194, "grad_norm": 6.680229663848877, "learning_rate": 9.478728310338527e-05, "loss": 2.1992170333862306, "memory(GiB)": 40.86, "step": 725, "token_acc": 0.5133531157270029, "train_speed(iter/s)": 0.582635 }, { "epoch": 0.4429611650485437, "grad_norm": 8.902689933776855, "learning_rate": 9.471643378703662e-05, "loss": 2.0395624160766603, "memory(GiB)": 40.86, "step": 730, "token_acc": 0.5493421052631579, "train_speed(iter/s)": 0.582703 }, { "epoch": 0.44599514563106796, "grad_norm": 5.443286895751953, "learning_rate": 9.464513308308734e-05, "loss": 2.506935882568359, "memory(GiB)": 40.86, "step": 735, "token_acc": 0.47368421052631576, "train_speed(iter/s)": 0.583215 }, { "epoch": 0.44902912621359226, "grad_norm": 6.487564563751221, "learning_rate": 9.457338171127847e-05, "loss": 2.2692995071411133, "memory(GiB)": 40.86, "step": 740, "token_acc": 0.5179153094462541, "train_speed(iter/s)": 0.583207 }, { "epoch": 0.4520631067961165, "grad_norm": 7.125478267669678, "learning_rate": 9.450118039590032e-05, "loss": 2.1293052673339843, "memory(GiB)": 40.86, "step": 745, "token_acc": 0.5464285714285714, "train_speed(iter/s)": 0.583504 }, { "epoch": 0.4550970873786408, "grad_norm": 7.087446212768555, "learning_rate": 9.442852986578514e-05, "loss": 2.4458339691162108, "memory(GiB)": 40.86, "step": 750, "token_acc": 0.49736842105263157, "train_speed(iter/s)": 0.583179 }, { "epoch": 0.45813106796116504, "grad_norm": 7.162069320678711, "learning_rate": 9.435543085429972e-05, "loss": 2.3158668518066405, "memory(GiB)": 40.86, "step": 755, "token_acc": 0.4744744744744745, "train_speed(iter/s)": 0.5828 }, { "epoch": 0.46116504854368934, "grad_norm": 5.414243698120117, "learning_rate": 9.428188409933806e-05, "loss": 2.16876335144043, "memory(GiB)": 40.86, "step": 760, "token_acc": 0.5087719298245614, "train_speed(iter/s)": 0.58285 }, { "epoch": 0.4641990291262136, "grad_norm": 6.282864570617676, "learning_rate": 9.420789034331387e-05, "loss": 2.289217948913574, "memory(GiB)": 40.86, "step": 765, "token_acc": 0.512396694214876, "train_speed(iter/s)": 0.582631 }, { "epoch": 0.4672330097087379, "grad_norm": 8.376455307006836, "learning_rate": 9.413345033315307e-05, "loss": 2.428557777404785, "memory(GiB)": 40.86, "step": 770, "token_acc": 0.49038461538461536, "train_speed(iter/s)": 0.582831 }, { "epoch": 0.4702669902912621, "grad_norm": 6.952515602111816, "learning_rate": 9.405856482028627e-05, "loss": 2.5767995834350588, "memory(GiB)": 40.86, "step": 775, "token_acc": 0.43425076452599387, "train_speed(iter/s)": 0.583315 }, { "epoch": 0.4733009708737864, "grad_norm": 9.879197120666504, "learning_rate": 9.398323456064123e-05, "loss": 2.218907356262207, "memory(GiB)": 40.86, "step": 780, "token_acc": 0.4844961240310077, "train_speed(iter/s)": 0.583721 }, { "epoch": 0.47633495145631066, "grad_norm": 7.553537845611572, "learning_rate": 9.39074603146351e-05, "loss": 2.3447980880737305, "memory(GiB)": 40.86, "step": 785, "token_acc": 0.4934640522875817, "train_speed(iter/s)": 0.583893 }, { "epoch": 0.47936893203883496, "grad_norm": 6.071379661560059, "learning_rate": 9.383124284716691e-05, "loss": 2.241764450073242, "memory(GiB)": 40.86, "step": 790, "token_acc": 0.46984126984126984, "train_speed(iter/s)": 0.584261 }, { "epoch": 0.4824029126213592, "grad_norm": 5.881275653839111, "learning_rate": 9.37545829276097e-05, "loss": 2.361056900024414, "memory(GiB)": 40.86, "step": 795, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 0.584506 }, { "epoch": 0.4854368932038835, "grad_norm": 8.599099159240723, "learning_rate": 9.367748132980287e-05, "loss": 2.1997608184814452, "memory(GiB)": 40.86, "step": 800, "token_acc": 0.48518518518518516, "train_speed(iter/s)": 0.584425 }, { "epoch": 0.48847087378640774, "grad_norm": 5.97467565536499, "learning_rate": 9.359993883204425e-05, "loss": 2.2297504425048826, "memory(GiB)": 40.86, "step": 805, "token_acc": 0.5, "train_speed(iter/s)": 0.583965 }, { "epoch": 0.49150485436893204, "grad_norm": 6.91083288192749, "learning_rate": 9.352195621708239e-05, "loss": 1.9850988388061523, "memory(GiB)": 40.86, "step": 810, "token_acc": 0.5147540983606558, "train_speed(iter/s)": 0.584245 }, { "epoch": 0.4945388349514563, "grad_norm": 8.59461784362793, "learning_rate": 9.344353427210852e-05, "loss": 2.421934127807617, "memory(GiB)": 40.86, "step": 815, "token_acc": 0.5176056338028169, "train_speed(iter/s)": 0.5847 }, { "epoch": 0.4975728155339806, "grad_norm": 6.944448947906494, "learning_rate": 9.336467378874871e-05, "loss": 2.4557096481323244, "memory(GiB)": 40.86, "step": 820, "token_acc": 0.47419354838709676, "train_speed(iter/s)": 0.584583 }, { "epoch": 0.5006067961165048, "grad_norm": 5.55971622467041, "learning_rate": 9.328537556305578e-05, "loss": 2.2306629180908204, "memory(GiB)": 40.86, "step": 825, "token_acc": 0.4852459016393443, "train_speed(iter/s)": 0.585006 }, { "epoch": 0.5036407766990292, "grad_norm": 7.160358905792236, "learning_rate": 9.320564039550134e-05, "loss": 2.429665374755859, "memory(GiB)": 40.86, "step": 830, "token_acc": 0.47262247838616717, "train_speed(iter/s)": 0.585348 }, { "epoch": 0.5066747572815534, "grad_norm": 6.570638656616211, "learning_rate": 9.31254690909677e-05, "loss": 2.442539596557617, "memory(GiB)": 40.86, "step": 835, "token_acc": 0.476027397260274, "train_speed(iter/s)": 0.585696 }, { "epoch": 0.5097087378640777, "grad_norm": 6.370124340057373, "learning_rate": 9.304486245873972e-05, "loss": 2.287601089477539, "memory(GiB)": 40.86, "step": 840, "token_acc": 0.4982456140350877, "train_speed(iter/s)": 0.585975 }, { "epoch": 0.5127427184466019, "grad_norm": 6.999332904815674, "learning_rate": 9.296382131249666e-05, "loss": 2.317913818359375, "memory(GiB)": 40.86, "step": 845, "token_acc": 0.5041782729805014, "train_speed(iter/s)": 0.585835 }, { "epoch": 0.5157766990291263, "grad_norm": 5.257606506347656, "learning_rate": 9.288234647030391e-05, "loss": 2.18968505859375, "memory(GiB)": 40.86, "step": 850, "token_acc": 0.5102040816326531, "train_speed(iter/s)": 0.585922 }, { "epoch": 0.5188106796116505, "grad_norm": 5.611077785491943, "learning_rate": 9.280043875460485e-05, "loss": 2.0620901107788088, "memory(GiB)": 40.86, "step": 855, "token_acc": 0.5365079365079365, "train_speed(iter/s)": 0.585966 }, { "epoch": 0.5218446601941747, "grad_norm": 5.108231067657471, "learning_rate": 9.271809899221246e-05, "loss": 2.4372896194458007, "memory(GiB)": 40.86, "step": 860, "token_acc": 0.4479768786127168, "train_speed(iter/s)": 0.585988 }, { "epoch": 0.524878640776699, "grad_norm": 6.733373641967773, "learning_rate": 9.263532801430094e-05, "loss": 2.1579952239990234, "memory(GiB)": 40.86, "step": 865, "token_acc": 0.4807121661721068, "train_speed(iter/s)": 0.585966 }, { "epoch": 0.5279126213592233, "grad_norm": 5.073429107666016, "learning_rate": 9.255212665639744e-05, "loss": 2.1149240493774415, "memory(GiB)": 40.86, "step": 870, "token_acc": 0.5171232876712328, "train_speed(iter/s)": 0.585975 }, { "epoch": 0.5309466019417476, "grad_norm": 6.175984859466553, "learning_rate": 9.246849575837349e-05, "loss": 1.9833623886108398, "memory(GiB)": 40.86, "step": 875, "token_acc": 0.526813880126183, "train_speed(iter/s)": 0.585474 }, { "epoch": 0.5339805825242718, "grad_norm": 8.124624252319336, "learning_rate": 9.238443616443666e-05, "loss": 2.4017959594726563, "memory(GiB)": 40.86, "step": 880, "token_acc": 0.49032258064516127, "train_speed(iter/s)": 0.585597 }, { "epoch": 0.5370145631067961, "grad_norm": 7.752699375152588, "learning_rate": 9.229994872312193e-05, "loss": 2.387744331359863, "memory(GiB)": 40.86, "step": 885, "token_acc": 0.5106382978723404, "train_speed(iter/s)": 0.585668 }, { "epoch": 0.5400485436893204, "grad_norm": 8.323918342590332, "learning_rate": 9.221503428728316e-05, "loss": 2.1385421752929688, "memory(GiB)": 40.86, "step": 890, "token_acc": 0.5551020408163265, "train_speed(iter/s)": 0.586131 }, { "epoch": 0.5430825242718447, "grad_norm": 6.044275760650635, "learning_rate": 9.212969371408449e-05, "loss": 1.9817846298217774, "memory(GiB)": 40.86, "step": 895, "token_acc": 0.5494880546075085, "train_speed(iter/s)": 0.586393 }, { "epoch": 0.5461165048543689, "grad_norm": 6.398566246032715, "learning_rate": 9.204392786499168e-05, "loss": 2.3052085876464843, "memory(GiB)": 40.86, "step": 900, "token_acc": 0.4840764331210191, "train_speed(iter/s)": 0.586567 }, { "epoch": 0.5491504854368932, "grad_norm": 9.17261028289795, "learning_rate": 9.19577376057634e-05, "loss": 2.37634391784668, "memory(GiB)": 40.86, "step": 905, "token_acc": 0.5249169435215947, "train_speed(iter/s)": 0.586787 }, { "epoch": 0.5521844660194175, "grad_norm": 5.751415729522705, "learning_rate": 9.187112380644254e-05, "loss": 2.2847476959228517, "memory(GiB)": 40.86, "step": 910, "token_acc": 0.51338199513382, "train_speed(iter/s)": 0.586408 }, { "epoch": 0.5552184466019418, "grad_norm": 12.116822242736816, "learning_rate": 9.178408734134736e-05, "loss": 2.5225976943969726, "memory(GiB)": 40.86, "step": 915, "token_acc": 0.48253968253968255, "train_speed(iter/s)": 0.586571 }, { "epoch": 0.558252427184466, "grad_norm": 8.28947925567627, "learning_rate": 9.16966290890627e-05, "loss": 2.215795135498047, "memory(GiB)": 40.86, "step": 920, "token_acc": 0.5543071161048689, "train_speed(iter/s)": 0.586881 }, { "epoch": 0.5612864077669902, "grad_norm": 9.582908630371094, "learning_rate": 9.160874993243113e-05, "loss": 2.299172019958496, "memory(GiB)": 40.86, "step": 925, "token_acc": 0.4763636363636364, "train_speed(iter/s)": 0.587069 }, { "epoch": 0.5643203883495146, "grad_norm": 8.669927597045898, "learning_rate": 9.152045075854398e-05, "loss": 2.457051086425781, "memory(GiB)": 40.86, "step": 930, "token_acc": 0.49828178694158076, "train_speed(iter/s)": 0.587349 }, { "epoch": 0.5673543689320388, "grad_norm": 6.801449298858643, "learning_rate": 9.143173245873247e-05, "loss": 2.1124551773071287, "memory(GiB)": 40.86, "step": 935, "token_acc": 0.5018315018315018, "train_speed(iter/s)": 0.58766 }, { "epoch": 0.5703883495145631, "grad_norm": 7.988888263702393, "learning_rate": 9.134259592855861e-05, "loss": 2.3452516555786134, "memory(GiB)": 40.86, "step": 940, "token_acc": 0.49666666666666665, "train_speed(iter/s)": 0.58765 }, { "epoch": 0.5734223300970874, "grad_norm": 7.102814674377441, "learning_rate": 9.125304206780627e-05, "loss": 2.3180185317993165, "memory(GiB)": 40.86, "step": 945, "token_acc": 0.5050847457627119, "train_speed(iter/s)": 0.587449 }, { "epoch": 0.5764563106796117, "grad_norm": 7.604477405548096, "learning_rate": 9.116307178047198e-05, "loss": 2.3972042083740233, "memory(GiB)": 40.86, "step": 950, "token_acc": 0.46283783783783783, "train_speed(iter/s)": 0.587729 }, { "epoch": 0.5794902912621359, "grad_norm": 6.319246292114258, "learning_rate": 9.10726859747559e-05, "loss": 2.103443908691406, "memory(GiB)": 40.86, "step": 955, "token_acc": 0.5191256830601093, "train_speed(iter/s)": 0.588189 }, { "epoch": 0.5825242718446602, "grad_norm": 8.772871017456055, "learning_rate": 9.098188556305263e-05, "loss": 2.073552703857422, "memory(GiB)": 40.86, "step": 960, "token_acc": 0.552901023890785, "train_speed(iter/s)": 0.588218 }, { "epoch": 0.5855582524271845, "grad_norm": 8.01586627960205, "learning_rate": 9.089067146194196e-05, "loss": 1.8984146118164062, "memory(GiB)": 40.86, "step": 965, "token_acc": 0.616504854368932, "train_speed(iter/s)": 0.58821 }, { "epoch": 0.5885922330097088, "grad_norm": 6.168645858764648, "learning_rate": 9.079904459217966e-05, "loss": 2.379282760620117, "memory(GiB)": 40.86, "step": 970, "token_acc": 0.4649122807017544, "train_speed(iter/s)": 0.588446 }, { "epoch": 0.591626213592233, "grad_norm": 6.704972743988037, "learning_rate": 9.070700587868817e-05, "loss": 2.1655595779418944, "memory(GiB)": 40.86, "step": 975, "token_acc": 0.5521885521885522, "train_speed(iter/s)": 0.588386 }, { "epoch": 0.5946601941747572, "grad_norm": 7.025293827056885, "learning_rate": 9.061455625054725e-05, "loss": 2.193133735656738, "memory(GiB)": 40.86, "step": 980, "token_acc": 0.5197368421052632, "train_speed(iter/s)": 0.588278 }, { "epoch": 0.5976941747572816, "grad_norm": 6.618514537811279, "learning_rate": 9.052169664098461e-05, "loss": 2.0073310852050783, "memory(GiB)": 40.86, "step": 985, "token_acc": 0.55893536121673, "train_speed(iter/s)": 0.588288 }, { "epoch": 0.6007281553398058, "grad_norm": 5.154722690582275, "learning_rate": 9.042842798736654e-05, "loss": 2.2399974822998048, "memory(GiB)": 40.86, "step": 990, "token_acc": 0.5195530726256983, "train_speed(iter/s)": 0.588303 }, { "epoch": 0.6037621359223301, "grad_norm": 6.787222862243652, "learning_rate": 9.03347512311884e-05, "loss": 2.3585285186767577, "memory(GiB)": 40.86, "step": 995, "token_acc": 0.46075085324232085, "train_speed(iter/s)": 0.588665 }, { "epoch": 0.6067961165048543, "grad_norm": 4.932912826538086, "learning_rate": 9.024066731806501e-05, "loss": 2.276376724243164, "memory(GiB)": 40.86, "step": 1000, "token_acc": 0.4921135646687697, "train_speed(iter/s)": 0.58881 }, { "epoch": 0.6067961165048543, "eval_loss": 2.31942081451416, "eval_runtime": 12.0489, "eval_samples_per_second": 8.3, "eval_steps_per_second": 8.3, "eval_token_acc": 0.48575305291723203, "step": 1000 }, { "epoch": 0.6098300970873787, "grad_norm": 9.08281421661377, "learning_rate": 9.01461771977214e-05, "loss": 2.333499717712402, "memory(GiB)": 40.86, "step": 1005, "token_acc": 0.4905838041431262, "train_speed(iter/s)": 0.584203 }, { "epoch": 0.6128640776699029, "grad_norm": 7.2298359870910645, "learning_rate": 9.005128182398283e-05, "loss": 2.4393625259399414, "memory(GiB)": 40.86, "step": 1010, "token_acc": 0.48732394366197185, "train_speed(iter/s)": 0.584412 }, { "epoch": 0.6158980582524272, "grad_norm": 5.784246444702148, "learning_rate": 8.995598215476555e-05, "loss": 2.171500587463379, "memory(GiB)": 40.86, "step": 1015, "token_acc": 0.5384615384615384, "train_speed(iter/s)": 0.58417 }, { "epoch": 0.6189320388349514, "grad_norm": 8.403388977050781, "learning_rate": 8.986027915206686e-05, "loss": 2.1093074798583986, "memory(GiB)": 40.86, "step": 1020, "token_acc": 0.5201342281879194, "train_speed(iter/s)": 0.584014 }, { "epoch": 0.6219660194174758, "grad_norm": 7.646571636199951, "learning_rate": 8.976417378195544e-05, "loss": 2.1439834594726563, "memory(GiB)": 40.86, "step": 1025, "token_acc": 0.5295857988165681, "train_speed(iter/s)": 0.583981 }, { "epoch": 0.625, "grad_norm": 6.978275299072266, "learning_rate": 8.966766701456177e-05, "loss": 2.288041687011719, "memory(GiB)": 40.86, "step": 1030, "token_acc": 0.513126491646778, "train_speed(iter/s)": 0.584314 }, { "epoch": 0.6280339805825242, "grad_norm": 6.3236541748046875, "learning_rate": 8.957075982406811e-05, "loss": 2.250352668762207, "memory(GiB)": 40.86, "step": 1035, "token_acc": 0.531986531986532, "train_speed(iter/s)": 0.584575 }, { "epoch": 0.6310679611650486, "grad_norm": 6.21897554397583, "learning_rate": 8.947345318869882e-05, "loss": 2.425637054443359, "memory(GiB)": 40.86, "step": 1040, "token_acc": 0.46859903381642515, "train_speed(iter/s)": 0.584674 }, { "epoch": 0.6341019417475728, "grad_norm": 7.0973358154296875, "learning_rate": 8.937574809071041e-05, "loss": 1.9796913146972657, "memory(GiB)": 40.86, "step": 1045, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 0.584622 }, { "epoch": 0.6371359223300971, "grad_norm": 6.9171833992004395, "learning_rate": 8.927764551638169e-05, "loss": 2.153505325317383, "memory(GiB)": 40.86, "step": 1050, "token_acc": 0.5481727574750831, "train_speed(iter/s)": 0.584749 }, { "epoch": 0.6401699029126213, "grad_norm": 6.10349178314209, "learning_rate": 8.917914645600369e-05, "loss": 2.2978469848632814, "memory(GiB)": 40.86, "step": 1055, "token_acc": 0.5279503105590062, "train_speed(iter/s)": 0.584579 }, { "epoch": 0.6432038834951457, "grad_norm": 8.071660995483398, "learning_rate": 8.908025190386985e-05, "loss": 1.8877496719360352, "memory(GiB)": 40.86, "step": 1060, "token_acc": 0.582089552238806, "train_speed(iter/s)": 0.584349 }, { "epoch": 0.6462378640776699, "grad_norm": 5.858845233917236, "learning_rate": 8.898096285826582e-05, "loss": 2.2511001586914063, "memory(GiB)": 40.86, "step": 1065, "token_acc": 0.4642857142857143, "train_speed(iter/s)": 0.584502 }, { "epoch": 0.6492718446601942, "grad_norm": 7.347677230834961, "learning_rate": 8.888128032145941e-05, "loss": 2.173788833618164, "memory(GiB)": 40.86, "step": 1070, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 0.584862 }, { "epoch": 0.6523058252427184, "grad_norm": 6.6078596115112305, "learning_rate": 8.878120529969061e-05, "loss": 2.1907543182373046, "memory(GiB)": 40.86, "step": 1075, "token_acc": 0.5047318611987381, "train_speed(iter/s)": 0.585062 }, { "epoch": 0.6553398058252428, "grad_norm": 6.744375228881836, "learning_rate": 8.868073880316124e-05, "loss": 2.5921836853027345, "memory(GiB)": 40.86, "step": 1080, "token_acc": 0.4777777777777778, "train_speed(iter/s)": 0.585281 }, { "epoch": 0.658373786407767, "grad_norm": 6.0812153816223145, "learning_rate": 8.857988184602484e-05, "loss": 2.076370620727539, "memory(GiB)": 40.86, "step": 1085, "token_acc": 0.5142118863049095, "train_speed(iter/s)": 0.585629 }, { "epoch": 0.6614077669902912, "grad_norm": 7.485403060913086, "learning_rate": 8.84786354463765e-05, "loss": 2.553455352783203, "memory(GiB)": 40.86, "step": 1090, "token_acc": 0.47151898734177217, "train_speed(iter/s)": 0.58598 }, { "epoch": 0.6644417475728155, "grad_norm": 6.709589958190918, "learning_rate": 8.837700062624245e-05, "loss": 2.1033605575561523, "memory(GiB)": 40.86, "step": 1095, "token_acc": 0.525, "train_speed(iter/s)": 0.586173 }, { "epoch": 0.6674757281553398, "grad_norm": 7.800707817077637, "learning_rate": 8.827497841156986e-05, "loss": 2.4268184661865235, "memory(GiB)": 40.86, "step": 1100, "token_acc": 0.5115511551155115, "train_speed(iter/s)": 0.586144 }, { "epoch": 0.6705097087378641, "grad_norm": 6.759317874908447, "learning_rate": 8.817256983221637e-05, "loss": 2.4730669021606446, "memory(GiB)": 40.86, "step": 1105, "token_acc": 0.45609065155807366, "train_speed(iter/s)": 0.586261 }, { "epoch": 0.6735436893203883, "grad_norm": 7.099054336547852, "learning_rate": 8.806977592193985e-05, "loss": 2.596049118041992, "memory(GiB)": 40.86, "step": 1110, "token_acc": 0.44542772861356933, "train_speed(iter/s)": 0.586514 }, { "epoch": 0.6765776699029126, "grad_norm": 6.84334135055542, "learning_rate": 8.796659771838777e-05, "loss": 2.2642656326293946, "memory(GiB)": 40.86, "step": 1115, "token_acc": 0.5156794425087108, "train_speed(iter/s)": 0.586643 }, { "epoch": 0.6796116504854369, "grad_norm": 6.421635627746582, "learning_rate": 8.786303626308689e-05, "loss": 2.1252628326416017, "memory(GiB)": 40.86, "step": 1120, "token_acc": 0.5263157894736842, "train_speed(iter/s)": 0.586755 }, { "epoch": 0.6826456310679612, "grad_norm": 4.622204303741455, "learning_rate": 8.775909260143266e-05, "loss": 2.2372303009033203, "memory(GiB)": 40.86, "step": 1125, "token_acc": 0.5159574468085106, "train_speed(iter/s)": 0.586869 }, { "epoch": 0.6856796116504854, "grad_norm": 7.592894554138184, "learning_rate": 8.765476778267874e-05, "loss": 2.1323163986206053, "memory(GiB)": 40.86, "step": 1130, "token_acc": 0.4909090909090909, "train_speed(iter/s)": 0.586899 }, { "epoch": 0.6887135922330098, "grad_norm": 7.193599700927734, "learning_rate": 8.755006285992629e-05, "loss": 2.1902294158935547, "memory(GiB)": 40.86, "step": 1135, "token_acc": 0.5234899328859061, "train_speed(iter/s)": 0.586763 }, { "epoch": 0.691747572815534, "grad_norm": 4.904916286468506, "learning_rate": 8.744497889011343e-05, "loss": 2.2312740325927733, "memory(GiB)": 40.86, "step": 1140, "token_acc": 0.48404255319148937, "train_speed(iter/s)": 0.586675 }, { "epoch": 0.6947815533980582, "grad_norm": 8.201228141784668, "learning_rate": 8.733951693400458e-05, "loss": 2.166943168640137, "memory(GiB)": 40.86, "step": 1145, "token_acc": 0.5105633802816901, "train_speed(iter/s)": 0.58693 }, { "epoch": 0.6978155339805825, "grad_norm": 5.049937725067139, "learning_rate": 8.723367805617965e-05, "loss": 2.254404067993164, "memory(GiB)": 40.86, "step": 1150, "token_acc": 0.478125, "train_speed(iter/s)": 0.587058 }, { "epoch": 0.7008495145631068, "grad_norm": 6.745171546936035, "learning_rate": 8.712746332502351e-05, "loss": 2.1543249130249023, "memory(GiB)": 40.86, "step": 1155, "token_acc": 0.5327380952380952, "train_speed(iter/s)": 0.587131 }, { "epoch": 0.7038834951456311, "grad_norm": 10.320196151733398, "learning_rate": 8.702087381271488e-05, "loss": 2.4464441299438477, "memory(GiB)": 40.86, "step": 1160, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 0.587013 }, { "epoch": 0.7069174757281553, "grad_norm": 6.8334503173828125, "learning_rate": 8.691391059521583e-05, "loss": 2.1910587310791017, "memory(GiB)": 40.86, "step": 1165, "token_acc": 0.527972027972028, "train_speed(iter/s)": 0.586856 }, { "epoch": 0.7099514563106796, "grad_norm": 6.28577184677124, "learning_rate": 8.680657475226069e-05, "loss": 1.9499628067016601, "memory(GiB)": 40.86, "step": 1170, "token_acc": 0.6007751937984496, "train_speed(iter/s)": 0.586444 }, { "epoch": 0.7129854368932039, "grad_norm": 6.818657398223877, "learning_rate": 8.669886736734527e-05, "loss": 2.151942825317383, "memory(GiB)": 40.86, "step": 1175, "token_acc": 0.5254777070063694, "train_speed(iter/s)": 0.58655 }, { "epoch": 0.7160194174757282, "grad_norm": 5.253009796142578, "learning_rate": 8.659078952771592e-05, "loss": 2.54516487121582, "memory(GiB)": 40.86, "step": 1180, "token_acc": 0.4984894259818731, "train_speed(iter/s)": 0.586775 }, { "epoch": 0.7190533980582524, "grad_norm": 8.068851470947266, "learning_rate": 8.648234232435845e-05, "loss": 2.3182897567749023, "memory(GiB)": 40.86, "step": 1185, "token_acc": 0.4734982332155477, "train_speed(iter/s)": 0.586773 }, { "epoch": 0.7220873786407767, "grad_norm": 6.965189456939697, "learning_rate": 8.63735268519873e-05, "loss": 2.1850954055786134, "memory(GiB)": 40.86, "step": 1190, "token_acc": 0.5196374622356495, "train_speed(iter/s)": 0.586751 }, { "epoch": 0.725121359223301, "grad_norm": 6.5986409187316895, "learning_rate": 8.626434420903424e-05, "loss": 2.5639453887939454, "memory(GiB)": 40.86, "step": 1195, "token_acc": 0.4631268436578171, "train_speed(iter/s)": 0.586943 }, { "epoch": 0.7281553398058253, "grad_norm": 4.670579433441162, "learning_rate": 8.615479549763756e-05, "loss": 2.406460189819336, "memory(GiB)": 40.86, "step": 1200, "token_acc": 0.48223350253807107, "train_speed(iter/s)": 0.587084 }, { "epoch": 0.7311893203883495, "grad_norm": 6.295917510986328, "learning_rate": 8.604488182363074e-05, "loss": 2.536121940612793, "memory(GiB)": 40.86, "step": 1205, "token_acc": 0.4873417721518987, "train_speed(iter/s)": 0.587357 }, { "epoch": 0.7342233009708737, "grad_norm": 6.125625133514404, "learning_rate": 8.593460429653133e-05, "loss": 2.4063135147094727, "memory(GiB)": 40.86, "step": 1210, "token_acc": 0.48128342245989303, "train_speed(iter/s)": 0.587204 }, { "epoch": 0.7372572815533981, "grad_norm": 6.775357723236084, "learning_rate": 8.582396402952984e-05, "loss": 2.082032585144043, "memory(GiB)": 40.86, "step": 1215, "token_acc": 0.5273311897106109, "train_speed(iter/s)": 0.587489 }, { "epoch": 0.7402912621359223, "grad_norm": 8.1486177444458, "learning_rate": 8.571296213947838e-05, "loss": 1.675777053833008, "memory(GiB)": 40.86, "step": 1220, "token_acc": 0.6021897810218978, "train_speed(iter/s)": 0.587737 }, { "epoch": 0.7433252427184466, "grad_norm": 5.302309036254883, "learning_rate": 8.560159974687952e-05, "loss": 2.1232393264770506, "memory(GiB)": 40.86, "step": 1225, "token_acc": 0.5156695156695157, "train_speed(iter/s)": 0.587809 }, { "epoch": 0.7463592233009708, "grad_norm": 9.10730266571045, "learning_rate": 8.54898779758748e-05, "loss": 2.1305063247680662, "memory(GiB)": 40.86, "step": 1230, "token_acc": 0.53515625, "train_speed(iter/s)": 0.587588 }, { "epoch": 0.7493932038834952, "grad_norm": 6.489813327789307, "learning_rate": 8.537779795423359e-05, "loss": 2.2934566497802735, "memory(GiB)": 40.86, "step": 1235, "token_acc": 0.5161290322580645, "train_speed(iter/s)": 0.587435 }, { "epoch": 0.7524271844660194, "grad_norm": 6.982603549957275, "learning_rate": 8.526536081334152e-05, "loss": 2.2987644195556642, "memory(GiB)": 40.86, "step": 1240, "token_acc": 0.47924528301886793, "train_speed(iter/s)": 0.587522 }, { "epoch": 0.7554611650485437, "grad_norm": 7.774171352386475, "learning_rate": 8.515256768818918e-05, "loss": 2.5817737579345703, "memory(GiB)": 40.86, "step": 1245, "token_acc": 0.5040431266846361, "train_speed(iter/s)": 0.587668 }, { "epoch": 0.758495145631068, "grad_norm": 5.695102691650391, "learning_rate": 8.503941971736062e-05, "loss": 2.298574447631836, "memory(GiB)": 40.86, "step": 1250, "token_acc": 0.5070821529745042, "train_speed(iter/s)": 0.587481 }, { "epoch": 0.7615291262135923, "grad_norm": 5.622751235961914, "learning_rate": 8.492591804302186e-05, "loss": 2.149024772644043, "memory(GiB)": 40.86, "step": 1255, "token_acc": 0.5030674846625767, "train_speed(iter/s)": 0.58761 }, { "epoch": 0.7645631067961165, "grad_norm": 9.05585765838623, "learning_rate": 8.481206381090934e-05, "loss": 2.464432716369629, "memory(GiB)": 40.86, "step": 1260, "token_acc": 0.504950495049505, "train_speed(iter/s)": 0.587385 }, { "epoch": 0.7675970873786407, "grad_norm": 6.5983428955078125, "learning_rate": 8.469785817031841e-05, "loss": 2.203810119628906, "memory(GiB)": 40.86, "step": 1265, "token_acc": 0.5412186379928315, "train_speed(iter/s)": 0.587789 }, { "epoch": 0.7706310679611651, "grad_norm": 4.769191265106201, "learning_rate": 8.458330227409168e-05, "loss": 2.432425308227539, "memory(GiB)": 40.86, "step": 1270, "token_acc": 0.4608433734939759, "train_speed(iter/s)": 0.588056 }, { "epoch": 0.7736650485436893, "grad_norm": 8.539231300354004, "learning_rate": 8.446839727860738e-05, "loss": 2.354892539978027, "memory(GiB)": 40.86, "step": 1275, "token_acc": 0.5155555555555555, "train_speed(iter/s)": 0.588053 }, { "epoch": 0.7766990291262136, "grad_norm": 5.209239959716797, "learning_rate": 8.435314434376773e-05, "loss": 2.296826934814453, "memory(GiB)": 40.86, "step": 1280, "token_acc": 0.5085714285714286, "train_speed(iter/s)": 0.587863 }, { "epoch": 0.7797330097087378, "grad_norm": 7.653853893280029, "learning_rate": 8.423754463298717e-05, "loss": 2.117538261413574, "memory(GiB)": 40.86, "step": 1285, "token_acc": 0.5723076923076923, "train_speed(iter/s)": 0.587922 }, { "epoch": 0.7827669902912622, "grad_norm": 7.506109237670898, "learning_rate": 8.412159931318068e-05, "loss": 2.4975624084472656, "memory(GiB)": 40.86, "step": 1290, "token_acc": 0.46710526315789475, "train_speed(iter/s)": 0.587905 }, { "epoch": 0.7858009708737864, "grad_norm": 5.187159538269043, "learning_rate": 8.400530955475198e-05, "loss": 2.2532814025878904, "memory(GiB)": 40.86, "step": 1295, "token_acc": 0.46987951807228917, "train_speed(iter/s)": 0.587699 }, { "epoch": 0.7888349514563107, "grad_norm": 8.4281587600708, "learning_rate": 8.38886765315817e-05, "loss": 2.3919906616210938, "memory(GiB)": 40.86, "step": 1300, "token_acc": 0.5017182130584192, "train_speed(iter/s)": 0.587556 }, { "epoch": 0.7918689320388349, "grad_norm": 6.116622447967529, "learning_rate": 8.377170142101548e-05, "loss": 2.3181718826293944, "memory(GiB)": 40.86, "step": 1305, "token_acc": 0.5061349693251533, "train_speed(iter/s)": 0.58774 }, { "epoch": 0.7949029126213593, "grad_norm": 8.740164756774902, "learning_rate": 8.365438540385223e-05, "loss": 2.1749797821044923, "memory(GiB)": 40.86, "step": 1310, "token_acc": 0.5187713310580204, "train_speed(iter/s)": 0.587607 }, { "epoch": 0.7979368932038835, "grad_norm": 6.935183048248291, "learning_rate": 8.353672966433206e-05, "loss": 2.314193534851074, "memory(GiB)": 40.86, "step": 1315, "token_acc": 0.47039473684210525, "train_speed(iter/s)": 0.587722 }, { "epoch": 0.8009708737864077, "grad_norm": 7.3493475914001465, "learning_rate": 8.341873539012444e-05, "loss": 2.2399951934814455, "memory(GiB)": 40.86, "step": 1320, "token_acc": 0.5111821086261981, "train_speed(iter/s)": 0.587965 }, { "epoch": 0.804004854368932, "grad_norm": 6.552261829376221, "learning_rate": 8.33004037723161e-05, "loss": 2.223754119873047, "memory(GiB)": 40.86, "step": 1325, "token_acc": 0.5283018867924528, "train_speed(iter/s)": 0.588183 }, { "epoch": 0.8070388349514563, "grad_norm": 6.420342445373535, "learning_rate": 8.318173600539911e-05, "loss": 1.9445220947265625, "memory(GiB)": 40.86, "step": 1330, "token_acc": 0.5394736842105263, "train_speed(iter/s)": 0.588461 }, { "epoch": 0.8100728155339806, "grad_norm": 5.923401355743408, "learning_rate": 8.306273328725878e-05, "loss": 2.1622385025024413, "memory(GiB)": 40.86, "step": 1335, "token_acc": 0.5357142857142857, "train_speed(iter/s)": 0.588601 }, { "epoch": 0.8131067961165048, "grad_norm": 7.1788506507873535, "learning_rate": 8.294339681916154e-05, "loss": 2.1121898651123048, "memory(GiB)": 40.86, "step": 1340, "token_acc": 0.496875, "train_speed(iter/s)": 0.588559 }, { "epoch": 0.8161407766990292, "grad_norm": 6.46894645690918, "learning_rate": 8.282372780574285e-05, "loss": 2.207390022277832, "memory(GiB)": 40.86, "step": 1345, "token_acc": 0.5216049382716049, "train_speed(iter/s)": 0.588706 }, { "epoch": 0.8191747572815534, "grad_norm": 7.959349632263184, "learning_rate": 8.270372745499506e-05, "loss": 2.2782615661621093, "memory(GiB)": 40.86, "step": 1350, "token_acc": 0.5174603174603175, "train_speed(iter/s)": 0.588601 }, { "epoch": 0.8222087378640777, "grad_norm": 7.4319939613342285, "learning_rate": 8.258339697825515e-05, "loss": 1.8879600524902345, "memory(GiB)": 40.86, "step": 1355, "token_acc": 0.5580357142857143, "train_speed(iter/s)": 0.588875 }, { "epoch": 0.8252427184466019, "grad_norm": 7.50739860534668, "learning_rate": 8.246273759019252e-05, "loss": 2.3653688430786133, "memory(GiB)": 40.86, "step": 1360, "token_acc": 0.5179856115107914, "train_speed(iter/s)": 0.588976 }, { "epoch": 0.8282766990291263, "grad_norm": 8.38315486907959, "learning_rate": 8.234175050879684e-05, "loss": 2.0219940185546874, "memory(GiB)": 40.86, "step": 1365, "token_acc": 0.5266903914590747, "train_speed(iter/s)": 0.589206 }, { "epoch": 0.8313106796116505, "grad_norm": 5.579223155975342, "learning_rate": 8.222043695536555e-05, "loss": 2.0323202133178713, "memory(GiB)": 41.25, "step": 1370, "token_acc": 0.5419847328244275, "train_speed(iter/s)": 0.588666 }, { "epoch": 0.8343446601941747, "grad_norm": 7.079959392547607, "learning_rate": 8.20987981544917e-05, "loss": 2.245712661743164, "memory(GiB)": 41.25, "step": 1375, "token_acc": 0.5054545454545455, "train_speed(iter/s)": 0.58865 }, { "epoch": 0.837378640776699, "grad_norm": 5.938848972320557, "learning_rate": 8.197683533405157e-05, "loss": 1.959267807006836, "memory(GiB)": 41.25, "step": 1380, "token_acc": 0.5316901408450704, "train_speed(iter/s)": 0.58891 }, { "epoch": 0.8404126213592233, "grad_norm": 8.333083152770996, "learning_rate": 8.185454972519213e-05, "loss": 2.2188604354858397, "memory(GiB)": 41.25, "step": 1385, "token_acc": 0.5415282392026578, "train_speed(iter/s)": 0.589226 }, { "epoch": 0.8434466019417476, "grad_norm": 5.235838413238525, "learning_rate": 8.173194256231884e-05, "loss": 2.312948226928711, "memory(GiB)": 41.25, "step": 1390, "token_acc": 0.48546511627906974, "train_speed(iter/s)": 0.589378 }, { "epoch": 0.8464805825242718, "grad_norm": 9.581235885620117, "learning_rate": 8.1609015083083e-05, "loss": 2.3604787826538085, "memory(GiB)": 41.25, "step": 1395, "token_acc": 0.4927007299270073, "train_speed(iter/s)": 0.589384 }, { "epoch": 0.8495145631067961, "grad_norm": 6.8221611976623535, "learning_rate": 8.148576852836933e-05, "loss": 2.0327474594116213, "memory(GiB)": 41.25, "step": 1400, "token_acc": 0.569620253164557, "train_speed(iter/s)": 0.589095 }, { "epoch": 0.8525485436893204, "grad_norm": 7.140889644622803, "learning_rate": 8.136220414228347e-05, "loss": 2.5129384994506836, "memory(GiB)": 41.25, "step": 1405, "token_acc": 0.4952076677316294, "train_speed(iter/s)": 0.589242 }, { "epoch": 0.8555825242718447, "grad_norm": 5.594088077545166, "learning_rate": 8.123832317213933e-05, "loss": 2.288181686401367, "memory(GiB)": 41.25, "step": 1410, "token_acc": 0.5228758169934641, "train_speed(iter/s)": 0.589415 }, { "epoch": 0.8586165048543689, "grad_norm": 5.7525811195373535, "learning_rate": 8.111412686844664e-05, "loss": 2.288965606689453, "memory(GiB)": 41.25, "step": 1415, "token_acc": 0.5157068062827225, "train_speed(iter/s)": 0.589323 }, { "epoch": 0.8616504854368932, "grad_norm": 9.362752914428711, "learning_rate": 8.098961648489821e-05, "loss": 1.9993032455444335, "memory(GiB)": 41.25, "step": 1420, "token_acc": 0.5448275862068965, "train_speed(iter/s)": 0.589162 }, { "epoch": 0.8646844660194175, "grad_norm": 6.3312764167785645, "learning_rate": 8.08647932783573e-05, "loss": 2.4338268280029296, "memory(GiB)": 41.25, "step": 1425, "token_acc": 0.4863013698630137, "train_speed(iter/s)": 0.589226 }, { "epoch": 0.8677184466019418, "grad_norm": 5.9260172843933105, "learning_rate": 8.073965850884496e-05, "loss": 2.2075326919555662, "memory(GiB)": 41.25, "step": 1430, "token_acc": 0.5230769230769231, "train_speed(iter/s)": 0.589205 }, { "epoch": 0.870752427184466, "grad_norm": 4.957935810089111, "learning_rate": 8.061421343952731e-05, "loss": 2.123280334472656, "memory(GiB)": 41.25, "step": 1435, "token_acc": 0.5446927374301676, "train_speed(iter/s)": 0.589414 }, { "epoch": 0.8737864077669902, "grad_norm": 5.678249359130859, "learning_rate": 8.048845933670273e-05, "loss": 1.9449834823608398, "memory(GiB)": 41.25, "step": 1440, "token_acc": 0.5700934579439252, "train_speed(iter/s)": 0.589581 }, { "epoch": 0.8768203883495146, "grad_norm": 7.655086040496826, "learning_rate": 8.036239746978914e-05, "loss": 2.4002641677856444, "memory(GiB)": 41.25, "step": 1445, "token_acc": 0.4842105263157895, "train_speed(iter/s)": 0.58949 }, { "epoch": 0.8798543689320388, "grad_norm": 6.851123332977295, "learning_rate": 8.02360291113112e-05, "loss": 2.103730392456055, "memory(GiB)": 41.25, "step": 1450, "token_acc": 0.5627009646302251, "train_speed(iter/s)": 0.589624 }, { "epoch": 0.8828883495145631, "grad_norm": 7.437098979949951, "learning_rate": 8.010935553688741e-05, "loss": 2.1862071990966796, "memory(GiB)": 41.25, "step": 1455, "token_acc": 0.5364238410596026, "train_speed(iter/s)": 0.58987 }, { "epoch": 0.8859223300970874, "grad_norm": 7.451559066772461, "learning_rate": 7.998237802521726e-05, "loss": 2.167529296875, "memory(GiB)": 41.25, "step": 1460, "token_acc": 0.5220338983050847, "train_speed(iter/s)": 0.589733 }, { "epoch": 0.8889563106796117, "grad_norm": 5.745720863342285, "learning_rate": 7.985509785806827e-05, "loss": 1.7958356857299804, "memory(GiB)": 41.25, "step": 1465, "token_acc": 0.6163793103448276, "train_speed(iter/s)": 0.589826 }, { "epoch": 0.8919902912621359, "grad_norm": 5.106093406677246, "learning_rate": 7.97275163202632e-05, "loss": 1.7782585144042968, "memory(GiB)": 41.25, "step": 1470, "token_acc": 0.5852842809364549, "train_speed(iter/s)": 0.589876 }, { "epoch": 0.8950242718446602, "grad_norm": 7.241384506225586, "learning_rate": 7.959963469966687e-05, "loss": 2.27147216796875, "memory(GiB)": 41.25, "step": 1475, "token_acc": 0.52, "train_speed(iter/s)": 0.590043 }, { "epoch": 0.8980582524271845, "grad_norm": 7.773332595825195, "learning_rate": 7.947145428717335e-05, "loss": 2.3339469909667967, "memory(GiB)": 41.25, "step": 1480, "token_acc": 0.4868035190615836, "train_speed(iter/s)": 0.59013 }, { "epoch": 0.9010922330097088, "grad_norm": 6.2095866203308105, "learning_rate": 7.934297637669281e-05, "loss": 2.15749568939209, "memory(GiB)": 41.25, "step": 1485, "token_acc": 0.5335120643431636, "train_speed(iter/s)": 0.590232 }, { "epoch": 0.904126213592233, "grad_norm": 9.049623489379883, "learning_rate": 7.921420226513852e-05, "loss": 2.2805938720703125, "memory(GiB)": 41.25, "step": 1490, "token_acc": 0.48771929824561405, "train_speed(iter/s)": 0.590181 }, { "epoch": 0.9071601941747572, "grad_norm": 5.86360502243042, "learning_rate": 7.90851332524137e-05, "loss": 2.204097557067871, "memory(GiB)": 41.25, "step": 1495, "token_acc": 0.5291970802919708, "train_speed(iter/s)": 0.589939 }, { "epoch": 0.9101941747572816, "grad_norm": 6.702127456665039, "learning_rate": 7.895577064139848e-05, "loss": 2.099565124511719, "memory(GiB)": 41.25, "step": 1500, "token_acc": 0.5468164794007491, "train_speed(iter/s)": 0.590107 }, { "epoch": 0.9101941747572816, "eval_loss": 1.9851206541061401, "eval_runtime": 12.4849, "eval_samples_per_second": 8.01, "eval_steps_per_second": 8.01, "eval_token_acc": 0.5260196905766527, "step": 1500 }, { "epoch": 0.9132281553398058, "grad_norm": 7.588992118835449, "learning_rate": 7.882611573793663e-05, "loss": 2.118764877319336, "memory(GiB)": 41.25, "step": 1505, "token_acc": 0.5204795204795205, "train_speed(iter/s)": 0.586984 }, { "epoch": 0.9162621359223301, "grad_norm": 5.986236572265625, "learning_rate": 7.869616985082255e-05, "loss": 2.0279298782348634, "memory(GiB)": 41.25, "step": 1510, "token_acc": 0.5660377358490566, "train_speed(iter/s)": 0.586663 }, { "epoch": 0.9192961165048543, "grad_norm": 7.583939075469971, "learning_rate": 7.856593429178789e-05, "loss": 2.0275857925415037, "memory(GiB)": 41.25, "step": 1515, "token_acc": 0.5351170568561873, "train_speed(iter/s)": 0.586556 }, { "epoch": 0.9223300970873787, "grad_norm": 7.145445823669434, "learning_rate": 7.843541037548838e-05, "loss": 2.181304168701172, "memory(GiB)": 41.25, "step": 1520, "token_acc": 0.5451612903225806, "train_speed(iter/s)": 0.58615 }, { "epoch": 0.9253640776699029, "grad_norm": 9.427350997924805, "learning_rate": 7.830459941949058e-05, "loss": 1.9623226165771483, "memory(GiB)": 41.25, "step": 1525, "token_acc": 0.5575539568345323, "train_speed(iter/s)": 0.585779 }, { "epoch": 0.9283980582524272, "grad_norm": 10.541104316711426, "learning_rate": 7.817350274425856e-05, "loss": 2.2855878829956056, "memory(GiB)": 41.25, "step": 1530, "token_acc": 0.518796992481203, "train_speed(iter/s)": 0.585757 }, { "epoch": 0.9314320388349514, "grad_norm": 8.254549980163574, "learning_rate": 7.804212167314054e-05, "loss": 2.3625198364257813, "memory(GiB)": 41.25, "step": 1535, "token_acc": 0.45938375350140054, "train_speed(iter/s)": 0.585525 }, { "epoch": 0.9344660194174758, "grad_norm": 5.327072620391846, "learning_rate": 7.791045753235555e-05, "loss": 2.1574447631835936, "memory(GiB)": 41.25, "step": 1540, "token_acc": 0.5401234567901234, "train_speed(iter/s)": 0.585161 }, { "epoch": 0.9375, "grad_norm": 6.956089496612549, "learning_rate": 7.777851165098012e-05, "loss": 2.220409965515137, "memory(GiB)": 41.25, "step": 1545, "token_acc": 0.5068027210884354, "train_speed(iter/s)": 0.58532 }, { "epoch": 0.9405339805825242, "grad_norm": 8.668743133544922, "learning_rate": 7.76462853609347e-05, "loss": 2.2191883087158204, "memory(GiB)": 41.25, "step": 1550, "token_acc": 0.5181159420289855, "train_speed(iter/s)": 0.58508 }, { "epoch": 0.9435679611650486, "grad_norm": 6.6736063957214355, "learning_rate": 7.751377999697043e-05, "loss": 2.111481857299805, "memory(GiB)": 41.25, "step": 1555, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 0.585013 }, { "epoch": 0.9466019417475728, "grad_norm": 8.943532943725586, "learning_rate": 7.73809968966554e-05, "loss": 2.2334514617919923, "memory(GiB)": 41.25, "step": 1560, "token_acc": 0.5, "train_speed(iter/s)": 0.585004 }, { "epoch": 0.9496359223300971, "grad_norm": 6.392602443695068, "learning_rate": 7.724793740036142e-05, "loss": 2.3538848876953127, "memory(GiB)": 41.25, "step": 1565, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 0.585018 }, { "epoch": 0.9526699029126213, "grad_norm": 8.349867820739746, "learning_rate": 7.711460285125028e-05, "loss": 1.9792165756225586, "memory(GiB)": 41.25, "step": 1570, "token_acc": 0.5506756756756757, "train_speed(iter/s)": 0.584841 }, { "epoch": 0.9557038834951457, "grad_norm": 6.740106582641602, "learning_rate": 7.698099459526034e-05, "loss": 2.2277217864990235, "memory(GiB)": 41.25, "step": 1575, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 0.584644 }, { "epoch": 0.9587378640776699, "grad_norm": 7.618457317352295, "learning_rate": 7.684711398109284e-05, "loss": 2.152913284301758, "memory(GiB)": 41.25, "step": 1580, "token_acc": 0.5343283582089552, "train_speed(iter/s)": 0.584502 }, { "epoch": 0.9617718446601942, "grad_norm": 5.828027248382568, "learning_rate": 7.67129623601983e-05, "loss": 2.1841548919677733, "memory(GiB)": 41.25, "step": 1585, "token_acc": 0.509493670886076, "train_speed(iter/s)": 0.584655 }, { "epoch": 0.9648058252427184, "grad_norm": 8.393068313598633, "learning_rate": 7.657854108676299e-05, "loss": 2.4885177612304688, "memory(GiB)": 41.25, "step": 1590, "token_acc": 0.48773006134969327, "train_speed(iter/s)": 0.584201 }, { "epoch": 0.9678398058252428, "grad_norm": 6.520992755889893, "learning_rate": 7.644385151769509e-05, "loss": 2.489660453796387, "memory(GiB)": 41.25, "step": 1595, "token_acc": 0.49107142857142855, "train_speed(iter/s)": 0.584289 }, { "epoch": 0.970873786407767, "grad_norm": 5.243824481964111, "learning_rate": 7.630889501261109e-05, "loss": 2.0495643615722656, "memory(GiB)": 41.25, "step": 1600, "token_acc": 0.5572289156626506, "train_speed(iter/s)": 0.584189 }, { "epoch": 0.9739077669902912, "grad_norm": 8.216861724853516, "learning_rate": 7.617367293382211e-05, "loss": 2.7457176208496095, "memory(GiB)": 41.25, "step": 1605, "token_acc": 0.4244791666666667, "train_speed(iter/s)": 0.584051 }, { "epoch": 0.9769417475728155, "grad_norm": 6.738630771636963, "learning_rate": 7.603818664632001e-05, "loss": 2.252565383911133, "memory(GiB)": 41.25, "step": 1610, "token_acc": 0.48986486486486486, "train_speed(iter/s)": 0.584192 }, { "epoch": 0.9799757281553398, "grad_norm": 6.404202938079834, "learning_rate": 7.590243751776374e-05, "loss": 2.2700517654418944, "memory(GiB)": 41.25, "step": 1615, "token_acc": 0.4858757062146893, "train_speed(iter/s)": 0.584132 }, { "epoch": 0.9830097087378641, "grad_norm": 6.124429702758789, "learning_rate": 7.576642691846546e-05, "loss": 2.3936836242675783, "memory(GiB)": 41.25, "step": 1620, "token_acc": 0.5235294117647059, "train_speed(iter/s)": 0.58398 }, { "epoch": 0.9860436893203883, "grad_norm": 7.0240044593811035, "learning_rate": 7.563015622137674e-05, "loss": 2.3892589569091798, "memory(GiB)": 41.25, "step": 1625, "token_acc": 0.46688741721854304, "train_speed(iter/s)": 0.583754 }, { "epoch": 0.9890776699029126, "grad_norm": 6.437112331390381, "learning_rate": 7.549362680207472e-05, "loss": 2.232225036621094, "memory(GiB)": 41.25, "step": 1630, "token_acc": 0.4984025559105431, "train_speed(iter/s)": 0.58366 }, { "epoch": 0.9921116504854369, "grad_norm": 6.010834217071533, "learning_rate": 7.535684003874816e-05, "loss": 2.146392822265625, "memory(GiB)": 41.25, "step": 1635, "token_acc": 0.5261538461538462, "train_speed(iter/s)": 0.58354 }, { "epoch": 0.9951456310679612, "grad_norm": 6.317235946655273, "learning_rate": 7.521979731218356e-05, "loss": 2.3056121826171876, "memory(GiB)": 41.25, "step": 1640, "token_acc": 0.48264984227129337, "train_speed(iter/s)": 0.583292 }, { "epoch": 0.9981796116504854, "grad_norm": 7.453293800354004, "learning_rate": 7.508250000575125e-05, "loss": 2.188218688964844, "memory(GiB)": 41.25, "step": 1645, "token_acc": 0.5512367491166078, "train_speed(iter/s)": 0.583245 }, { "epoch": 1.0012135922330097, "grad_norm": 8.073345184326172, "learning_rate": 7.494494950539143e-05, "loss": 1.7200986862182617, "memory(GiB)": 41.25, "step": 1650, "token_acc": 0.6014760147601476, "train_speed(iter/s)": 0.583281 }, { "epoch": 1.004247572815534, "grad_norm": 6.676420211791992, "learning_rate": 7.480714719960007e-05, "loss": 2.1127391815185548, "memory(GiB)": 41.25, "step": 1655, "token_acc": 0.5288461538461539, "train_speed(iter/s)": 0.583366 }, { "epoch": 1.0072815533980584, "grad_norm": 6.307994842529297, "learning_rate": 7.466909447941508e-05, "loss": 1.8806413650512694, "memory(GiB)": 41.25, "step": 1660, "token_acc": 0.5547703180212014, "train_speed(iter/s)": 0.583445 }, { "epoch": 1.0103155339805825, "grad_norm": 6.221712589263916, "learning_rate": 7.453079273840207e-05, "loss": 2.276551055908203, "memory(GiB)": 41.25, "step": 1665, "token_acc": 0.5133333333333333, "train_speed(iter/s)": 0.583162 }, { "epoch": 1.0133495145631068, "grad_norm": 5.912354469299316, "learning_rate": 7.439224337264043e-05, "loss": 1.9514554977416991, "memory(GiB)": 41.25, "step": 1670, "token_acc": 0.5527950310559007, "train_speed(iter/s)": 0.583074 }, { "epoch": 1.016383495145631, "grad_norm": 7.461360931396484, "learning_rate": 7.425344778070917e-05, "loss": 2.087990951538086, "memory(GiB)": 41.25, "step": 1675, "token_acc": 0.5451713395638629, "train_speed(iter/s)": 0.583019 }, { "epoch": 1.0194174757281553, "grad_norm": 6.206568241119385, "learning_rate": 7.411440736367281e-05, "loss": 2.088376045227051, "memory(GiB)": 41.25, "step": 1680, "token_acc": 0.5496688741721855, "train_speed(iter/s)": 0.58291 }, { "epoch": 1.0224514563106797, "grad_norm": 6.608606338500977, "learning_rate": 7.397512352506727e-05, "loss": 1.6116622924804687, "memory(GiB)": 41.25, "step": 1685, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.582982 }, { "epoch": 1.0254854368932038, "grad_norm": 7.508535385131836, "learning_rate": 7.383559767088566e-05, "loss": 1.8518999099731446, "memory(GiB)": 41.25, "step": 1690, "token_acc": 0.5867158671586716, "train_speed(iter/s)": 0.583052 }, { "epoch": 1.0285194174757282, "grad_norm": 6.2956318855285645, "learning_rate": 7.369583120956407e-05, "loss": 2.077930450439453, "memory(GiB)": 41.25, "step": 1695, "token_acc": 0.5295950155763239, "train_speed(iter/s)": 0.583022 }, { "epoch": 1.0315533980582525, "grad_norm": 6.229779243469238, "learning_rate": 7.355582555196745e-05, "loss": 1.6506580352783202, "memory(GiB)": 41.25, "step": 1700, "token_acc": 0.6342182890855457, "train_speed(iter/s)": 0.582892 }, { "epoch": 1.0345873786407767, "grad_norm": 7.167182445526123, "learning_rate": 7.341558211137526e-05, "loss": 2.1481195449829102, "memory(GiB)": 41.25, "step": 1705, "token_acc": 0.49226006191950467, "train_speed(iter/s)": 0.582852 }, { "epoch": 1.037621359223301, "grad_norm": 7.526867866516113, "learning_rate": 7.327510230346726e-05, "loss": 2.0346538543701174, "memory(GiB)": 41.25, "step": 1710, "token_acc": 0.5077399380804953, "train_speed(iter/s)": 0.582708 }, { "epoch": 1.0406553398058251, "grad_norm": 6.285158634185791, "learning_rate": 7.313438754630918e-05, "loss": 2.084914779663086, "memory(GiB)": 41.25, "step": 1715, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 0.58263 }, { "epoch": 1.0436893203883495, "grad_norm": 5.3016252517700195, "learning_rate": 7.299343926033851e-05, "loss": 1.8931154251098632, "memory(GiB)": 41.25, "step": 1720, "token_acc": 0.5520504731861199, "train_speed(iter/s)": 0.582235 }, { "epoch": 1.0467233009708738, "grad_norm": 6.363744258880615, "learning_rate": 7.285225886834997e-05, "loss": 2.1936279296875, "memory(GiB)": 41.25, "step": 1725, "token_acc": 0.49683544303797467, "train_speed(iter/s)": 0.582259 }, { "epoch": 1.049757281553398, "grad_norm": 6.571318626403809, "learning_rate": 7.271084779548136e-05, "loss": 2.0733669281005858, "memory(GiB)": 41.25, "step": 1730, "token_acc": 0.5579937304075235, "train_speed(iter/s)": 0.582202 }, { "epoch": 1.0527912621359223, "grad_norm": 7.151698589324951, "learning_rate": 7.256920746919904e-05, "loss": 2.2026699066162108, "memory(GiB)": 41.25, "step": 1735, "token_acc": 0.5150375939849624, "train_speed(iter/s)": 0.58212 }, { "epoch": 1.0558252427184467, "grad_norm": 6.636294364929199, "learning_rate": 7.242733931928352e-05, "loss": 2.145404052734375, "memory(GiB)": 41.25, "step": 1740, "token_acc": 0.49221183800623053, "train_speed(iter/s)": 0.582037 }, { "epoch": 1.0588592233009708, "grad_norm": 6.21516227722168, "learning_rate": 7.228524477781514e-05, "loss": 1.6696731567382812, "memory(GiB)": 41.25, "step": 1745, "token_acc": 0.6295081967213115, "train_speed(iter/s)": 0.581842 }, { "epoch": 1.0618932038834952, "grad_norm": 6.904699802398682, "learning_rate": 7.214292527915949e-05, "loss": 1.995549201965332, "memory(GiB)": 41.25, "step": 1750, "token_acc": 0.5806451612903226, "train_speed(iter/s)": 0.581695 }, { "epoch": 1.0649271844660193, "grad_norm": 4.713315963745117, "learning_rate": 7.200038225995294e-05, "loss": 2.3474475860595705, "memory(GiB)": 41.25, "step": 1755, "token_acc": 0.4887005649717514, "train_speed(iter/s)": 0.581706 }, { "epoch": 1.0679611650485437, "grad_norm": 8.901693344116211, "learning_rate": 7.185761715908825e-05, "loss": 2.004246139526367, "memory(GiB)": 41.25, "step": 1760, "token_acc": 0.5867158671586716, "train_speed(iter/s)": 0.581409 }, { "epoch": 1.070995145631068, "grad_norm": 6.650726318359375, "learning_rate": 7.171463141769994e-05, "loss": 2.21859130859375, "memory(GiB)": 41.25, "step": 1765, "token_acc": 0.5466666666666666, "train_speed(iter/s)": 0.581411 }, { "epoch": 1.0740291262135921, "grad_norm": 7.826591968536377, "learning_rate": 7.157142647914979e-05, "loss": 2.0319658279418946, "memory(GiB)": 41.25, "step": 1770, "token_acc": 0.5594202898550724, "train_speed(iter/s)": 0.581305 }, { "epoch": 1.0770631067961165, "grad_norm": 6.98701286315918, "learning_rate": 7.14280037890122e-05, "loss": 1.9901140213012696, "memory(GiB)": 41.25, "step": 1775, "token_acc": 0.551829268292683, "train_speed(iter/s)": 0.581264 }, { "epoch": 1.0800970873786409, "grad_norm": 6.480953693389893, "learning_rate": 7.128436479505971e-05, "loss": 2.1239852905273438, "memory(GiB)": 41.25, "step": 1780, "token_acc": 0.5121359223300971, "train_speed(iter/s)": 0.581217 }, { "epoch": 1.083131067961165, "grad_norm": 5.683126449584961, "learning_rate": 7.114051094724831e-05, "loss": 2.0841569900512695, "memory(GiB)": 41.25, "step": 1785, "token_acc": 0.5318352059925093, "train_speed(iter/s)": 0.581099 }, { "epoch": 1.0861650485436893, "grad_norm": 5.394412517547607, "learning_rate": 7.09964436977028e-05, "loss": 1.9973236083984376, "memory(GiB)": 41.25, "step": 1790, "token_acc": 0.541095890410959, "train_speed(iter/s)": 0.580947 }, { "epoch": 1.0891990291262137, "grad_norm": 5.046519756317139, "learning_rate": 7.085216450070218e-05, "loss": 2.029042053222656, "memory(GiB)": 41.25, "step": 1795, "token_acc": 0.5591054313099042, "train_speed(iter/s)": 0.580934 }, { "epoch": 1.0922330097087378, "grad_norm": 6.593071460723877, "learning_rate": 7.070767481266492e-05, "loss": 1.8102890014648438, "memory(GiB)": 41.25, "step": 1800, "token_acc": 0.5718654434250765, "train_speed(iter/s)": 0.581028 }, { "epoch": 1.0952669902912622, "grad_norm": 7.305717945098877, "learning_rate": 7.056297609213432e-05, "loss": 1.9902324676513672, "memory(GiB)": 41.25, "step": 1805, "token_acc": 0.5699300699300699, "train_speed(iter/s)": 0.581249 }, { "epoch": 1.0983009708737863, "grad_norm": 7.886199474334717, "learning_rate": 7.041806979976368e-05, "loss": 2.2953224182128906, "memory(GiB)": 41.25, "step": 1810, "token_acc": 0.5, "train_speed(iter/s)": 0.581276 }, { "epoch": 1.1013349514563107, "grad_norm": 10.443878173828125, "learning_rate": 7.027295739830169e-05, "loss": 2.220531463623047, "memory(GiB)": 41.25, "step": 1815, "token_acc": 0.5054945054945055, "train_speed(iter/s)": 0.581467 }, { "epoch": 1.104368932038835, "grad_norm": 8.019064903259277, "learning_rate": 7.012764035257756e-05, "loss": 2.4718793869018554, "memory(GiB)": 41.25, "step": 1820, "token_acc": 0.47619047619047616, "train_speed(iter/s)": 0.581659 }, { "epoch": 1.1074029126213591, "grad_norm": 7.334555625915527, "learning_rate": 6.998212012948626e-05, "loss": 1.9244306564331055, "memory(GiB)": 41.25, "step": 1825, "token_acc": 0.5625, "train_speed(iter/s)": 0.581498 }, { "epoch": 1.1104368932038835, "grad_norm": 10.03096866607666, "learning_rate": 6.983639819797377e-05, "loss": 2.2340341567993165, "memory(GiB)": 41.25, "step": 1830, "token_acc": 0.5136986301369864, "train_speed(iter/s)": 0.581297 }, { "epoch": 1.1134708737864079, "grad_norm": 8.886280059814453, "learning_rate": 6.969047602902213e-05, "loss": 2.0593013763427734, "memory(GiB)": 41.25, "step": 1835, "token_acc": 0.5460526315789473, "train_speed(iter/s)": 0.581181 }, { "epoch": 1.116504854368932, "grad_norm": 7.363580703735352, "learning_rate": 6.954435509563478e-05, "loss": 1.8324342727661134, "memory(GiB)": 41.25, "step": 1840, "token_acc": 0.5607142857142857, "train_speed(iter/s)": 0.581259 }, { "epoch": 1.1195388349514563, "grad_norm": 8.011999130249023, "learning_rate": 6.939803687282146e-05, "loss": 2.3135982513427735, "memory(GiB)": 41.25, "step": 1845, "token_acc": 0.4666666666666667, "train_speed(iter/s)": 0.581014 }, { "epoch": 1.1225728155339807, "grad_norm": 5.767248630523682, "learning_rate": 6.925152283758348e-05, "loss": 1.8407760620117188, "memory(GiB)": 41.25, "step": 1850, "token_acc": 0.5792880258899676, "train_speed(iter/s)": 0.58124 }, { "epoch": 1.1256067961165048, "grad_norm": 6.498402118682861, "learning_rate": 6.91048144688988e-05, "loss": 2.21679573059082, "memory(GiB)": 41.25, "step": 1855, "token_acc": 0.5419354838709678, "train_speed(iter/s)": 0.581462 }, { "epoch": 1.1286407766990292, "grad_norm": 6.394837379455566, "learning_rate": 6.895791324770701e-05, "loss": 1.947611427307129, "memory(GiB)": 41.25, "step": 1860, "token_acc": 0.534375, "train_speed(iter/s)": 0.581612 }, { "epoch": 1.1316747572815533, "grad_norm": 11.657426834106445, "learning_rate": 6.881082065689453e-05, "loss": 2.234457015991211, "memory(GiB)": 41.25, "step": 1865, "token_acc": 0.543859649122807, "train_speed(iter/s)": 0.581654 }, { "epoch": 1.1347087378640777, "grad_norm": 7.383030414581299, "learning_rate": 6.866353818127942e-05, "loss": 2.1067886352539062, "memory(GiB)": 41.25, "step": 1870, "token_acc": 0.5800711743772242, "train_speed(iter/s)": 0.581704 }, { "epoch": 1.137742718446602, "grad_norm": 6.467532157897949, "learning_rate": 6.851606730759664e-05, "loss": 2.12357234954834, "memory(GiB)": 41.25, "step": 1875, "token_acc": 0.5160256410256411, "train_speed(iter/s)": 0.581655 }, { "epoch": 1.1407766990291262, "grad_norm": 7.949151992797852, "learning_rate": 6.836840952448285e-05, "loss": 2.1536586761474608, "memory(GiB)": 41.25, "step": 1880, "token_acc": 0.5520833333333334, "train_speed(iter/s)": 0.581744 }, { "epoch": 1.1438106796116505, "grad_norm": 7.234400749206543, "learning_rate": 6.82205663224615e-05, "loss": 2.2570121765136717, "memory(GiB)": 41.25, "step": 1885, "token_acc": 0.5168195718654435, "train_speed(iter/s)": 0.581831 }, { "epoch": 1.1468446601941746, "grad_norm": 6.600982189178467, "learning_rate": 6.807253919392773e-05, "loss": 1.9843761444091796, "memory(GiB)": 41.25, "step": 1890, "token_acc": 0.5444839857651246, "train_speed(iter/s)": 0.581694 }, { "epoch": 1.149878640776699, "grad_norm": 7.820127010345459, "learning_rate": 6.792432963313328e-05, "loss": 2.096297836303711, "memory(GiB)": 41.25, "step": 1895, "token_acc": 0.5566037735849056, "train_speed(iter/s)": 0.581753 }, { "epoch": 1.1529126213592233, "grad_norm": 6.915624618530273, "learning_rate": 6.777593913617152e-05, "loss": 2.108437156677246, "memory(GiB)": 41.25, "step": 1900, "token_acc": 0.5073313782991202, "train_speed(iter/s)": 0.581827 }, { "epoch": 1.1559466019417475, "grad_norm": 7.475584030151367, "learning_rate": 6.762736920096218e-05, "loss": 2.277429389953613, "memory(GiB)": 41.25, "step": 1905, "token_acc": 0.5014577259475219, "train_speed(iter/s)": 0.581845 }, { "epoch": 1.1589805825242718, "grad_norm": 7.104306221008301, "learning_rate": 6.747862132723641e-05, "loss": 2.067903518676758, "memory(GiB)": 41.25, "step": 1910, "token_acc": 0.5628930817610063, "train_speed(iter/s)": 0.581719 }, { "epoch": 1.1620145631067962, "grad_norm": 8.869878768920898, "learning_rate": 6.732969701652145e-05, "loss": 2.2940914154052736, "memory(GiB)": 41.25, "step": 1915, "token_acc": 0.5511551155115512, "train_speed(iter/s)": 0.58164 }, { "epoch": 1.1650485436893203, "grad_norm": 7.5197248458862305, "learning_rate": 6.718059777212567e-05, "loss": 2.0857444763183595, "memory(GiB)": 41.25, "step": 1920, "token_acc": 0.5338645418326693, "train_speed(iter/s)": 0.581495 }, { "epoch": 1.1680825242718447, "grad_norm": 6.92659854888916, "learning_rate": 6.703132509912322e-05, "loss": 1.807958221435547, "memory(GiB)": 41.25, "step": 1925, "token_acc": 0.5786350148367952, "train_speed(iter/s)": 0.581481 }, { "epoch": 1.171116504854369, "grad_norm": 7.253981113433838, "learning_rate": 6.688188050433897e-05, "loss": 1.9212162017822265, "memory(GiB)": 41.25, "step": 1930, "token_acc": 0.5470383275261324, "train_speed(iter/s)": 0.581673 }, { "epoch": 1.1741504854368932, "grad_norm": 7.32392692565918, "learning_rate": 6.673226549633325e-05, "loss": 2.0752506256103516, "memory(GiB)": 41.25, "step": 1935, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.581879 }, { "epoch": 1.1771844660194175, "grad_norm": 6.774953842163086, "learning_rate": 6.658248158538655e-05, "loss": 2.022067832946777, "memory(GiB)": 41.25, "step": 1940, "token_acc": 0.5303430079155673, "train_speed(iter/s)": 0.582004 }, { "epoch": 1.1802184466019416, "grad_norm": 8.567710876464844, "learning_rate": 6.643253028348443e-05, "loss": 1.9163774490356444, "memory(GiB)": 41.25, "step": 1945, "token_acc": 0.5769230769230769, "train_speed(iter/s)": 0.581884 }, { "epoch": 1.183252427184466, "grad_norm": 7.197096347808838, "learning_rate": 6.628241310430208e-05, "loss": 1.9915233612060548, "memory(GiB)": 41.25, "step": 1950, "token_acc": 0.5397923875432526, "train_speed(iter/s)": 0.58194 }, { "epoch": 1.1862864077669903, "grad_norm": 7.874612808227539, "learning_rate": 6.613213156318921e-05, "loss": 2.039535331726074, "memory(GiB)": 41.25, "step": 1955, "token_acc": 0.5563380281690141, "train_speed(iter/s)": 0.58199 }, { "epoch": 1.1893203883495145, "grad_norm": 6.794829368591309, "learning_rate": 6.598168717715462e-05, "loss": 2.182103729248047, "memory(GiB)": 41.25, "step": 1960, "token_acc": 0.48936170212765956, "train_speed(iter/s)": 0.581769 }, { "epoch": 1.1923543689320388, "grad_norm": 8.138648986816406, "learning_rate": 6.583108146485092e-05, "loss": 2.205635833740234, "memory(GiB)": 41.25, "step": 1965, "token_acc": 0.5301507537688442, "train_speed(iter/s)": 0.581594 }, { "epoch": 1.1953883495145632, "grad_norm": 5.8334197998046875, "learning_rate": 6.568031594655933e-05, "loss": 2.1141899108886717, "memory(GiB)": 41.25, "step": 1970, "token_acc": 0.5442622950819672, "train_speed(iter/s)": 0.581474 }, { "epoch": 1.1984223300970873, "grad_norm": 6.450995922088623, "learning_rate": 6.552939214417411e-05, "loss": 2.0908193588256836, "memory(GiB)": 41.25, "step": 1975, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 0.581538 }, { "epoch": 1.2014563106796117, "grad_norm": 5.936161041259766, "learning_rate": 6.537831158118732e-05, "loss": 2.2035654067993162, "memory(GiB)": 41.25, "step": 1980, "token_acc": 0.5281899109792285, "train_speed(iter/s)": 0.581735 }, { "epoch": 1.204490291262136, "grad_norm": 6.0731282234191895, "learning_rate": 6.522707578267349e-05, "loss": 2.015408515930176, "memory(GiB)": 41.25, "step": 1985, "token_acc": 0.5625, "train_speed(iter/s)": 0.581728 }, { "epoch": 1.2075242718446602, "grad_norm": 8.126087188720703, "learning_rate": 6.507568627527411e-05, "loss": 2.233916091918945, "memory(GiB)": 41.25, "step": 1990, "token_acc": 0.5214899713467048, "train_speed(iter/s)": 0.581678 }, { "epoch": 1.2105582524271845, "grad_norm": 5.780952453613281, "learning_rate": 6.492414458718235e-05, "loss": 2.153764533996582, "memory(GiB)": 41.25, "step": 1995, "token_acc": 0.5223880597014925, "train_speed(iter/s)": 0.581876 }, { "epoch": 1.2135922330097086, "grad_norm": 6.646781921386719, "learning_rate": 6.477245224812745e-05, "loss": 2.137297439575195, "memory(GiB)": 41.25, "step": 2000, "token_acc": 0.5310880829015544, "train_speed(iter/s)": 0.581927 }, { "epoch": 1.2135922330097086, "eval_loss": 2.180868148803711, "eval_runtime": 12.0025, "eval_samples_per_second": 8.332, "eval_steps_per_second": 8.332, "eval_token_acc": 0.5036284470246735, "step": 2000 }, { "epoch": 1.216626213592233, "grad_norm": 6.332268238067627, "learning_rate": 6.462061078935951e-05, "loss": 2.0248859405517576, "memory(GiB)": 41.25, "step": 2005, "token_acc": 0.5138888888888888, "train_speed(iter/s)": 0.579819 }, { "epoch": 1.2196601941747574, "grad_norm": 9.277915954589844, "learning_rate": 6.446862174363378e-05, "loss": 2.223433494567871, "memory(GiB)": 41.25, "step": 2010, "token_acc": 0.5347985347985348, "train_speed(iter/s)": 0.579723 }, { "epoch": 1.2226941747572815, "grad_norm": 6.857091903686523, "learning_rate": 6.431648664519544e-05, "loss": 2.093130111694336, "memory(GiB)": 41.25, "step": 2015, "token_acc": 0.5523809523809524, "train_speed(iter/s)": 0.579662 }, { "epoch": 1.2257281553398058, "grad_norm": 7.251791000366211, "learning_rate": 6.416420702976393e-05, "loss": 2.4163230895996093, "memory(GiB)": 41.25, "step": 2020, "token_acc": 0.5063291139240507, "train_speed(iter/s)": 0.579631 }, { "epoch": 1.2287621359223302, "grad_norm": 6.369975566864014, "learning_rate": 6.401178443451751e-05, "loss": 1.8332990646362304, "memory(GiB)": 41.25, "step": 2025, "token_acc": 0.5765124555160143, "train_speed(iter/s)": 0.579685 }, { "epoch": 1.2317961165048543, "grad_norm": 12.884454727172852, "learning_rate": 6.385922039807773e-05, "loss": 1.9554672241210938, "memory(GiB)": 41.25, "step": 2030, "token_acc": 0.5648148148148148, "train_speed(iter/s)": 0.579632 }, { "epoch": 1.2348300970873787, "grad_norm": 9.875422477722168, "learning_rate": 6.370651646049398e-05, "loss": 2.229812812805176, "memory(GiB)": 41.25, "step": 2035, "token_acc": 0.49691358024691357, "train_speed(iter/s)": 0.579766 }, { "epoch": 1.237864077669903, "grad_norm": 5.669778823852539, "learning_rate": 6.355367416322779e-05, "loss": 1.7003231048583984, "memory(GiB)": 41.25, "step": 2040, "token_acc": 0.5830508474576271, "train_speed(iter/s)": 0.579765 }, { "epoch": 1.2408980582524272, "grad_norm": 6.894186019897461, "learning_rate": 6.340069504913737e-05, "loss": 2.091649627685547, "memory(GiB)": 41.25, "step": 2045, "token_acc": 0.5504885993485342, "train_speed(iter/s)": 0.579827 }, { "epoch": 1.2439320388349515, "grad_norm": 8.025986671447754, "learning_rate": 6.324758066246211e-05, "loss": 2.0427883148193358, "memory(GiB)": 41.25, "step": 2050, "token_acc": 0.5252225519287834, "train_speed(iter/s)": 0.579952 }, { "epoch": 1.2469660194174756, "grad_norm": 6.996369361877441, "learning_rate": 6.309433254880675e-05, "loss": 2.1355659484863283, "memory(GiB)": 41.25, "step": 2055, "token_acc": 0.49606299212598426, "train_speed(iter/s)": 0.580029 }, { "epoch": 1.25, "grad_norm": 9.216190338134766, "learning_rate": 6.294095225512603e-05, "loss": 2.045370101928711, "memory(GiB)": 41.25, "step": 2060, "token_acc": 0.5151515151515151, "train_speed(iter/s)": 0.579931 }, { "epoch": 1.2530339805825244, "grad_norm": 8.278094291687012, "learning_rate": 6.278744132970899e-05, "loss": 1.7628780364990235, "memory(GiB)": 41.25, "step": 2065, "token_acc": 0.6045016077170418, "train_speed(iter/s)": 0.57996 }, { "epoch": 1.2560679611650485, "grad_norm": 6.4922027587890625, "learning_rate": 6.263380132216328e-05, "loss": 2.0872188568115235, "memory(GiB)": 41.25, "step": 2070, "token_acc": 0.5318471337579618, "train_speed(iter/s)": 0.580006 }, { "epoch": 1.2591019417475728, "grad_norm": 9.755973815917969, "learning_rate": 6.248003378339958e-05, "loss": 2.043658638000488, "memory(GiB)": 41.25, "step": 2075, "token_acc": 0.521594684385382, "train_speed(iter/s)": 0.579989 }, { "epoch": 1.262135922330097, "grad_norm": 9.78760051727295, "learning_rate": 6.232614026561587e-05, "loss": 2.1496110916137696, "memory(GiB)": 41.25, "step": 2080, "token_acc": 0.5136054421768708, "train_speed(iter/s)": 0.579983 }, { "epoch": 1.2651699029126213, "grad_norm": 6.134158611297607, "learning_rate": 6.217212232228189e-05, "loss": 1.965431022644043, "memory(GiB)": 41.25, "step": 2085, "token_acc": 0.5578231292517006, "train_speed(iter/s)": 0.579816 }, { "epoch": 1.2682038834951457, "grad_norm": 6.624486446380615, "learning_rate": 6.201798150812338e-05, "loss": 2.282021713256836, "memory(GiB)": 41.25, "step": 2090, "token_acc": 0.48556430446194226, "train_speed(iter/s)": 0.579749 }, { "epoch": 1.27123786407767, "grad_norm": 7.1900739669799805, "learning_rate": 6.186371937910637e-05, "loss": 2.047537994384766, "memory(GiB)": 41.25, "step": 2095, "token_acc": 0.4966442953020134, "train_speed(iter/s)": 0.579939 }, { "epoch": 1.2742718446601942, "grad_norm": 6.147539138793945, "learning_rate": 6.170933749242152e-05, "loss": 2.319692039489746, "memory(GiB)": 41.25, "step": 2100, "token_acc": 0.5370370370370371, "train_speed(iter/s)": 0.580086 }, { "epoch": 1.2773058252427185, "grad_norm": 7.209454536437988, "learning_rate": 6.155483740646832e-05, "loss": 2.322870445251465, "memory(GiB)": 41.25, "step": 2105, "token_acc": 0.521865889212828, "train_speed(iter/s)": 0.580276 }, { "epoch": 1.2803398058252426, "grad_norm": 6.588000297546387, "learning_rate": 6.140022068083948e-05, "loss": 2.015561103820801, "memory(GiB)": 41.25, "step": 2110, "token_acc": 0.5344129554655871, "train_speed(iter/s)": 0.5802 }, { "epoch": 1.283373786407767, "grad_norm": 9.121885299682617, "learning_rate": 6.124548887630508e-05, "loss": 2.019037628173828, "memory(GiB)": 41.25, "step": 2115, "token_acc": 0.5336927223719676, "train_speed(iter/s)": 0.580255 }, { "epoch": 1.2864077669902914, "grad_norm": 10.898550987243652, "learning_rate": 6.109064355479692e-05, "loss": 1.740947151184082, "memory(GiB)": 41.25, "step": 2120, "token_acc": 0.5368421052631579, "train_speed(iter/s)": 0.580136 }, { "epoch": 1.2894417475728155, "grad_norm": 6.376506328582764, "learning_rate": 6.093568627939261e-05, "loss": 1.9328853607177734, "memory(GiB)": 41.34, "step": 2125, "token_acc": 0.55, "train_speed(iter/s)": 0.579925 }, { "epoch": 1.2924757281553398, "grad_norm": 7.9046525955200195, "learning_rate": 6.078061861429995e-05, "loss": 2.187295913696289, "memory(GiB)": 41.34, "step": 2130, "token_acc": 0.5116279069767442, "train_speed(iter/s)": 0.580068 }, { "epoch": 1.295509708737864, "grad_norm": 6.604916095733643, "learning_rate": 6.062544212484096e-05, "loss": 2.0762821197509767, "memory(GiB)": 41.34, "step": 2135, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 0.58009 }, { "epoch": 1.2985436893203883, "grad_norm": 7.367359638214111, "learning_rate": 6.047015837743629e-05, "loss": 2.126904106140137, "memory(GiB)": 41.34, "step": 2140, "token_acc": 0.5333333333333333, "train_speed(iter/s)": 0.580247 }, { "epoch": 1.3015776699029127, "grad_norm": 5.810800552368164, "learning_rate": 6.031476893958926e-05, "loss": 1.7963878631591796, "memory(GiB)": 41.34, "step": 2145, "token_acc": 0.5631399317406144, "train_speed(iter/s)": 0.580352 }, { "epoch": 1.3046116504854368, "grad_norm": 6.407706260681152, "learning_rate": 6.015927537987004e-05, "loss": 2.1866846084594727, "memory(GiB)": 41.34, "step": 2150, "token_acc": 0.5258855585831063, "train_speed(iter/s)": 0.5802 }, { "epoch": 1.3076456310679612, "grad_norm": 7.020833969116211, "learning_rate": 6.0003679267899904e-05, "loss": 1.8915981292724608, "memory(GiB)": 41.34, "step": 2155, "token_acc": 0.5571428571428572, "train_speed(iter/s)": 0.580155 }, { "epoch": 1.3106796116504853, "grad_norm": 8.229516983032227, "learning_rate": 5.9847982174335316e-05, "loss": 1.890799331665039, "memory(GiB)": 41.34, "step": 2160, "token_acc": 0.5424836601307189, "train_speed(iter/s)": 0.579946 }, { "epoch": 1.3137135922330097, "grad_norm": 6.056339263916016, "learning_rate": 5.969218567085206e-05, "loss": 2.39956111907959, "memory(GiB)": 41.34, "step": 2165, "token_acc": 0.49453551912568305, "train_speed(iter/s)": 0.580006 }, { "epoch": 1.316747572815534, "grad_norm": 7.4000468254089355, "learning_rate": 5.953629133012949e-05, "loss": 2.256308937072754, "memory(GiB)": 41.34, "step": 2170, "token_acc": 0.5133689839572193, "train_speed(iter/s)": 0.579824 }, { "epoch": 1.3197815533980584, "grad_norm": 6.835947513580322, "learning_rate": 5.938030072583447e-05, "loss": 1.8971139907836914, "memory(GiB)": 41.34, "step": 2175, "token_acc": 0.542319749216301, "train_speed(iter/s)": 0.57985 }, { "epoch": 1.3228155339805825, "grad_norm": 8.275431632995605, "learning_rate": 5.922421543260567e-05, "loss": 1.7686073303222656, "memory(GiB)": 41.34, "step": 2180, "token_acc": 0.5703971119133574, "train_speed(iter/s)": 0.579752 }, { "epoch": 1.3258495145631068, "grad_norm": 7.795175552368164, "learning_rate": 5.906803702603755e-05, "loss": 1.9470417022705078, "memory(GiB)": 41.34, "step": 2185, "token_acc": 0.5381944444444444, "train_speed(iter/s)": 0.57989 }, { "epoch": 1.328883495145631, "grad_norm": 5.923962593078613, "learning_rate": 5.891176708266454e-05, "loss": 2.17016716003418, "memory(GiB)": 41.34, "step": 2190, "token_acc": 0.5444126074498568, "train_speed(iter/s)": 0.579998 }, { "epoch": 1.3319174757281553, "grad_norm": 7.121251106262207, "learning_rate": 5.875540717994503e-05, "loss": 1.586796760559082, "memory(GiB)": 41.34, "step": 2195, "token_acc": 0.6234817813765182, "train_speed(iter/s)": 0.579935 }, { "epoch": 1.3349514563106797, "grad_norm": 7.5099921226501465, "learning_rate": 5.859895889624554e-05, "loss": 1.777475357055664, "memory(GiB)": 41.34, "step": 2200, "token_acc": 0.5938566552901023, "train_speed(iter/s)": 0.580098 }, { "epoch": 1.3379854368932038, "grad_norm": 8.970749855041504, "learning_rate": 5.84424238108247e-05, "loss": 1.670484733581543, "memory(GiB)": 41.34, "step": 2205, "token_acc": 0.5873015873015873, "train_speed(iter/s)": 0.580043 }, { "epoch": 1.3410194174757282, "grad_norm": 6.932069778442383, "learning_rate": 5.8285803503817425e-05, "loss": 2.056923675537109, "memory(GiB)": 41.34, "step": 2210, "token_acc": 0.5124653739612188, "train_speed(iter/s)": 0.580004 }, { "epoch": 1.3440533980582523, "grad_norm": 7.549715518951416, "learning_rate": 5.812909955621886e-05, "loss": 1.9996042251586914, "memory(GiB)": 41.34, "step": 2215, "token_acc": 0.5565749235474006, "train_speed(iter/s)": 0.579986 }, { "epoch": 1.3470873786407767, "grad_norm": 8.340503692626953, "learning_rate": 5.7972313549868415e-05, "loss": 2.207027816772461, "memory(GiB)": 41.34, "step": 2220, "token_acc": 0.4897959183673469, "train_speed(iter/s)": 0.579916 }, { "epoch": 1.350121359223301, "grad_norm": 6.941786766052246, "learning_rate": 5.7815447067433917e-05, "loss": 1.7856271743774415, "memory(GiB)": 41.34, "step": 2225, "token_acc": 0.5862068965517241, "train_speed(iter/s)": 0.579928 }, { "epoch": 1.3531553398058254, "grad_norm": 5.413527488708496, "learning_rate": 5.7658501692395475e-05, "loss": 1.8429689407348633, "memory(GiB)": 41.34, "step": 2230, "token_acc": 0.6061643835616438, "train_speed(iter/s)": 0.579742 }, { "epoch": 1.3561893203883495, "grad_norm": 6.279661655426025, "learning_rate": 5.7501479009029636e-05, "loss": 1.8153335571289062, "memory(GiB)": 41.34, "step": 2235, "token_acc": 0.5572289156626506, "train_speed(iter/s)": 0.57984 }, { "epoch": 1.3592233009708738, "grad_norm": 7.204460620880127, "learning_rate": 5.734438060239331e-05, "loss": 2.255967712402344, "memory(GiB)": 41.34, "step": 2240, "token_acc": 0.513595166163142, "train_speed(iter/s)": 0.579902 }, { "epoch": 1.362257281553398, "grad_norm": 7.191935062408447, "learning_rate": 5.718720805830777e-05, "loss": 2.1052494049072266, "memory(GiB)": 41.34, "step": 2245, "token_acc": 0.533724340175953, "train_speed(iter/s)": 0.579865 }, { "epoch": 1.3652912621359223, "grad_norm": 9.75123119354248, "learning_rate": 5.70299629633427e-05, "loss": 2.176554489135742, "memory(GiB)": 41.34, "step": 2250, "token_acc": 0.5303514376996805, "train_speed(iter/s)": 0.579795 }, { "epoch": 1.3683252427184467, "grad_norm": 8.081015586853027, "learning_rate": 5.687264690480014e-05, "loss": 2.253178024291992, "memory(GiB)": 41.34, "step": 2255, "token_acc": 0.5040431266846361, "train_speed(iter/s)": 0.579802 }, { "epoch": 1.3713592233009708, "grad_norm": 5.86273193359375, "learning_rate": 5.6715261470698434e-05, "loss": 2.2541793823242187, "memory(GiB)": 41.34, "step": 2260, "token_acc": 0.5361842105263158, "train_speed(iter/s)": 0.57974 }, { "epoch": 1.3743932038834952, "grad_norm": 6.653288841247559, "learning_rate": 5.655780824975628e-05, "loss": 2.219985008239746, "memory(GiB)": 41.34, "step": 2265, "token_acc": 0.5471014492753623, "train_speed(iter/s)": 0.579644 }, { "epoch": 1.3774271844660193, "grad_norm": 9.517049789428711, "learning_rate": 5.6400288831376604e-05, "loss": 2.2441757202148436, "memory(GiB)": 41.34, "step": 2270, "token_acc": 0.5112359550561798, "train_speed(iter/s)": 0.579692 }, { "epoch": 1.3804611650485437, "grad_norm": 6.048003673553467, "learning_rate": 5.624270480563059e-05, "loss": 2.186481475830078, "memory(GiB)": 41.34, "step": 2275, "token_acc": 0.5155875299760192, "train_speed(iter/s)": 0.57971 }, { "epoch": 1.383495145631068, "grad_norm": 7.275609970092773, "learning_rate": 5.608505776324158e-05, "loss": 2.20775146484375, "memory(GiB)": 41.34, "step": 2280, "token_acc": 0.5342465753424658, "train_speed(iter/s)": 0.57976 }, { "epoch": 1.3865291262135924, "grad_norm": 7.088268280029297, "learning_rate": 5.592734929556907e-05, "loss": 1.7822921752929688, "memory(GiB)": 41.34, "step": 2285, "token_acc": 0.610223642172524, "train_speed(iter/s)": 0.57981 }, { "epoch": 1.3895631067961165, "grad_norm": 6.6104207038879395, "learning_rate": 5.576958099459254e-05, "loss": 2.022065353393555, "memory(GiB)": 41.34, "step": 2290, "token_acc": 0.5352941176470588, "train_speed(iter/s)": 0.579726 }, { "epoch": 1.3925970873786409, "grad_norm": 7.773556709289551, "learning_rate": 5.5611754452895516e-05, "loss": 1.8300546646118163, "memory(GiB)": 41.34, "step": 2295, "token_acc": 0.577922077922078, "train_speed(iter/s)": 0.579707 }, { "epoch": 1.395631067961165, "grad_norm": 7.439202785491943, "learning_rate": 5.5453871263649395e-05, "loss": 1.970297622680664, "memory(GiB)": 41.84, "step": 2300, "token_acc": 0.6112852664576802, "train_speed(iter/s)": 0.579403 }, { "epoch": 1.3986650485436893, "grad_norm": 9.190638542175293, "learning_rate": 5.5295933020597426e-05, "loss": 2.140420913696289, "memory(GiB)": 41.84, "step": 2305, "token_acc": 0.5065359477124183, "train_speed(iter/s)": 0.579349 }, { "epoch": 1.4016990291262137, "grad_norm": 5.690435409545898, "learning_rate": 5.5137941318038596e-05, "loss": 1.893089485168457, "memory(GiB)": 41.84, "step": 2310, "token_acc": 0.6, "train_speed(iter/s)": 0.579324 }, { "epoch": 1.4047330097087378, "grad_norm": 7.719916343688965, "learning_rate": 5.4979897750811506e-05, "loss": 2.3775409698486327, "memory(GiB)": 41.84, "step": 2315, "token_acc": 0.5171102661596958, "train_speed(iter/s)": 0.579192 }, { "epoch": 1.4077669902912622, "grad_norm": 7.299395561218262, "learning_rate": 5.4821803914278336e-05, "loss": 1.9694931030273437, "memory(GiB)": 41.84, "step": 2320, "token_acc": 0.5427728613569321, "train_speed(iter/s)": 0.579086 }, { "epoch": 1.4108009708737863, "grad_norm": 6.726255893707275, "learning_rate": 5.4663661404308677e-05, "loss": 2.0492481231689452, "memory(GiB)": 41.84, "step": 2325, "token_acc": 0.5476923076923077, "train_speed(iter/s)": 0.579024 }, { "epoch": 1.4138349514563107, "grad_norm": 9.350031852722168, "learning_rate": 5.4505471817263475e-05, "loss": 2.0813602447509765, "memory(GiB)": 41.84, "step": 2330, "token_acc": 0.5481727574750831, "train_speed(iter/s)": 0.578975 }, { "epoch": 1.416868932038835, "grad_norm": 6.127203464508057, "learning_rate": 5.434723674997888e-05, "loss": 1.884780502319336, "memory(GiB)": 41.84, "step": 2335, "token_acc": 0.5686900958466453, "train_speed(iter/s)": 0.579032 }, { "epoch": 1.4199029126213591, "grad_norm": 6.9619646072387695, "learning_rate": 5.418895779975014e-05, "loss": 1.7420536041259767, "memory(GiB)": 41.84, "step": 2340, "token_acc": 0.552901023890785, "train_speed(iter/s)": 0.578878 }, { "epoch": 1.4229368932038835, "grad_norm": 8.211845397949219, "learning_rate": 5.403063656431548e-05, "loss": 1.926046371459961, "memory(GiB)": 41.84, "step": 2345, "token_acc": 0.5566666666666666, "train_speed(iter/s)": 0.578768 }, { "epoch": 1.4259708737864076, "grad_norm": 8.615828514099121, "learning_rate": 5.387227464183999e-05, "loss": 1.8713953018188476, "memory(GiB)": 41.84, "step": 2350, "token_acc": 0.5667870036101083, "train_speed(iter/s)": 0.578908 }, { "epoch": 1.429004854368932, "grad_norm": 8.677647590637207, "learning_rate": 5.371387363089945e-05, "loss": 2.0104761123657227, "memory(GiB)": 41.84, "step": 2355, "token_acc": 0.5653710247349824, "train_speed(iter/s)": 0.578973 }, { "epoch": 1.4320388349514563, "grad_norm": 8.752043724060059, "learning_rate": 5.355543513046419e-05, "loss": 2.0104990005493164, "memory(GiB)": 41.84, "step": 2360, "token_acc": 0.5486111111111112, "train_speed(iter/s)": 0.579051 }, { "epoch": 1.4350728155339807, "grad_norm": 6.938195705413818, "learning_rate": 5.3396960739883037e-05, "loss": 1.974110984802246, "memory(GiB)": 41.84, "step": 2365, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 0.579115 }, { "epoch": 1.4381067961165048, "grad_norm": 6.470673561096191, "learning_rate": 5.323845205886707e-05, "loss": 2.092882537841797, "memory(GiB)": 41.84, "step": 2370, "token_acc": 0.5299684542586751, "train_speed(iter/s)": 0.57924 }, { "epoch": 1.4411407766990292, "grad_norm": 6.7543206214904785, "learning_rate": 5.307991068747353e-05, "loss": 2.317662811279297, "memory(GiB)": 41.84, "step": 2375, "token_acc": 0.5239616613418531, "train_speed(iter/s)": 0.579126 }, { "epoch": 1.4441747572815533, "grad_norm": 7.441592216491699, "learning_rate": 5.292133822608961e-05, "loss": 2.0434192657470702, "memory(GiB)": 41.84, "step": 2380, "token_acc": 0.547945205479452, "train_speed(iter/s)": 0.579147 }, { "epoch": 1.4472087378640777, "grad_norm": 7.122344970703125, "learning_rate": 5.2762736275416416e-05, "loss": 2.2737056732177736, "memory(GiB)": 41.84, "step": 2385, "token_acc": 0.540785498489426, "train_speed(iter/s)": 0.579209 }, { "epoch": 1.450242718446602, "grad_norm": 6.282622337341309, "learning_rate": 5.260410643645263e-05, "loss": 2.0695510864257813, "memory(GiB)": 41.84, "step": 2390, "token_acc": 0.5391849529780565, "train_speed(iter/s)": 0.579338 }, { "epoch": 1.4532766990291262, "grad_norm": 6.010311603546143, "learning_rate": 5.2445450310478525e-05, "loss": 1.819678497314453, "memory(GiB)": 41.84, "step": 2395, "token_acc": 0.5876288659793815, "train_speed(iter/s)": 0.579301 }, { "epoch": 1.4563106796116505, "grad_norm": 8.786865234375, "learning_rate": 5.228676949903973e-05, "loss": 1.9962085723876952, "memory(GiB)": 41.84, "step": 2400, "token_acc": 0.543046357615894, "train_speed(iter/s)": 0.579291 }, { "epoch": 1.4593446601941746, "grad_norm": 6.772591590881348, "learning_rate": 5.2128065603931006e-05, "loss": 1.931478500366211, "memory(GiB)": 41.84, "step": 2405, "token_acc": 0.584717607973422, "train_speed(iter/s)": 0.579183 }, { "epoch": 1.462378640776699, "grad_norm": 7.0186357498168945, "learning_rate": 5.196934022718017e-05, "loss": 1.8834335327148437, "memory(GiB)": 41.84, "step": 2410, "token_acc": 0.5857142857142857, "train_speed(iter/s)": 0.579263 }, { "epoch": 1.4654126213592233, "grad_norm": 7.649616241455078, "learning_rate": 5.18105949710319e-05, "loss": 2.1677167892456053, "memory(GiB)": 41.84, "step": 2415, "token_acc": 0.5331010452961672, "train_speed(iter/s)": 0.579501 }, { "epoch": 1.4684466019417477, "grad_norm": 7.913327693939209, "learning_rate": 5.165183143793149e-05, "loss": 2.4113887786865233, "memory(GiB)": 41.84, "step": 2420, "token_acc": 0.47790055248618785, "train_speed(iter/s)": 0.579573 }, { "epoch": 1.4714805825242718, "grad_norm": 8.196721076965332, "learning_rate": 5.149305123050877e-05, "loss": 1.6590158462524414, "memory(GiB)": 41.84, "step": 2425, "token_acc": 0.5425531914893617, "train_speed(iter/s)": 0.579678 }, { "epoch": 1.4745145631067962, "grad_norm": 5.6772637367248535, "learning_rate": 5.133425595156187e-05, "loss": 2.0934783935546877, "memory(GiB)": 41.84, "step": 2430, "token_acc": 0.49586776859504134, "train_speed(iter/s)": 0.579607 }, { "epoch": 1.4775485436893203, "grad_norm": 9.212677955627441, "learning_rate": 5.1175447204041096e-05, "loss": 2.0111692428588865, "memory(GiB)": 41.84, "step": 2435, "token_acc": 0.5536912751677853, "train_speed(iter/s)": 0.579662 }, { "epoch": 1.4805825242718447, "grad_norm": 6.798145771026611, "learning_rate": 5.101662659103265e-05, "loss": 1.8395654678344726, "memory(GiB)": 41.84, "step": 2440, "token_acc": 0.597972972972973, "train_speed(iter/s)": 0.579718 }, { "epoch": 1.483616504854369, "grad_norm": 5.608346462249756, "learning_rate": 5.0857795715742575e-05, "loss": 2.0497175216674806, "memory(GiB)": 41.84, "step": 2445, "token_acc": 0.5542168674698795, "train_speed(iter/s)": 0.579756 }, { "epoch": 1.4866504854368932, "grad_norm": 7.392420291900635, "learning_rate": 5.0698956181480465e-05, "loss": 2.040939521789551, "memory(GiB)": 41.84, "step": 2450, "token_acc": 0.516728624535316, "train_speed(iter/s)": 0.57972 }, { "epoch": 1.4896844660194175, "grad_norm": 5.091887474060059, "learning_rate": 5.054010959164329e-05, "loss": 2.256111907958984, "memory(GiB)": 41.84, "step": 2455, "token_acc": 0.5181347150259067, "train_speed(iter/s)": 0.57971 }, { "epoch": 1.4927184466019416, "grad_norm": 8.56528091430664, "learning_rate": 5.038125754969933e-05, "loss": 2.1345645904541017, "memory(GiB)": 41.84, "step": 2460, "token_acc": 0.524390243902439, "train_speed(iter/s)": 0.579785 }, { "epoch": 1.495752427184466, "grad_norm": 8.425841331481934, "learning_rate": 5.0222401659171846e-05, "loss": 1.8225021362304688, "memory(GiB)": 41.84, "step": 2465, "token_acc": 0.6041666666666666, "train_speed(iter/s)": 0.579851 }, { "epoch": 1.4987864077669903, "grad_norm": 7.502073287963867, "learning_rate": 5.006354352362296e-05, "loss": 2.2287876129150392, "memory(GiB)": 41.84, "step": 2470, "token_acc": 0.5451505016722408, "train_speed(iter/s)": 0.579885 }, { "epoch": 1.5018203883495147, "grad_norm": 14.120893478393555, "learning_rate": 4.9904684746637445e-05, "loss": 2.1780731201171877, "memory(GiB)": 41.84, "step": 2475, "token_acc": 0.5900621118012422, "train_speed(iter/s)": 0.580025 }, { "epoch": 1.5048543689320388, "grad_norm": 6.581485271453857, "learning_rate": 4.9745826931806524e-05, "loss": 2.466159439086914, "memory(GiB)": 41.84, "step": 2480, "token_acc": 0.4410958904109589, "train_speed(iter/s)": 0.580064 }, { "epoch": 1.507888349514563, "grad_norm": 6.508731365203857, "learning_rate": 4.958697168271179e-05, "loss": 1.8887645721435546, "memory(GiB)": 41.84, "step": 2485, "token_acc": 0.5559440559440559, "train_speed(iter/s)": 0.58009 }, { "epoch": 1.5109223300970873, "grad_norm": 5.886694431304932, "learning_rate": 4.942812060290886e-05, "loss": 2.1457874298095705, "memory(GiB)": 41.84, "step": 2490, "token_acc": 0.5476190476190477, "train_speed(iter/s)": 0.580188 }, { "epoch": 1.5139563106796117, "grad_norm": 5.6448655128479, "learning_rate": 4.92692752959113e-05, "loss": 1.9578502655029297, "memory(GiB)": 41.84, "step": 2495, "token_acc": 0.5710227272727273, "train_speed(iter/s)": 0.58031 }, { "epoch": 1.516990291262136, "grad_norm": 9.438764572143555, "learning_rate": 4.91104373651744e-05, "loss": 2.124725341796875, "memory(GiB)": 41.84, "step": 2500, "token_acc": 0.5164179104477612, "train_speed(iter/s)": 0.580368 }, { "epoch": 1.516990291262136, "eval_loss": 2.0256900787353516, "eval_runtime": 12.7025, "eval_samples_per_second": 7.872, "eval_steps_per_second": 7.872, "eval_token_acc": 0.5185185185185185, "step": 2500 }, { "epoch": 1.5200242718446602, "grad_norm": 7.8130106925964355, "learning_rate": 4.8951608414078944e-05, "loss": 2.377336311340332, "memory(GiB)": 41.84, "step": 2505, "token_acc": 0.509090909090909, "train_speed(iter/s)": 0.578557 }, { "epoch": 1.5230582524271845, "grad_norm": 7.16809606552124, "learning_rate": 4.8792790045915167e-05, "loss": 1.6067583084106445, "memory(GiB)": 41.84, "step": 2510, "token_acc": 0.6186770428015564, "train_speed(iter/s)": 0.578509 }, { "epoch": 1.5260922330097086, "grad_norm": 6.225858688354492, "learning_rate": 4.863398386386638e-05, "loss": 1.8492023468017578, "memory(GiB)": 41.84, "step": 2515, "token_acc": 0.5787965616045845, "train_speed(iter/s)": 0.578454 }, { "epoch": 1.529126213592233, "grad_norm": 8.595073699951172, "learning_rate": 4.847519147099294e-05, "loss": 1.9532032012939453, "memory(GiB)": 41.84, "step": 2520, "token_acc": 0.535593220338983, "train_speed(iter/s)": 0.578603 }, { "epoch": 1.5321601941747574, "grad_norm": 7.294178009033203, "learning_rate": 4.831641447021599e-05, "loss": 1.7893003463745116, "memory(GiB)": 41.84, "step": 2525, "token_acc": 0.6137184115523465, "train_speed(iter/s)": 0.57858 }, { "epoch": 1.5351941747572817, "grad_norm": 7.821887969970703, "learning_rate": 4.8157654464301275e-05, "loss": 2.2367401123046875, "memory(GiB)": 41.84, "step": 2530, "token_acc": 0.5182072829131653, "train_speed(iter/s)": 0.578758 }, { "epoch": 1.5382281553398058, "grad_norm": 7.00529670715332, "learning_rate": 4.7998913055843054e-05, "loss": 2.1124399185180662, "memory(GiB)": 41.84, "step": 2535, "token_acc": 0.5432835820895522, "train_speed(iter/s)": 0.578828 }, { "epoch": 1.54126213592233, "grad_norm": 5.952232837677002, "learning_rate": 4.7840191847247774e-05, "loss": 2.0016332626342774, "memory(GiB)": 41.84, "step": 2540, "token_acc": 0.5930232558139535, "train_speed(iter/s)": 0.578959 }, { "epoch": 1.5442961165048543, "grad_norm": 7.779722213745117, "learning_rate": 4.7681492440718045e-05, "loss": 1.982724952697754, "memory(GiB)": 41.84, "step": 2545, "token_acc": 0.5338345864661654, "train_speed(iter/s)": 0.579046 }, { "epoch": 1.5473300970873787, "grad_norm": 7.770874977111816, "learning_rate": 4.752281643823633e-05, "loss": 2.032842254638672, "memory(GiB)": 41.84, "step": 2550, "token_acc": 0.5749235474006116, "train_speed(iter/s)": 0.579014 }, { "epoch": 1.550364077669903, "grad_norm": 6.972710609436035, "learning_rate": 4.736416544154891e-05, "loss": 1.9030048370361328, "memory(GiB)": 41.84, "step": 2555, "token_acc": 0.5656565656565656, "train_speed(iter/s)": 0.579128 }, { "epoch": 1.5533980582524272, "grad_norm": 7.349340915679932, "learning_rate": 4.720554105214961e-05, "loss": 1.903385543823242, "memory(GiB)": 41.84, "step": 2560, "token_acc": 0.5469798657718121, "train_speed(iter/s)": 0.579119 }, { "epoch": 1.5564320388349513, "grad_norm": 7.2185444831848145, "learning_rate": 4.704694487126365e-05, "loss": 1.8204626083374023, "memory(GiB)": 41.84, "step": 2565, "token_acc": 0.5747126436781609, "train_speed(iter/s)": 0.579258 }, { "epoch": 1.5594660194174756, "grad_norm": 7.047289848327637, "learning_rate": 4.688837849983154e-05, "loss": 2.169702339172363, "memory(GiB)": 41.84, "step": 2570, "token_acc": 0.509493670886076, "train_speed(iter/s)": 0.579358 }, { "epoch": 1.5625, "grad_norm": 10.583885192871094, "learning_rate": 4.6729843538492847e-05, "loss": 1.8666536331176757, "memory(GiB)": 41.84, "step": 2575, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 0.579491 }, { "epoch": 1.5655339805825244, "grad_norm": 7.884814262390137, "learning_rate": 4.657134158757012e-05, "loss": 2.1705270767211915, "memory(GiB)": 41.84, "step": 2580, "token_acc": 0.5142857142857142, "train_speed(iter/s)": 0.579527 }, { "epoch": 1.5685679611650487, "grad_norm": 7.872768402099609, "learning_rate": 4.6412874247052615e-05, "loss": 2.2928442001342773, "memory(GiB)": 41.84, "step": 2585, "token_acc": 0.5105105105105106, "train_speed(iter/s)": 0.579593 }, { "epoch": 1.5716019417475728, "grad_norm": 9.023248672485352, "learning_rate": 4.625444311658028e-05, "loss": 1.8835826873779298, "memory(GiB)": 41.84, "step": 2590, "token_acc": 0.5552147239263804, "train_speed(iter/s)": 0.579652 }, { "epoch": 1.574635922330097, "grad_norm": 7.943882942199707, "learning_rate": 4.6096049795427514e-05, "loss": 2.0815145492553713, "memory(GiB)": 41.84, "step": 2595, "token_acc": 0.5218855218855218, "train_speed(iter/s)": 0.579716 }, { "epoch": 1.5776699029126213, "grad_norm": 7.587296009063721, "learning_rate": 4.593769588248702e-05, "loss": 1.6165863037109376, "memory(GiB)": 41.84, "step": 2600, "token_acc": 0.6129032258064516, "train_speed(iter/s)": 0.579694 }, { "epoch": 1.5807038834951457, "grad_norm": 8.291844367980957, "learning_rate": 4.577938297625378e-05, "loss": 2.093304443359375, "memory(GiB)": 41.84, "step": 2605, "token_acc": 0.5827814569536424, "train_speed(iter/s)": 0.579729 }, { "epoch": 1.58373786407767, "grad_norm": 6.745671272277832, "learning_rate": 4.5621112674808756e-05, "loss": 1.9251686096191407, "memory(GiB)": 41.84, "step": 2610, "token_acc": 0.5833333333333334, "train_speed(iter/s)": 0.579877 }, { "epoch": 1.5867718446601942, "grad_norm": 8.493294715881348, "learning_rate": 4.5462886575802884e-05, "loss": 1.971460723876953, "memory(GiB)": 41.84, "step": 2615, "token_acc": 0.5821428571428572, "train_speed(iter/s)": 0.579847 }, { "epoch": 1.5898058252427183, "grad_norm": 13.71259593963623, "learning_rate": 4.530470627644088e-05, "loss": 2.0272783279418944, "memory(GiB)": 41.84, "step": 2620, "token_acc": 0.5578231292517006, "train_speed(iter/s)": 0.579923 }, { "epoch": 1.5928398058252426, "grad_norm": 6.396689414978027, "learning_rate": 4.514657337346512e-05, "loss": 1.958717155456543, "memory(GiB)": 41.84, "step": 2625, "token_acc": 0.5413333333333333, "train_speed(iter/s)": 0.579933 }, { "epoch": 1.595873786407767, "grad_norm": 8.41101360321045, "learning_rate": 4.4988489463139605e-05, "loss": 1.8024402618408204, "memory(GiB)": 41.84, "step": 2630, "token_acc": 0.574468085106383, "train_speed(iter/s)": 0.580054 }, { "epoch": 1.5989077669902914, "grad_norm": 6.545622825622559, "learning_rate": 4.483045614123371e-05, "loss": 2.081429862976074, "memory(GiB)": 41.84, "step": 2635, "token_acc": 0.5523809523809524, "train_speed(iter/s)": 0.580079 }, { "epoch": 1.6019417475728155, "grad_norm": 7.194870471954346, "learning_rate": 4.46724750030062e-05, "loss": 1.9362052917480468, "memory(GiB)": 41.84, "step": 2640, "token_acc": 0.5756578947368421, "train_speed(iter/s)": 0.580221 }, { "epoch": 1.6049757281553398, "grad_norm": 6.871307849884033, "learning_rate": 4.451454764318903e-05, "loss": 2.0093603134155273, "memory(GiB)": 41.84, "step": 2645, "token_acc": 0.5370370370370371, "train_speed(iter/s)": 0.580085 }, { "epoch": 1.608009708737864, "grad_norm": 6.45038366317749, "learning_rate": 4.4356675655971344e-05, "loss": 1.9990568161010742, "memory(GiB)": 41.84, "step": 2650, "token_acc": 0.5369774919614148, "train_speed(iter/s)": 0.580256 }, { "epoch": 1.6110436893203883, "grad_norm": 10.047187805175781, "learning_rate": 4.419886063498329e-05, "loss": 2.281326103210449, "memory(GiB)": 41.84, "step": 2655, "token_acc": 0.4965034965034965, "train_speed(iter/s)": 0.580351 }, { "epoch": 1.6140776699029127, "grad_norm": 8.295970916748047, "learning_rate": 4.404110417327998e-05, "loss": 2.0824228286743165, "memory(GiB)": 41.84, "step": 2660, "token_acc": 0.519434628975265, "train_speed(iter/s)": 0.580351 }, { "epoch": 1.617111650485437, "grad_norm": 8.373644828796387, "learning_rate": 4.388340786332541e-05, "loss": 1.9413429260253907, "memory(GiB)": 41.84, "step": 2665, "token_acc": 0.580110497237569, "train_speed(iter/s)": 0.58041 }, { "epoch": 1.6201456310679612, "grad_norm": 6.771739482879639, "learning_rate": 4.372577329697636e-05, "loss": 2.1314056396484373, "memory(GiB)": 41.84, "step": 2670, "token_acc": 0.5014005602240896, "train_speed(iter/s)": 0.580318 }, { "epoch": 1.6231796116504853, "grad_norm": 6.547637462615967, "learning_rate": 4.35682020654664e-05, "loss": 1.8196992874145508, "memory(GiB)": 41.84, "step": 2675, "token_acc": 0.5973154362416108, "train_speed(iter/s)": 0.580398 }, { "epoch": 1.6262135922330097, "grad_norm": 7.0243449211120605, "learning_rate": 4.341069575938968e-05, "loss": 2.0443634033203124, "memory(GiB)": 41.84, "step": 2680, "token_acc": 0.5777027027027027, "train_speed(iter/s)": 0.580433 }, { "epoch": 1.629247572815534, "grad_norm": 7.968044281005859, "learning_rate": 4.3253255968685044e-05, "loss": 2.372605323791504, "memory(GiB)": 41.84, "step": 2685, "token_acc": 0.5537459283387622, "train_speed(iter/s)": 0.580421 }, { "epoch": 1.6322815533980584, "grad_norm": 7.074746608734131, "learning_rate": 4.3095884282619866e-05, "loss": 1.9867733001708985, "memory(GiB)": 41.84, "step": 2690, "token_acc": 0.5676691729323309, "train_speed(iter/s)": 0.580481 }, { "epoch": 1.6353155339805825, "grad_norm": 6.959107398986816, "learning_rate": 4.2938582289774e-05, "loss": 1.9854732513427735, "memory(GiB)": 41.84, "step": 2695, "token_acc": 0.5686813186813187, "train_speed(iter/s)": 0.58059 }, { "epoch": 1.6383495145631068, "grad_norm": 6.535874843597412, "learning_rate": 4.278135157802389e-05, "loss": 2.186625289916992, "memory(GiB)": 41.84, "step": 2700, "token_acc": 0.5300859598853869, "train_speed(iter/s)": 0.5806 }, { "epoch": 1.641383495145631, "grad_norm": 6.670753002166748, "learning_rate": 4.262419373452634e-05, "loss": 2.415786361694336, "memory(GiB)": 41.84, "step": 2705, "token_acc": 0.4827586206896552, "train_speed(iter/s)": 0.580602 }, { "epoch": 1.6444174757281553, "grad_norm": 11.83166790008545, "learning_rate": 4.246711034570264e-05, "loss": 2.008403015136719, "memory(GiB)": 41.84, "step": 2710, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 0.580751 }, { "epoch": 1.6474514563106797, "grad_norm": 7.605556964874268, "learning_rate": 4.231010299722248e-05, "loss": 2.3934700012207033, "memory(GiB)": 41.84, "step": 2715, "token_acc": 0.4915254237288136, "train_speed(iter/s)": 0.580846 }, { "epoch": 1.650485436893204, "grad_norm": 6.8486504554748535, "learning_rate": 4.2153173273987946e-05, "loss": 1.9181827545166015, "memory(GiB)": 41.84, "step": 2720, "token_acc": 0.5562913907284768, "train_speed(iter/s)": 0.580869 }, { "epoch": 1.6535194174757282, "grad_norm": 8.30029296875, "learning_rate": 4.199632276011761e-05, "loss": 2.099735641479492, "memory(GiB)": 41.84, "step": 2725, "token_acc": 0.5529100529100529, "train_speed(iter/s)": 0.580925 }, { "epoch": 1.6565533980582523, "grad_norm": 6.734464168548584, "learning_rate": 4.1839553038930396e-05, "loss": 1.9709980010986328, "memory(GiB)": 41.84, "step": 2730, "token_acc": 0.5331230283911672, "train_speed(iter/s)": 0.580952 }, { "epoch": 1.6595873786407767, "grad_norm": 6.3508710861206055, "learning_rate": 4.168286569292972e-05, "loss": 2.039066123962402, "memory(GiB)": 41.84, "step": 2735, "token_acc": 0.5649717514124294, "train_speed(iter/s)": 0.581109 }, { "epoch": 1.662621359223301, "grad_norm": 6.782240867614746, "learning_rate": 4.152626230378741e-05, "loss": 1.832118606567383, "memory(GiB)": 41.84, "step": 2740, "token_acc": 0.6, "train_speed(iter/s)": 0.581153 }, { "epoch": 1.6656553398058254, "grad_norm": 8.437490463256836, "learning_rate": 4.136974445232788e-05, "loss": 1.9984106063842773, "memory(GiB)": 41.84, "step": 2745, "token_acc": 0.5113636363636364, "train_speed(iter/s)": 0.581248 }, { "epoch": 1.6686893203883495, "grad_norm": 8.64138126373291, "learning_rate": 4.121331371851201e-05, "loss": 1.9429035186767578, "memory(GiB)": 41.84, "step": 2750, "token_acc": 0.574468085106383, "train_speed(iter/s)": 0.581216 }, { "epoch": 1.6717233009708736, "grad_norm": 7.808033466339111, "learning_rate": 4.10569716814213e-05, "loss": 2.069664192199707, "memory(GiB)": 41.84, "step": 2755, "token_acc": 0.546583850931677, "train_speed(iter/s)": 0.581204 }, { "epoch": 1.674757281553398, "grad_norm": 7.158506393432617, "learning_rate": 4.0900719919241935e-05, "loss": 2.2129743576049803, "memory(GiB)": 41.84, "step": 2760, "token_acc": 0.5330882352941176, "train_speed(iter/s)": 0.581324 }, { "epoch": 1.6777912621359223, "grad_norm": 6.141445636749268, "learning_rate": 4.0744560009248766e-05, "loss": 2.1222957611083983, "memory(GiB)": 41.84, "step": 2765, "token_acc": 0.5301204819277109, "train_speed(iter/s)": 0.581344 }, { "epoch": 1.6808252427184467, "grad_norm": 9.04359245300293, "learning_rate": 4.0588493527789537e-05, "loss": 2.0622652053833006, "memory(GiB)": 41.84, "step": 2770, "token_acc": 0.5793103448275863, "train_speed(iter/s)": 0.581484 }, { "epoch": 1.6838592233009708, "grad_norm": 7.4207892417907715, "learning_rate": 4.043252205026879e-05, "loss": 1.9703941345214844, "memory(GiB)": 41.84, "step": 2775, "token_acc": 0.5451807228915663, "train_speed(iter/s)": 0.581551 }, { "epoch": 1.6868932038834952, "grad_norm": 6.962371826171875, "learning_rate": 4.027664715113209e-05, "loss": 2.0751678466796877, "memory(GiB)": 41.84, "step": 2780, "token_acc": 0.533724340175953, "train_speed(iter/s)": 0.58165 }, { "epoch": 1.6899271844660193, "grad_norm": 6.551590919494629, "learning_rate": 4.012087040385012e-05, "loss": 1.9780982971191405, "memory(GiB)": 41.84, "step": 2785, "token_acc": 0.564625850340136, "train_speed(iter/s)": 0.581595 }, { "epoch": 1.6929611650485437, "grad_norm": 8.19705867767334, "learning_rate": 3.996519338090273e-05, "loss": 1.9075267791748047, "memory(GiB)": 41.84, "step": 2790, "token_acc": 0.5729537366548043, "train_speed(iter/s)": 0.58155 }, { "epoch": 1.695995145631068, "grad_norm": 6.0668206214904785, "learning_rate": 3.980961765376316e-05, "loss": 2.269983100891113, "memory(GiB)": 41.84, "step": 2795, "token_acc": 0.5031446540880503, "train_speed(iter/s)": 0.581377 }, { "epoch": 1.6990291262135924, "grad_norm": 7.507983684539795, "learning_rate": 3.965414479288209e-05, "loss": 2.1596681594848635, "memory(GiB)": 41.84, "step": 2800, "token_acc": 0.5704225352112676, "train_speed(iter/s)": 0.581409 }, { "epoch": 1.7020631067961165, "grad_norm": 9.827066421508789, "learning_rate": 3.9498776367671825e-05, "loss": 2.028460884094238, "memory(GiB)": 41.84, "step": 2805, "token_acc": 0.5544871794871795, "train_speed(iter/s)": 0.581541 }, { "epoch": 1.7050970873786406, "grad_norm": 7.970204830169678, "learning_rate": 3.9343513946490454e-05, "loss": 2.2608503341674804, "memory(GiB)": 41.84, "step": 2810, "token_acc": 0.532871972318339, "train_speed(iter/s)": 0.5816 }, { "epoch": 1.708131067961165, "grad_norm": 8.01364517211914, "learning_rate": 3.9188359096626e-05, "loss": 1.965842056274414, "memory(GiB)": 41.84, "step": 2815, "token_acc": 0.5447154471544715, "train_speed(iter/s)": 0.581736 }, { "epoch": 1.7111650485436893, "grad_norm": 7.19758939743042, "learning_rate": 3.903331338428067e-05, "loss": 2.0728851318359376, "memory(GiB)": 41.84, "step": 2820, "token_acc": 0.5568862275449101, "train_speed(iter/s)": 0.581956 }, { "epoch": 1.7141990291262137, "grad_norm": 6.977797508239746, "learning_rate": 3.88783783745549e-05, "loss": 1.7800270080566407, "memory(GiB)": 41.84, "step": 2825, "token_acc": 0.565359477124183, "train_speed(iter/s)": 0.581974 }, { "epoch": 1.7172330097087378, "grad_norm": 8.389069557189941, "learning_rate": 3.872355563143173e-05, "loss": 1.479856300354004, "memory(GiB)": 41.84, "step": 2830, "token_acc": 0.6463878326996197, "train_speed(iter/s)": 0.582021 }, { "epoch": 1.7202669902912622, "grad_norm": 8.598016738891602, "learning_rate": 3.856884671776085e-05, "loss": 1.9001766204833985, "memory(GiB)": 41.84, "step": 2835, "token_acc": 0.5427509293680297, "train_speed(iter/s)": 0.582021 }, { "epoch": 1.7233009708737863, "grad_norm": 7.339463233947754, "learning_rate": 3.8414253195242986e-05, "loss": 2.0311508178710938, "memory(GiB)": 41.84, "step": 2840, "token_acc": 0.5960912052117264, "train_speed(iter/s)": 0.582075 }, { "epoch": 1.7263349514563107, "grad_norm": 6.700257778167725, "learning_rate": 3.8259776624414e-05, "loss": 1.824915313720703, "memory(GiB)": 41.84, "step": 2845, "token_acc": 0.5838709677419355, "train_speed(iter/s)": 0.582141 }, { "epoch": 1.729368932038835, "grad_norm": 7.298790454864502, "learning_rate": 3.81054185646292e-05, "loss": 2.0110477447509765, "memory(GiB)": 41.84, "step": 2850, "token_acc": 0.5802047781569966, "train_speed(iter/s)": 0.581997 }, { "epoch": 1.7324029126213594, "grad_norm": 7.2910332679748535, "learning_rate": 3.795118057404761e-05, "loss": 1.9101539611816407, "memory(GiB)": 41.84, "step": 2855, "token_acc": 0.5787545787545788, "train_speed(iter/s)": 0.582142 }, { "epoch": 1.7354368932038835, "grad_norm": 5.262487411499023, "learning_rate": 3.779706420961617e-05, "loss": 1.8585384368896485, "memory(GiB)": 41.84, "step": 2860, "token_acc": 0.5941176470588235, "train_speed(iter/s)": 0.5821 }, { "epoch": 1.7384708737864076, "grad_norm": 10.52902603149414, "learning_rate": 3.764307102705417e-05, "loss": 2.2284523010253907, "memory(GiB)": 41.84, "step": 2865, "token_acc": 0.5323076923076923, "train_speed(iter/s)": 0.582044 }, { "epoch": 1.741504854368932, "grad_norm": 7.36726188659668, "learning_rate": 3.748920258083736e-05, "loss": 2.3935964584350584, "memory(GiB)": 41.84, "step": 2870, "token_acc": 0.5157593123209169, "train_speed(iter/s)": 0.582023 }, { "epoch": 1.7445388349514563, "grad_norm": 9.515303611755371, "learning_rate": 3.7335460424182356e-05, "loss": 2.0206344604492186, "memory(GiB)": 41.84, "step": 2875, "token_acc": 0.5436241610738255, "train_speed(iter/s)": 0.582136 }, { "epoch": 1.7475728155339807, "grad_norm": 7.746051788330078, "learning_rate": 3.7181846109031005e-05, "loss": 1.9893791198730468, "memory(GiB)": 41.84, "step": 2880, "token_acc": 0.5664335664335665, "train_speed(iter/s)": 0.582034 }, { "epoch": 1.7506067961165048, "grad_norm": 7.868143081665039, "learning_rate": 3.702836118603458e-05, "loss": 2.369589614868164, "memory(GiB)": 41.84, "step": 2885, "token_acc": 0.5084745762711864, "train_speed(iter/s)": 0.581894 }, { "epoch": 1.7536407766990292, "grad_norm": 6.672244071960449, "learning_rate": 3.687500720453831e-05, "loss": 1.9809467315673828, "memory(GiB)": 41.84, "step": 2890, "token_acc": 0.5498489425981873, "train_speed(iter/s)": 0.58182 }, { "epoch": 1.7566747572815533, "grad_norm": 5.8379011154174805, "learning_rate": 3.672178571256556e-05, "loss": 2.137996864318848, "memory(GiB)": 41.84, "step": 2895, "token_acc": 0.5470588235294118, "train_speed(iter/s)": 0.581917 }, { "epoch": 1.7597087378640777, "grad_norm": 5.696329593658447, "learning_rate": 3.656869825680234e-05, "loss": 1.7796316146850586, "memory(GiB)": 41.84, "step": 2900, "token_acc": 0.6054421768707483, "train_speed(iter/s)": 0.581974 }, { "epoch": 1.762742718446602, "grad_norm": 7.160623550415039, "learning_rate": 3.641574638258162e-05, "loss": 2.0094619750976563, "memory(GiB)": 41.84, "step": 2905, "token_acc": 0.5428571428571428, "train_speed(iter/s)": 0.58194 }, { "epoch": 1.7657766990291264, "grad_norm": 5.733323097229004, "learning_rate": 3.62629316338677e-05, "loss": 2.0144931793212892, "memory(GiB)": 41.84, "step": 2910, "token_acc": 0.5308988764044944, "train_speed(iter/s)": 0.581796 }, { "epoch": 1.7688106796116505, "grad_norm": 6.644180774688721, "learning_rate": 3.611025555324079e-05, "loss": 1.9589729309082031, "memory(GiB)": 41.84, "step": 2915, "token_acc": 0.5672727272727273, "train_speed(iter/s)": 0.581878 }, { "epoch": 1.7718446601941746, "grad_norm": 13.900938034057617, "learning_rate": 3.595771968188121e-05, "loss": 1.9292577743530273, "memory(GiB)": 41.84, "step": 2920, "token_acc": 0.59, "train_speed(iter/s)": 0.582002 }, { "epoch": 1.774878640776699, "grad_norm": 9.342930793762207, "learning_rate": 3.5805325559554006e-05, "loss": 1.8789905548095702, "memory(GiB)": 41.84, "step": 2925, "token_acc": 0.556, "train_speed(iter/s)": 0.582068 }, { "epoch": 1.7779126213592233, "grad_norm": 10.121810913085938, "learning_rate": 3.5653074724593306e-05, "loss": 2.171294593811035, "memory(GiB)": 41.84, "step": 2930, "token_acc": 0.5441176470588235, "train_speed(iter/s)": 0.582172 }, { "epoch": 1.7809466019417477, "grad_norm": 8.192787170410156, "learning_rate": 3.550096871388689e-05, "loss": 1.9008895874023437, "memory(GiB)": 41.84, "step": 2935, "token_acc": 0.5387205387205387, "train_speed(iter/s)": 0.582287 }, { "epoch": 1.7839805825242718, "grad_norm": 9.528207778930664, "learning_rate": 3.5349009062860586e-05, "loss": 2.1617660522460938, "memory(GiB)": 41.84, "step": 2940, "token_acc": 0.5551601423487544, "train_speed(iter/s)": 0.582322 }, { "epoch": 1.787014563106796, "grad_norm": 11.588967323303223, "learning_rate": 3.519719730546275e-05, "loss": 1.679486083984375, "memory(GiB)": 41.84, "step": 2945, "token_acc": 0.6188679245283019, "train_speed(iter/s)": 0.582412 }, { "epoch": 1.7900485436893203, "grad_norm": 8.055891990661621, "learning_rate": 3.504553497414893e-05, "loss": 1.960872268676758, "memory(GiB)": 41.84, "step": 2950, "token_acc": 0.6287878787878788, "train_speed(iter/s)": 0.582489 }, { "epoch": 1.7930825242718447, "grad_norm": 6.05890417098999, "learning_rate": 3.489402359986621e-05, "loss": 1.9190954208374023, "memory(GiB)": 41.84, "step": 2955, "token_acc": 0.5660377358490566, "train_speed(iter/s)": 0.582598 }, { "epoch": 1.796116504854369, "grad_norm": 10.20227336883545, "learning_rate": 3.474266471203794e-05, "loss": 1.7310752868652344, "memory(GiB)": 41.84, "step": 2960, "token_acc": 0.6046511627906976, "train_speed(iter/s)": 0.582734 }, { "epoch": 1.7991504854368932, "grad_norm": 8.176021575927734, "learning_rate": 3.459145983854813e-05, "loss": 1.9539764404296875, "memory(GiB)": 41.84, "step": 2965, "token_acc": 0.5962962962962963, "train_speed(iter/s)": 0.582861 }, { "epoch": 1.8021844660194175, "grad_norm": 7.691636085510254, "learning_rate": 3.444041050572611e-05, "loss": 2.0364006042480467, "memory(GiB)": 41.84, "step": 2970, "token_acc": 0.5605536332179931, "train_speed(iter/s)": 0.582943 }, { "epoch": 1.8052184466019416, "grad_norm": 8.828807830810547, "learning_rate": 3.4289518238331145e-05, "loss": 1.7169891357421876, "memory(GiB)": 41.84, "step": 2975, "token_acc": 0.654275092936803, "train_speed(iter/s)": 0.583015 }, { "epoch": 1.808252427184466, "grad_norm": 6.20446252822876, "learning_rate": 3.413878455953698e-05, "loss": 2.094204902648926, "memory(GiB)": 41.84, "step": 2980, "token_acc": 0.5351170568561873, "train_speed(iter/s)": 0.583072 }, { "epoch": 1.8112864077669903, "grad_norm": 7.542689800262451, "learning_rate": 3.398821099091652e-05, "loss": 1.8194765090942382, "memory(GiB)": 41.84, "step": 2985, "token_acc": 0.5900621118012422, "train_speed(iter/s)": 0.583267 }, { "epoch": 1.8143203883495147, "grad_norm": 5.989041328430176, "learning_rate": 3.3837799052426434e-05, "loss": 2.085628128051758, "memory(GiB)": 41.84, "step": 2990, "token_acc": 0.5573770491803278, "train_speed(iter/s)": 0.58343 }, { "epoch": 1.8173543689320388, "grad_norm": 8.956052780151367, "learning_rate": 3.3687550262391836e-05, "loss": 2.0220142364501954, "memory(GiB)": 41.84, "step": 2995, "token_acc": 0.563076923076923, "train_speed(iter/s)": 0.583454 }, { "epoch": 1.820388349514563, "grad_norm": 9.703901290893555, "learning_rate": 3.353746613749094e-05, "loss": 1.7758405685424805, "memory(GiB)": 41.84, "step": 3000, "token_acc": 0.5978260869565217, "train_speed(iter/s)": 0.583443 }, { "epoch": 1.820388349514563, "eval_loss": 2.155855655670166, "eval_runtime": 12.3446, "eval_samples_per_second": 8.101, "eval_steps_per_second": 8.101, "eval_token_acc": 0.5071335927367056, "step": 3000 }, { "epoch": 1.8234223300970873, "grad_norm": 10.289823532104492, "learning_rate": 3.33875481927397e-05, "loss": 1.9597461700439454, "memory(GiB)": 41.84, "step": 3005, "token_acc": 0.5239887111947319, "train_speed(iter/s)": 0.581978 }, { "epoch": 1.8264563106796117, "grad_norm": 8.281176567077637, "learning_rate": 3.3237797941476715e-05, "loss": 1.7820388793945312, "memory(GiB)": 41.84, "step": 3010, "token_acc": 0.5830258302583026, "train_speed(iter/s)": 0.582021 }, { "epoch": 1.829490291262136, "grad_norm": 6.512312889099121, "learning_rate": 3.308821689534766e-05, "loss": 1.9633775711059571, "memory(GiB)": 41.84, "step": 3015, "token_acc": 0.526984126984127, "train_speed(iter/s)": 0.582048 }, { "epoch": 1.8325242718446602, "grad_norm": 6.695690631866455, "learning_rate": 3.293880656429028e-05, "loss": 1.9059555053710937, "memory(GiB)": 41.84, "step": 3020, "token_acc": 0.558282208588957, "train_speed(iter/s)": 0.582014 }, { "epoch": 1.8355582524271845, "grad_norm": 8.261147499084473, "learning_rate": 3.278956845651897e-05, "loss": 1.9743257522583009, "memory(GiB)": 41.84, "step": 3025, "token_acc": 0.5537974683544303, "train_speed(iter/s)": 0.582071 }, { "epoch": 1.8385922330097086, "grad_norm": 8.489652633666992, "learning_rate": 3.2640504078509706e-05, "loss": 2.0056623458862304, "memory(GiB)": 41.84, "step": 3030, "token_acc": 0.5425219941348973, "train_speed(iter/s)": 0.582011 }, { "epoch": 1.841626213592233, "grad_norm": 5.517820835113525, "learning_rate": 3.2491614934984706e-05, "loss": 2.0196483612060545, "memory(GiB)": 41.84, "step": 3035, "token_acc": 0.5681159420289855, "train_speed(iter/s)": 0.58204 }, { "epoch": 1.8446601941747574, "grad_norm": 11.377049446105957, "learning_rate": 3.2342902528897276e-05, "loss": 2.4981143951416014, "memory(GiB)": 41.84, "step": 3040, "token_acc": 0.4857142857142857, "train_speed(iter/s)": 0.582056 }, { "epoch": 1.8476941747572817, "grad_norm": 9.072402954101562, "learning_rate": 3.219436836141672e-05, "loss": 1.7939895629882812, "memory(GiB)": 41.84, "step": 3045, "token_acc": 0.5458015267175572, "train_speed(iter/s)": 0.582115 }, { "epoch": 1.8507281553398058, "grad_norm": 8.273455619812012, "learning_rate": 3.204601393191305e-05, "loss": 2.0849941253662108, "memory(GiB)": 41.84, "step": 3050, "token_acc": 0.5551948051948052, "train_speed(iter/s)": 0.582202 }, { "epoch": 1.85376213592233, "grad_norm": 6.509883880615234, "learning_rate": 3.1897840737941996e-05, "loss": 1.894825553894043, "memory(GiB)": 41.84, "step": 3055, "token_acc": 0.5211726384364821, "train_speed(iter/s)": 0.582142 }, { "epoch": 1.8567961165048543, "grad_norm": 8.81839370727539, "learning_rate": 3.174985027522978e-05, "loss": 1.9194953918457032, "memory(GiB)": 41.84, "step": 3060, "token_acc": 0.5727554179566563, "train_speed(iter/s)": 0.582189 }, { "epoch": 1.8598300970873787, "grad_norm": 7.000573635101318, "learning_rate": 3.1602044037657994e-05, "loss": 1.977131462097168, "memory(GiB)": 41.84, "step": 3065, "token_acc": 0.543046357615894, "train_speed(iter/s)": 0.582179 }, { "epoch": 1.862864077669903, "grad_norm": 8.45114803314209, "learning_rate": 3.1454423517248704e-05, "loss": 2.187137985229492, "memory(GiB)": 41.84, "step": 3070, "token_acc": 0.5319767441860465, "train_speed(iter/s)": 0.582204 }, { "epoch": 1.8658980582524272, "grad_norm": 11.056445121765137, "learning_rate": 3.1306990204149146e-05, "loss": 1.8925033569335938, "memory(GiB)": 41.84, "step": 3075, "token_acc": 0.568, "train_speed(iter/s)": 0.582108 }, { "epoch": 1.8689320388349513, "grad_norm": 7.232324123382568, "learning_rate": 3.115974558661691e-05, "loss": 2.050203323364258, "memory(GiB)": 41.84, "step": 3080, "token_acc": 0.5853658536585366, "train_speed(iter/s)": 0.582179 }, { "epoch": 1.8719660194174756, "grad_norm": 6.1433024406433105, "learning_rate": 3.1012691151004694e-05, "loss": 1.7500345230102539, "memory(GiB)": 41.84, "step": 3085, "token_acc": 0.5871886120996441, "train_speed(iter/s)": 0.582301 }, { "epoch": 1.875, "grad_norm": 5.706048488616943, "learning_rate": 3.086582838174551e-05, "loss": 1.8604692459106444, "memory(GiB)": 41.84, "step": 3090, "token_acc": 0.5847953216374269, "train_speed(iter/s)": 0.582285 }, { "epoch": 1.8780339805825244, "grad_norm": 7.604012489318848, "learning_rate": 3.0719158761337574e-05, "loss": 1.8550039291381837, "memory(GiB)": 41.84, "step": 3095, "token_acc": 0.558641975308642, "train_speed(iter/s)": 0.582099 }, { "epoch": 1.8810679611650487, "grad_norm": 7.333124160766602, "learning_rate": 3.0572683770329316e-05, "loss": 2.143758010864258, "memory(GiB)": 41.84, "step": 3100, "token_acc": 0.5300353356890459, "train_speed(iter/s)": 0.581945 }, { "epoch": 1.8841019417475728, "grad_norm": 6.552914142608643, "learning_rate": 3.0426404887304605e-05, "loss": 1.7599102020263673, "memory(GiB)": 41.84, "step": 3105, "token_acc": 0.5617283950617284, "train_speed(iter/s)": 0.58193 }, { "epoch": 1.887135922330097, "grad_norm": 7.146379470825195, "learning_rate": 3.0280323588867586e-05, "loss": 1.814478302001953, "memory(GiB)": 41.84, "step": 3110, "token_acc": 0.5836177474402731, "train_speed(iter/s)": 0.582031 }, { "epoch": 1.8901699029126213, "grad_norm": 4.842132568359375, "learning_rate": 3.0134441349627997e-05, "loss": 2.0125823974609376, "memory(GiB)": 41.84, "step": 3115, "token_acc": 0.575, "train_speed(iter/s)": 0.581976 }, { "epoch": 1.8932038834951457, "grad_norm": 7.434795379638672, "learning_rate": 2.9988759642186097e-05, "loss": 2.0929500579833986, "memory(GiB)": 41.84, "step": 3120, "token_acc": 0.5663956639566395, "train_speed(iter/s)": 0.582017 }, { "epoch": 1.89623786407767, "grad_norm": 15.827396392822266, "learning_rate": 2.9843279937117997e-05, "loss": 2.314325141906738, "memory(GiB)": 41.84, "step": 3125, "token_acc": 0.5579399141630901, "train_speed(iter/s)": 0.582137 }, { "epoch": 1.8992718446601942, "grad_norm": 7.544915199279785, "learning_rate": 2.9698003702960586e-05, "loss": 2.055324745178223, "memory(GiB)": 41.84, "step": 3130, "token_acc": 0.5213903743315508, "train_speed(iter/s)": 0.5821 }, { "epoch": 1.9023058252427183, "grad_norm": 6.31001091003418, "learning_rate": 2.9552932406196876e-05, "loss": 1.8344003677368164, "memory(GiB)": 41.84, "step": 3135, "token_acc": 0.5980707395498392, "train_speed(iter/s)": 0.582162 }, { "epoch": 1.9053398058252426, "grad_norm": 9.230671882629395, "learning_rate": 2.94080675112412e-05, "loss": 1.9021150588989257, "memory(GiB)": 41.84, "step": 3140, "token_acc": 0.5845070422535211, "train_speed(iter/s)": 0.582193 }, { "epoch": 1.908373786407767, "grad_norm": 7.505317211151123, "learning_rate": 2.9263410480424303e-05, "loss": 2.2937973022460936, "memory(GiB)": 41.84, "step": 3145, "token_acc": 0.5370370370370371, "train_speed(iter/s)": 0.582135 }, { "epoch": 1.9114077669902914, "grad_norm": 11.365267753601074, "learning_rate": 2.9118962773978693e-05, "loss": 2.124867057800293, "memory(GiB)": 41.84, "step": 3150, "token_acc": 0.5379310344827586, "train_speed(iter/s)": 0.582048 }, { "epoch": 1.9144417475728155, "grad_norm": 6.946807861328125, "learning_rate": 2.8974725850023886e-05, "loss": 1.7865402221679687, "memory(GiB)": 41.84, "step": 3155, "token_acc": 0.6114649681528662, "train_speed(iter/s)": 0.582042 }, { "epoch": 1.9174757281553398, "grad_norm": 7.430286884307861, "learning_rate": 2.8830701164551598e-05, "loss": 2.096043014526367, "memory(GiB)": 41.84, "step": 3160, "token_acc": 0.5474006116207951, "train_speed(iter/s)": 0.581966 }, { "epoch": 1.920509708737864, "grad_norm": 5.716464996337891, "learning_rate": 2.8686890171411175e-05, "loss": 1.8883914947509766, "memory(GiB)": 41.84, "step": 3165, "token_acc": 0.55, "train_speed(iter/s)": 0.582002 }, { "epoch": 1.9235436893203883, "grad_norm": 6.345276832580566, "learning_rate": 2.8543294322294846e-05, "loss": 1.888068962097168, "memory(GiB)": 41.84, "step": 3170, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.581965 }, { "epoch": 1.9265776699029127, "grad_norm": 8.231746673583984, "learning_rate": 2.8399915066723072e-05, "loss": 2.047636795043945, "memory(GiB)": 41.84, "step": 3175, "token_acc": 0.5, "train_speed(iter/s)": 0.581961 }, { "epoch": 1.929611650485437, "grad_norm": 7.52333927154541, "learning_rate": 2.8256753852029915e-05, "loss": 1.964263916015625, "memory(GiB)": 41.84, "step": 3180, "token_acc": 0.5566666666666666, "train_speed(iter/s)": 0.581957 }, { "epoch": 1.9326456310679612, "grad_norm": 8.115636825561523, "learning_rate": 2.811381212334847e-05, "loss": 1.974155807495117, "memory(GiB)": 41.84, "step": 3185, "token_acc": 0.5273775216138329, "train_speed(iter/s)": 0.58201 }, { "epoch": 1.9356796116504853, "grad_norm": 10.928778648376465, "learning_rate": 2.7971091323596177e-05, "loss": 1.7765790939331054, "memory(GiB)": 41.84, "step": 3190, "token_acc": 0.5967078189300411, "train_speed(iter/s)": 0.581997 }, { "epoch": 1.9387135922330097, "grad_norm": 10.940017700195312, "learning_rate": 2.782859289346038e-05, "loss": 2.00123291015625, "memory(GiB)": 41.84, "step": 3195, "token_acc": 0.5628930817610063, "train_speed(iter/s)": 0.58186 }, { "epoch": 1.941747572815534, "grad_norm": 5.479226112365723, "learning_rate": 2.7686318271383714e-05, "loss": 1.7830612182617187, "memory(GiB)": 41.84, "step": 3200, "token_acc": 0.5899705014749262, "train_speed(iter/s)": 0.581899 }, { "epoch": 1.9447815533980584, "grad_norm": 9.235628128051758, "learning_rate": 2.7544268893549573e-05, "loss": 2.1630695343017576, "memory(GiB)": 41.84, "step": 3205, "token_acc": 0.5141843971631206, "train_speed(iter/s)": 0.581774 }, { "epoch": 1.9478155339805825, "grad_norm": 9.116209030151367, "learning_rate": 2.740244619386768e-05, "loss": 1.9152229309082032, "memory(GiB)": 41.84, "step": 3210, "token_acc": 0.5373134328358209, "train_speed(iter/s)": 0.581685 }, { "epoch": 1.9508495145631068, "grad_norm": 8.476284980773926, "learning_rate": 2.726085160395948e-05, "loss": 1.9020435333251953, "memory(GiB)": 41.84, "step": 3215, "token_acc": 0.6095238095238096, "train_speed(iter/s)": 0.581688 }, { "epoch": 1.953883495145631, "grad_norm": 6.1975226402282715, "learning_rate": 2.7119486553143904e-05, "loss": 1.6950944900512694, "memory(GiB)": 41.84, "step": 3220, "token_acc": 0.5774193548387097, "train_speed(iter/s)": 0.58168 }, { "epoch": 1.9569174757281553, "grad_norm": 8.92437744140625, "learning_rate": 2.6978352468422685e-05, "loss": 1.9295099258422852, "memory(GiB)": 41.84, "step": 3225, "token_acc": 0.5393586005830904, "train_speed(iter/s)": 0.581539 }, { "epoch": 1.9599514563106797, "grad_norm": 7.443687438964844, "learning_rate": 2.683745077446616e-05, "loss": 1.8496671676635743, "memory(GiB)": 41.84, "step": 3230, "token_acc": 0.6013289036544851, "train_speed(iter/s)": 0.581521 }, { "epoch": 1.962985436893204, "grad_norm": 8.71033763885498, "learning_rate": 2.6696782893598816e-05, "loss": 1.8758098602294921, "memory(GiB)": 41.84, "step": 3235, "token_acc": 0.5804195804195804, "train_speed(iter/s)": 0.581603 }, { "epoch": 1.9660194174757282, "grad_norm": 9.311905860900879, "learning_rate": 2.6556350245784833e-05, "loss": 2.088191795349121, "memory(GiB)": 41.84, "step": 3240, "token_acc": 0.5805626598465473, "train_speed(iter/s)": 0.581562 }, { "epoch": 1.9690533980582523, "grad_norm": 7.559510707855225, "learning_rate": 2.641615424861399e-05, "loss": 2.090311050415039, "memory(GiB)": 41.84, "step": 3245, "token_acc": 0.5533980582524272, "train_speed(iter/s)": 0.58146 }, { "epoch": 1.9720873786407767, "grad_norm": 9.421564102172852, "learning_rate": 2.6276196317287083e-05, "loss": 2.2272558212280273, "memory(GiB)": 41.84, "step": 3250, "token_acc": 0.5223463687150838, "train_speed(iter/s)": 0.581307 }, { "epoch": 1.975121359223301, "grad_norm": 6.799111843109131, "learning_rate": 2.6136477864601817e-05, "loss": 2.049495887756348, "memory(GiB)": 41.84, "step": 3255, "token_acc": 0.5488215488215489, "train_speed(iter/s)": 0.581288 }, { "epoch": 1.9781553398058254, "grad_norm": 6.001493453979492, "learning_rate": 2.5997000300938506e-05, "loss": 1.8592962265014648, "memory(GiB)": 41.84, "step": 3260, "token_acc": 0.5870206489675516, "train_speed(iter/s)": 0.581194 }, { "epoch": 1.9811893203883495, "grad_norm": 8.738608360290527, "learning_rate": 2.585776503424576e-05, "loss": 2.017384719848633, "memory(GiB)": 41.84, "step": 3265, "token_acc": 0.5529801324503312, "train_speed(iter/s)": 0.581142 }, { "epoch": 1.9842233009708736, "grad_norm": 9.666224479675293, "learning_rate": 2.5718773470026448e-05, "loss": 1.999835205078125, "memory(GiB)": 41.84, "step": 3270, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 0.581015 }, { "epoch": 1.987257281553398, "grad_norm": 10.135787963867188, "learning_rate": 2.5580027011323282e-05, "loss": 1.6806678771972656, "memory(GiB)": 41.84, "step": 3275, "token_acc": 0.5893536121673004, "train_speed(iter/s)": 0.580994 }, { "epoch": 1.9902912621359223, "grad_norm": 7.922843933105469, "learning_rate": 2.544152705870483e-05, "loss": 2.177354431152344, "memory(GiB)": 41.84, "step": 3280, "token_acc": 0.5117056856187291, "train_speed(iter/s)": 0.581002 }, { "epoch": 1.9933252427184467, "grad_norm": 6.94931697845459, "learning_rate": 2.5303275010251315e-05, "loss": 2.506937026977539, "memory(GiB)": 41.84, "step": 3285, "token_acc": 0.4811594202898551, "train_speed(iter/s)": 0.581043 }, { "epoch": 1.9963592233009708, "grad_norm": 10.3767728805542, "learning_rate": 2.5165272261540458e-05, "loss": 2.0383968353271484, "memory(GiB)": 41.84, "step": 3290, "token_acc": 0.5487364620938628, "train_speed(iter/s)": 0.581 }, { "epoch": 1.9993932038834952, "grad_norm": 9.176785469055176, "learning_rate": 2.5027520205633537e-05, "loss": 2.0018213272094725, "memory(GiB)": 41.84, "step": 3295, "token_acc": 0.5522875816993464, "train_speed(iter/s)": 0.580883 }, { "epoch": 2.0024271844660193, "grad_norm": 6.717225551605225, "learning_rate": 2.4890020233061117e-05, "loss": 1.7098587036132813, "memory(GiB)": 41.84, "step": 3300, "token_acc": 0.5948905109489051, "train_speed(iter/s)": 0.580883 }, { "epoch": 2.0054611650485437, "grad_norm": 5.7973246574401855, "learning_rate": 2.4752773731809176e-05, "loss": 2.0262834548950197, "memory(GiB)": 41.84, "step": 3305, "token_acc": 0.558641975308642, "train_speed(iter/s)": 0.580856 }, { "epoch": 2.008495145631068, "grad_norm": 7.4671831130981445, "learning_rate": 2.461578208730504e-05, "loss": 1.7233488082885742, "memory(GiB)": 41.84, "step": 3310, "token_acc": 0.6162790697674418, "train_speed(iter/s)": 0.580794 }, { "epoch": 2.0115291262135924, "grad_norm": 12.061534881591797, "learning_rate": 2.447904668240338e-05, "loss": 1.8241962432861327, "memory(GiB)": 41.84, "step": 3315, "token_acc": 0.6076923076923076, "train_speed(iter/s)": 0.580734 }, { "epoch": 2.0145631067961167, "grad_norm": 8.090734481811523, "learning_rate": 2.4342568897372304e-05, "loss": 1.7618919372558595, "memory(GiB)": 41.84, "step": 3320, "token_acc": 0.5701219512195121, "train_speed(iter/s)": 0.58063 }, { "epoch": 2.0175970873786406, "grad_norm": 9.886768341064453, "learning_rate": 2.4206350109879322e-05, "loss": 2.333799362182617, "memory(GiB)": 41.84, "step": 3325, "token_acc": 0.5303514376996805, "train_speed(iter/s)": 0.58057 }, { "epoch": 2.020631067961165, "grad_norm": 9.405782699584961, "learning_rate": 2.4070391694977578e-05, "loss": 1.9533946990966797, "memory(GiB)": 41.84, "step": 3330, "token_acc": 0.5647840531561462, "train_speed(iter/s)": 0.580546 }, { "epoch": 2.0236650485436893, "grad_norm": 8.449411392211914, "learning_rate": 2.3934695025091863e-05, "loss": 1.9143606185913087, "memory(GiB)": 41.84, "step": 3335, "token_acc": 0.5501618122977346, "train_speed(iter/s)": 0.580463 }, { "epoch": 2.0266990291262137, "grad_norm": 9.61319351196289, "learning_rate": 2.3799261470004817e-05, "loss": 1.825465202331543, "memory(GiB)": 41.84, "step": 3340, "token_acc": 0.5772357723577236, "train_speed(iter/s)": 0.580476 }, { "epoch": 2.029733009708738, "grad_norm": 10.004016876220703, "learning_rate": 2.3664092396843078e-05, "loss": 2.128991889953613, "memory(GiB)": 41.84, "step": 3345, "token_acc": 0.5173501577287066, "train_speed(iter/s)": 0.58049 }, { "epoch": 2.032766990291262, "grad_norm": 8.138049125671387, "learning_rate": 2.3529189170063448e-05, "loss": 2.3146188735961912, "memory(GiB)": 41.84, "step": 3350, "token_acc": 0.5306122448979592, "train_speed(iter/s)": 0.580442 }, { "epoch": 2.0358009708737863, "grad_norm": 8.229063987731934, "learning_rate": 2.3394553151439207e-05, "loss": 1.8358327865600585, "memory(GiB)": 41.84, "step": 3355, "token_acc": 0.583941605839416, "train_speed(iter/s)": 0.580463 }, { "epoch": 2.0388349514563107, "grad_norm": 7.304425239562988, "learning_rate": 2.3260185700046294e-05, "loss": 1.8064495086669923, "memory(GiB)": 41.84, "step": 3360, "token_acc": 0.5791245791245792, "train_speed(iter/s)": 0.58043 }, { "epoch": 2.041868932038835, "grad_norm": 9.741589546203613, "learning_rate": 2.3126088172249617e-05, "loss": 1.8935234069824218, "memory(GiB)": 41.84, "step": 3365, "token_acc": 0.5535055350553506, "train_speed(iter/s)": 0.580243 }, { "epoch": 2.0449029126213594, "grad_norm": 11.936101913452148, "learning_rate": 2.299226192168935e-05, "loss": 1.8312896728515624, "memory(GiB)": 41.84, "step": 3370, "token_acc": 0.5755627009646302, "train_speed(iter/s)": 0.580241 }, { "epoch": 2.0479368932038833, "grad_norm": 8.954520225524902, "learning_rate": 2.28587082992673e-05, "loss": 1.9918130874633788, "memory(GiB)": 41.84, "step": 3375, "token_acc": 0.5692307692307692, "train_speed(iter/s)": 0.580236 }, { "epoch": 2.0509708737864076, "grad_norm": 7.279824256896973, "learning_rate": 2.2725428653133178e-05, "loss": 2.056449317932129, "memory(GiB)": 41.84, "step": 3380, "token_acc": 0.5582655826558266, "train_speed(iter/s)": 0.580216 }, { "epoch": 2.054004854368932, "grad_norm": 8.318132400512695, "learning_rate": 2.2592424328671125e-05, "loss": 1.845474624633789, "memory(GiB)": 41.84, "step": 3385, "token_acc": 0.5753846153846154, "train_speed(iter/s)": 0.580119 }, { "epoch": 2.0570388349514563, "grad_norm": 8.473575592041016, "learning_rate": 2.2459696668486025e-05, "loss": 2.0317916870117188, "memory(GiB)": 41.84, "step": 3390, "token_acc": 0.5693950177935944, "train_speed(iter/s)": 0.580048 }, { "epoch": 2.0600728155339807, "grad_norm": 6.581578254699707, "learning_rate": 2.2327247012390005e-05, "loss": 1.8874988555908203, "memory(GiB)": 41.84, "step": 3395, "token_acc": 0.5551839464882943, "train_speed(iter/s)": 0.579895 }, { "epoch": 2.063106796116505, "grad_norm": 9.253079414367676, "learning_rate": 2.2195076697388915e-05, "loss": 1.6856924057006837, "memory(GiB)": 41.84, "step": 3400, "token_acc": 0.6493506493506493, "train_speed(iter/s)": 0.57988 }, { "epoch": 2.066140776699029, "grad_norm": 8.945847511291504, "learning_rate": 2.2063187057668727e-05, "loss": 1.6917535781860351, "memory(GiB)": 41.84, "step": 3405, "token_acc": 0.5947712418300654, "train_speed(iter/s)": 0.57992 }, { "epoch": 2.0691747572815533, "grad_norm": 9.185718536376953, "learning_rate": 2.1931579424582283e-05, "loss": 1.7603189468383789, "memory(GiB)": 41.84, "step": 3410, "token_acc": 0.5736434108527132, "train_speed(iter/s)": 0.579988 }, { "epoch": 2.0722087378640777, "grad_norm": 6.9922332763671875, "learning_rate": 2.18002551266356e-05, "loss": 2.1215755462646486, "memory(GiB)": 41.84, "step": 3415, "token_acc": 0.5202312138728323, "train_speed(iter/s)": 0.579988 }, { "epoch": 2.075242718446602, "grad_norm": 8.512064933776855, "learning_rate": 2.166921548947466e-05, "loss": 1.720651626586914, "memory(GiB)": 41.84, "step": 3420, "token_acc": 0.5985915492957746, "train_speed(iter/s)": 0.579814 }, { "epoch": 2.0782766990291264, "grad_norm": 8.933260917663574, "learning_rate": 2.1538461835871937e-05, "loss": 1.8302701950073241, "memory(GiB)": 41.84, "step": 3425, "token_acc": 0.5993975903614458, "train_speed(iter/s)": 0.579754 }, { "epoch": 2.0813106796116503, "grad_norm": 7.324397087097168, "learning_rate": 2.1407995485713007e-05, "loss": 1.9634611129760742, "memory(GiB)": 41.84, "step": 3430, "token_acc": 0.5775075987841946, "train_speed(iter/s)": 0.579633 }, { "epoch": 2.0843446601941746, "grad_norm": 6.617276191711426, "learning_rate": 2.127781775598339e-05, "loss": 1.535646343231201, "memory(GiB)": 41.84, "step": 3435, "token_acc": 0.62, "train_speed(iter/s)": 0.579668 }, { "epoch": 2.087378640776699, "grad_norm": 8.722604751586914, "learning_rate": 2.1147929960755032e-05, "loss": 1.8054920196533204, "memory(GiB)": 41.84, "step": 3440, "token_acc": 0.5772058823529411, "train_speed(iter/s)": 0.579792 }, { "epoch": 2.0904126213592233, "grad_norm": 7.9137043952941895, "learning_rate": 2.101833341117319e-05, "loss": 1.9117881774902343, "memory(GiB)": 41.84, "step": 3445, "token_acc": 0.5891238670694864, "train_speed(iter/s)": 0.579887 }, { "epoch": 2.0934466019417477, "grad_norm": 8.221436500549316, "learning_rate": 2.08890294154432e-05, "loss": 2.002272033691406, "memory(GiB)": 41.84, "step": 3450, "token_acc": 0.5616883116883117, "train_speed(iter/s)": 0.580016 }, { "epoch": 2.096480582524272, "grad_norm": 8.50936222076416, "learning_rate": 2.0760019278817123e-05, "loss": 1.9437885284423828, "memory(GiB)": 44.28, "step": 3455, "token_acc": 0.6167247386759582, "train_speed(iter/s)": 0.580014 }, { "epoch": 2.099514563106796, "grad_norm": 8.858839988708496, "learning_rate": 2.0631304303580824e-05, "loss": 1.8394168853759765, "memory(GiB)": 44.28, "step": 3460, "token_acc": 0.5693430656934306, "train_speed(iter/s)": 0.580039 }, { "epoch": 2.1025485436893203, "grad_norm": 7.461985111236572, "learning_rate": 2.0502885789040537e-05, "loss": 2.222452163696289, "memory(GiB)": 44.28, "step": 3465, "token_acc": 0.5231607629427792, "train_speed(iter/s)": 0.580161 }, { "epoch": 2.1055825242718447, "grad_norm": 6.139802932739258, "learning_rate": 2.037476503150997e-05, "loss": 1.6303333282470702, "memory(GiB)": 44.28, "step": 3470, "token_acc": 0.6225165562913907, "train_speed(iter/s)": 0.580274 }, { "epoch": 2.108616504854369, "grad_norm": 9.019342422485352, "learning_rate": 2.024694332429713e-05, "loss": 2.1092754364013673, "memory(GiB)": 44.28, "step": 3475, "token_acc": 0.5068493150684932, "train_speed(iter/s)": 0.580406 }, { "epoch": 2.1116504854368934, "grad_norm": 10.167961120605469, "learning_rate": 2.011942195769122e-05, "loss": 1.965473747253418, "memory(GiB)": 44.28, "step": 3480, "token_acc": 0.5733788395904437, "train_speed(iter/s)": 0.580421 }, { "epoch": 2.1146844660194173, "grad_norm": 11.388608932495117, "learning_rate": 1.9992202218949784e-05, "loss": 1.9142690658569337, "memory(GiB)": 44.28, "step": 3485, "token_acc": 0.5517241379310345, "train_speed(iter/s)": 0.58051 }, { "epoch": 2.1177184466019416, "grad_norm": 6.913421154022217, "learning_rate": 1.986528539228548e-05, "loss": 1.9621810913085938, "memory(GiB)": 44.28, "step": 3490, "token_acc": 0.55, "train_speed(iter/s)": 0.580592 }, { "epoch": 2.120752427184466, "grad_norm": 7.60167121887207, "learning_rate": 1.9738672758853305e-05, "loss": 1.8437973022460938, "memory(GiB)": 44.28, "step": 3495, "token_acc": 0.5822368421052632, "train_speed(iter/s)": 0.580641 }, { "epoch": 2.1237864077669903, "grad_norm": 7.163271427154541, "learning_rate": 1.9612365596737598e-05, "loss": 1.6543169021606445, "memory(GiB)": 44.28, "step": 3500, "token_acc": 0.6292134831460674, "train_speed(iter/s)": 0.58071 }, { "epoch": 2.1237864077669903, "eval_loss": 1.866715431213379, "eval_runtime": 11.5427, "eval_samples_per_second": 8.663, "eval_steps_per_second": 8.663, "eval_token_acc": 0.5221745350500715, "step": 3500 }, { "epoch": 2.1268203883495147, "grad_norm": 8.629687309265137, "learning_rate": 1.948636518093906e-05, "loss": 2.077587127685547, "memory(GiB)": 44.29, "step": 3505, "token_acc": 0.5247895229186156, "train_speed(iter/s)": 0.579649 }, { "epoch": 2.1298543689320386, "grad_norm": 9.276599884033203, "learning_rate": 1.9360672783362076e-05, "loss": 1.759820556640625, "memory(GiB)": 44.29, "step": 3510, "token_acc": 0.5964912280701754, "train_speed(iter/s)": 0.579656 }, { "epoch": 2.132888349514563, "grad_norm": 7.651179790496826, "learning_rate": 1.9235289672801653e-05, "loss": 2.0451793670654297, "memory(GiB)": 44.29, "step": 3515, "token_acc": 0.4962025316455696, "train_speed(iter/s)": 0.579702 }, { "epoch": 2.1359223300970873, "grad_norm": 8.811480522155762, "learning_rate": 1.911021711493077e-05, "loss": 2.1489105224609375, "memory(GiB)": 44.29, "step": 3520, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 0.579789 }, { "epoch": 2.1389563106796117, "grad_norm": 7.404130935668945, "learning_rate": 1.8985456372287534e-05, "loss": 1.7454706192016602, "memory(GiB)": 44.29, "step": 3525, "token_acc": 0.6334405144694534, "train_speed(iter/s)": 0.579816 }, { "epoch": 2.141990291262136, "grad_norm": 10.67794132232666, "learning_rate": 1.8861008704262457e-05, "loss": 1.8724552154541017, "memory(GiB)": 44.29, "step": 3530, "token_acc": 0.5796610169491525, "train_speed(iter/s)": 0.579807 }, { "epoch": 2.1450242718446604, "grad_norm": 8.964271545410156, "learning_rate": 1.8736875367085755e-05, "loss": 1.8260086059570313, "memory(GiB)": 44.29, "step": 3535, "token_acc": 0.5640138408304498, "train_speed(iter/s)": 0.579826 }, { "epoch": 2.1480582524271843, "grad_norm": 8.768270492553711, "learning_rate": 1.8613057613814584e-05, "loss": 1.9611518859863282, "memory(GiB)": 44.29, "step": 3540, "token_acc": 0.552901023890785, "train_speed(iter/s)": 0.579777 }, { "epoch": 2.1510922330097086, "grad_norm": 9.284390449523926, "learning_rate": 1.8489556694320513e-05, "loss": 2.0381515502929686, "memory(GiB)": 44.29, "step": 3545, "token_acc": 0.5503875968992248, "train_speed(iter/s)": 0.579814 }, { "epoch": 2.154126213592233, "grad_norm": 11.225659370422363, "learning_rate": 1.836637385527684e-05, "loss": 2.1475587844848634, "memory(GiB)": 44.29, "step": 3550, "token_acc": 0.5677233429394812, "train_speed(iter/s)": 0.579745 }, { "epoch": 2.1571601941747574, "grad_norm": 7.625835418701172, "learning_rate": 1.8243510340146015e-05, "loss": 1.9312858581542969, "memory(GiB)": 44.29, "step": 3555, "token_acc": 0.5634328358208955, "train_speed(iter/s)": 0.579783 }, { "epoch": 2.1601941747572817, "grad_norm": 9.190287590026855, "learning_rate": 1.8120967389167076e-05, "loss": 1.5170929908752442, "memory(GiB)": 44.29, "step": 3560, "token_acc": 0.6521739130434783, "train_speed(iter/s)": 0.579754 }, { "epoch": 2.163228155339806, "grad_norm": 11.153077125549316, "learning_rate": 1.799874623934318e-05, "loss": 1.931208610534668, "memory(GiB)": 44.29, "step": 3565, "token_acc": 0.5909090909090909, "train_speed(iter/s)": 0.579843 }, { "epoch": 2.16626213592233, "grad_norm": 6.920065879821777, "learning_rate": 1.7876848124429014e-05, "loss": 1.7487638473510743, "memory(GiB)": 44.29, "step": 3570, "token_acc": 0.5811209439528023, "train_speed(iter/s)": 0.579783 }, { "epoch": 2.1692961165048543, "grad_norm": 9.68315601348877, "learning_rate": 1.775527427491847e-05, "loss": 1.9304796218872071, "memory(GiB)": 44.29, "step": 3575, "token_acc": 0.5424354243542435, "train_speed(iter/s)": 0.579873 }, { "epoch": 2.1723300970873787, "grad_norm": 8.648472785949707, "learning_rate": 1.7634025918032132e-05, "loss": 1.822089385986328, "memory(GiB)": 44.29, "step": 3580, "token_acc": 0.615625, "train_speed(iter/s)": 0.579913 }, { "epoch": 2.175364077669903, "grad_norm": 5.941222190856934, "learning_rate": 1.7513104277704926e-05, "loss": 1.5487011909484862, "memory(GiB)": 44.29, "step": 3585, "token_acc": 0.6267123287671232, "train_speed(iter/s)": 0.579944 }, { "epoch": 2.1783980582524274, "grad_norm": 8.311307907104492, "learning_rate": 1.739251057457377e-05, "loss": 1.876582145690918, "memory(GiB)": 44.29, "step": 3590, "token_acc": 0.5734463276836158, "train_speed(iter/s)": 0.579958 }, { "epoch": 2.1814320388349513, "grad_norm": 9.144810676574707, "learning_rate": 1.7272246025965178e-05, "loss": 2.155200386047363, "memory(GiB)": 44.29, "step": 3595, "token_acc": 0.5140845070422535, "train_speed(iter/s)": 0.579869 }, { "epoch": 2.1844660194174756, "grad_norm": 7.681180953979492, "learning_rate": 1.7152311845883095e-05, "loss": 1.7877147674560547, "memory(GiB)": 44.29, "step": 3600, "token_acc": 0.5666666666666667, "train_speed(iter/s)": 0.57981 }, { "epoch": 2.1875, "grad_norm": 8.98862361907959, "learning_rate": 1.703270924499656e-05, "loss": 1.7724479675292968, "memory(GiB)": 44.29, "step": 3605, "token_acc": 0.5951557093425606, "train_speed(iter/s)": 0.57974 }, { "epoch": 2.1905339805825244, "grad_norm": 6.949456214904785, "learning_rate": 1.691343943062749e-05, "loss": 1.7420495986938476, "memory(GiB)": 44.29, "step": 3610, "token_acc": 0.5741935483870968, "train_speed(iter/s)": 0.579755 }, { "epoch": 2.1935679611650487, "grad_norm": 7.481090545654297, "learning_rate": 1.6794503606738548e-05, "loss": 2.0047124862670898, "memory(GiB)": 44.29, "step": 3615, "token_acc": 0.5398230088495575, "train_speed(iter/s)": 0.57981 }, { "epoch": 2.1966019417475726, "grad_norm": 7.942904472351074, "learning_rate": 1.667590297392086e-05, "loss": 2.1652708053588867, "memory(GiB)": 44.29, "step": 3620, "token_acc": 0.5389048991354467, "train_speed(iter/s)": 0.57981 }, { "epoch": 2.199635922330097, "grad_norm": 7.470623016357422, "learning_rate": 1.6557638729382107e-05, "loss": 1.7064685821533203, "memory(GiB)": 44.29, "step": 3625, "token_acc": 0.6104651162790697, "train_speed(iter/s)": 0.57981 }, { "epoch": 2.2026699029126213, "grad_norm": 6.908362865447998, "learning_rate": 1.6439712066934204e-05, "loss": 1.8296821594238282, "memory(GiB)": 44.29, "step": 3630, "token_acc": 0.5864022662889519, "train_speed(iter/s)": 0.579781 }, { "epoch": 2.2057038834951457, "grad_norm": 7.870819568634033, "learning_rate": 1.632212417698143e-05, "loss": 1.9550270080566405, "memory(GiB)": 44.29, "step": 3635, "token_acc": 0.5835777126099707, "train_speed(iter/s)": 0.579726 }, { "epoch": 2.20873786407767, "grad_norm": 8.01059627532959, "learning_rate": 1.620487624650834e-05, "loss": 1.8678318023681642, "memory(GiB)": 44.29, "step": 3640, "token_acc": 0.61875, "train_speed(iter/s)": 0.579706 }, { "epoch": 2.211771844660194, "grad_norm": 7.753682613372803, "learning_rate": 1.6087969459067708e-05, "loss": 1.5739126205444336, "memory(GiB)": 44.29, "step": 3645, "token_acc": 0.6141479099678456, "train_speed(iter/s)": 0.57974 }, { "epoch": 2.2148058252427183, "grad_norm": 8.250489234924316, "learning_rate": 1.5971404994768797e-05, "loss": 1.9059646606445313, "memory(GiB)": 44.29, "step": 3650, "token_acc": 0.5551470588235294, "train_speed(iter/s)": 0.579782 }, { "epoch": 2.2178398058252426, "grad_norm": 8.499149322509766, "learning_rate": 1.585518403026518e-05, "loss": 2.0898170471191406, "memory(GiB)": 44.29, "step": 3655, "token_acc": 0.59, "train_speed(iter/s)": 0.579868 }, { "epoch": 2.220873786407767, "grad_norm": 9.44747543334961, "learning_rate": 1.5739307738743057e-05, "loss": 1.9359277725219726, "memory(GiB)": 44.29, "step": 3660, "token_acc": 0.5628930817610063, "train_speed(iter/s)": 0.579972 }, { "epoch": 2.2239077669902914, "grad_norm": 6.627506256103516, "learning_rate": 1.5623777289909347e-05, "loss": 1.749598503112793, "memory(GiB)": 44.29, "step": 3665, "token_acc": 0.621160409556314, "train_speed(iter/s)": 0.579956 }, { "epoch": 2.2269417475728157, "grad_norm": 9.652698516845703, "learning_rate": 1.5508593849979812e-05, "loss": 1.946786117553711, "memory(GiB)": 44.29, "step": 3670, "token_acc": 0.5962732919254659, "train_speed(iter/s)": 0.580051 }, { "epoch": 2.2299757281553396, "grad_norm": 11.728522300720215, "learning_rate": 1.5393758581667462e-05, "loss": 1.8440595626831056, "memory(GiB)": 44.29, "step": 3675, "token_acc": 0.5598455598455598, "train_speed(iter/s)": 0.580023 }, { "epoch": 2.233009708737864, "grad_norm": 9.42689323425293, "learning_rate": 1.52792726441706e-05, "loss": 2.040317916870117, "memory(GiB)": 44.29, "step": 3680, "token_acc": 0.5699658703071673, "train_speed(iter/s)": 0.580045 }, { "epoch": 2.2360436893203883, "grad_norm": 9.370969772338867, "learning_rate": 1.5165137193161289e-05, "loss": 1.9046701431274413, "memory(GiB)": 44.29, "step": 3685, "token_acc": 0.5689149560117303, "train_speed(iter/s)": 0.580056 }, { "epoch": 2.2390776699029127, "grad_norm": 9.691226959228516, "learning_rate": 1.505135338077363e-05, "loss": 2.0255931854248046, "memory(GiB)": 44.29, "step": 3690, "token_acc": 0.5156695156695157, "train_speed(iter/s)": 0.579994 }, { "epoch": 2.242111650485437, "grad_norm": 7.089369773864746, "learning_rate": 1.4937922355592054e-05, "loss": 1.856874656677246, "memory(GiB)": 44.29, "step": 3695, "token_acc": 0.5548961424332344, "train_speed(iter/s)": 0.580088 }, { "epoch": 2.2451456310679614, "grad_norm": 8.272523880004883, "learning_rate": 1.482484526263993e-05, "loss": 1.9418399810791016, "memory(GiB)": 44.29, "step": 3700, "token_acc": 0.5479041916167665, "train_speed(iter/s)": 0.580039 }, { "epoch": 2.2481796116504853, "grad_norm": 19.949644088745117, "learning_rate": 1.4712123243367742e-05, "loss": 2.0299962997436523, "memory(GiB)": 44.29, "step": 3705, "token_acc": 0.5658362989323843, "train_speed(iter/s)": 0.580004 }, { "epoch": 2.2512135922330097, "grad_norm": 12.743327140808105, "learning_rate": 1.459975743564178e-05, "loss": 1.9635414123535155, "memory(GiB)": 44.29, "step": 3710, "token_acc": 0.5590277777777778, "train_speed(iter/s)": 0.579959 }, { "epoch": 2.254247572815534, "grad_norm": 6.324910640716553, "learning_rate": 1.4487748973732567e-05, "loss": 2.068693733215332, "memory(GiB)": 44.29, "step": 3715, "token_acc": 0.5710382513661202, "train_speed(iter/s)": 0.579955 }, { "epoch": 2.2572815533980584, "grad_norm": 8.611750602722168, "learning_rate": 1.4376098988303405e-05, "loss": 1.7477828979492187, "memory(GiB)": 44.29, "step": 3720, "token_acc": 0.5627118644067797, "train_speed(iter/s)": 0.579921 }, { "epoch": 2.2603155339805827, "grad_norm": 8.731199264526367, "learning_rate": 1.4264808606398988e-05, "loss": 1.9445646286010743, "memory(GiB)": 44.29, "step": 3725, "token_acc": 0.5650969529085873, "train_speed(iter/s)": 0.579908 }, { "epoch": 2.2633495145631066, "grad_norm": 8.617072105407715, "learning_rate": 1.4153878951433985e-05, "loss": 1.764409065246582, "memory(GiB)": 44.29, "step": 3730, "token_acc": 0.6271186440677966, "train_speed(iter/s)": 0.579952 }, { "epoch": 2.266383495145631, "grad_norm": 6.622957706451416, "learning_rate": 1.4043311143181743e-05, "loss": 1.8772661209106445, "memory(GiB)": 44.29, "step": 3735, "token_acc": 0.5902578796561605, "train_speed(iter/s)": 0.579925 }, { "epoch": 2.2694174757281553, "grad_norm": 7.272273063659668, "learning_rate": 1.3933106297762983e-05, "loss": 1.6700300216674804, "memory(GiB)": 44.29, "step": 3740, "token_acc": 0.6431095406360424, "train_speed(iter/s)": 0.579928 }, { "epoch": 2.2724514563106797, "grad_norm": 8.500160217285156, "learning_rate": 1.38232655276345e-05, "loss": 1.9523941040039063, "memory(GiB)": 44.29, "step": 3745, "token_acc": 0.5574324324324325, "train_speed(iter/s)": 0.579866 }, { "epoch": 2.275485436893204, "grad_norm": 10.481255531311035, "learning_rate": 1.3713789941577947e-05, "loss": 1.935152816772461, "memory(GiB)": 44.29, "step": 3750, "token_acc": 0.5851851851851851, "train_speed(iter/s)": 0.579728 }, { "epoch": 2.278519417475728, "grad_norm": 8.817157745361328, "learning_rate": 1.3604680644688673e-05, "loss": 2.029979705810547, "memory(GiB)": 44.29, "step": 3755, "token_acc": 0.5822784810126582, "train_speed(iter/s)": 0.579834 }, { "epoch": 2.2815533980582523, "grad_norm": 8.694374084472656, "learning_rate": 1.3495938738364495e-05, "loss": 1.8262203216552735, "memory(GiB)": 44.29, "step": 3760, "token_acc": 0.6044776119402985, "train_speed(iter/s)": 0.57994 }, { "epoch": 2.2845873786407767, "grad_norm": 8.665304183959961, "learning_rate": 1.338756532029466e-05, "loss": 1.8623455047607422, "memory(GiB)": 44.29, "step": 3765, "token_acc": 0.5573122529644269, "train_speed(iter/s)": 0.579929 }, { "epoch": 2.287621359223301, "grad_norm": 8.049120903015137, "learning_rate": 1.3279561484448726e-05, "loss": 1.8126539230346679, "memory(GiB)": 44.29, "step": 3770, "token_acc": 0.6254416961130742, "train_speed(iter/s)": 0.579935 }, { "epoch": 2.2906553398058254, "grad_norm": 5.988779544830322, "learning_rate": 1.3171928321065525e-05, "loss": 1.5385218620300294, "memory(GiB)": 44.29, "step": 3775, "token_acc": 0.657243816254417, "train_speed(iter/s)": 0.580025 }, { "epoch": 2.2936893203883493, "grad_norm": 5.922063827514648, "learning_rate": 1.306466691664216e-05, "loss": 1.7553050994873047, "memory(GiB)": 44.29, "step": 3780, "token_acc": 0.5925925925925926, "train_speed(iter/s)": 0.579998 }, { "epoch": 2.2967233009708736, "grad_norm": 6.754926681518555, "learning_rate": 1.2957778353922994e-05, "loss": 1.6977853775024414, "memory(GiB)": 44.29, "step": 3785, "token_acc": 0.5875912408759124, "train_speed(iter/s)": 0.580116 }, { "epoch": 2.299757281553398, "grad_norm": 6.817199230194092, "learning_rate": 1.285126371188881e-05, "loss": 1.9571613311767577, "memory(GiB)": 44.29, "step": 3790, "token_acc": 0.5661971830985916, "train_speed(iter/s)": 0.580194 }, { "epoch": 2.3027912621359223, "grad_norm": 10.896566390991211, "learning_rate": 1.2745124065745845e-05, "loss": 1.7496770858764648, "memory(GiB)": 44.29, "step": 3795, "token_acc": 0.6095238095238096, "train_speed(iter/s)": 0.580115 }, { "epoch": 2.3058252427184467, "grad_norm": 7.805569171905518, "learning_rate": 1.2639360486914964e-05, "loss": 2.1383758544921876, "memory(GiB)": 44.29, "step": 3800, "token_acc": 0.5325779036827195, "train_speed(iter/s)": 0.580145 }, { "epoch": 2.308859223300971, "grad_norm": 8.069032669067383, "learning_rate": 1.2533974043020862e-05, "loss": 1.7861778259277343, "memory(GiB)": 44.29, "step": 3805, "token_acc": 0.594855305466238, "train_speed(iter/s)": 0.580121 }, { "epoch": 2.311893203883495, "grad_norm": 10.002004623413086, "learning_rate": 1.2428965797881204e-05, "loss": 1.8549165725708008, "memory(GiB)": 44.29, "step": 3810, "token_acc": 0.559375, "train_speed(iter/s)": 0.580167 }, { "epoch": 2.3149271844660193, "grad_norm": 10.041362762451172, "learning_rate": 1.232433681149604e-05, "loss": 1.9269153594970703, "memory(GiB)": 44.29, "step": 3815, "token_acc": 0.5762195121951219, "train_speed(iter/s)": 0.580098 }, { "epoch": 2.3179611650485437, "grad_norm": 7.903229236602783, "learning_rate": 1.2220088140036934e-05, "loss": 1.8197761535644532, "memory(GiB)": 44.29, "step": 3820, "token_acc": 0.6220735785953178, "train_speed(iter/s)": 0.580156 }, { "epoch": 2.320995145631068, "grad_norm": 7.331014156341553, "learning_rate": 1.2116220835836389e-05, "loss": 2.0878772735595703, "memory(GiB)": 44.29, "step": 3825, "token_acc": 0.5467128027681661, "train_speed(iter/s)": 0.580167 }, { "epoch": 2.3240291262135924, "grad_norm": 11.29516315460205, "learning_rate": 1.2012735947377297e-05, "loss": 1.9641210556030273, "memory(GiB)": 44.29, "step": 3830, "token_acc": 0.5759493670886076, "train_speed(iter/s)": 0.580195 }, { "epoch": 2.3270631067961167, "grad_norm": 9.621826171875, "learning_rate": 1.1909634519282154e-05, "loss": 1.9087528228759765, "memory(GiB)": 44.29, "step": 3835, "token_acc": 0.5802047781569966, "train_speed(iter/s)": 0.580189 }, { "epoch": 2.3300970873786406, "grad_norm": 7.312023162841797, "learning_rate": 1.1806917592302762e-05, "loss": 1.5428638458251953, "memory(GiB)": 44.29, "step": 3840, "token_acc": 0.6550522648083623, "train_speed(iter/s)": 0.580258 }, { "epoch": 2.333131067961165, "grad_norm": 12.322574615478516, "learning_rate": 1.1704586203309486e-05, "loss": 2.2512718200683595, "memory(GiB)": 44.29, "step": 3845, "token_acc": 0.5261627906976745, "train_speed(iter/s)": 0.580196 }, { "epoch": 2.3361650485436893, "grad_norm": 7.682923316955566, "learning_rate": 1.1602641385280971e-05, "loss": 2.14353084564209, "memory(GiB)": 44.29, "step": 3850, "token_acc": 0.5100502512562815, "train_speed(iter/s)": 0.580288 }, { "epoch": 2.3391990291262137, "grad_norm": 8.914677619934082, "learning_rate": 1.1501084167293624e-05, "loss": 1.8753440856933594, "memory(GiB)": 44.29, "step": 3855, "token_acc": 0.587248322147651, "train_speed(iter/s)": 0.58029 }, { "epoch": 2.342233009708738, "grad_norm": 8.7797212600708, "learning_rate": 1.1399915574511205e-05, "loss": 1.93109130859375, "memory(GiB)": 44.29, "step": 3860, "token_acc": 0.5684931506849316, "train_speed(iter/s)": 0.58018 }, { "epoch": 2.345266990291262, "grad_norm": 6.757023334503174, "learning_rate": 1.1299136628174606e-05, "loss": 1.959303855895996, "memory(GiB)": 44.29, "step": 3865, "token_acc": 0.573134328358209, "train_speed(iter/s)": 0.580123 }, { "epoch": 2.3483009708737863, "grad_norm": 7.724388599395752, "learning_rate": 1.1198748345591358e-05, "loss": 1.923073959350586, "memory(GiB)": 44.29, "step": 3870, "token_acc": 0.5460122699386503, "train_speed(iter/s)": 0.580153 }, { "epoch": 2.3513349514563107, "grad_norm": 8.733378410339355, "learning_rate": 1.1098751740125518e-05, "loss": 1.9303054809570312, "memory(GiB)": 44.29, "step": 3875, "token_acc": 0.5620437956204379, "train_speed(iter/s)": 0.580197 }, { "epoch": 2.354368932038835, "grad_norm": 7.248959541320801, "learning_rate": 1.0999147821187378e-05, "loss": 1.9763971328735352, "memory(GiB)": 44.29, "step": 3880, "token_acc": 0.528052805280528, "train_speed(iter/s)": 0.580278 }, { "epoch": 2.3574029126213594, "grad_norm": 7.560742378234863, "learning_rate": 1.0899937594223225e-05, "loss": 2.138459014892578, "memory(GiB)": 44.29, "step": 3885, "token_acc": 0.5240793201133145, "train_speed(iter/s)": 0.580203 }, { "epoch": 2.3604368932038833, "grad_norm": 7.769505023956299, "learning_rate": 1.080112206070531e-05, "loss": 1.8142425537109375, "memory(GiB)": 44.29, "step": 3890, "token_acc": 0.5935483870967742, "train_speed(iter/s)": 0.580071 }, { "epoch": 2.3634708737864076, "grad_norm": 8.150938987731934, "learning_rate": 1.070270221812163e-05, "loss": 2.216781234741211, "memory(GiB)": 44.29, "step": 3895, "token_acc": 0.49122807017543857, "train_speed(iter/s)": 0.580092 }, { "epoch": 2.366504854368932, "grad_norm": 6.342752456665039, "learning_rate": 1.0604679059965922e-05, "loss": 1.5916692733764648, "memory(GiB)": 44.29, "step": 3900, "token_acc": 0.6594982078853047, "train_speed(iter/s)": 0.580113 }, { "epoch": 2.3695388349514563, "grad_norm": 8.572466850280762, "learning_rate": 1.050705357572761e-05, "loss": 1.6800006866455077, "memory(GiB)": 44.29, "step": 3905, "token_acc": 0.5981873111782477, "train_speed(iter/s)": 0.580211 }, { "epoch": 2.3725728155339807, "grad_norm": 10.74704360961914, "learning_rate": 1.0409826750881824e-05, "loss": 2.0315380096435547, "memory(GiB)": 44.29, "step": 3910, "token_acc": 0.5259067357512953, "train_speed(iter/s)": 0.580153 }, { "epoch": 2.375606796116505, "grad_norm": 10.060522079467773, "learning_rate": 1.031299956687941e-05, "loss": 1.925653839111328, "memory(GiB)": 44.29, "step": 3915, "token_acc": 0.5745454545454546, "train_speed(iter/s)": 0.580105 }, { "epoch": 2.378640776699029, "grad_norm": 6.9279704093933105, "learning_rate": 1.0216573001137126e-05, "loss": 1.7791040420532227, "memory(GiB)": 44.29, "step": 3920, "token_acc": 0.6091954022988506, "train_speed(iter/s)": 0.580094 }, { "epoch": 2.3816747572815533, "grad_norm": 8.384385108947754, "learning_rate": 1.0120548027027655e-05, "loss": 1.839115524291992, "memory(GiB)": 44.29, "step": 3925, "token_acc": 0.5759493670886076, "train_speed(iter/s)": 0.580033 }, { "epoch": 2.3847087378640777, "grad_norm": 6.519843578338623, "learning_rate": 1.0024925613869874e-05, "loss": 2.303724670410156, "memory(GiB)": 44.29, "step": 3930, "token_acc": 0.5181818181818182, "train_speed(iter/s)": 0.580098 }, { "epoch": 2.387742718446602, "grad_norm": 6.988163948059082, "learning_rate": 9.929706726919019e-06, "loss": 2.0136226654052733, "memory(GiB)": 44.29, "step": 3935, "token_acc": 0.5746031746031746, "train_speed(iter/s)": 0.580148 }, { "epoch": 2.3907766990291264, "grad_norm": 8.978435516357422, "learning_rate": 9.834892327356909e-06, "loss": 2.091661262512207, "memory(GiB)": 44.29, "step": 3940, "token_acc": 0.5614035087719298, "train_speed(iter/s)": 0.580167 }, { "epoch": 2.3938106796116507, "grad_norm": 8.229738235473633, "learning_rate": 9.740483372282383e-06, "loss": 1.8495658874511718, "memory(GiB)": 44.29, "step": 3945, "token_acc": 0.6066176470588235, "train_speed(iter/s)": 0.580188 }, { "epoch": 2.3968446601941746, "grad_norm": 9.419842720031738, "learning_rate": 9.646480814701447e-06, "loss": 1.9571540832519532, "memory(GiB)": 44.29, "step": 3950, "token_acc": 0.5364431486880467, "train_speed(iter/s)": 0.580215 }, { "epoch": 2.399878640776699, "grad_norm": 9.840128898620605, "learning_rate": 9.552885603517797e-06, "loss": 1.9348846435546876, "memory(GiB)": 44.29, "step": 3955, "token_acc": 0.570446735395189, "train_speed(iter/s)": 0.580228 }, { "epoch": 2.4029126213592233, "grad_norm": 11.587018013000488, "learning_rate": 9.459698683523204e-06, "loss": 2.1948358535766603, "memory(GiB)": 44.29, "step": 3960, "token_acc": 0.5457413249211357, "train_speed(iter/s)": 0.580218 }, { "epoch": 2.4059466019417477, "grad_norm": 7.861437797546387, "learning_rate": 9.366920995387901e-06, "loss": 2.0211660385131838, "memory(GiB)": 44.29, "step": 3965, "token_acc": 0.5498489425981873, "train_speed(iter/s)": 0.580257 }, { "epoch": 2.408980582524272, "grad_norm": 10.794283866882324, "learning_rate": 9.274553475651254e-06, "loss": 1.9600090026855468, "memory(GiB)": 44.29, "step": 3970, "token_acc": 0.5950413223140496, "train_speed(iter/s)": 0.580279 }, { "epoch": 2.412014563106796, "grad_norm": 7.574179649353027, "learning_rate": 9.182597056712111e-06, "loss": 1.962773895263672, "memory(GiB)": 44.29, "step": 3975, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 0.580177 }, { "epoch": 2.4150485436893203, "grad_norm": 8.261923789978027, "learning_rate": 9.09105266681954e-06, "loss": 2.138422393798828, "memory(GiB)": 44.29, "step": 3980, "token_acc": 0.518796992481203, "train_speed(iter/s)": 0.580227 }, { "epoch": 2.4180825242718447, "grad_norm": 9.629799842834473, "learning_rate": 8.99992123006339e-06, "loss": 1.972011184692383, "memory(GiB)": 44.29, "step": 3985, "token_acc": 0.5466237942122186, "train_speed(iter/s)": 0.580297 }, { "epoch": 2.421116504854369, "grad_norm": 6.383166313171387, "learning_rate": 8.909203666364957e-06, "loss": 1.874557113647461, "memory(GiB)": 44.29, "step": 3990, "token_acc": 0.583941605839416, "train_speed(iter/s)": 0.580373 }, { "epoch": 2.4241504854368934, "grad_norm": 7.899206161499023, "learning_rate": 8.818900891467773e-06, "loss": 2.0880853652954103, "memory(GiB)": 44.29, "step": 3995, "token_acc": 0.54, "train_speed(iter/s)": 0.580411 }, { "epoch": 2.4271844660194173, "grad_norm": 8.335851669311523, "learning_rate": 8.729013816928239e-06, "loss": 1.8050338745117187, "memory(GiB)": 44.29, "step": 4000, "token_acc": 0.6138613861386139, "train_speed(iter/s)": 0.580506 }, { "epoch": 2.4271844660194173, "eval_loss": 1.988856554031372, "eval_runtime": 12.22, "eval_samples_per_second": 8.183, "eval_steps_per_second": 8.183, "eval_token_acc": 0.5174337517433751, "step": 4000 }, { "epoch": 2.4302184466019416, "grad_norm": 6.326101303100586, "learning_rate": 8.639543350106532e-06, "loss": 1.6620052337646485, "memory(GiB)": 44.29, "step": 4005, "token_acc": 0.5577651515151515, "train_speed(iter/s)": 0.579332 }, { "epoch": 2.433252427184466, "grad_norm": 5.722497463226318, "learning_rate": 8.550490394157417e-06, "loss": 2.129566192626953, "memory(GiB)": 44.29, "step": 4010, "token_acc": 0.5157593123209169, "train_speed(iter/s)": 0.57938 }, { "epoch": 2.4362864077669903, "grad_norm": 7.888674736022949, "learning_rate": 8.46185584802106e-06, "loss": 1.7735406875610351, "memory(GiB)": 44.29, "step": 4015, "token_acc": 0.6041666666666666, "train_speed(iter/s)": 0.579409 }, { "epoch": 2.4393203883495147, "grad_norm": 9.027255058288574, "learning_rate": 8.373640606414096e-06, "loss": 2.2499406814575194, "memory(GiB)": 44.29, "step": 4020, "token_acc": 0.5216049382716049, "train_speed(iter/s)": 0.579437 }, { "epoch": 2.4423543689320386, "grad_norm": 6.910282611846924, "learning_rate": 8.285845559820427e-06, "loss": 1.820733642578125, "memory(GiB)": 44.29, "step": 4025, "token_acc": 0.5671641791044776, "train_speed(iter/s)": 0.579456 }, { "epoch": 2.445388349514563, "grad_norm": 8.852483749389648, "learning_rate": 8.198471594482376e-06, "loss": 2.3667272567749023, "memory(GiB)": 44.29, "step": 4030, "token_acc": 0.5066666666666667, "train_speed(iter/s)": 0.579474 }, { "epoch": 2.4484223300970873, "grad_norm": 7.324892520904541, "learning_rate": 8.111519592391669e-06, "loss": 1.7319637298583985, "memory(GiB)": 44.29, "step": 4035, "token_acc": 0.6045751633986928, "train_speed(iter/s)": 0.579496 }, { "epoch": 2.4514563106796117, "grad_norm": 7.501872539520264, "learning_rate": 8.024990431280543e-06, "loss": 2.2290987014770507, "memory(GiB)": 44.29, "step": 4040, "token_acc": 0.5184049079754601, "train_speed(iter/s)": 0.57949 }, { "epoch": 2.454490291262136, "grad_norm": 11.032537460327148, "learning_rate": 7.93888498461291e-06, "loss": 1.7387943267822266, "memory(GiB)": 44.29, "step": 4045, "token_acc": 0.5924657534246576, "train_speed(iter/s)": 0.579449 }, { "epoch": 2.4575242718446604, "grad_norm": 7.408664226531982, "learning_rate": 7.853204121575475e-06, "loss": 2.0464914321899412, "memory(GiB)": 44.29, "step": 4050, "token_acc": 0.5613496932515337, "train_speed(iter/s)": 0.579515 }, { "epoch": 2.4605582524271843, "grad_norm": 8.151251792907715, "learning_rate": 7.76794870706905e-06, "loss": 1.9731042861938477, "memory(GiB)": 44.29, "step": 4055, "token_acc": 0.5264900662251656, "train_speed(iter/s)": 0.579583 }, { "epoch": 2.4635922330097086, "grad_norm": 7.136772632598877, "learning_rate": 7.683119601699757e-06, "loss": 1.9375322341918946, "memory(GiB)": 44.29, "step": 4060, "token_acc": 0.5538922155688623, "train_speed(iter/s)": 0.579566 }, { "epoch": 2.466626213592233, "grad_norm": 8.133397102355957, "learning_rate": 7.598717661770377e-06, "loss": 1.9626676559448242, "memory(GiB)": 44.29, "step": 4065, "token_acc": 0.5642633228840125, "train_speed(iter/s)": 0.579562 }, { "epoch": 2.4696601941747574, "grad_norm": 7.656953811645508, "learning_rate": 7.514743739271696e-06, "loss": 1.7955259323120116, "memory(GiB)": 44.29, "step": 4070, "token_acc": 0.6, "train_speed(iter/s)": 0.579478 }, { "epoch": 2.4726941747572817, "grad_norm": 9.205748558044434, "learning_rate": 7.4311986818738685e-06, "loss": 1.7786579132080078, "memory(GiB)": 44.29, "step": 4075, "token_acc": 0.6013071895424836, "train_speed(iter/s)": 0.579436 }, { "epoch": 2.475728155339806, "grad_norm": 9.025361061096191, "learning_rate": 7.348083332917926e-06, "loss": 2.109883689880371, "memory(GiB)": 44.29, "step": 4080, "token_acc": 0.536, "train_speed(iter/s)": 0.579399 }, { "epoch": 2.47876213592233, "grad_norm": 7.150624752044678, "learning_rate": 7.26539853140723e-06, "loss": 1.8995925903320312, "memory(GiB)": 44.29, "step": 4085, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.57952 }, { "epoch": 2.4817961165048543, "grad_norm": 7.212602138519287, "learning_rate": 7.1831451119989955e-06, "loss": 2.258907508850098, "memory(GiB)": 44.29, "step": 4090, "token_acc": 0.4742547425474255, "train_speed(iter/s)": 0.579551 }, { "epoch": 2.4848300970873787, "grad_norm": 9.11235523223877, "learning_rate": 7.1013239049958714e-06, "loss": 1.7706048965454102, "memory(GiB)": 44.29, "step": 4095, "token_acc": 0.5925925925925926, "train_speed(iter/s)": 0.579509 }, { "epoch": 2.487864077669903, "grad_norm": 10.799226760864258, "learning_rate": 7.019935736337585e-06, "loss": 2.1821046829223634, "memory(GiB)": 44.29, "step": 4100, "token_acc": 0.5335463258785943, "train_speed(iter/s)": 0.579558 }, { "epoch": 2.4908980582524274, "grad_norm": 11.041994094848633, "learning_rate": 6.938981427592534e-06, "loss": 2.088601303100586, "memory(GiB)": 44.29, "step": 4105, "token_acc": 0.5171339563862928, "train_speed(iter/s)": 0.579608 }, { "epoch": 2.4939320388349513, "grad_norm": 10.685086250305176, "learning_rate": 6.858461795949583e-06, "loss": 1.5177223205566406, "memory(GiB)": 44.29, "step": 4110, "token_acc": 0.6339285714285714, "train_speed(iter/s)": 0.579595 }, { "epoch": 2.4969660194174756, "grad_norm": 8.770302772521973, "learning_rate": 6.778377654209761e-06, "loss": 1.7158885955810548, "memory(GiB)": 44.29, "step": 4115, "token_acc": 0.5580524344569289, "train_speed(iter/s)": 0.579559 }, { "epoch": 2.5, "grad_norm": 8.05949878692627, "learning_rate": 6.698729810778065e-06, "loss": 2.1136884689331055, "memory(GiB)": 44.29, "step": 4120, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.579581 }, { "epoch": 2.5030339805825244, "grad_norm": 9.553973197937012, "learning_rate": 6.619519069655322e-06, "loss": 1.8230070114135741, "memory(GiB)": 44.29, "step": 4125, "token_acc": 0.5797101449275363, "train_speed(iter/s)": 0.579641 }, { "epoch": 2.5060679611650487, "grad_norm": 7.956108093261719, "learning_rate": 6.54074623042999e-06, "loss": 2.0894168853759765, "memory(GiB)": 44.29, "step": 4130, "token_acc": 0.5565749235474006, "train_speed(iter/s)": 0.579668 }, { "epoch": 2.5091019417475726, "grad_norm": 8.481484413146973, "learning_rate": 6.4624120882702535e-06, "loss": 1.8939842224121093, "memory(GiB)": 44.29, "step": 4135, "token_acc": 0.5757575757575758, "train_speed(iter/s)": 0.579678 }, { "epoch": 2.512135922330097, "grad_norm": 9.907540321350098, "learning_rate": 6.384517433915793e-06, "loss": 1.9347640991210937, "memory(GiB)": 44.29, "step": 4140, "token_acc": 0.5447761194029851, "train_speed(iter/s)": 0.579755 }, { "epoch": 2.5151699029126213, "grad_norm": 7.414953231811523, "learning_rate": 6.30706305366996e-06, "loss": 1.730459213256836, "memory(GiB)": 44.29, "step": 4145, "token_acc": 0.5985401459854015, "train_speed(iter/s)": 0.579655 }, { "epoch": 2.5182038834951457, "grad_norm": 8.353326797485352, "learning_rate": 6.230049729391779e-06, "loss": 1.9265541076660155, "memory(GiB)": 44.29, "step": 4150, "token_acc": 0.5632183908045977, "train_speed(iter/s)": 0.57963 }, { "epoch": 2.52123786407767, "grad_norm": 6.5633673667907715, "learning_rate": 6.153478238488019e-06, "loss": 1.7929351806640625, "memory(GiB)": 44.29, "step": 4155, "token_acc": 0.5882352941176471, "train_speed(iter/s)": 0.579611 }, { "epoch": 2.524271844660194, "grad_norm": 7.694858551025391, "learning_rate": 6.077349353905465e-06, "loss": 2.095606231689453, "memory(GiB)": 44.29, "step": 4160, "token_acc": 0.5905511811023622, "train_speed(iter/s)": 0.579685 }, { "epoch": 2.5273058252427183, "grad_norm": 6.272264003753662, "learning_rate": 6.00166384412294e-06, "loss": 2.1394012451171873, "memory(GiB)": 44.29, "step": 4165, "token_acc": 0.5434782608695652, "train_speed(iter/s)": 0.579716 }, { "epoch": 2.5303398058252426, "grad_norm": 8.841377258300781, "learning_rate": 5.926422473143717e-06, "loss": 1.972856330871582, "memory(GiB)": 44.29, "step": 4170, "token_acc": 0.5674740484429066, "train_speed(iter/s)": 0.579732 }, { "epoch": 2.533373786407767, "grad_norm": 7.734652996063232, "learning_rate": 5.851626000487714e-06, "loss": 1.771505355834961, "memory(GiB)": 44.29, "step": 4175, "token_acc": 0.5695364238410596, "train_speed(iter/s)": 0.57976 }, { "epoch": 2.5364077669902914, "grad_norm": 8.464856147766113, "learning_rate": 5.7772751811838165e-06, "loss": 2.1697675704956056, "memory(GiB)": 44.29, "step": 4180, "token_acc": 0.5490196078431373, "train_speed(iter/s)": 0.579766 }, { "epoch": 2.5394417475728153, "grad_norm": 6.249225616455078, "learning_rate": 5.703370765762345e-06, "loss": 2.03582706451416, "memory(GiB)": 44.29, "step": 4185, "token_acc": 0.5623188405797102, "train_speed(iter/s)": 0.57982 }, { "epoch": 2.54247572815534, "grad_norm": 9.059986114501953, "learning_rate": 5.629913500247364e-06, "loss": 2.067348098754883, "memory(GiB)": 44.29, "step": 4190, "token_acc": 0.5376712328767124, "train_speed(iter/s)": 0.579847 }, { "epoch": 2.545509708737864, "grad_norm": 7.16273307800293, "learning_rate": 5.556904126149237e-06, "loss": 1.619649314880371, "memory(GiB)": 44.29, "step": 4195, "token_acc": 0.6006600660066007, "train_speed(iter/s)": 0.579885 }, { "epoch": 2.5485436893203883, "grad_norm": 8.991573333740234, "learning_rate": 5.484343380457125e-06, "loss": 1.7998830795288085, "memory(GiB)": 44.29, "step": 4200, "token_acc": 0.5662650602409639, "train_speed(iter/s)": 0.57986 }, { "epoch": 2.5515776699029127, "grad_norm": 8.396170616149902, "learning_rate": 5.412231995631473e-06, "loss": 1.9479732513427734, "memory(GiB)": 44.29, "step": 4205, "token_acc": 0.5418060200668896, "train_speed(iter/s)": 0.579772 }, { "epoch": 2.554611650485437, "grad_norm": 9.159605979919434, "learning_rate": 5.340570699596769e-06, "loss": 1.8561626434326173, "memory(GiB)": 44.29, "step": 4210, "token_acc": 0.5530973451327433, "train_speed(iter/s)": 0.579768 }, { "epoch": 2.5576456310679614, "grad_norm": 9.651739120483398, "learning_rate": 5.269360215734026e-06, "loss": 2.021830940246582, "memory(GiB)": 44.29, "step": 4215, "token_acc": 0.5821917808219178, "train_speed(iter/s)": 0.579771 }, { "epoch": 2.5606796116504853, "grad_norm": 6.730819225311279, "learning_rate": 5.198601262873593e-06, "loss": 1.8237226486206055, "memory(GiB)": 44.29, "step": 4220, "token_acc": 0.5780821917808219, "train_speed(iter/s)": 0.57971 }, { "epoch": 2.5637135922330097, "grad_norm": 10.186707496643066, "learning_rate": 5.12829455528786e-06, "loss": 1.6941600799560548, "memory(GiB)": 44.29, "step": 4225, "token_acc": 0.6114649681528662, "train_speed(iter/s)": 0.579676 }, { "epoch": 2.566747572815534, "grad_norm": 11.422538757324219, "learning_rate": 5.0584408026840555e-06, "loss": 1.9525514602661134, "memory(GiB)": 44.29, "step": 4230, "token_acc": 0.5769230769230769, "train_speed(iter/s)": 0.57969 }, { "epoch": 2.5697815533980584, "grad_norm": 6.254408836364746, "learning_rate": 4.989040710197068e-06, "loss": 1.8417320251464844, "memory(GiB)": 44.29, "step": 4235, "token_acc": 0.5893854748603352, "train_speed(iter/s)": 0.579743 }, { "epoch": 2.5728155339805827, "grad_norm": 8.989594459533691, "learning_rate": 4.920094978382339e-06, "loss": 2.2028553009033205, "memory(GiB)": 44.29, "step": 4240, "token_acc": 0.52, "train_speed(iter/s)": 0.579778 }, { "epoch": 2.5758495145631066, "grad_norm": 8.0951566696167, "learning_rate": 4.851604303208801e-06, "loss": 1.881844711303711, "memory(GiB)": 44.29, "step": 4245, "token_acc": 0.6153846153846154, "train_speed(iter/s)": 0.579786 }, { "epoch": 2.578883495145631, "grad_norm": 7.341141700744629, "learning_rate": 4.783569376051833e-06, "loss": 2.057468223571777, "memory(GiB)": 44.29, "step": 4250, "token_acc": 0.5373563218390804, "train_speed(iter/s)": 0.579789 }, { "epoch": 2.5819174757281553, "grad_norm": 8.066463470458984, "learning_rate": 4.7159908836862994e-06, "loss": 1.9251741409301757, "memory(GiB)": 44.29, "step": 4255, "token_acc": 0.559322033898305, "train_speed(iter/s)": 0.579759 }, { "epoch": 2.5849514563106797, "grad_norm": 14.030436515808105, "learning_rate": 4.648869508279613e-06, "loss": 1.9517692565917968, "memory(GiB)": 44.29, "step": 4260, "token_acc": 0.5652173913043478, "train_speed(iter/s)": 0.579826 }, { "epoch": 2.587985436893204, "grad_norm": 7.453925132751465, "learning_rate": 4.582205927384814e-06, "loss": 1.7124622344970704, "memory(GiB)": 44.29, "step": 4265, "token_acc": 0.6095890410958904, "train_speed(iter/s)": 0.579851 }, { "epoch": 2.591019417475728, "grad_norm": 9.848562240600586, "learning_rate": 4.51600081393379e-06, "loss": 1.534929084777832, "memory(GiB)": 44.29, "step": 4270, "token_acc": 0.6325757575757576, "train_speed(iter/s)": 0.579913 }, { "epoch": 2.5940533980582523, "grad_norm": 8.778762817382812, "learning_rate": 4.450254836230449e-06, "loss": 1.9810653686523438, "memory(GiB)": 44.29, "step": 4275, "token_acc": 0.5531914893617021, "train_speed(iter/s)": 0.579963 }, { "epoch": 2.5970873786407767, "grad_norm": 7.1793742179870605, "learning_rate": 4.384968657943972e-06, "loss": 2.044744682312012, "memory(GiB)": 44.29, "step": 4280, "token_acc": 0.5632530120481928, "train_speed(iter/s)": 0.579985 }, { "epoch": 2.600121359223301, "grad_norm": 6.974610805511475, "learning_rate": 4.3201429381021285e-06, "loss": 1.750173568725586, "memory(GiB)": 44.29, "step": 4285, "token_acc": 0.6114864864864865, "train_speed(iter/s)": 0.580089 }, { "epoch": 2.6031553398058254, "grad_norm": 7.6272196769714355, "learning_rate": 4.255778331084609e-06, "loss": 2.1643795013427733, "memory(GiB)": 44.29, "step": 4290, "token_acc": 0.5138539042821159, "train_speed(iter/s)": 0.580145 }, { "epoch": 2.6061893203883493, "grad_norm": 8.453348159790039, "learning_rate": 4.1918754866164205e-06, "loss": 2.236542510986328, "memory(GiB)": 44.29, "step": 4295, "token_acc": 0.4793650793650794, "train_speed(iter/s)": 0.580165 }, { "epoch": 2.6092233009708736, "grad_norm": 10.852858543395996, "learning_rate": 4.1284350497613426e-06, "loss": 1.9886856079101562, "memory(GiB)": 44.29, "step": 4300, "token_acc": 0.5439739413680782, "train_speed(iter/s)": 0.58019 }, { "epoch": 2.612257281553398, "grad_norm": 9.123336791992188, "learning_rate": 4.065457660915401e-06, "loss": 1.9303335189819335, "memory(GiB)": 44.29, "step": 4305, "token_acc": 0.5847457627118644, "train_speed(iter/s)": 0.580152 }, { "epoch": 2.6152912621359223, "grad_norm": 7.784154891967773, "learning_rate": 4.002943955800409e-06, "loss": 2.0141778945922852, "memory(GiB)": 44.29, "step": 4310, "token_acc": 0.5393258426966292, "train_speed(iter/s)": 0.580169 }, { "epoch": 2.6183252427184467, "grad_norm": 9.202990531921387, "learning_rate": 3.94089456545757e-06, "loss": 1.936072540283203, "memory(GiB)": 44.29, "step": 4315, "token_acc": 0.5570469798657718, "train_speed(iter/s)": 0.580162 }, { "epoch": 2.6213592233009706, "grad_norm": 8.248907089233398, "learning_rate": 3.879310116241042e-06, "loss": 1.968276596069336, "memory(GiB)": 44.29, "step": 4320, "token_acc": 0.5680272108843537, "train_speed(iter/s)": 0.580185 }, { "epoch": 2.6243932038834954, "grad_norm": 10.208954811096191, "learning_rate": 3.818191229811696e-06, "loss": 1.9195415496826171, "memory(GiB)": 44.29, "step": 4325, "token_acc": 0.5785123966942148, "train_speed(iter/s)": 0.580191 }, { "epoch": 2.6274271844660193, "grad_norm": 8.11597728729248, "learning_rate": 3.757538523130799e-06, "loss": 2.197231674194336, "memory(GiB)": 44.29, "step": 4330, "token_acc": 0.5173501577287066, "train_speed(iter/s)": 0.580246 }, { "epoch": 2.6304611650485437, "grad_norm": 10.075161933898926, "learning_rate": 3.697352608453791e-06, "loss": 2.041206932067871, "memory(GiB)": 44.29, "step": 4335, "token_acc": 0.5785714285714286, "train_speed(iter/s)": 0.580244 }, { "epoch": 2.633495145631068, "grad_norm": 9.632774353027344, "learning_rate": 3.6376340933241104e-06, "loss": 1.9504831314086915, "memory(GiB)": 44.29, "step": 4340, "token_acc": 0.5544217687074829, "train_speed(iter/s)": 0.580233 }, { "epoch": 2.6365291262135924, "grad_norm": 7.268722057342529, "learning_rate": 3.5783835805670183e-06, "loss": 2.2769695281982423, "memory(GiB)": 44.29, "step": 4345, "token_acc": 0.5015197568389058, "train_speed(iter/s)": 0.580233 }, { "epoch": 2.6395631067961167, "grad_norm": 13.444320678710938, "learning_rate": 3.519601668283623e-06, "loss": 1.9888429641723633, "memory(GiB)": 44.29, "step": 4350, "token_acc": 0.5563636363636364, "train_speed(iter/s)": 0.580169 }, { "epoch": 2.6425970873786406, "grad_norm": 8.871038436889648, "learning_rate": 3.4612889498447043e-06, "loss": 1.7693092346191406, "memory(GiB)": 44.29, "step": 4355, "token_acc": 0.5708812260536399, "train_speed(iter/s)": 0.580141 }, { "epoch": 2.645631067961165, "grad_norm": 9.421436309814453, "learning_rate": 3.40344601388482e-06, "loss": 1.8508855819702148, "memory(GiB)": 44.29, "step": 4360, "token_acc": 0.5864197530864198, "train_speed(iter/s)": 0.580145 }, { "epoch": 2.6486650485436893, "grad_norm": 14.255351066589355, "learning_rate": 3.346073444296338e-06, "loss": 1.8605754852294922, "memory(GiB)": 44.29, "step": 4365, "token_acc": 0.6238244514106583, "train_speed(iter/s)": 0.580184 }, { "epoch": 2.6516990291262137, "grad_norm": 9.838223457336426, "learning_rate": 3.289171820223519e-06, "loss": 1.8943605422973633, "memory(GiB)": 44.29, "step": 4370, "token_acc": 0.59375, "train_speed(iter/s)": 0.580196 }, { "epoch": 2.654733009708738, "grad_norm": 7.1384148597717285, "learning_rate": 3.2327417160567196e-06, "loss": 1.945779037475586, "memory(GiB)": 44.29, "step": 4375, "token_acc": 0.5589225589225589, "train_speed(iter/s)": 0.580208 }, { "epoch": 2.657766990291262, "grad_norm": 7.130894184112549, "learning_rate": 3.176783701426528e-06, "loss": 1.920769500732422, "memory(GiB)": 44.29, "step": 4380, "token_acc": 0.5652173913043478, "train_speed(iter/s)": 0.5802 }, { "epoch": 2.6608009708737863, "grad_norm": 7.5801215171813965, "learning_rate": 3.121298341198081e-06, "loss": 2.089648628234863, "memory(GiB)": 44.29, "step": 4385, "token_acc": 0.5445026178010471, "train_speed(iter/s)": 0.580307 }, { "epoch": 2.6638349514563107, "grad_norm": 9.623913764953613, "learning_rate": 3.0662861954653232e-06, "loss": 2.102077674865723, "memory(GiB)": 44.29, "step": 4390, "token_acc": 0.5238095238095238, "train_speed(iter/s)": 0.580352 }, { "epoch": 2.666868932038835, "grad_norm": 8.056645393371582, "learning_rate": 3.0117478195453353e-06, "loss": 2.002307319641113, "memory(GiB)": 44.29, "step": 4395, "token_acc": 0.5625, "train_speed(iter/s)": 0.580375 }, { "epoch": 2.6699029126213594, "grad_norm": 9.664189338684082, "learning_rate": 2.9576837639728073e-06, "loss": 1.638421630859375, "memory(GiB)": 44.29, "step": 4400, "token_acc": 0.6138328530259366, "train_speed(iter/s)": 0.580313 }, { "epoch": 2.6729368932038833, "grad_norm": 7.514981269836426, "learning_rate": 2.9040945744943757e-06, "loss": 1.8152626037597657, "memory(GiB)": 44.29, "step": 4405, "token_acc": 0.5535055350553506, "train_speed(iter/s)": 0.580281 }, { "epoch": 2.6759708737864076, "grad_norm": 7.019512176513672, "learning_rate": 2.850980792063196e-06, "loss": 1.805082130432129, "memory(GiB)": 44.29, "step": 4410, "token_acc": 0.5683890577507599, "train_speed(iter/s)": 0.580336 }, { "epoch": 2.679004854368932, "grad_norm": 8.447052955627441, "learning_rate": 2.798342952833455e-06, "loss": 1.9645135879516602, "memory(GiB)": 44.29, "step": 4415, "token_acc": 0.5562700964630225, "train_speed(iter/s)": 0.580375 }, { "epoch": 2.6820388349514563, "grad_norm": 15.852560997009277, "learning_rate": 2.7461815881549225e-06, "loss": 1.9464908599853517, "memory(GiB)": 44.29, "step": 4420, "token_acc": 0.5913978494623656, "train_speed(iter/s)": 0.580409 }, { "epoch": 2.6850728155339807, "grad_norm": 8.933894157409668, "learning_rate": 2.694497224567688e-06, "loss": 2.005167007446289, "memory(GiB)": 44.29, "step": 4425, "token_acc": 0.5362903225806451, "train_speed(iter/s)": 0.580346 }, { "epoch": 2.6881067961165046, "grad_norm": 5.791989326477051, "learning_rate": 2.6432903837967036e-06, "loss": 1.905177116394043, "memory(GiB)": 44.29, "step": 4430, "token_acc": 0.556923076923077, "train_speed(iter/s)": 0.580417 }, { "epoch": 2.6911407766990294, "grad_norm": 7.198362350463867, "learning_rate": 2.5925615827466444e-06, "loss": 2.0099058151245117, "memory(GiB)": 44.29, "step": 4435, "token_acc": 0.5861111111111111, "train_speed(iter/s)": 0.580452 }, { "epoch": 2.6941747572815533, "grad_norm": 10.059782981872559, "learning_rate": 2.542311333496622e-06, "loss": 2.030255126953125, "memory(GiB)": 44.29, "step": 4440, "token_acc": 0.5693215339233039, "train_speed(iter/s)": 0.580427 }, { "epoch": 2.6972087378640777, "grad_norm": 8.237997055053711, "learning_rate": 2.492540143295036e-06, "loss": 1.9501361846923828, "memory(GiB)": 44.29, "step": 4445, "token_acc": 0.5619335347432024, "train_speed(iter/s)": 0.5805 }, { "epoch": 2.700242718446602, "grad_norm": 8.189979553222656, "learning_rate": 2.4432485145544527e-06, "loss": 2.0908411026000975, "memory(GiB)": 44.29, "step": 4450, "token_acc": 0.5563139931740614, "train_speed(iter/s)": 0.580487 }, { "epoch": 2.7032766990291264, "grad_norm": 7.352850437164307, "learning_rate": 2.394436944846523e-06, "loss": 1.8278610229492187, "memory(GiB)": 44.29, "step": 4455, "token_acc": 0.5739130434782609, "train_speed(iter/s)": 0.580446 }, { "epoch": 2.7063106796116507, "grad_norm": 7.38375997543335, "learning_rate": 2.3461059268969744e-06, "loss": 1.9157276153564453, "memory(GiB)": 44.29, "step": 4460, "token_acc": 0.5667655786350149, "train_speed(iter/s)": 0.580538 }, { "epoch": 2.7093446601941746, "grad_norm": 11.641793251037598, "learning_rate": 2.29825594858063e-06, "loss": 1.723676872253418, "memory(GiB)": 44.29, "step": 4465, "token_acc": 0.5962962962962963, "train_speed(iter/s)": 0.580636 }, { "epoch": 2.712378640776699, "grad_norm": 8.030855178833008, "learning_rate": 2.250887492916487e-06, "loss": 1.855816650390625, "memory(GiB)": 44.29, "step": 4470, "token_acc": 0.5791044776119403, "train_speed(iter/s)": 0.580678 }, { "epoch": 2.7154126213592233, "grad_norm": 8.97354793548584, "learning_rate": 2.204001038062836e-06, "loss": 1.9793785095214844, "memory(GiB)": 44.29, "step": 4475, "token_acc": 0.5257731958762887, "train_speed(iter/s)": 0.58069 }, { "epoch": 2.7184466019417477, "grad_norm": 10.212775230407715, "learning_rate": 2.157597057312444e-06, "loss": 2.099479103088379, "memory(GiB)": 44.29, "step": 4480, "token_acc": 0.5294117647058824, "train_speed(iter/s)": 0.580635 }, { "epoch": 2.721480582524272, "grad_norm": 9.049674034118652, "learning_rate": 2.1116760190877437e-06, "loss": 1.7141408920288086, "memory(GiB)": 44.29, "step": 4485, "token_acc": 0.6265822784810127, "train_speed(iter/s)": 0.580726 }, { "epoch": 2.724514563106796, "grad_norm": 10.3820219039917, "learning_rate": 2.0662383869361645e-06, "loss": 1.9986873626708985, "memory(GiB)": 44.29, "step": 4490, "token_acc": 0.5757575757575758, "train_speed(iter/s)": 0.580832 }, { "epoch": 2.7275485436893203, "grad_norm": 7.463447093963623, "learning_rate": 2.0212846195253987e-06, "loss": 2.1121494293212892, "memory(GiB)": 44.29, "step": 4495, "token_acc": 0.5421052631578948, "train_speed(iter/s)": 0.580767 }, { "epoch": 2.7305825242718447, "grad_norm": 8.770597457885742, "learning_rate": 1.976815170638802e-06, "loss": 2.0751237869262695, "memory(GiB)": 44.29, "step": 4500, "token_acc": 0.5382165605095541, "train_speed(iter/s)": 0.580785 }, { "epoch": 2.7305825242718447, "eval_loss": 1.9814581871032715, "eval_runtime": 12.0819, "eval_samples_per_second": 8.277, "eval_steps_per_second": 8.277, "eval_token_acc": 0.5314591700133868, "step": 4500 }, { "epoch": 2.733616504854369, "grad_norm": 11.466297149658203, "learning_rate": 1.9328304891708003e-06, "loss": 1.710250473022461, "memory(GiB)": 44.29, "step": 4505, "token_acc": 0.5572666025024061, "train_speed(iter/s)": 0.579903 }, { "epoch": 2.7366504854368934, "grad_norm": 7.724918365478516, "learning_rate": 1.8893310191223535e-06, "loss": 1.6978034973144531, "memory(GiB)": 44.29, "step": 4510, "token_acc": 0.5993377483443708, "train_speed(iter/s)": 0.579938 }, { "epoch": 2.7396844660194173, "grad_norm": 7.455316543579102, "learning_rate": 1.8463171995964978e-06, "loss": 1.7312326431274414, "memory(GiB)": 44.29, "step": 4515, "token_acc": 0.584717607973422, "train_speed(iter/s)": 0.579877 }, { "epoch": 2.7427184466019416, "grad_norm": 9.585491180419922, "learning_rate": 1.8037894647938758e-06, "loss": 1.9202953338623048, "memory(GiB)": 44.29, "step": 4520, "token_acc": 0.5942492012779552, "train_speed(iter/s)": 0.579941 }, { "epoch": 2.745752427184466, "grad_norm": 7.608863830566406, "learning_rate": 1.7617482440083931e-06, "loss": 1.9673721313476562, "memory(GiB)": 44.29, "step": 4525, "token_acc": 0.5555555555555556, "train_speed(iter/s)": 0.579965 }, { "epoch": 2.7487864077669903, "grad_norm": 7.024211883544922, "learning_rate": 1.7201939616228569e-06, "loss": 1.9407548904418945, "memory(GiB)": 44.29, "step": 4530, "token_acc": 0.5653333333333334, "train_speed(iter/s)": 0.579964 }, { "epoch": 2.7518203883495147, "grad_norm": 8.106232643127441, "learning_rate": 1.6791270371046997e-06, "loss": 1.7603139877319336, "memory(GiB)": 44.29, "step": 4535, "token_acc": 0.5956112852664577, "train_speed(iter/s)": 0.579993 }, { "epoch": 2.7548543689320386, "grad_norm": 8.373075485229492, "learning_rate": 1.638547885001762e-06, "loss": 2.115154838562012, "memory(GiB)": 44.29, "step": 4540, "token_acc": 0.5537459283387622, "train_speed(iter/s)": 0.58 }, { "epoch": 2.757888349514563, "grad_norm": 8.133313179016113, "learning_rate": 1.5984569149380678e-06, "loss": 1.959366226196289, "memory(GiB)": 44.29, "step": 4545, "token_acc": 0.592948717948718, "train_speed(iter/s)": 0.580015 }, { "epoch": 2.7609223300970873, "grad_norm": 10.857476234436035, "learning_rate": 1.5588545316097269e-06, "loss": 1.636090087890625, "memory(GiB)": 44.29, "step": 4550, "token_acc": 0.5871212121212122, "train_speed(iter/s)": 0.580078 }, { "epoch": 2.7639563106796117, "grad_norm": 7.435564041137695, "learning_rate": 1.51974113478085e-06, "loss": 1.679054069519043, "memory(GiB)": 44.29, "step": 4555, "token_acc": 0.597864768683274, "train_speed(iter/s)": 0.580029 }, { "epoch": 2.766990291262136, "grad_norm": 8.17315673828125, "learning_rate": 1.4811171192794627e-06, "loss": 2.029444694519043, "memory(GiB)": 44.29, "step": 4560, "token_acc": 0.568561872909699, "train_speed(iter/s)": 0.579988 }, { "epoch": 2.77002427184466, "grad_norm": 10.141511917114258, "learning_rate": 1.4429828749936092e-06, "loss": 1.9936655044555665, "memory(GiB)": 44.29, "step": 4565, "token_acc": 0.5769230769230769, "train_speed(iter/s)": 0.580042 }, { "epoch": 2.7730582524271847, "grad_norm": 8.14281177520752, "learning_rate": 1.4053387868673217e-06, "loss": 1.8854488372802733, "memory(GiB)": 44.29, "step": 4570, "token_acc": 0.582089552238806, "train_speed(iter/s)": 0.580069 }, { "epoch": 2.7760922330097086, "grad_norm": 7.9403910636901855, "learning_rate": 1.368185234896796e-06, "loss": 2.005961608886719, "memory(GiB)": 44.29, "step": 4575, "token_acc": 0.5279503105590062, "train_speed(iter/s)": 0.580129 }, { "epoch": 2.779126213592233, "grad_norm": 8.195262908935547, "learning_rate": 1.3315225941265386e-06, "loss": 1.789654541015625, "memory(GiB)": 44.29, "step": 4580, "token_acc": 0.5811688311688312, "train_speed(iter/s)": 0.580137 }, { "epoch": 2.7821601941747574, "grad_norm": 7.385119438171387, "learning_rate": 1.2953512346455643e-06, "loss": 1.678761100769043, "memory(GiB)": 44.29, "step": 4585, "token_acc": 0.5840978593272171, "train_speed(iter/s)": 0.580145 }, { "epoch": 2.7851941747572817, "grad_norm": 6.9788336753845215, "learning_rate": 1.2596715215836996e-06, "loss": 1.8593015670776367, "memory(GiB)": 44.29, "step": 4590, "token_acc": 0.5896551724137931, "train_speed(iter/s)": 0.580152 }, { "epoch": 2.788228155339806, "grad_norm": 7.658742427825928, "learning_rate": 1.224483815107863e-06, "loss": 1.924429702758789, "memory(GiB)": 44.29, "step": 4595, "token_acc": 0.5796610169491525, "train_speed(iter/s)": 0.580188 }, { "epoch": 2.79126213592233, "grad_norm": 7.426290035247803, "learning_rate": 1.1897884704184236e-06, "loss": 1.8148229598999024, "memory(GiB)": 44.29, "step": 4600, "token_acc": 0.6019108280254777, "train_speed(iter/s)": 0.580172 }, { "epoch": 2.7942961165048543, "grad_norm": 9.051165580749512, "learning_rate": 1.1555858377456596e-06, "loss": 1.9418960571289063, "memory(GiB)": 44.29, "step": 4605, "token_acc": 0.6119402985074627, "train_speed(iter/s)": 0.580258 }, { "epoch": 2.7973300970873787, "grad_norm": 6.436223030090332, "learning_rate": 1.1218762623461666e-06, "loss": 1.7338180541992188, "memory(GiB)": 44.29, "step": 4610, "token_acc": 0.6037735849056604, "train_speed(iter/s)": 0.580253 }, { "epoch": 2.800364077669903, "grad_norm": 9.345931053161621, "learning_rate": 1.0886600844994266e-06, "loss": 2.1333446502685547, "memory(GiB)": 44.29, "step": 4615, "token_acc": 0.5629139072847682, "train_speed(iter/s)": 0.580306 }, { "epoch": 2.8033980582524274, "grad_norm": 9.715279579162598, "learning_rate": 1.0559376395043285e-06, "loss": 1.706222152709961, "memory(GiB)": 44.29, "step": 4620, "token_acc": 0.6322314049586777, "train_speed(iter/s)": 0.580351 }, { "epoch": 2.8064320388349513, "grad_norm": 5.747392654418945, "learning_rate": 1.0237092576758034e-06, "loss": 1.9026046752929688, "memory(GiB)": 44.29, "step": 4625, "token_acc": 0.5642458100558659, "train_speed(iter/s)": 0.580353 }, { "epoch": 2.8094660194174756, "grad_norm": 9.29836654663086, "learning_rate": 9.919752643414992e-07, "loss": 1.9644575119018555, "memory(GiB)": 44.29, "step": 4630, "token_acc": 0.5217391304347826, "train_speed(iter/s)": 0.580395 }, { "epoch": 2.8125, "grad_norm": 6.784262657165527, "learning_rate": 9.607359798384785e-07, "loss": 2.0778518676757813, "memory(GiB)": 44.29, "step": 4635, "token_acc": 0.5848375451263538, "train_speed(iter/s)": 0.58043 }, { "epoch": 2.8155339805825244, "grad_norm": 10.704444885253906, "learning_rate": 9.299917195099927e-07, "loss": 1.6303802490234376, "memory(GiB)": 44.29, "step": 4640, "token_acc": 0.5941176470588235, "train_speed(iter/s)": 0.580467 }, { "epoch": 2.8185679611650487, "grad_norm": 9.466361045837402, "learning_rate": 8.997427937023018e-07, "loss": 2.072785758972168, "memory(GiB)": 44.29, "step": 4645, "token_acc": 0.5944272445820433, "train_speed(iter/s)": 0.580569 }, { "epoch": 2.8216019417475726, "grad_norm": 8.331581115722656, "learning_rate": 8.699895077615316e-07, "loss": 1.9922773361206054, "memory(GiB)": 44.29, "step": 4650, "token_acc": 0.5819935691318328, "train_speed(iter/s)": 0.580614 }, { "epoch": 2.824635922330097, "grad_norm": 7.199705600738525, "learning_rate": 8.407321620306108e-07, "loss": 2.1337678909301756, "memory(GiB)": 44.29, "step": 4655, "token_acc": 0.583011583011583, "train_speed(iter/s)": 0.580633 }, { "epoch": 2.8276699029126213, "grad_norm": 11.327582359313965, "learning_rate": 8.119710518462164e-07, "loss": 1.815553855895996, "memory(GiB)": 44.29, "step": 4660, "token_acc": 0.5860058309037901, "train_speed(iter/s)": 0.58062 }, { "epoch": 2.8307038834951457, "grad_norm": 9.220823287963867, "learning_rate": 7.837064675357997e-07, "loss": 2.0095773696899415, "memory(GiB)": 44.29, "step": 4665, "token_acc": 0.5482866043613707, "train_speed(iter/s)": 0.580668 }, { "epoch": 2.83373786407767, "grad_norm": 8.487168312072754, "learning_rate": 7.559386944146762e-07, "loss": 1.874141311645508, "memory(GiB)": 44.29, "step": 4670, "token_acc": 0.5662337662337662, "train_speed(iter/s)": 0.58063 }, { "epoch": 2.836771844660194, "grad_norm": 10.926680564880371, "learning_rate": 7.28668012783107e-07, "loss": 1.9664880752563476, "memory(GiB)": 44.29, "step": 4675, "token_acc": 0.567398119122257, "train_speed(iter/s)": 0.580617 }, { "epoch": 2.8398058252427183, "grad_norm": 8.504984855651855, "learning_rate": 7.018946979234997e-07, "loss": 2.202426528930664, "memory(GiB)": 44.29, "step": 4680, "token_acc": 0.5260416666666666, "train_speed(iter/s)": 0.580652 }, { "epoch": 2.8428398058252426, "grad_norm": 8.054615020751953, "learning_rate": 6.756190200976287e-07, "loss": 2.008488082885742, "memory(GiB)": 44.29, "step": 4685, "token_acc": 0.5735735735735735, "train_speed(iter/s)": 0.580661 }, { "epoch": 2.845873786407767, "grad_norm": 7.760517597198486, "learning_rate": 6.498412445438751e-07, "loss": 1.9507659912109374, "memory(GiB)": 44.29, "step": 4690, "token_acc": 0.5644699140401146, "train_speed(iter/s)": 0.58068 }, { "epoch": 2.8489077669902914, "grad_norm": 8.335232734680176, "learning_rate": 6.245616314746072e-07, "loss": 2.067840576171875, "memory(GiB)": 44.29, "step": 4695, "token_acc": 0.5325779036827195, "train_speed(iter/s)": 0.580562 }, { "epoch": 2.8519417475728153, "grad_norm": 10.580134391784668, "learning_rate": 5.997804360734827e-07, "loss": 2.042892837524414, "memory(GiB)": 44.29, "step": 4700, "token_acc": 0.5509554140127388, "train_speed(iter/s)": 0.580537 }, { "epoch": 2.85497572815534, "grad_norm": 7.85345983505249, "learning_rate": 5.754979084929335e-07, "loss": 1.6745044708251953, "memory(GiB)": 44.29, "step": 4705, "token_acc": 0.6067796610169491, "train_speed(iter/s)": 0.580553 }, { "epoch": 2.858009708737864, "grad_norm": 6.51752233505249, "learning_rate": 5.517142938516074e-07, "loss": 1.8814077377319336, "memory(GiB)": 44.29, "step": 4710, "token_acc": 0.5815384615384616, "train_speed(iter/s)": 0.58047 }, { "epoch": 2.8610436893203883, "grad_norm": 9.65807819366455, "learning_rate": 5.284298322319026e-07, "loss": 2.0154050827026366, "memory(GiB)": 44.29, "step": 4715, "token_acc": 0.5488958990536278, "train_speed(iter/s)": 0.580497 }, { "epoch": 2.8640776699029127, "grad_norm": 6.690892696380615, "learning_rate": 5.056447586775593e-07, "loss": 1.9270032882690429, "memory(GiB)": 44.29, "step": 4720, "token_acc": 0.589041095890411, "train_speed(iter/s)": 0.580493 }, { "epoch": 2.867111650485437, "grad_norm": 7.775207996368408, "learning_rate": 4.833593031912387e-07, "loss": 1.9307134628295899, "memory(GiB)": 44.29, "step": 4725, "token_acc": 0.584045584045584, "train_speed(iter/s)": 0.580464 }, { "epoch": 2.8701456310679614, "grad_norm": 6.894526481628418, "learning_rate": 4.6157369073226984e-07, "loss": 1.5071632385253906, "memory(GiB)": 44.29, "step": 4730, "token_acc": 0.6421725239616614, "train_speed(iter/s)": 0.58048 }, { "epoch": 2.8731796116504853, "grad_norm": 6.513083457946777, "learning_rate": 4.402881412143234e-07, "loss": 2.146462249755859, "memory(GiB)": 44.29, "step": 4735, "token_acc": 0.5506849315068493, "train_speed(iter/s)": 0.580443 }, { "epoch": 2.8762135922330097, "grad_norm": 7.810274600982666, "learning_rate": 4.1950286950321327e-07, "loss": 1.9746414184570313, "memory(GiB)": 44.29, "step": 4740, "token_acc": 0.5451713395638629, "train_speed(iter/s)": 0.580426 }, { "epoch": 2.879247572815534, "grad_norm": 8.50667667388916, "learning_rate": 3.9921808541474316e-07, "loss": 1.7838300704956054, "memory(GiB)": 44.29, "step": 4745, "token_acc": 0.5792880258899676, "train_speed(iter/s)": 0.580387 }, { "epoch": 2.8822815533980584, "grad_norm": 7.629726886749268, "learning_rate": 3.7943399371254686e-07, "loss": 1.6623340606689454, "memory(GiB)": 44.29, "step": 4750, "token_acc": 0.6351791530944625, "train_speed(iter/s)": 0.580241 }, { "epoch": 2.8853155339805827, "grad_norm": 7.58314323425293, "learning_rate": 3.601507941060622e-07, "loss": 2.0338212966918947, "memory(GiB)": 44.29, "step": 4755, "token_acc": 0.5410764872521246, "train_speed(iter/s)": 0.580202 }, { "epoch": 2.8883495145631066, "grad_norm": 11.662416458129883, "learning_rate": 3.41368681248494e-07, "loss": 1.8530158996582031, "memory(GiB)": 44.29, "step": 4760, "token_acc": 0.5753424657534246, "train_speed(iter/s)": 0.580151 }, { "epoch": 2.891383495145631, "grad_norm": 9.764945030212402, "learning_rate": 3.2308784473485956e-07, "loss": 1.810487937927246, "memory(GiB)": 44.29, "step": 4765, "token_acc": 0.6116071428571429, "train_speed(iter/s)": 0.580171 }, { "epoch": 2.8944174757281553, "grad_norm": 7.281760215759277, "learning_rate": 3.053084691000685e-07, "loss": 1.762740707397461, "memory(GiB)": 44.29, "step": 4770, "token_acc": 0.5967213114754099, "train_speed(iter/s)": 0.580116 }, { "epoch": 2.8974514563106797, "grad_norm": 8.245015144348145, "learning_rate": 2.8803073381704626e-07, "loss": 1.84234561920166, "memory(GiB)": 44.29, "step": 4775, "token_acc": 0.5969230769230769, "train_speed(iter/s)": 0.580086 }, { "epoch": 2.900485436893204, "grad_norm": 6.5408935546875, "learning_rate": 2.712548132949577e-07, "loss": 1.842409896850586, "memory(GiB)": 44.29, "step": 4780, "token_acc": 0.60790273556231, "train_speed(iter/s)": 0.580074 }, { "epoch": 2.903519417475728, "grad_norm": 7.102424144744873, "learning_rate": 2.5498087687741424e-07, "loss": 1.610619354248047, "memory(GiB)": 44.64, "step": 4785, "token_acc": 0.6106870229007634, "train_speed(iter/s)": 0.580062 }, { "epoch": 2.9065533980582523, "grad_norm": 8.561509132385254, "learning_rate": 2.3920908884078053e-07, "loss": 1.9039691925048827, "memory(GiB)": 44.64, "step": 4790, "token_acc": 0.5791245791245792, "train_speed(iter/s)": 0.580072 }, { "epoch": 2.9095873786407767, "grad_norm": 10.727002143859863, "learning_rate": 2.239396083925094e-07, "loss": 1.9637014389038085, "memory(GiB)": 44.64, "step": 4795, "token_acc": 0.5355029585798816, "train_speed(iter/s)": 0.579908 }, { "epoch": 2.912621359223301, "grad_norm": 8.442927360534668, "learning_rate": 2.0917258966953733e-07, "loss": 2.2038265228271485, "memory(GiB)": 44.64, "step": 4800, "token_acc": 0.5454545454545454, "train_speed(iter/s)": 0.579874 }, { "epoch": 2.9156553398058254, "grad_norm": 8.77606201171875, "learning_rate": 1.9490818173672486e-07, "loss": 1.8866867065429687, "memory(GiB)": 44.64, "step": 4805, "token_acc": 0.5782747603833865, "train_speed(iter/s)": 0.579853 }, { "epoch": 2.9186893203883493, "grad_norm": 8.638134956359863, "learning_rate": 1.8114652858536862e-07, "loss": 1.8457630157470704, "memory(GiB)": 44.64, "step": 4810, "token_acc": 0.5689655172413793, "train_speed(iter/s)": 0.579838 }, { "epoch": 2.9217233009708736, "grad_norm": 6.532174587249756, "learning_rate": 1.6788776913171932e-07, "loss": 1.879047966003418, "memory(GiB)": 44.64, "step": 4815, "token_acc": 0.5876923076923077, "train_speed(iter/s)": 0.579795 }, { "epoch": 2.924757281553398, "grad_norm": 10.169187545776367, "learning_rate": 1.5513203721559955e-07, "loss": 2.0470817565917967, "memory(GiB)": 44.64, "step": 4820, "token_acc": 0.5559322033898305, "train_speed(iter/s)": 0.57982 }, { "epoch": 2.9277912621359223, "grad_norm": 7.9186015129089355, "learning_rate": 1.428794615990603e-07, "loss": 1.855224609375, "memory(GiB)": 44.64, "step": 4825, "token_acc": 0.559322033898305, "train_speed(iter/s)": 0.579822 }, { "epoch": 2.9308252427184467, "grad_norm": 8.12701416015625, "learning_rate": 1.3113016596503769e-07, "loss": 1.8893653869628906, "memory(GiB)": 44.64, "step": 4830, "token_acc": 0.551948051948052, "train_speed(iter/s)": 0.579803 }, { "epoch": 2.9338592233009706, "grad_norm": 6.349172592163086, "learning_rate": 1.1988426891617054e-07, "loss": 1.6970531463623046, "memory(GiB)": 44.64, "step": 4835, "token_acc": 0.6104294478527608, "train_speed(iter/s)": 0.579803 }, { "epoch": 2.9368932038834954, "grad_norm": 7.324570655822754, "learning_rate": 1.0914188397355141e-07, "loss": 1.8949806213378906, "memory(GiB)": 44.64, "step": 4840, "token_acc": 0.5326797385620915, "train_speed(iter/s)": 0.579773 }, { "epoch": 2.9399271844660193, "grad_norm": 7.8848114013671875, "learning_rate": 9.890311957559406e-08, "loss": 2.149030303955078, "memory(GiB)": 44.64, "step": 4845, "token_acc": 0.5292207792207793, "train_speed(iter/s)": 0.579762 }, { "epoch": 2.9429611650485437, "grad_norm": 7.4910783767700195, "learning_rate": 8.916807907695113e-08, "loss": 2.0563175201416017, "memory(GiB)": 44.64, "step": 4850, "token_acc": 0.5728476821192053, "train_speed(iter/s)": 0.5797 }, { "epoch": 2.945995145631068, "grad_norm": 10.111432075500488, "learning_rate": 7.993686074744821e-08, "loss": 1.8403484344482421, "memory(GiB)": 44.64, "step": 4855, "token_acc": 0.6107594936708861, "train_speed(iter/s)": 0.579672 }, { "epoch": 2.9490291262135924, "grad_norm": 8.500150680541992, "learning_rate": 7.120955777112914e-08, "loss": 1.9626632690429688, "memory(GiB)": 44.64, "step": 4860, "token_acc": 0.5274390243902439, "train_speed(iter/s)": 0.579711 }, { "epoch": 2.9520631067961167, "grad_norm": 9.124574661254883, "learning_rate": 6.298625824527337e-08, "loss": 2.131892776489258, "memory(GiB)": 44.64, "step": 4865, "token_acc": 0.540625, "train_speed(iter/s)": 0.579721 }, { "epoch": 2.9550970873786406, "grad_norm": 8.05234432220459, "learning_rate": 5.526704517951897e-08, "loss": 1.5919179916381836, "memory(GiB)": 44.64, "step": 4870, "token_acc": 0.6409495548961425, "train_speed(iter/s)": 0.579706 }, { "epoch": 2.958131067961165, "grad_norm": 9.051335334777832, "learning_rate": 4.8051996495052096e-08, "loss": 1.8254867553710938, "memory(GiB)": 44.64, "step": 4875, "token_acc": 0.5714285714285714, "train_speed(iter/s)": 0.579633 }, { "epoch": 2.9611650485436893, "grad_norm": 12.33359432220459, "learning_rate": 4.134118502378548e-08, "loss": 1.7505077362060546, "memory(GiB)": 44.64, "step": 4880, "token_acc": 0.6245954692556634, "train_speed(iter/s)": 0.579568 }, { "epoch": 2.9641990291262137, "grad_norm": 8.701041221618652, "learning_rate": 3.5134678507636745e-08, "loss": 1.7970073699951172, "memory(GiB)": 44.64, "step": 4885, "token_acc": 0.5992366412213741, "train_speed(iter/s)": 0.579584 }, { "epoch": 2.967233009708738, "grad_norm": 10.156620979309082, "learning_rate": 2.9432539597851195e-08, "loss": 2.0175302505493162, "memory(GiB)": 44.64, "step": 4890, "token_acc": 0.5270758122743683, "train_speed(iter/s)": 0.579648 }, { "epoch": 2.970266990291262, "grad_norm": 9.249608039855957, "learning_rate": 2.423482585435788e-08, "loss": 1.8950572967529298, "memory(GiB)": 44.64, "step": 4895, "token_acc": 0.5878594249201278, "train_speed(iter/s)": 0.579643 }, { "epoch": 2.9733009708737863, "grad_norm": 8.014542579650879, "learning_rate": 1.9541589745186717e-08, "loss": 1.8426591873168945, "memory(GiB)": 44.64, "step": 4900, "token_acc": 0.5846153846153846, "train_speed(iter/s)": 0.579651 }, { "epoch": 2.9763349514563107, "grad_norm": 7.611429691314697, "learning_rate": 1.5352878645963352e-08, "loss": 2.1125755310058594, "memory(GiB)": 44.64, "step": 4905, "token_acc": 0.5577557755775577, "train_speed(iter/s)": 0.579586 }, { "epoch": 2.979368932038835, "grad_norm": 12.364704132080078, "learning_rate": 1.1668734839404006e-08, "loss": 1.8508235931396484, "memory(GiB)": 44.64, "step": 4910, "token_acc": 0.6196078431372549, "train_speed(iter/s)": 0.579638 }, { "epoch": 2.9824029126213594, "grad_norm": 7.556114196777344, "learning_rate": 8.489195514888027e-09, "loss": 2.153472137451172, "memory(GiB)": 44.64, "step": 4915, "token_acc": 0.5749235474006116, "train_speed(iter/s)": 0.579639 }, { "epoch": 2.9854368932038833, "grad_norm": 5.7980546951293945, "learning_rate": 5.814292768108187e-09, "loss": 1.972738265991211, "memory(GiB)": 44.64, "step": 4920, "token_acc": 0.5417721518987342, "train_speed(iter/s)": 0.579645 }, { "epoch": 2.9884708737864076, "grad_norm": 8.18667221069336, "learning_rate": 3.644053600726505e-09, "loss": 2.019988441467285, "memory(GiB)": 44.64, "step": 4925, "token_acc": 0.5647058823529412, "train_speed(iter/s)": 0.579688 }, { "epoch": 2.991504854368932, "grad_norm": 7.362902641296387, "learning_rate": 1.978499920096688e-09, "loss": 1.9861087799072266, "memory(GiB)": 44.64, "step": 4930, "token_acc": 0.5605095541401274, "train_speed(iter/s)": 0.579756 }, { "epoch": 2.9945388349514563, "grad_norm": 7.812079906463623, "learning_rate": 8.176485390642974e-10, "loss": 1.789814567565918, "memory(GiB)": 44.64, "step": 4935, "token_acc": 0.5348837209302325, "train_speed(iter/s)": 0.579831 }, { "epoch": 2.9975728155339807, "grad_norm": 9.960822105407715, "learning_rate": 1.6151117577800633e-10, "loss": 2.1190773010253907, "memory(GiB)": 44.64, "step": 4940, "token_acc": 0.5483870967741935, "train_speed(iter/s)": 0.579872 }, { "epoch": 3.0, "eval_loss": 1.9859907627105713, "eval_runtime": 12.2556, "eval_samples_per_second": 8.16, "eval_steps_per_second": 8.16, "eval_token_acc": 0.5401554404145078, "step": 4944 } ], "logging_steps": 5, "max_steps": 4944, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.826945200557008e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }