{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.0, "eval_steps": 50, "global_step": 246, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02476780185758514, "grad_norm": 0.5827791094779968, "learning_rate": 0.0, "loss": 0.5993257761001587, "step": 1 }, { "epoch": 0.04953560371517028, "grad_norm": 0.5781313180923462, "learning_rate": 2.5e-07, "loss": 0.5510573387145996, "step": 2 }, { "epoch": 0.07430340557275542, "grad_norm": 0.5830345153808594, "learning_rate": 5e-07, "loss": 0.500480055809021, "step": 3 }, { "epoch": 0.09907120743034056, "grad_norm": 0.5189770460128784, "learning_rate": 7.5e-07, "loss": 0.5299410820007324, "step": 4 }, { "epoch": 0.1238390092879257, "grad_norm": 0.520061194896698, "learning_rate": 1e-06, "loss": 0.5539457201957703, "step": 5 }, { "epoch": 0.14860681114551083, "grad_norm": 0.5419376492500305, "learning_rate": 1.2499999999999999e-06, "loss": 0.5408970713615417, "step": 6 }, { "epoch": 0.17337461300309598, "grad_norm": 0.5576385855674744, "learning_rate": 1.5e-06, "loss": 0.5969724655151367, "step": 7 }, { "epoch": 0.19814241486068113, "grad_norm": 0.5351932048797607, "learning_rate": 1.75e-06, "loss": 0.5394197106361389, "step": 8 }, { "epoch": 0.22291021671826625, "grad_norm": 0.4773852527141571, "learning_rate": 2e-06, "loss": 0.5735222101211548, "step": 9 }, { "epoch": 0.2476780185758514, "grad_norm": 0.5032294392585754, "learning_rate": 1.9999128816724105e-06, "loss": 0.5828520059585571, "step": 10 }, { "epoch": 0.2724458204334365, "grad_norm": 0.49014607071876526, "learning_rate": 1.9996515418688487e-06, "loss": 0.5568044781684875, "step": 11 }, { "epoch": 0.29721362229102166, "grad_norm": 0.5634818077087402, "learning_rate": 1.9992160261242874e-06, "loss": 0.5982780456542969, "step": 12 }, { "epoch": 0.3219814241486068, "grad_norm": 0.4928373396396637, "learning_rate": 1.9986064103215337e-06, "loss": 0.563035249710083, "step": 13 }, { "epoch": 0.34674922600619196, "grad_norm": 0.5265209674835205, "learning_rate": 1.9978228006780053e-06, "loss": 0.588450014591217, "step": 14 }, { "epoch": 0.3715170278637771, "grad_norm": 0.4966702461242676, "learning_rate": 1.996865333727226e-06, "loss": 0.5518300533294678, "step": 15 }, { "epoch": 0.39628482972136225, "grad_norm": 0.5559803247451782, "learning_rate": 1.9957341762950344e-06, "loss": 0.5778566002845764, "step": 16 }, { "epoch": 0.42105263157894735, "grad_norm": 0.5569736957550049, "learning_rate": 1.9944295254705185e-06, "loss": 0.556509256362915, "step": 17 }, { "epoch": 0.4458204334365325, "grad_norm": 0.5971181988716125, "learning_rate": 1.992951608571673e-06, "loss": 0.5314251780509949, "step": 18 }, { "epoch": 0.47058823529411764, "grad_norm": 0.529690146446228, "learning_rate": 1.9913006831057965e-06, "loss": 0.5227062702178955, "step": 19 }, { "epoch": 0.4953560371517028, "grad_norm": 0.6401184797286987, "learning_rate": 1.989477036724619e-06, "loss": 0.5782433152198792, "step": 20 }, { "epoch": 0.5201238390092879, "grad_norm": 0.539941132068634, "learning_rate": 1.9874809871741874e-06, "loss": 0.5736757516860962, "step": 21 }, { "epoch": 0.544891640866873, "grad_norm": 0.5726771950721741, "learning_rate": 1.9853128822394975e-06, "loss": 0.5858570337295532, "step": 22 }, { "epoch": 0.5696594427244582, "grad_norm": 0.55902498960495, "learning_rate": 1.982973099683902e-06, "loss": 0.5574871301651001, "step": 23 }, { "epoch": 0.5944272445820433, "grad_norm": 0.527619481086731, "learning_rate": 1.9804620471832865e-06, "loss": 0.5171317458152771, "step": 24 }, { "epoch": 0.6191950464396285, "grad_norm": 0.5026052594184875, "learning_rate": 1.9777801622550405e-06, "loss": 0.5416678190231323, "step": 25 }, { "epoch": 0.6439628482972136, "grad_norm": 0.47064998745918274, "learning_rate": 1.9749279121818236e-06, "loss": 0.5682564973831177, "step": 26 }, { "epoch": 0.6687306501547987, "grad_norm": 0.5842341184616089, "learning_rate": 1.9719057939301475e-06, "loss": 0.5644649267196655, "step": 27 }, { "epoch": 0.6934984520123839, "grad_norm": 0.49904075264930725, "learning_rate": 1.9687143340637884e-06, "loss": 0.5811545252799988, "step": 28 }, { "epoch": 0.718266253869969, "grad_norm": 1.2309396266937256, "learning_rate": 1.9653540886520385e-06, "loss": 0.605437695980072, "step": 29 }, { "epoch": 0.7430340557275542, "grad_norm": 0.5156847834587097, "learning_rate": 1.9618256431728192e-06, "loss": 0.5422309637069702, "step": 30 }, { "epoch": 0.7678018575851393, "grad_norm": 0.6013903617858887, "learning_rate": 1.958129612410668e-06, "loss": 0.54377281665802, "step": 31 }, { "epoch": 0.7925696594427245, "grad_norm": 0.5307015180587769, "learning_rate": 1.954266640349623e-06, "loss": 0.5074729919433594, "step": 32 }, { "epoch": 0.8173374613003096, "grad_norm": 0.5950272679328918, "learning_rate": 1.950237400061015e-06, "loss": 0.5290631055831909, "step": 33 }, { "epoch": 0.8421052631578947, "grad_norm": 0.5664405226707458, "learning_rate": 1.9460425935861946e-06, "loss": 0.600000262260437, "step": 34 }, { "epoch": 0.8668730650154799, "grad_norm": 0.5338588953018188, "learning_rate": 1.9416829518142113e-06, "loss": 0.5680241584777832, "step": 35 }, { "epoch": 0.891640866873065, "grad_norm": 0.5495931506156921, "learning_rate": 1.9371592343544655e-06, "loss": 0.5304821729660034, "step": 36 }, { "epoch": 0.9164086687306502, "grad_norm": 0.47950977087020874, "learning_rate": 1.932472229404356e-06, "loss": 0.5156245827674866, "step": 37 }, { "epoch": 0.9411764705882353, "grad_norm": 0.5299308896064758, "learning_rate": 1.9276227536119477e-06, "loss": 0.5732549428939819, "step": 38 }, { "epoch": 0.9659442724458205, "grad_norm": 0.5737171173095703, "learning_rate": 1.9226116519336828e-06, "loss": 0.5309604406356812, "step": 39 }, { "epoch": 0.9907120743034056, "grad_norm": 0.693321168422699, "learning_rate": 1.917439797487156e-06, "loss": 0.5797507762908936, "step": 40 }, { "epoch": 1.0, "grad_norm": 0.9561907649040222, "learning_rate": 1.9121080913989878e-06, "loss": 0.5909802913665771, "step": 41 }, { "epoch": 1.0247678018575852, "grad_norm": 0.6066501140594482, "learning_rate": 1.9066174626478126e-06, "loss": 0.6078804135322571, "step": 42 }, { "epoch": 1.0495356037151702, "grad_norm": 0.5243707299232483, "learning_rate": 1.9009688679024189e-06, "loss": 0.5241413116455078, "step": 43 }, { "epoch": 1.0743034055727554, "grad_norm": 0.5240072011947632, "learning_rate": 1.8951632913550625e-06, "loss": 0.5645661950111389, "step": 44 }, { "epoch": 1.0990712074303406, "grad_norm": 0.6983147263526917, "learning_rate": 1.889201744549981e-06, "loss": 0.5029958486557007, "step": 45 }, { "epoch": 1.1238390092879258, "grad_norm": 0.6109921932220459, "learning_rate": 1.8830852662071505e-06, "loss": 0.5748687386512756, "step": 46 }, { "epoch": 1.1486068111455108, "grad_norm": 0.5242897868156433, "learning_rate": 1.8768149220412987e-06, "loss": 0.5576164722442627, "step": 47 }, { "epoch": 1.173374613003096, "grad_norm": 0.5376689434051514, "learning_rate": 1.8703918045762194e-06, "loss": 0.5489684343338013, "step": 48 }, { "epoch": 1.1981424148606812, "grad_norm": 0.5369903445243835, "learning_rate": 1.863817032954416e-06, "loss": 0.5305777192115784, "step": 49 }, { "epoch": 1.2229102167182662, "grad_norm": 0.482452392578125, "learning_rate": 1.8570917527421045e-06, "loss": 0.4907306134700775, "step": 50 }, { "epoch": 1.2229102167182662, "eval_accuracy": 0.8213776795920542, "eval_loss": 0.5645560622215271, "eval_runtime": 16.7311, "eval_samples_per_second": 4.064, "eval_steps_per_second": 2.032, "step": 50 }, { "epoch": 1.2476780185758514, "grad_norm": 0.5009844899177551, "learning_rate": 1.8502171357296142e-06, "loss": 0.5544570088386536, "step": 51 }, { "epoch": 1.2724458204334366, "grad_norm": 0.5807215571403503, "learning_rate": 1.8431943797272185e-06, "loss": 0.5804014205932617, "step": 52 }, { "epoch": 1.2972136222910216, "grad_norm": 0.5564484596252441, "learning_rate": 1.836024708356434e-06, "loss": 0.5661737322807312, "step": 53 }, { "epoch": 1.3219814241486068, "grad_norm": 0.5095818042755127, "learning_rate": 1.8287093708368186e-06, "loss": 0.5299423336982727, "step": 54 }, { "epoch": 1.346749226006192, "grad_norm": 0.5763193368911743, "learning_rate": 1.8212496417683135e-06, "loss": 0.5352605581283569, "step": 55 }, { "epoch": 1.3715170278637772, "grad_norm": 0.5195797681808472, "learning_rate": 1.81364682090916e-06, "loss": 0.530654788017273, "step": 56 }, { "epoch": 1.3962848297213624, "grad_norm": 0.5399932861328125, "learning_rate": 1.805902232949435e-06, "loss": 0.5673707723617554, "step": 57 }, { "epoch": 1.4210526315789473, "grad_norm": 0.5126509666442871, "learning_rate": 1.7980172272802397e-06, "loss": 0.5673764944076538, "step": 58 }, { "epoch": 1.4458204334365325, "grad_norm": 0.5293602347373962, "learning_rate": 1.789993177758588e-06, "loss": 0.5548557043075562, "step": 59 }, { "epoch": 1.4705882352941178, "grad_norm": 0.47508999705314636, "learning_rate": 1.7818314824680298e-06, "loss": 0.5592916011810303, "step": 60 }, { "epoch": 1.4953560371517027, "grad_norm": 0.506854236125946, "learning_rate": 1.773533563475053e-06, "loss": 0.5494035482406616, "step": 61 }, { "epoch": 1.520123839009288, "grad_norm": 0.6375800371170044, "learning_rate": 1.7651008665813081e-06, "loss": 0.5607191324234009, "step": 62 }, { "epoch": 1.5448916408668731, "grad_norm": 0.4859982132911682, "learning_rate": 1.7565348610716958e-06, "loss": 0.5413356423377991, "step": 63 }, { "epoch": 1.5696594427244581, "grad_norm": 0.5644744634628296, "learning_rate": 1.7478370394583643e-06, "loss": 0.5568721294403076, "step": 64 }, { "epoch": 1.5944272445820433, "grad_norm": 0.5623730421066284, "learning_rate": 1.739008917220659e-06, "loss": 0.5305633544921875, "step": 65 }, { "epoch": 1.6191950464396285, "grad_norm": 0.46600863337516785, "learning_rate": 1.7300520325410698e-06, "loss": 0.519407331943512, "step": 66 }, { "epoch": 1.6439628482972135, "grad_norm": 0.5476927161216736, "learning_rate": 1.7209679460372249e-06, "loss": 0.5438145399093628, "step": 67 }, { "epoch": 1.6687306501547987, "grad_norm": 0.5339446663856506, "learning_rate": 1.711758240489971e-06, "loss": 0.5288221836090088, "step": 68 }, { "epoch": 1.693498452012384, "grad_norm": 0.4474664628505707, "learning_rate": 1.7024245205675985e-06, "loss": 0.5665724277496338, "step": 69 }, { "epoch": 1.718266253869969, "grad_norm": 0.5247179865837097, "learning_rate": 1.6929684125462468e-06, "loss": 0.5420582294464111, "step": 70 }, { "epoch": 1.7430340557275543, "grad_norm": 0.6573188304901123, "learning_rate": 1.6833915640265483e-06, "loss": 0.538118839263916, "step": 71 }, { "epoch": 1.7678018575851393, "grad_norm": 0.5430057644844055, "learning_rate": 1.6736956436465573e-06, "loss": 0.5287379026412964, "step": 72 }, { "epoch": 1.7925696594427245, "grad_norm": 1.451054334640503, "learning_rate": 1.6638823407910082e-06, "loss": 0.5065432190895081, "step": 73 }, { "epoch": 1.8173374613003097, "grad_norm": 1.7800654172897339, "learning_rate": 1.6539533652969682e-06, "loss": 0.5422472357749939, "step": 74 }, { "epoch": 1.8421052631578947, "grad_norm": 0.5204485654830933, "learning_rate": 1.6439104471559156e-06, "loss": 0.4941398501396179, "step": 75 }, { "epoch": 1.86687306501548, "grad_norm": 0.4798074960708618, "learning_rate": 1.6337553362123161e-06, "loss": 0.5543307065963745, "step": 76 }, { "epoch": 1.891640866873065, "grad_norm": 0.4639158248901367, "learning_rate": 1.6234898018587336e-06, "loss": 0.5305337905883789, "step": 77 }, { "epoch": 1.91640866873065, "grad_norm": 0.4957791566848755, "learning_rate": 1.613115632727537e-06, "loss": 0.4810314178466797, "step": 78 }, { "epoch": 1.9411764705882353, "grad_norm": 0.542951762676239, "learning_rate": 1.6026346363792564e-06, "loss": 0.5742234587669373, "step": 79 }, { "epoch": 1.9659442724458205, "grad_norm": 0.518661618232727, "learning_rate": 1.592048638987638e-06, "loss": 0.5540245771408081, "step": 80 }, { "epoch": 1.9907120743034055, "grad_norm": 0.48943665623664856, "learning_rate": 1.5813594850214597e-06, "loss": 0.509993851184845, "step": 81 }, { "epoch": 2.0, "grad_norm": 0.8778729438781738, "learning_rate": 1.570569036923155e-06, "loss": 0.539715051651001, "step": 82 }, { "epoch": 2.024767801857585, "grad_norm": 0.4994299113750458, "learning_rate": 1.5596791747843082e-06, "loss": 0.5089604258537292, "step": 83 }, { "epoch": 2.0495356037151704, "grad_norm": 0.5828955173492432, "learning_rate": 1.548691796018074e-06, "loss": 0.5253075361251831, "step": 84 }, { "epoch": 2.0743034055727554, "grad_norm": 0.5461580753326416, "learning_rate": 1.5376088150285774e-06, "loss": 0.5154924392700195, "step": 85 }, { "epoch": 2.0990712074303404, "grad_norm": 0.965928852558136, "learning_rate": 1.5264321628773558e-06, "loss": 0.5028945803642273, "step": 86 }, { "epoch": 2.123839009287926, "grad_norm": 0.45946890115737915, "learning_rate": 1.5151637869468958e-06, "loss": 0.5220765471458435, "step": 87 }, { "epoch": 2.1486068111455108, "grad_norm": 0.4885327219963074, "learning_rate": 1.5038056506013295e-06, "loss": 0.5020776391029358, "step": 88 }, { "epoch": 2.173374613003096, "grad_norm": 0.5246437191963196, "learning_rate": 1.492359732844342e-06, "loss": 0.46335524320602417, "step": 89 }, { "epoch": 2.198142414860681, "grad_norm": 0.5331137180328369, "learning_rate": 1.4808280279743591e-06, "loss": 0.5037820339202881, "step": 90 }, { "epoch": 2.222910216718266, "grad_norm": 0.5505975484848022, "learning_rate": 1.4692125452370662e-06, "loss": 0.5359715223312378, "step": 91 }, { "epoch": 2.2476780185758516, "grad_norm": 0.5390040278434753, "learning_rate": 1.4575153084753232e-06, "loss": 0.5337521433830261, "step": 92 }, { "epoch": 2.2724458204334366, "grad_norm": 0.44791266322135925, "learning_rate": 1.4457383557765383e-06, "loss": 0.5155265927314758, "step": 93 }, { "epoch": 2.2972136222910216, "grad_norm": 0.4978775382041931, "learning_rate": 1.433883739117558e-06, "loss": 0.4920554757118225, "step": 94 }, { "epoch": 2.321981424148607, "grad_norm": 0.5269660353660583, "learning_rate": 1.4219535240071376e-06, "loss": 0.5533995628356934, "step": 95 }, { "epoch": 2.346749226006192, "grad_norm": 0.4875043034553528, "learning_rate": 1.4099497891260537e-06, "loss": 0.523270845413208, "step": 96 }, { "epoch": 2.371517027863777, "grad_norm": 0.5254143476486206, "learning_rate": 1.3978746259649208e-06, "loss": 0.5255824327468872, "step": 97 }, { "epoch": 2.3962848297213624, "grad_norm": 0.5345160365104675, "learning_rate": 1.3857301384597794e-06, "loss": 0.5329371094703674, "step": 98 }, { "epoch": 2.4210526315789473, "grad_norm": 0.46321746706962585, "learning_rate": 1.3735184426255114e-06, "loss": 0.5548917055130005, "step": 99 }, { "epoch": 2.4458204334365323, "grad_norm": 0.5209585428237915, "learning_rate": 1.3612416661871531e-06, "loss": 0.5931960940361023, "step": 100 }, { "epoch": 2.4458204334365323, "eval_accuracy": 0.8215138901886158, "eval_loss": 0.562470018863678, "eval_runtime": 16.4711, "eval_samples_per_second": 4.128, "eval_steps_per_second": 2.064, "step": 100 }, { "epoch": 2.4705882352941178, "grad_norm": 0.483987033367157, "learning_rate": 1.3489019482091667e-06, "loss": 0.5425853133201599, "step": 101 }, { "epoch": 2.4953560371517027, "grad_norm": 0.44485101103782654, "learning_rate": 1.336501438722739e-06, "loss": 0.5403157472610474, "step": 102 }, { "epoch": 2.5201238390092877, "grad_norm": 0.5460787415504456, "learning_rate": 1.324042298351166e-06, "loss": 0.5747348666191101, "step": 103 }, { "epoch": 2.544891640866873, "grad_norm": 0.45323142409324646, "learning_rate": 1.3115266979333914e-06, "loss": 0.5297880172729492, "step": 104 }, { "epoch": 2.569659442724458, "grad_norm": 0.6902194619178772, "learning_rate": 1.2989568181457702e-06, "loss": 0.5073508024215698, "step": 105 }, { "epoch": 2.594427244582043, "grad_norm": 0.5212258100509644, "learning_rate": 1.2863348491221127e-06, "loss": 0.5311723351478577, "step": 106 }, { "epoch": 2.6191950464396285, "grad_norm": 0.5578774809837341, "learning_rate": 1.273662990072083e-06, "loss": 0.5304839015007019, "step": 107 }, { "epoch": 2.6439628482972135, "grad_norm": 0.504798173904419, "learning_rate": 1.2609434488980166e-06, "loss": 0.4865831136703491, "step": 108 }, { "epoch": 2.6687306501547985, "grad_norm": 0.4682161211967468, "learning_rate": 1.2481784418102239e-06, "loss": 0.5439316630363464, "step": 109 }, { "epoch": 2.693498452012384, "grad_norm": 0.5871185064315796, "learning_rate": 1.2353701929408424e-06, "loss": 0.477615088224411, "step": 110 }, { "epoch": 2.718266253869969, "grad_norm": 0.4735322594642639, "learning_rate": 1.2225209339563143e-06, "loss": 0.5605683326721191, "step": 111 }, { "epoch": 2.7430340557275543, "grad_norm": 0.5656632781028748, "learning_rate": 1.2096329036685466e-06, "loss": 0.5053581595420837, "step": 112 }, { "epoch": 2.7678018575851393, "grad_norm": 0.501797616481781, "learning_rate": 1.196708347644828e-06, "loss": 0.5080878734588623, "step": 113 }, { "epoch": 2.7925696594427247, "grad_norm": 1.2063102722167969, "learning_rate": 1.1837495178165704e-06, "loss": 0.552485466003418, "step": 114 }, { "epoch": 2.8173374613003097, "grad_norm": 0.5052933096885681, "learning_rate": 1.1707586720869374e-06, "loss": 0.5424617528915405, "step": 115 }, { "epoch": 2.8421052631578947, "grad_norm": 0.5184856057167053, "learning_rate": 1.1577380739374373e-06, "loss": 0.5432671904563904, "step": 116 }, { "epoch": 2.86687306501548, "grad_norm": 0.5071874260902405, "learning_rate": 1.1446899920335405e-06, "loss": 0.5507460832595825, "step": 117 }, { "epoch": 2.891640866873065, "grad_norm": 0.519482433795929, "learning_rate": 1.1316166998293935e-06, "loss": 0.5559477210044861, "step": 118 }, { "epoch": 2.91640866873065, "grad_norm": 0.5042552947998047, "learning_rate": 1.1185204751717027e-06, "loss": 0.5015457272529602, "step": 119 }, { "epoch": 2.9411764705882355, "grad_norm": 1.3727635145187378, "learning_rate": 1.1054035999028476e-06, "loss": 0.5176253318786621, "step": 120 }, { "epoch": 2.9659442724458205, "grad_norm": 0.5206997990608215, "learning_rate": 1.092268359463302e-06, "loss": 0.5474892258644104, "step": 121 }, { "epoch": 2.9907120743034055, "grad_norm": 0.472130686044693, "learning_rate": 1.0791170424934246e-06, "loss": 0.4985366463661194, "step": 122 }, { "epoch": 3.0, "grad_norm": 1.058793544769287, "learning_rate": 1.0659519404346952e-06, "loss": 0.48316121101379395, "step": 123 }, { "epoch": 3.024767801857585, "grad_norm": 0.5421841740608215, "learning_rate": 1.0527753471304623e-06, "loss": 0.5144573450088501, "step": 124 }, { "epoch": 3.0495356037151704, "grad_norm": 0.5197970271110535, "learning_rate": 1.0395895584262695e-06, "loss": 0.5817261934280396, "step": 125 }, { "epoch": 3.0743034055727554, "grad_norm": 0.49334728717803955, "learning_rate": 1.0263968717698363e-06, "loss": 0.5018012523651123, "step": 126 }, { "epoch": 3.0990712074303404, "grad_norm": 0.6232290267944336, "learning_rate": 1.013199585810759e-06, "loss": 0.5584498643875122, "step": 127 }, { "epoch": 3.123839009287926, "grad_norm": 0.455437034368515, "learning_rate": 1e-06, "loss": 0.5036893486976624, "step": 128 }, { "epoch": 3.1486068111455108, "grad_norm": 0.48946836590766907, "learning_rate": 9.868004141892412e-07, "loss": 0.5123312473297119, "step": 129 }, { "epoch": 3.173374613003096, "grad_norm": 0.5698655843734741, "learning_rate": 9.736031282301638e-07, "loss": 0.5401725172996521, "step": 130 }, { "epoch": 3.198142414860681, "grad_norm": 0.9283490180969238, "learning_rate": 9.604104415737308e-07, "loss": 0.48566514253616333, "step": 131 }, { "epoch": 3.222910216718266, "grad_norm": 2.0157785415649414, "learning_rate": 9.472246528695375e-07, "loss": 0.4537651538848877, "step": 132 }, { "epoch": 3.2476780185758516, "grad_norm": 0.5449803471565247, "learning_rate": 9.340480595653045e-07, "loss": 0.5530433654785156, "step": 133 }, { "epoch": 3.2724458204334366, "grad_norm": 0.4725954532623291, "learning_rate": 9.208829575065753e-07, "loss": 0.5256283283233643, "step": 134 }, { "epoch": 3.2972136222910216, "grad_norm": 0.4579267203807831, "learning_rate": 9.077316405366981e-07, "loss": 0.5190701484680176, "step": 135 }, { "epoch": 3.321981424148607, "grad_norm": 0.544757604598999, "learning_rate": 8.945964000971523e-07, "loss": 0.5290215015411377, "step": 136 }, { "epoch": 3.346749226006192, "grad_norm": 0.4990670084953308, "learning_rate": 8.814795248282973e-07, "loss": 0.5203908085823059, "step": 137 }, { "epoch": 3.371517027863777, "grad_norm": 0.5583924651145935, "learning_rate": 8.683833001706067e-07, "loss": 0.499897837638855, "step": 138 }, { "epoch": 3.3962848297213624, "grad_norm": 0.47875887155532837, "learning_rate": 8.553100079664598e-07, "loss": 0.4940932095050812, "step": 139 }, { "epoch": 3.4210526315789473, "grad_norm": 0.4689862132072449, "learning_rate": 8.422619260625624e-07, "loss": 0.488369345664978, "step": 140 }, { "epoch": 3.4458204334365323, "grad_norm": 0.5019742846488953, "learning_rate": 8.292413279130624e-07, "loss": 0.49827271699905396, "step": 141 }, { "epoch": 3.4705882352941178, "grad_norm": 0.47474774718284607, "learning_rate": 8.162504821834295e-07, "loss": 0.5006945133209229, "step": 142 }, { "epoch": 3.4953560371517027, "grad_norm": 0.5412342548370361, "learning_rate": 8.032916523551719e-07, "loss": 0.5021499395370483, "step": 143 }, { "epoch": 3.5201238390092877, "grad_norm": 0.46898508071899414, "learning_rate": 7.903670963314535e-07, "loss": 0.5173486471176147, "step": 144 }, { "epoch": 3.544891640866873, "grad_norm": 0.5036367177963257, "learning_rate": 7.774790660436857e-07, "loss": 0.5127341151237488, "step": 145 }, { "epoch": 3.569659442724458, "grad_norm": 0.4592057466506958, "learning_rate": 7.646298070591577e-07, "loss": 0.5291725397109985, "step": 146 }, { "epoch": 3.594427244582043, "grad_norm": 0.579252302646637, "learning_rate": 7.518215581897763e-07, "loss": 0.5540162324905396, "step": 147 }, { "epoch": 3.6191950464396285, "grad_norm": 0.5662134885787964, "learning_rate": 7.390565511019833e-07, "loss": 0.5307095646858215, "step": 148 }, { "epoch": 3.6439628482972135, "grad_norm": 0.5780702233314514, "learning_rate": 7.263370099279171e-07, "loss": 0.48574694991111755, "step": 149 }, { "epoch": 3.6687306501547985, "grad_norm": 0.5063837766647339, "learning_rate": 7.136651508778874e-07, "loss": 0.5621860027313232, "step": 150 }, { "epoch": 3.6687306501547985, "eval_accuracy": 0.8215492383391412, "eval_loss": 0.5617780685424805, "eval_runtime": 16.4087, "eval_samples_per_second": 4.144, "eval_steps_per_second": 2.072, "step": 150 }, { "epoch": 3.693498452012384, "grad_norm": 0.5430096387863159, "learning_rate": 7.010431818542297e-07, "loss": 0.4991950988769531, "step": 151 }, { "epoch": 3.718266253869969, "grad_norm": 0.4858173727989197, "learning_rate": 6.884733020666084e-07, "loss": 0.47163355350494385, "step": 152 }, { "epoch": 3.7430340557275543, "grad_norm": 0.4979320168495178, "learning_rate": 6.759577016488343e-07, "loss": 0.5382797718048096, "step": 153 }, { "epoch": 3.7678018575851393, "grad_norm": 0.47822287678718567, "learning_rate": 6.63498561277261e-07, "loss": 0.5248020887374878, "step": 154 }, { "epoch": 3.7925696594427247, "grad_norm": 0.5561540722846985, "learning_rate": 6.510980517908333e-07, "loss": 0.47944825887680054, "step": 155 }, { "epoch": 3.8173374613003097, "grad_norm": 0.510204553604126, "learning_rate": 6.387583338128471e-07, "loss": 0.5094054937362671, "step": 156 }, { "epoch": 3.8421052631578947, "grad_norm": 0.4817684590816498, "learning_rate": 6.264815573744884e-07, "loss": 0.4909018874168396, "step": 157 }, { "epoch": 3.86687306501548, "grad_norm": 0.4790090024471283, "learning_rate": 6.142698615402204e-07, "loss": 0.47690001130104065, "step": 158 }, { "epoch": 3.891640866873065, "grad_norm": 0.4971541464328766, "learning_rate": 6.021253740350792e-07, "loss": 0.5042445659637451, "step": 159 }, { "epoch": 3.91640866873065, "grad_norm": 0.5663966536521912, "learning_rate": 5.900502108739465e-07, "loss": 0.5802559852600098, "step": 160 }, { "epoch": 3.9411764705882355, "grad_norm": 0.6140542030334473, "learning_rate": 5.780464759928623e-07, "loss": 0.5226213932037354, "step": 161 }, { "epoch": 3.9659442724458205, "grad_norm": 0.510217010974884, "learning_rate": 5.661162608824419e-07, "loss": 0.487061470746994, "step": 162 }, { "epoch": 3.9907120743034055, "grad_norm": 0.47863468527793884, "learning_rate": 5.542616442234618e-07, "loss": 0.49519461393356323, "step": 163 }, { "epoch": 4.0, "grad_norm": 0.8134075999259949, "learning_rate": 5.424846915246769e-07, "loss": 0.5006481409072876, "step": 164 }, { "epoch": 4.024767801857585, "grad_norm": 0.5010446906089783, "learning_rate": 5.307874547629339e-07, "loss": 0.5043383240699768, "step": 165 }, { "epoch": 4.04953560371517, "grad_norm": 0.5629169344902039, "learning_rate": 5.191719720256407e-07, "loss": 0.5104990005493164, "step": 166 }, { "epoch": 4.074303405572755, "grad_norm": 0.5630432367324829, "learning_rate": 5.076402671556577e-07, "loss": 0.4841610789299011, "step": 167 }, { "epoch": 4.099071207430341, "grad_norm": 0.46193253993988037, "learning_rate": 4.961943493986708e-07, "loss": 0.5317561030387878, "step": 168 }, { "epoch": 4.123839009287925, "grad_norm": 0.5281070470809937, "learning_rate": 4.848362130531039e-07, "loss": 0.5141686201095581, "step": 169 }, { "epoch": 4.148606811145511, "grad_norm": 0.927697479724884, "learning_rate": 4.7356783712264403e-07, "loss": 0.46369314193725586, "step": 170 }, { "epoch": 4.173374613003096, "grad_norm": 0.5692654252052307, "learning_rate": 4.623911849714225e-07, "loss": 0.48228251934051514, "step": 171 }, { "epoch": 4.198142414860681, "grad_norm": 0.48862549662590027, "learning_rate": 4.5130820398192636e-07, "loss": 0.5285767316818237, "step": 172 }, { "epoch": 4.222910216718266, "grad_norm": 0.5772708058357239, "learning_rate": 4.40320825215692e-07, "loss": 0.5200311541557312, "step": 173 }, { "epoch": 4.247678018575852, "grad_norm": 0.5576812028884888, "learning_rate": 4.294309630768451e-07, "loss": 0.5052947402000427, "step": 174 }, { "epoch": 4.272445820433436, "grad_norm": 0.48456260561943054, "learning_rate": 4.1864051497854027e-07, "loss": 0.5091853141784668, "step": 175 }, { "epoch": 4.2972136222910216, "grad_norm": 0.4992901086807251, "learning_rate": 4.079513610123618e-07, "loss": 0.5285595655441284, "step": 176 }, { "epoch": 4.321981424148607, "grad_norm": 0.560563862323761, "learning_rate": 3.973653636207437e-07, "loss": 0.5327163338661194, "step": 177 }, { "epoch": 4.346749226006192, "grad_norm": 0.48380428552627563, "learning_rate": 3.8688436727246296e-07, "loss": 0.4750836491584778, "step": 178 }, { "epoch": 4.371517027863777, "grad_norm": 0.4964829385280609, "learning_rate": 3.765101981412665e-07, "loss": 0.46454548835754395, "step": 179 }, { "epoch": 4.396284829721362, "grad_norm": 0.4538560211658478, "learning_rate": 3.6624466378768384e-07, "loss": 0.51465904712677, "step": 180 }, { "epoch": 4.421052631578947, "grad_norm": 0.6692084074020386, "learning_rate": 3.560895528440844e-07, "loss": 0.4617176055908203, "step": 181 }, { "epoch": 4.445820433436532, "grad_norm": 0.47236230969429016, "learning_rate": 3.4604663470303186e-07, "loss": 0.5083804130554199, "step": 182 }, { "epoch": 4.470588235294118, "grad_norm": 0.4774688184261322, "learning_rate": 3.3611765920899183e-07, "loss": 0.5058382749557495, "step": 183 }, { "epoch": 4.495356037151703, "grad_norm": 0.47210627794265747, "learning_rate": 3.263043563534428e-07, "loss": 0.5376588106155396, "step": 184 }, { "epoch": 4.520123839009288, "grad_norm": 0.4772137403488159, "learning_rate": 3.166084359734513e-07, "loss": 0.5304179191589355, "step": 185 }, { "epoch": 4.544891640866873, "grad_norm": 0.4682233929634094, "learning_rate": 3.070315874537531e-07, "loss": 0.4820975661277771, "step": 186 }, { "epoch": 4.569659442724459, "grad_norm": 0.48219650983810425, "learning_rate": 2.975754794324015e-07, "loss": 0.5084782838821411, "step": 187 }, { "epoch": 4.594427244582043, "grad_norm": 0.43362459540367126, "learning_rate": 2.8824175951002916e-07, "loss": 0.47581952810287476, "step": 188 }, { "epoch": 4.6191950464396285, "grad_norm": 0.567948579788208, "learning_rate": 2.790320539627754e-07, "loss": 0.5314459800720215, "step": 189 }, { "epoch": 4.643962848297214, "grad_norm": 0.5087016224861145, "learning_rate": 2.6994796745893e-07, "loss": 0.4740360379219055, "step": 190 }, { "epoch": 4.6687306501547985, "grad_norm": 0.5123845338821411, "learning_rate": 2.60991082779341e-07, "loss": 0.5245854258537292, "step": 191 }, { "epoch": 4.693498452012384, "grad_norm": 0.4884699285030365, "learning_rate": 2.521629605416354e-07, "loss": 0.5254173278808594, "step": 192 }, { "epoch": 4.718266253869969, "grad_norm": 0.5492839217185974, "learning_rate": 2.434651389283042e-07, "loss": 0.5060293674468994, "step": 193 }, { "epoch": 4.743034055727554, "grad_norm": 0.4537581503391266, "learning_rate": 2.3489913341869193e-07, "loss": 0.5028636455535889, "step": 194 }, { "epoch": 4.767801857585139, "grad_norm": 0.5206896662712097, "learning_rate": 2.264664365249469e-07, "loss": 0.509818971157074, "step": 195 }, { "epoch": 4.792569659442725, "grad_norm": 0.5348969101905823, "learning_rate": 2.181685175319702e-07, "loss": 0.4900963306427002, "step": 196 }, { "epoch": 4.817337461300309, "grad_norm": 0.478466659784317, "learning_rate": 2.100068222414121e-07, "loss": 0.5366532802581787, "step": 197 }, { "epoch": 4.842105263157895, "grad_norm": 0.4873082637786865, "learning_rate": 2.0198277271976049e-07, "loss": 0.5138839483261108, "step": 198 }, { "epoch": 4.86687306501548, "grad_norm": 0.5307355523109436, "learning_rate": 1.9409776705056514e-07, "loss": 0.48487958312034607, "step": 199 }, { "epoch": 4.891640866873065, "grad_norm": 0.6182578206062317, "learning_rate": 1.863531790908398e-07, "loss": 0.49715912342071533, "step": 200 }, { "epoch": 4.891640866873065, "eval_accuracy": 0.8215848485329422, "eval_loss": 0.5621271133422852, "eval_runtime": 16.3624, "eval_samples_per_second": 4.156, "eval_steps_per_second": 2.078, "step": 200 }, { "epoch": 4.91640866873065, "grad_norm": 0.5110271573066711, "learning_rate": 1.787503582316864e-07, "loss": 0.5255718231201172, "step": 201 }, { "epoch": 4.9411764705882355, "grad_norm": 0.4957195222377777, "learning_rate": 1.7129062916318137e-07, "loss": 0.5106043219566345, "step": 202 }, { "epoch": 4.965944272445821, "grad_norm": 1.4632741212844849, "learning_rate": 1.6397529164356606e-07, "loss": 0.5344016551971436, "step": 203 }, { "epoch": 4.9907120743034055, "grad_norm": 0.533440113067627, "learning_rate": 1.5680562027278154e-07, "loss": 0.5215489268302917, "step": 204 }, { "epoch": 5.0, "grad_norm": 0.8572560548782349, "learning_rate": 1.49782864270386e-07, "loss": 0.5227999687194824, "step": 205 }, { "epoch": 5.024767801857585, "grad_norm": 0.43222400546073914, "learning_rate": 1.429082472578954e-07, "loss": 0.5099145174026489, "step": 206 }, { "epoch": 5.04953560371517, "grad_norm": 0.47421810030937195, "learning_rate": 1.3618296704558364e-07, "loss": 0.5271211862564087, "step": 207 }, { "epoch": 5.074303405572755, "grad_norm": 0.5383461117744446, "learning_rate": 1.2960819542378053e-07, "loss": 0.548247218132019, "step": 208 }, { "epoch": 5.099071207430341, "grad_norm": 0.513953685760498, "learning_rate": 1.2318507795870137e-07, "loss": 0.47977253794670105, "step": 209 }, { "epoch": 5.123839009287925, "grad_norm": 0.5112437605857849, "learning_rate": 1.1691473379284944e-07, "loss": 0.4924686551094055, "step": 210 }, { "epoch": 5.148606811145511, "grad_norm": 0.5439184308052063, "learning_rate": 1.1079825545001886e-07, "loss": 0.4926351308822632, "step": 211 }, { "epoch": 5.173374613003096, "grad_norm": 0.47784221172332764, "learning_rate": 1.0483670864493777e-07, "loss": 0.5255255699157715, "step": 212 }, { "epoch": 5.198142414860681, "grad_norm": 0.48372480273246765, "learning_rate": 9.903113209758096e-08, "loss": 0.5388856530189514, "step": 213 }, { "epoch": 5.222910216718266, "grad_norm": 0.4922617971897125, "learning_rate": 9.338253735218748e-08, "loss": 0.4714866280555725, "step": 214 }, { "epoch": 5.247678018575852, "grad_norm": 0.5694555044174194, "learning_rate": 8.789190860101226e-08, "loss": 0.49757862091064453, "step": 215 }, { "epoch": 5.272445820433436, "grad_norm": 0.5285799503326416, "learning_rate": 8.256020251284379e-08, "loss": 0.5523006916046143, "step": 216 }, { "epoch": 5.2972136222910216, "grad_norm": 0.542019784450531, "learning_rate": 7.73883480663171e-08, "loss": 0.4939878582954407, "step": 217 }, { "epoch": 5.321981424148607, "grad_norm": 0.4783063232898712, "learning_rate": 7.23772463880522e-08, "loss": 0.5162045359611511, "step": 218 }, { "epoch": 5.346749226006192, "grad_norm": 0.4960096776485443, "learning_rate": 6.75277705956443e-08, "loss": 0.5186662673950195, "step": 219 }, { "epoch": 5.371517027863777, "grad_norm": 0.4951794147491455, "learning_rate": 6.284076564553464e-08, "loss": 0.48755860328674316, "step": 220 }, { "epoch": 5.396284829721362, "grad_norm": 0.4898841381072998, "learning_rate": 5.831704818578842e-08, "loss": 0.5034775733947754, "step": 221 }, { "epoch": 5.421052631578947, "grad_norm": 0.540875256061554, "learning_rate": 5.395740641380531e-08, "loss": 0.4632171094417572, "step": 222 }, { "epoch": 5.445820433436532, "grad_norm": 0.45750898122787476, "learning_rate": 4.976259993898502e-08, "loss": 0.49796921014785767, "step": 223 }, { "epoch": 5.470588235294118, "grad_norm": 0.5052651166915894, "learning_rate": 4.573335965037706e-08, "loss": 0.47650158405303955, "step": 224 }, { "epoch": 5.495356037151703, "grad_norm": 0.4999431371688843, "learning_rate": 4.187038758933203e-08, "loss": 0.49834519624710083, "step": 225 }, { "epoch": 5.520123839009288, "grad_norm": 0.5175738334655762, "learning_rate": 3.817435682718095e-08, "loss": 0.46955606341362, "step": 226 }, { "epoch": 5.544891640866873, "grad_norm": 0.4690812826156616, "learning_rate": 3.464591134796135e-08, "loss": 0.5154824256896973, "step": 227 }, { "epoch": 5.569659442724459, "grad_norm": 0.4758513867855072, "learning_rate": 3.1285665936211516e-08, "loss": 0.5336707830429077, "step": 228 }, { "epoch": 5.594427244582043, "grad_norm": 0.442473441362381, "learning_rate": 2.8094206069852355e-08, "loss": 0.4967498779296875, "step": 229 }, { "epoch": 5.6191950464396285, "grad_norm": 0.4868296682834625, "learning_rate": 2.507208781817638e-08, "loss": 0.5311983823776245, "step": 230 }, { "epoch": 5.643962848297214, "grad_norm": 0.5476986169815063, "learning_rate": 2.221983774495928e-08, "loss": 0.5054424405097961, "step": 231 }, { "epoch": 5.6687306501547985, "grad_norm": 0.4974565804004669, "learning_rate": 1.953795281671333e-08, "loss": 0.5006812214851379, "step": 232 }, { "epoch": 5.693498452012384, "grad_norm": 0.5025091767311096, "learning_rate": 1.7026900316098212e-08, "loss": 0.527012825012207, "step": 233 }, { "epoch": 5.718266253869969, "grad_norm": 0.46924424171447754, "learning_rate": 1.4687117760502576e-08, "loss": 0.4735889434814453, "step": 234 }, { "epoch": 5.743034055727554, "grad_norm": 0.454560786485672, "learning_rate": 1.2519012825812803e-08, "loss": 0.49276185035705566, "step": 235 }, { "epoch": 5.767801857585139, "grad_norm": 0.4710627496242523, "learning_rate": 1.0522963275380492e-08, "loss": 0.5048189759254456, "step": 236 }, { "epoch": 5.792569659442725, "grad_norm": 0.4550038278102875, "learning_rate": 8.699316894203223e-09, "loss": 0.513171911239624, "step": 237 }, { "epoch": 5.817337461300309, "grad_norm": 0.5602344870567322, "learning_rate": 7.048391428326584e-09, "loss": 0.5195218324661255, "step": 238 }, { "epoch": 5.842105263157895, "grad_norm": 0.4764668643474579, "learning_rate": 5.570474529481561e-09, "loss": 0.49439120292663574, "step": 239 }, { "epoch": 5.86687306501548, "grad_norm": 0.7008131146430969, "learning_rate": 4.265823704965532e-09, "loss": 0.5026534795761108, "step": 240 }, { "epoch": 5.891640866873065, "grad_norm": 0.5155523419380188, "learning_rate": 3.1346662727740338e-09, "loss": 0.505569338798523, "step": 241 }, { "epoch": 5.91640866873065, "grad_norm": 0.48813626170158386, "learning_rate": 2.1771993219946718e-09, "loss": 0.4332225024700165, "step": 242 }, { "epoch": 5.9411764705882355, "grad_norm": 0.5733649134635925, "learning_rate": 1.393589678466367e-09, "loss": 0.5184577703475952, "step": 243 }, { "epoch": 5.965944272445821, "grad_norm": 0.47005656361579895, "learning_rate": 7.839738757123848e-10, "loss": 0.48927992582321167, "step": 244 }, { "epoch": 5.9907120743034055, "grad_norm": 0.519534170627594, "learning_rate": 3.484581311511414e-10, "loss": 0.5252695679664612, "step": 245 }, { "epoch": 6.0, "grad_norm": 0.8245697617530823, "learning_rate": 8.711832758934168e-11, "loss": 0.485756516456604, "step": 246 } ], "logging_steps": 1, "max_steps": 246, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 63272699183104.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }