| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9945344330716486, | |
| "eval_steps": 500, | |
| "global_step": 1257, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.023849746596442414, | |
| "grad_norm": 3.046875, | |
| "learning_rate": 2.368421052631579e-05, | |
| "loss": 1.8936, | |
| "mean_token_accuracy": 0.6058215032021205, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.04769949319288483, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 5e-05, | |
| "loss": 1.5408, | |
| "mean_token_accuracy": 0.6348679741223653, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07154923978932724, | |
| "grad_norm": 1.6953125, | |
| "learning_rate": 7.631578947368422e-05, | |
| "loss": 1.4672, | |
| "mean_token_accuracy": 0.6460853961606821, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.09539898638576966, | |
| "grad_norm": 1.546875, | |
| "learning_rate": 9.999983395259131e-05, | |
| "loss": 1.4522, | |
| "mean_token_accuracy": 0.6488149074216684, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.11924873298221206, | |
| "grad_norm": 1.96875, | |
| "learning_rate": 9.997990959798673e-05, | |
| "loss": 1.431, | |
| "mean_token_accuracy": 0.6533399142324925, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.14309847957865449, | |
| "grad_norm": 1.328125, | |
| "learning_rate": 9.99267909244468e-05, | |
| "loss": 1.4142, | |
| "mean_token_accuracy": 0.6570006487270196, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.1669482261750969, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 9.984051321091072e-05, | |
| "loss": 1.3949, | |
| "mean_token_accuracy": 0.6594945053259532, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.19079797277153931, | |
| "grad_norm": 1.4609375, | |
| "learning_rate": 9.972113375900154e-05, | |
| "loss": 1.3968, | |
| "mean_token_accuracy": 0.6598712412019571, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.21464771936798172, | |
| "grad_norm": 1.7109375, | |
| "learning_rate": 9.956873185496916e-05, | |
| "loss": 1.387, | |
| "mean_token_accuracy": 0.6636203820506732, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.23849746596442412, | |
| "grad_norm": 1.421875, | |
| "learning_rate": 9.9383408717032e-05, | |
| "loss": 1.3872, | |
| "mean_token_accuracy": 0.6624753788113594, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2623472125608665, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 9.916528742815276e-05, | |
| "loss": 1.3453, | |
| "mean_token_accuracy": 0.6708123281598091, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.28619695915730897, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 9.891451285429231e-05, | |
| "loss": 1.3512, | |
| "mean_token_accuracy": 0.6688777978221575, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.3100467057537514, | |
| "grad_norm": 1.1171875, | |
| "learning_rate": 9.863125154819663e-05, | |
| "loss": 1.3764, | |
| "mean_token_accuracy": 0.6649044439196586, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.3338964523501938, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.831569163878015e-05, | |
| "loss": 1.3456, | |
| "mean_token_accuracy": 0.6716934656103452, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.3577461989466362, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 9.796804270617937e-05, | |
| "loss": 1.3439, | |
| "mean_token_accuracy": 0.6696053599317868, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.38159594554307863, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 9.758853564255954e-05, | |
| "loss": 1.3524, | |
| "mean_token_accuracy": 0.6700724132359028, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.40544569213952103, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 9.71774224987669e-05, | |
| "loss": 1.3421, | |
| "mean_token_accuracy": 0.6707797152300675, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.42929543873596343, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.673497631692821e-05, | |
| "loss": 1.3112, | |
| "mean_token_accuracy": 0.6768868687252204, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.45314518533240583, | |
| "grad_norm": 1.0546875, | |
| "learning_rate": 9.626149094910905e-05, | |
| "loss": 1.3133, | |
| "mean_token_accuracy": 0.675601539760828, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.47699493192884823, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 9.575728086215092e-05, | |
| "loss": 1.3186, | |
| "mean_token_accuracy": 0.6764184221625328, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.5008446785252907, | |
| "grad_norm": 1.0625, | |
| "learning_rate": 9.522268092881716e-05, | |
| "loss": 1.34, | |
| "mean_token_accuracy": 0.6720153917868932, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.524694425121733, | |
| "grad_norm": 1.2421875, | |
| "learning_rate": 9.465804620538601e-05, | |
| "loss": 1.3072, | |
| "mean_token_accuracy": 0.6793769620358944, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.5485441717181755, | |
| "grad_norm": 1.109375, | |
| "learning_rate": 9.406375169583893e-05, | |
| "loss": 1.3028, | |
| "mean_token_accuracy": 0.6790211101373037, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.5723939183146179, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 9.34401921028004e-05, | |
| "loss": 1.319, | |
| "mean_token_accuracy": 0.6775049904982249, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.5962436649110603, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.278778156539502e-05, | |
| "loss": 1.2948, | |
| "mean_token_accuracy": 0.6797074491779009, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.6200934115075027, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 9.210695338419553e-05, | |
| "loss": 1.2894, | |
| "mean_token_accuracy": 0.6807446469863255, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.6439431581039451, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 9.139815973344493e-05, | |
| "loss": 1.3118, | |
| "mean_token_accuracy": 0.6776151955127716, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.6677929047003875, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 9.066187136074344e-05, | |
| "loss": 1.2922, | |
| "mean_token_accuracy": 0.6806117855012417, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.69164265129683, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 8.989857727439994e-05, | |
| "loss": 1.2821, | |
| "mean_token_accuracy": 0.6829586572945118, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.7154923978932723, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.91087844186556e-05, | |
| "loss": 1.2876, | |
| "mean_token_accuracy": 0.6828203638394673, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.7393421444897148, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 8.829301733699511e-05, | |
| "loss": 1.2681, | |
| "mean_token_accuracy": 0.6843130300442378, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.7631918910861573, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 8.745181782376955e-05, | |
| "loss": 1.2828, | |
| "mean_token_accuracy": 0.6828058943152427, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.7870416376825996, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 8.658574456436177e-05, | |
| "loss": 1.2724, | |
| "mean_token_accuracy": 0.6842800090710323, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.8108913842790421, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 8.56953727641338e-05, | |
| "loss": 1.2904, | |
| "mean_token_accuracy": 0.6812497737507025, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.8347411308754844, | |
| "grad_norm": 0.89453125, | |
| "learning_rate": 8.47812937664023e-05, | |
| "loss": 1.2815, | |
| "mean_token_accuracy": 0.685196939855814, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.8585908774719269, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 8.384411465969597e-05, | |
| "loss": 1.263, | |
| "mean_token_accuracy": 0.6859518033762773, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.8824406240683693, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 8.288445787455579e-05, | |
| "loss": 1.265, | |
| "mean_token_accuracy": 0.6860440624256928, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.9062903706648117, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 8.190296077014563e-05, | |
| "loss": 1.277, | |
| "mean_token_accuracy": 0.6848570423821608, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.9301401172612541, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 8.090027521094826e-05, | |
| "loss": 1.2564, | |
| "mean_token_accuracy": 0.6886172123253346, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.9539898638576965, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 7.987706713382735e-05, | |
| "loss": 1.2191, | |
| "mean_token_accuracy": 0.694924907386303, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.9778396104541389, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 7.883401610574336e-05, | |
| "loss": 1.2559, | |
| "mean_token_accuracy": 0.6870772863427798, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 7.777181487241691e-05, | |
| "loss": 1.2608, | |
| "mean_token_accuracy": 0.6875777412957674, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 1.0238497465964425, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 7.669116889823955e-05, | |
| "loss": 1.0743, | |
| "mean_token_accuracy": 0.7243270235757033, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 1.047699493192885, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 7.559279589773715e-05, | |
| "loss": 1.056, | |
| "mean_token_accuracy": 0.7269702054560184, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 1.0715492397893271, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 7.447742535889768e-05, | |
| "loss": 1.0525, | |
| "mean_token_accuracy": 0.7275599343081315, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 1.0953989863857696, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 7.334579805867918e-05, | |
| "loss": 1.0496, | |
| "mean_token_accuracy": 0.7283496469259262, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 1.119248732982212, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 7.219866557102051e-05, | |
| "loss": 1.0534, | |
| "mean_token_accuracy": 0.7267930120229721, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 1.1430984795786545, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 7.103678976768102e-05, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.7286298322180907, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 1.166948226175097, | |
| "grad_norm": 0.9921875, | |
| "learning_rate": 6.986094231224089e-05, | |
| "loss": 1.0589, | |
| "mean_token_accuracy": 0.7247064632674058, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 1.1907979727715392, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 6.867190414759835e-05, | |
| "loss": 1.0528, | |
| "mean_token_accuracy": 0.7296631435553232, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.2146477193679817, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 6.747046497730395e-05, | |
| "loss": 1.067, | |
| "mean_token_accuracy": 0.7264716491103173, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.2384974659644241, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 6.625742274107625e-05, | |
| "loss": 1.0602, | |
| "mean_token_accuracy": 0.7274638287723064, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.2623472125608666, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 6.503358308484771e-05, | |
| "loss": 1.0547, | |
| "mean_token_accuracy": 0.7279857188463211, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.286196959157309, | |
| "grad_norm": 1.0, | |
| "learning_rate": 6.379975882569224e-05, | |
| "loss": 1.0496, | |
| "mean_token_accuracy": 0.7278440314034621, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.3100467057537513, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 6.255676941199018e-05, | |
| "loss": 1.0517, | |
| "mean_token_accuracy": 0.7278454805413882, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.3338964523501937, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 6.130544037918894e-05, | |
| "loss": 1.0465, | |
| "mean_token_accuracy": 0.7296144522726535, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.3577461989466362, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 6.004660280152096e-05, | |
| "loss": 1.0441, | |
| "mean_token_accuracy": 0.7295484840869904, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.3815959455430786, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 5.878109274004314e-05, | |
| "loss": 1.0399, | |
| "mean_token_accuracy": 0.7302657755712668, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.405445692139521, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 5.7509750687364105e-05, | |
| "loss": 1.0245, | |
| "mean_token_accuracy": 0.7333451308310032, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.4292954387359633, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 5.62334210094283e-05, | |
| "loss": 1.0438, | |
| "mean_token_accuracy": 0.7301900732020538, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.4531451853324058, | |
| "grad_norm": 1.0, | |
| "learning_rate": 5.495295138472769e-05, | |
| "loss": 1.0512, | |
| "mean_token_accuracy": 0.7292244640489419, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.4769949319288482, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 5.366919224131331e-05, | |
| "loss": 1.0391, | |
| "mean_token_accuracy": 0.7299602496127288, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.5008446785252907, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 5.238299619198066e-05, | |
| "loss": 1.048, | |
| "mean_token_accuracy": 0.7292950809001922, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.5246944251217331, | |
| "grad_norm": 0.98828125, | |
| "learning_rate": 5.109521746800426e-05, | |
| "loss": 1.0443, | |
| "mean_token_accuracy": 0.729659582922856, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.5485441717181754, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 4.980671135179708e-05, | |
| "loss": 1.0539, | |
| "mean_token_accuracy": 0.7284331674377124, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.572393918314618, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 4.851833360887201e-05, | |
| "loss": 1.0498, | |
| "mean_token_accuracy": 0.7298707855244477, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.5962436649110603, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 4.723093991948245e-05, | |
| "loss": 1.0226, | |
| "mean_token_accuracy": 0.7343422142167886, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.6200934115075027, | |
| "grad_norm": 0.90625, | |
| "learning_rate": 4.5945385310319344e-05, | |
| "loss": 1.046, | |
| "mean_token_accuracy": 0.7300175433357556, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.6439431581039452, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 4.466252358664269e-05, | |
| "loss": 1.0453, | |
| "mean_token_accuracy": 0.730091076095899, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.6677929047003874, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 4.338320676522374e-05, | |
| "loss": 1.0366, | |
| "mean_token_accuracy": 0.732094819098711, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.6916426512968301, | |
| "grad_norm": 0.87109375, | |
| "learning_rate": 4.210828450847551e-05, | |
| "loss": 1.0202, | |
| "mean_token_accuracy": 0.7349480445186297, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.7154923978932723, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 4.08386035601466e-05, | |
| "loss": 1.0287, | |
| "mean_token_accuracy": 0.7352750234305858, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.7393421444897148, | |
| "grad_norm": 0.87890625, | |
| "learning_rate": 3.9575007182953633e-05, | |
| "loss": 1.0381, | |
| "mean_token_accuracy": 0.7321987241506577, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.7631918910861573, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 3.8318334598525625e-05, | |
| "loss": 1.0284, | |
| "mean_token_accuracy": 0.7348339579999447, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.7870416376825995, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 3.706942043003215e-05, | |
| "loss": 1.0314, | |
| "mean_token_accuracy": 0.7322920034329097, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.8108913842790422, | |
| "grad_norm": 0.859375, | |
| "learning_rate": 3.582909414786576e-05, | |
| "loss": 1.0316, | |
| "mean_token_accuracy": 0.7317293718457222, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.8347411308754844, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 3.4598179518746375e-05, | |
| "loss": 1.0057, | |
| "mean_token_accuracy": 0.7380195428927739, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.8585908774719269, | |
| "grad_norm": 0.8515625, | |
| "learning_rate": 3.3377494058614145e-05, | |
| "loss": 1.02, | |
| "mean_token_accuracy": 0.7366910368204117, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.8824406240683693, | |
| "grad_norm": 0.8984375, | |
| "learning_rate": 3.216784848967341e-05, | |
| "loss": 1.0053, | |
| "mean_token_accuracy": 0.7390421097477277, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.9062903706648116, | |
| "grad_norm": 0.8671875, | |
| "learning_rate": 3.097004620194902e-05, | |
| "loss": 1.0249, | |
| "mean_token_accuracy": 0.7343820112446944, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.9301401172612542, | |
| "grad_norm": 0.8828125, | |
| "learning_rate": 2.9784882719712248e-05, | |
| "loss": 1.023, | |
| "mean_token_accuracy": 0.7356185607612133, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.9539898638576965, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 2.861314517313071e-05, | |
| "loss": 1.0223, | |
| "mean_token_accuracy": 0.736139855782191, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.977839610454139, | |
| "grad_norm": 0.890625, | |
| "learning_rate": 2.7455611775493363e-05, | |
| "loss": 1.0058, | |
| "mean_token_accuracy": 0.7381812597314517, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 1.6796875, | |
| "learning_rate": 2.6313051306357583e-05, | |
| "loss": 1.0083, | |
| "mean_token_accuracy": 0.7398098124516919, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 2.0238497465964422, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 2.5186222600961785e-05, | |
| "loss": 0.8511, | |
| "mean_token_accuracy": 0.7758485838770867, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 2.047699493192885, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 2.407587404624253e-05, | |
| "loss": 0.8605, | |
| "mean_token_accuracy": 0.7730280997852484, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 2.071549239789327, | |
| "grad_norm": 1.0703125, | |
| "learning_rate": 2.2982743083790858e-05, | |
| "loss": 0.843, | |
| "mean_token_accuracy": 0.7769588189820449, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 2.09539898638577, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 2.1907555720078115e-05, | |
| "loss": 0.8554, | |
| "mean_token_accuracy": 0.7744472898542881, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 2.119248732982212, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 2.0851026044276406e-05, | |
| "loss": 0.8534, | |
| "mean_token_accuracy": 0.7748738661408424, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 2.1430984795786543, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.9813855753993914e-05, | |
| "loss": 0.8549, | |
| "mean_token_accuracy": 0.7733134960134824, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 2.166948226175097, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.879673368924025e-05, | |
| "loss": 0.8385, | |
| "mean_token_accuracy": 0.7773586166401704, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 2.190797972771539, | |
| "grad_norm": 0.9375, | |
| "learning_rate": 1.7800335374930998e-05, | |
| "loss": 0.8688, | |
| "mean_token_accuracy": 0.7720932170748711, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 2.214647719367982, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 1.6825322572235767e-05, | |
| "loss": 0.8432, | |
| "mean_token_accuracy": 0.7758201735715071, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 2.238497465964424, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 1.5872342839067306e-05, | |
| "loss": 0.8467, | |
| "mean_token_accuracy": 0.7756141935785611, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 2.2623472125608663, | |
| "grad_norm": 0.9609375, | |
| "learning_rate": 1.4942029100003808e-05, | |
| "loss": 0.8615, | |
| "mean_token_accuracy": 0.7729161617656549, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 2.286196959157309, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 1.4034999225930107e-05, | |
| "loss": 0.8604, | |
| "mean_token_accuracy": 0.7723979363838832, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 2.3100467057537513, | |
| "grad_norm": 0.97265625, | |
| "learning_rate": 1.3151855623676517e-05, | |
| "loss": 0.8512, | |
| "mean_token_accuracy": 0.7750633083283901, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 2.333896452350194, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 1.2293184835928639e-05, | |
| "loss": 0.8555, | |
| "mean_token_accuracy": 0.7741216157873472, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 2.357746198946636, | |
| "grad_norm": 0.96875, | |
| "learning_rate": 1.1459557151672933e-05, | |
| "loss": 0.8452, | |
| "mean_token_accuracy": 0.7758361441393693, | |
| "step": 990 | |
| }, | |
| { | |
| "epoch": 2.3815959455430784, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 1.065152622743753e-05, | |
| "loss": 0.8344, | |
| "mean_token_accuracy": 0.7779088502128919, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 2.405445692139521, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 9.869628719579448e-06, | |
| "loss": 0.8474, | |
| "mean_token_accuracy": 0.7763092644512654, | |
| "step": 1010 | |
| }, | |
| { | |
| "epoch": 2.4292954387359633, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 9.114383927862502e-06, | |
| "loss": 0.8307, | |
| "mean_token_accuracy": 0.7789155239860217, | |
| "step": 1020 | |
| }, | |
| { | |
| "epoch": 2.453145185332406, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 8.386293450562755e-06, | |
| "loss": 0.8316, | |
| "mean_token_accuracy": 0.7797198290626208, | |
| "step": 1030 | |
| }, | |
| { | |
| "epoch": 2.4769949319288482, | |
| "grad_norm": 0.9296875, | |
| "learning_rate": 7.685840851330296e-06, | |
| "loss": 0.8276, | |
| "mean_token_accuracy": 0.7800254955887794, | |
| "step": 1040 | |
| }, | |
| { | |
| "epoch": 2.500844678525291, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 7.013491338028938e-06, | |
| "loss": 0.858, | |
| "mean_token_accuracy": 0.7726903068522613, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 2.524694425121733, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 6.3696914537669184e-06, | |
| "loss": 0.8433, | |
| "mean_token_accuracy": 0.7763248644769192, | |
| "step": 1060 | |
| }, | |
| { | |
| "epoch": 2.5485441717181754, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 5.7548687803237915e-06, | |
| "loss": 0.8508, | |
| "mean_token_accuracy": 0.7756962545216084, | |
| "step": 1070 | |
| }, | |
| { | |
| "epoch": 2.572393918314618, | |
| "grad_norm": 0.9453125, | |
| "learning_rate": 5.16943165417067e-06, | |
| "loss": 0.8344, | |
| "mean_token_accuracy": 0.7778078898787498, | |
| "step": 1080 | |
| }, | |
| { | |
| "epoch": 2.5962436649110603, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 4.613768895272181e-06, | |
| "loss": 0.8511, | |
| "mean_token_accuracy": 0.7750198910633723, | |
| "step": 1090 | |
| }, | |
| { | |
| "epoch": 2.6200934115075025, | |
| "grad_norm": 0.92578125, | |
| "learning_rate": 4.088249548850442e-06, | |
| "loss": 0.8429, | |
| "mean_token_accuracy": 0.7772863209247589, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 2.643943158103945, | |
| "grad_norm": 0.98046875, | |
| "learning_rate": 3.5932226402824954e-06, | |
| "loss": 0.8517, | |
| "mean_token_accuracy": 0.7759914328654607, | |
| "step": 1110 | |
| }, | |
| { | |
| "epoch": 2.6677929047003874, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 3.1290169432939553e-06, | |
| "loss": 0.8544, | |
| "mean_token_accuracy": 0.7765047900378704, | |
| "step": 1120 | |
| }, | |
| { | |
| "epoch": 2.69164265129683, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 2.6959407616028997e-06, | |
| "loss": 0.8327, | |
| "mean_token_accuracy": 0.778083398193121, | |
| "step": 1130 | |
| }, | |
| { | |
| "epoch": 2.7154923978932723, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 2.294281724158892e-06, | |
| "loss": 0.8517, | |
| "mean_token_accuracy": 0.7756194410224756, | |
| "step": 1140 | |
| }, | |
| { | |
| "epoch": 2.739342144489715, | |
| "grad_norm": 1.5390625, | |
| "learning_rate": 1.9243065941133787e-06, | |
| "loss": 0.8394, | |
| "mean_token_accuracy": 0.7775321317215761, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 2.7631918910861573, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.5862610916479893e-06, | |
| "loss": 0.8432, | |
| "mean_token_accuracy": 0.7762146083017191, | |
| "step": 1160 | |
| }, | |
| { | |
| "epoch": 2.7870416376825995, | |
| "grad_norm": 0.91796875, | |
| "learning_rate": 1.280369730778741e-06, | |
| "loss": 0.8434, | |
| "mean_token_accuracy": 0.7769906096160412, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 2.810891384279042, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 1.0068356702442827e-06, | |
| "loss": 0.8558, | |
| "mean_token_accuracy": 0.7744560035566489, | |
| "step": 1180 | |
| }, | |
| { | |
| "epoch": 2.8347411308754844, | |
| "grad_norm": 0.93359375, | |
| "learning_rate": 7.658405785773926e-07, | |
| "loss": 0.8373, | |
| "mean_token_accuracy": 0.7771262871722381, | |
| "step": 1190 | |
| }, | |
| { | |
| "epoch": 2.8585908774719266, | |
| "grad_norm": 0.953125, | |
| "learning_rate": 5.575445134492374e-07, | |
| "loss": 0.8336, | |
| "mean_token_accuracy": 0.7793849507967631, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 2.8824406240683693, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 3.8208581536650635e-07, | |
| "loss": 0.8291, | |
| "mean_token_accuracy": 0.7792126409709453, | |
| "step": 1210 | |
| }, | |
| { | |
| "epoch": 2.9062903706648116, | |
| "grad_norm": 0.90234375, | |
| "learning_rate": 2.395810157921674e-07, | |
| "loss": 0.832, | |
| "mean_token_accuracy": 0.7793585898975531, | |
| "step": 1220 | |
| }, | |
| { | |
| "epoch": 2.9301401172612542, | |
| "grad_norm": 0.94140625, | |
| "learning_rate": 1.3012475975068828e-07, | |
| "loss": 0.8489, | |
| "mean_token_accuracy": 0.7747443556785584, | |
| "step": 1230 | |
| }, | |
| { | |
| "epoch": 2.9539898638576965, | |
| "grad_norm": 0.94921875, | |
| "learning_rate": 5.378974296924577e-08, | |
| "loss": 0.829, | |
| "mean_token_accuracy": 0.7786815270781517, | |
| "step": 1240 | |
| }, | |
| { | |
| "epoch": 2.977839610454139, | |
| "grad_norm": 1.0234375, | |
| "learning_rate": 1.0626663596624564e-08, | |
| "loss": 0.8445, | |
| "mean_token_accuracy": 0.7772307982047398, | |
| "step": 1250 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1257, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.602088301310771e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |