{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018018018018018018, "grad_norm": 4.300365447998047, "learning_rate": 1e-05, "loss": 0.6205, "step": 1 }, { "epoch": 0.036036036036036036, "grad_norm": 2.4697682857513428, "learning_rate": 9.999921320324328e-06, "loss": 0.4825, "step": 2 }, { "epoch": 0.05405405405405406, "grad_norm": 1.6707440614700317, "learning_rate": 9.999685283773504e-06, "loss": 0.4016, "step": 3 }, { "epoch": 0.07207207207207207, "grad_norm": 1.063056468963623, "learning_rate": 9.999291897776043e-06, "loss": 0.3491, "step": 4 }, { "epoch": 0.09009009009009009, "grad_norm": 1.9208970069885254, "learning_rate": 9.998741174712534e-06, "loss": 0.3424, "step": 5 }, { "epoch": 0.10810810810810811, "grad_norm": 1.2284053564071655, "learning_rate": 9.998033131915266e-06, "loss": 0.3004, "step": 6 }, { "epoch": 0.12612612612612611, "grad_norm": 0.9112282395362854, "learning_rate": 9.997167791667668e-06, "loss": 0.2775, "step": 7 }, { "epoch": 0.14414414414414414, "grad_norm": 0.9811397790908813, "learning_rate": 9.996145181203616e-06, "loss": 0.2732, "step": 8 }, { "epoch": 0.16216216216216217, "grad_norm": 0.813277542591095, "learning_rate": 9.994965332706574e-06, "loss": 0.2549, "step": 9 }, { "epoch": 0.18018018018018017, "grad_norm": 0.7180718183517456, "learning_rate": 9.993628283308582e-06, "loss": 0.2494, "step": 10 }, { "epoch": 0.1981981981981982, "grad_norm": 0.6469175815582275, "learning_rate": 9.992134075089085e-06, "loss": 0.2428, "step": 11 }, { "epoch": 0.21621621621621623, "grad_norm": 0.6054768562316895, "learning_rate": 9.990482755073607e-06, "loss": 0.2335, "step": 12 }, { "epoch": 0.23423423423423423, "grad_norm": 0.6253430843353271, "learning_rate": 9.98867437523228e-06, "loss": 0.2473, "step": 13 }, { "epoch": 0.25225225225225223, "grad_norm": 0.5254931449890137, "learning_rate": 9.986708992478202e-06, "loss": 0.2345, "step": 14 }, { "epoch": 0.2702702702702703, "grad_norm": 0.47010546922683716, "learning_rate": 9.984586668665641e-06, "loss": 0.2191, "step": 15 }, { "epoch": 0.2882882882882883, "grad_norm": 0.5132431983947754, "learning_rate": 9.982307470588097e-06, "loss": 0.2323, "step": 16 }, { "epoch": 0.3063063063063063, "grad_norm": 0.48005616664886475, "learning_rate": 9.979871469976197e-06, "loss": 0.206, "step": 17 }, { "epoch": 0.32432432432432434, "grad_norm": 0.49143457412719727, "learning_rate": 9.977278743495434e-06, "loss": 0.205, "step": 18 }, { "epoch": 0.34234234234234234, "grad_norm": 0.429962158203125, "learning_rate": 9.974529372743762e-06, "loss": 0.2136, "step": 19 }, { "epoch": 0.36036036036036034, "grad_norm": 0.43287697434425354, "learning_rate": 9.97162344424902e-06, "loss": 0.1981, "step": 20 }, { "epoch": 0.3783783783783784, "grad_norm": 0.4126758873462677, "learning_rate": 9.968561049466214e-06, "loss": 0.2005, "step": 21 }, { "epoch": 0.3963963963963964, "grad_norm": 0.41148409247398376, "learning_rate": 9.965342284774633e-06, "loss": 0.1896, "step": 22 }, { "epoch": 0.4144144144144144, "grad_norm": 0.40187039971351624, "learning_rate": 9.961967251474823e-06, "loss": 0.1869, "step": 23 }, { "epoch": 0.43243243243243246, "grad_norm": 0.4011826813220978, "learning_rate": 9.958436055785391e-06, "loss": 0.1954, "step": 24 }, { "epoch": 0.45045045045045046, "grad_norm": 0.3922724723815918, "learning_rate": 9.954748808839675e-06, "loss": 0.196, "step": 25 }, { "epoch": 0.46846846846846846, "grad_norm": 0.3767087459564209, "learning_rate": 9.950905626682229e-06, "loss": 0.1885, "step": 26 }, { "epoch": 0.4864864864864865, "grad_norm": 0.38024166226387024, "learning_rate": 9.946906630265184e-06, "loss": 0.2059, "step": 27 }, { "epoch": 0.5045045045045045, "grad_norm": 0.35347816348075867, "learning_rate": 9.942751945444437e-06, "loss": 0.1933, "step": 28 }, { "epoch": 0.5225225225225225, "grad_norm": 0.36196964979171753, "learning_rate": 9.938441702975689e-06, "loss": 0.184, "step": 29 }, { "epoch": 0.5405405405405406, "grad_norm": 0.3916337192058563, "learning_rate": 9.933976038510334e-06, "loss": 0.1876, "step": 30 }, { "epoch": 0.5585585585585585, "grad_norm": 0.35248634219169617, "learning_rate": 9.92935509259118e-06, "loss": 0.1831, "step": 31 }, { "epoch": 0.5765765765765766, "grad_norm": 0.3620535433292389, "learning_rate": 9.924579010648042e-06, "loss": 0.1817, "step": 32 }, { "epoch": 0.5945945945945946, "grad_norm": 0.36167603731155396, "learning_rate": 9.91964794299315e-06, "loss": 0.1769, "step": 33 }, { "epoch": 0.6126126126126126, "grad_norm": 0.35638344287872314, "learning_rate": 9.914562044816424e-06, "loss": 0.1765, "step": 34 }, { "epoch": 0.6306306306306306, "grad_norm": 0.35270994901657104, "learning_rate": 9.909321476180594e-06, "loss": 0.1786, "step": 35 }, { "epoch": 0.6486486486486487, "grad_norm": 0.362835168838501, "learning_rate": 9.903926402016153e-06, "loss": 0.1784, "step": 36 }, { "epoch": 0.6666666666666666, "grad_norm": 0.3634340465068817, "learning_rate": 9.898376992116179e-06, "loss": 0.1794, "step": 37 }, { "epoch": 0.6846846846846847, "grad_norm": 0.3652496337890625, "learning_rate": 9.892673421130979e-06, "loss": 0.1714, "step": 38 }, { "epoch": 0.7027027027027027, "grad_norm": 0.34574374556541443, "learning_rate": 9.886815868562596e-06, "loss": 0.1658, "step": 39 }, { "epoch": 0.7207207207207207, "grad_norm": 0.3675323724746704, "learning_rate": 9.88080451875917e-06, "loss": 0.1766, "step": 40 }, { "epoch": 0.7387387387387387, "grad_norm": 0.33044207096099854, "learning_rate": 9.874639560909118e-06, "loss": 0.1814, "step": 41 }, { "epoch": 0.7567567567567568, "grad_norm": 0.3569028079509735, "learning_rate": 9.868321189035196e-06, "loss": 0.1727, "step": 42 }, { "epoch": 0.7747747747747747, "grad_norm": 0.34842026233673096, "learning_rate": 9.861849601988384e-06, "loss": 0.1683, "step": 43 }, { "epoch": 0.7927927927927928, "grad_norm": 0.35948798060417175, "learning_rate": 9.855225003441629e-06, "loss": 0.1719, "step": 44 }, { "epoch": 0.8108108108108109, "grad_norm": 0.3589368164539337, "learning_rate": 9.848447601883436e-06, "loss": 0.1659, "step": 45 }, { "epoch": 0.8288288288288288, "grad_norm": 0.3669843077659607, "learning_rate": 9.841517610611309e-06, "loss": 0.1703, "step": 46 }, { "epoch": 0.8468468468468469, "grad_norm": 0.34229257702827454, "learning_rate": 9.834435247725032e-06, "loss": 0.1766, "step": 47 }, { "epoch": 0.8648648648648649, "grad_norm": 0.3512667417526245, "learning_rate": 9.827200736119815e-06, "loss": 0.1643, "step": 48 }, { "epoch": 0.8828828828828829, "grad_norm": 0.3597641885280609, "learning_rate": 9.819814303479268e-06, "loss": 0.169, "step": 49 }, { "epoch": 0.9009009009009009, "grad_norm": 0.4169813096523285, "learning_rate": 9.812276182268236e-06, "loss": 0.1706, "step": 50 }, { "epoch": 0.918918918918919, "grad_norm": 0.35649940371513367, "learning_rate": 9.804586609725499e-06, "loss": 0.172, "step": 51 }, { "epoch": 0.9369369369369369, "grad_norm": 0.38562002778053284, "learning_rate": 9.79674582785628e-06, "loss": 0.1554, "step": 52 }, { "epoch": 0.954954954954955, "grad_norm": 0.3706550896167755, "learning_rate": 9.788754083424654e-06, "loss": 0.1619, "step": 53 }, { "epoch": 0.972972972972973, "grad_norm": 0.3817874789237976, "learning_rate": 9.78061162794576e-06, "loss": 0.1563, "step": 54 }, { "epoch": 0.990990990990991, "grad_norm": 0.3631809949874878, "learning_rate": 9.772318717677905e-06, "loss": 0.1686, "step": 55 }, { "epoch": 1.0, "grad_norm": 0.3631809949874878, "learning_rate": 9.763875613614482e-06, "loss": 0.0795, "step": 56 }, { "epoch": 1.018018018018018, "grad_norm": 0.37705934047698975, "learning_rate": 9.755282581475769e-06, "loss": 0.1467, "step": 57 }, { "epoch": 1.0360360360360361, "grad_norm": 0.3958112895488739, "learning_rate": 9.746539891700558e-06, "loss": 0.1395, "step": 58 }, { "epoch": 1.054054054054054, "grad_norm": 0.3740142285823822, "learning_rate": 9.737647819437645e-06, "loss": 0.1407, "step": 59 }, { "epoch": 1.072072072072072, "grad_norm": 0.3496512174606323, "learning_rate": 9.728606644537177e-06, "loss": 0.1369, "step": 60 }, { "epoch": 1.09009009009009, "grad_norm": 0.3684415817260742, "learning_rate": 9.719416651541839e-06, "loss": 0.1307, "step": 61 }, { "epoch": 1.1081081081081081, "grad_norm": 0.3743540048599243, "learning_rate": 9.710078129677895e-06, "loss": 0.1378, "step": 62 }, { "epoch": 1.1261261261261262, "grad_norm": 0.39405685663223267, "learning_rate": 9.700591372846096e-06, "loss": 0.1315, "step": 63 }, { "epoch": 1.1441441441441442, "grad_norm": 0.38246047496795654, "learning_rate": 9.690956679612422e-06, "loss": 0.1385, "step": 64 }, { "epoch": 1.1621621621621623, "grad_norm": 0.38582369685173035, "learning_rate": 9.681174353198687e-06, "loss": 0.1295, "step": 65 }, { "epoch": 1.1801801801801801, "grad_norm": 0.3975503742694855, "learning_rate": 9.671244701472999e-06, "loss": 0.1326, "step": 66 }, { "epoch": 1.1981981981981982, "grad_norm": 0.3592880964279175, "learning_rate": 9.661168036940071e-06, "loss": 0.1336, "step": 67 }, { "epoch": 1.2162162162162162, "grad_norm": 0.362184077501297, "learning_rate": 9.650944676731383e-06, "loss": 0.1363, "step": 68 }, { "epoch": 1.2342342342342343, "grad_norm": 0.35390111804008484, "learning_rate": 9.640574942595195e-06, "loss": 0.1385, "step": 69 }, { "epoch": 1.2522522522522523, "grad_norm": 0.3839528262615204, "learning_rate": 9.63005916088644e-06, "loss": 0.133, "step": 70 }, { "epoch": 1.2702702702702702, "grad_norm": 0.3693341612815857, "learning_rate": 9.619397662556434e-06, "loss": 0.1395, "step": 71 }, { "epoch": 1.2882882882882882, "grad_norm": 0.36481085419654846, "learning_rate": 9.608590783142471e-06, "loss": 0.1392, "step": 72 }, { "epoch": 1.3063063063063063, "grad_norm": 0.3655868172645569, "learning_rate": 9.597638862757255e-06, "loss": 0.1294, "step": 73 }, { "epoch": 1.3243243243243243, "grad_norm": 0.35944369435310364, "learning_rate": 9.586542246078203e-06, "loss": 0.1275, "step": 74 }, { "epoch": 1.3423423423423424, "grad_norm": 0.3815653622150421, "learning_rate": 9.5753012823366e-06, "loss": 0.1316, "step": 75 }, { "epoch": 1.3603603603603602, "grad_norm": 0.3833068907260895, "learning_rate": 9.563916325306595e-06, "loss": 0.1271, "step": 76 }, { "epoch": 1.3783783783783785, "grad_norm": 0.369124174118042, "learning_rate": 9.552387733294081e-06, "loss": 0.1352, "step": 77 }, { "epoch": 1.3963963963963963, "grad_norm": 0.35860103368759155, "learning_rate": 9.540715869125407e-06, "loss": 0.1342, "step": 78 }, { "epoch": 1.4144144144144144, "grad_norm": 0.39357277750968933, "learning_rate": 9.528901100135971e-06, "loss": 0.1323, "step": 79 }, { "epoch": 1.4324324324324325, "grad_norm": 0.3725232779979706, "learning_rate": 9.51694379815865e-06, "loss": 0.1309, "step": 80 }, { "epoch": 1.4504504504504505, "grad_norm": 0.3814895749092102, "learning_rate": 9.504844339512096e-06, "loss": 0.1277, "step": 81 }, { "epoch": 1.4684684684684686, "grad_norm": 0.4078717529773712, "learning_rate": 9.492603104988907e-06, "loss": 0.1342, "step": 82 }, { "epoch": 1.4864864864864864, "grad_norm": 0.360506534576416, "learning_rate": 9.480220479843627e-06, "loss": 0.1332, "step": 83 }, { "epoch": 1.5045045045045045, "grad_norm": 0.37059858441352844, "learning_rate": 9.467696853780625e-06, "loss": 0.1303, "step": 84 }, { "epoch": 1.5225225225225225, "grad_norm": 0.38473525643348694, "learning_rate": 9.45503262094184e-06, "loss": 0.1355, "step": 85 }, { "epoch": 1.5405405405405406, "grad_norm": 0.39016297459602356, "learning_rate": 9.442228179894362e-06, "loss": 0.1415, "step": 86 }, { "epoch": 1.5585585585585586, "grad_norm": 0.372662752866745, "learning_rate": 9.4292839336179e-06, "loss": 0.1334, "step": 87 }, { "epoch": 1.5765765765765765, "grad_norm": 0.3725448250770569, "learning_rate": 9.416200289492092e-06, "loss": 0.1285, "step": 88 }, { "epoch": 1.5945945945945947, "grad_norm": 0.3796994090080261, "learning_rate": 9.40297765928369e-06, "loss": 0.1274, "step": 89 }, { "epoch": 1.6126126126126126, "grad_norm": 0.36067870259284973, "learning_rate": 9.389616459133597e-06, "loss": 0.1301, "step": 90 }, { "epoch": 1.6306306306306306, "grad_norm": 0.36582452058792114, "learning_rate": 9.376117109543769e-06, "loss": 0.1319, "step": 91 }, { "epoch": 1.6486486486486487, "grad_norm": 0.38476914167404175, "learning_rate": 9.362480035363987e-06, "loss": 0.1259, "step": 92 }, { "epoch": 1.6666666666666665, "grad_norm": 0.3849290907382965, "learning_rate": 9.348705665778479e-06, "loss": 0.1302, "step": 93 }, { "epoch": 1.6846846846846848, "grad_norm": 0.3458653688430786, "learning_rate": 9.334794434292416e-06, "loss": 0.1229, "step": 94 }, { "epoch": 1.7027027027027026, "grad_norm": 0.369865357875824, "learning_rate": 9.320746778718274e-06, "loss": 0.1348, "step": 95 }, { "epoch": 1.7207207207207207, "grad_norm": 0.34437230229377747, "learning_rate": 9.306563141162046e-06, "loss": 0.1268, "step": 96 }, { "epoch": 1.7387387387387387, "grad_norm": 0.37380245327949524, "learning_rate": 9.292243968009332e-06, "loss": 0.1331, "step": 97 }, { "epoch": 1.7567567567567568, "grad_norm": 0.3576115369796753, "learning_rate": 9.27778970991129e-06, "loss": 0.1327, "step": 98 }, { "epoch": 1.7747747747747749, "grad_norm": 0.3359750211238861, "learning_rate": 9.263200821770462e-06, "loss": 0.1312, "step": 99 }, { "epoch": 1.7927927927927927, "grad_norm": 0.4232477843761444, "learning_rate": 9.248477762726438e-06, "loss": 0.1262, "step": 100 }, { "epoch": 1.810810810810811, "grad_norm": 0.34946057200431824, "learning_rate": 9.233620996141421e-06, "loss": 0.125, "step": 101 }, { "epoch": 1.8288288288288288, "grad_norm": 0.37758418917655945, "learning_rate": 9.218630989585647e-06, "loss": 0.1259, "step": 102 }, { "epoch": 1.8468468468468469, "grad_norm": 0.38684237003326416, "learning_rate": 9.203508214822652e-06, "loss": 0.1282, "step": 103 }, { "epoch": 1.864864864864865, "grad_norm": 0.3739815652370453, "learning_rate": 9.188253147794443e-06, "loss": 0.1238, "step": 104 }, { "epoch": 1.8828828828828827, "grad_norm": 0.3532610237598419, "learning_rate": 9.172866268606514e-06, "loss": 0.1272, "step": 105 }, { "epoch": 1.900900900900901, "grad_norm": 0.37537145614624023, "learning_rate": 9.157348061512728e-06, "loss": 0.1287, "step": 106 }, { "epoch": 1.9189189189189189, "grad_norm": 0.3723607659339905, "learning_rate": 9.141699014900084e-06, "loss": 0.1237, "step": 107 }, { "epoch": 1.936936936936937, "grad_norm": 0.33924514055252075, "learning_rate": 9.125919621273348e-06, "loss": 0.1277, "step": 108 }, { "epoch": 1.954954954954955, "grad_norm": 0.39682239294052124, "learning_rate": 9.110010377239552e-06, "loss": 0.1276, "step": 109 }, { "epoch": 1.972972972972973, "grad_norm": 0.39324480295181274, "learning_rate": 9.093971783492354e-06, "loss": 0.1301, "step": 110 }, { "epoch": 1.990990990990991, "grad_norm": 0.3867541253566742, "learning_rate": 9.077804344796302e-06, "loss": 0.1327, "step": 111 }, { "epoch": 2.0, "grad_norm": 0.39770832657814026, "learning_rate": 9.061508569970926e-06, "loss": 0.0666, "step": 112 } ], "logging_steps": 1, "max_steps": 560, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 56, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.424337771526357e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }