{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 10880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.09191176470588236, "grad_norm": 2.861119031906128, "learning_rate": 4.955882352941177e-05, "loss": 1.678604736328125, "step": 100 }, { "epoch": 0.18382352941176472, "grad_norm": 1.6152201890945435, "learning_rate": 4.9099264705882355e-05, "loss": 0.554380111694336, "step": 200 }, { "epoch": 0.2757352941176471, "grad_norm": 2.08105206489563, "learning_rate": 4.863970588235294e-05, "loss": 0.33380359649658203, "step": 300 }, { "epoch": 0.36764705882352944, "grad_norm": 1.5297737121582031, "learning_rate": 4.818014705882353e-05, "loss": 0.26274593353271486, "step": 400 }, { "epoch": 0.45955882352941174, "grad_norm": 1.867387294769287, "learning_rate": 4.7720588235294124e-05, "loss": 0.22210750579833985, "step": 500 }, { "epoch": 0.5514705882352942, "grad_norm": 4.296945095062256, "learning_rate": 4.7261029411764704e-05, "loss": 0.1887100601196289, "step": 600 }, { "epoch": 0.6433823529411765, "grad_norm": 1.9779675006866455, "learning_rate": 4.68014705882353e-05, "loss": 0.16664567947387696, "step": 700 }, { "epoch": 0.7352941176470589, "grad_norm": 1.235213279724121, "learning_rate": 4.6341911764705886e-05, "loss": 0.16922021865844727, "step": 800 }, { "epoch": 0.8272058823529411, "grad_norm": 2.484598398208618, "learning_rate": 4.588235294117647e-05, "loss": 0.15672155380249023, "step": 900 }, { "epoch": 0.9191176470588235, "grad_norm": 0.8935145139694214, "learning_rate": 4.542279411764706e-05, "loss": 0.1493326473236084, "step": 1000 }, { "epoch": 1.0, "eval_loss": 0.14647769927978516, "eval_runtime": 4.2967, "eval_samples_per_second": 2025.035, "eval_steps_per_second": 63.304, "step": 1088 }, { "epoch": 1.0110294117647058, "grad_norm": 0.8300992250442505, "learning_rate": 4.496323529411765e-05, "loss": 0.14999670028686524, "step": 1100 }, { "epoch": 1.1029411764705883, "grad_norm": 0.6350510716438293, "learning_rate": 4.4503676470588236e-05, "loss": 0.12446197509765625, "step": 1200 }, { "epoch": 1.1948529411764706, "grad_norm": 1.469220519065857, "learning_rate": 4.404411764705882e-05, "loss": 0.11504798889160156, "step": 1300 }, { "epoch": 1.2867647058823528, "grad_norm": 0.9885977506637573, "learning_rate": 4.358455882352942e-05, "loss": 0.12503914833068847, "step": 1400 }, { "epoch": 1.3786764705882353, "grad_norm": 1.2623215913772583, "learning_rate": 4.3125000000000005e-05, "loss": 0.1185552978515625, "step": 1500 }, { "epoch": 1.4705882352941178, "grad_norm": 1.181009292602539, "learning_rate": 4.2665441176470585e-05, "loss": 0.1074635124206543, "step": 1600 }, { "epoch": 1.5625, "grad_norm": 0.7625616788864136, "learning_rate": 4.220588235294118e-05, "loss": 0.11462491989135742, "step": 1700 }, { "epoch": 1.6544117647058822, "grad_norm": 2.7165768146514893, "learning_rate": 4.174632352941177e-05, "loss": 0.11170839309692383, "step": 1800 }, { "epoch": 1.7463235294117647, "grad_norm": 1.0306402444839478, "learning_rate": 4.1286764705882354e-05, "loss": 0.10664710998535157, "step": 1900 }, { "epoch": 1.8382352941176472, "grad_norm": 1.0458590984344482, "learning_rate": 4.082720588235294e-05, "loss": 0.10655851364135742, "step": 2000 }, { "epoch": 1.9301470588235294, "grad_norm": 1.751387357711792, "learning_rate": 4.036764705882353e-05, "loss": 0.11276634216308594, "step": 2100 }, { "epoch": 2.0, "eval_loss": 0.11876623332500458, "eval_runtime": 3.7959, "eval_samples_per_second": 2292.186, "eval_steps_per_second": 71.656, "step": 2176 }, { "epoch": 2.0220588235294117, "grad_norm": 0.9640232920646667, "learning_rate": 3.9908088235294123e-05, "loss": 0.10104022979736328, "step": 2200 }, { "epoch": 2.113970588235294, "grad_norm": 1.449666976928711, "learning_rate": 3.9448529411764704e-05, "loss": 0.08359379768371582, "step": 2300 }, { "epoch": 2.2058823529411766, "grad_norm": 0.781505286693573, "learning_rate": 3.89889705882353e-05, "loss": 0.08938695907592774, "step": 2400 }, { "epoch": 2.297794117647059, "grad_norm": 0.9161350131034851, "learning_rate": 3.8529411764705886e-05, "loss": 0.09331055641174317, "step": 2500 }, { "epoch": 2.389705882352941, "grad_norm": 0.34266597032546997, "learning_rate": 3.806985294117647e-05, "loss": 0.0942567253112793, "step": 2600 }, { "epoch": 2.4816176470588234, "grad_norm": 0.3938254714012146, "learning_rate": 3.761029411764706e-05, "loss": 0.08378758430480956, "step": 2700 }, { "epoch": 2.5735294117647056, "grad_norm": 0.8159363865852356, "learning_rate": 3.715073529411765e-05, "loss": 0.08920242309570313, "step": 2800 }, { "epoch": 2.6654411764705883, "grad_norm": 0.7843156456947327, "learning_rate": 3.6691176470588235e-05, "loss": 0.09552728652954101, "step": 2900 }, { "epoch": 2.7573529411764706, "grad_norm": 1.678454041481018, "learning_rate": 3.623161764705882e-05, "loss": 0.0881564712524414, "step": 3000 }, { "epoch": 2.849264705882353, "grad_norm": 1.567854642868042, "learning_rate": 3.577205882352942e-05, "loss": 0.09041579246520996, "step": 3100 }, { "epoch": 2.9411764705882355, "grad_norm": 0.587993860244751, "learning_rate": 3.5312500000000005e-05, "loss": 0.08352569580078124, "step": 3200 }, { "epoch": 3.0, "eval_loss": 0.11157828569412231, "eval_runtime": 3.9136, "eval_samples_per_second": 2223.252, "eval_steps_per_second": 69.501, "step": 3264 }, { "epoch": 3.0330882352941178, "grad_norm": 0.7346888184547424, "learning_rate": 3.4852941176470585e-05, "loss": 0.08925918579101562, "step": 3300 }, { "epoch": 3.125, "grad_norm": 0.6136897206306458, "learning_rate": 3.439338235294118e-05, "loss": 0.07912126064300537, "step": 3400 }, { "epoch": 3.2169117647058822, "grad_norm": 0.47108200192451477, "learning_rate": 3.393382352941177e-05, "loss": 0.07419106960296631, "step": 3500 }, { "epoch": 3.3088235294117645, "grad_norm": 0.8382533192634583, "learning_rate": 3.3474264705882354e-05, "loss": 0.0670989227294922, "step": 3600 }, { "epoch": 3.400735294117647, "grad_norm": 0.6706309914588928, "learning_rate": 3.301470588235294e-05, "loss": 0.06986721515655518, "step": 3700 }, { "epoch": 3.4926470588235294, "grad_norm": 0.5485235452651978, "learning_rate": 3.255514705882353e-05, "loss": 0.07686973571777343, "step": 3800 }, { "epoch": 3.5845588235294117, "grad_norm": 0.8460040092468262, "learning_rate": 3.209558823529412e-05, "loss": 0.07120684623718261, "step": 3900 }, { "epoch": 3.6764705882352944, "grad_norm": 0.9563305974006653, "learning_rate": 3.1636029411764704e-05, "loss": 0.07464917659759522, "step": 4000 }, { "epoch": 3.7683823529411766, "grad_norm": 0.6851525902748108, "learning_rate": 3.11764705882353e-05, "loss": 0.07348180770874023, "step": 4100 }, { "epoch": 3.860294117647059, "grad_norm": 0.46768584847450256, "learning_rate": 3.0716911764705886e-05, "loss": 0.08051628112792969, "step": 4200 }, { "epoch": 3.952205882352941, "grad_norm": 0.8145326375961304, "learning_rate": 3.025735294117647e-05, "loss": 0.0793468189239502, "step": 4300 }, { "epoch": 4.0, "eval_loss": 0.11373896896839142, "eval_runtime": 3.8395, "eval_samples_per_second": 2266.171, "eval_steps_per_second": 70.842, "step": 4352 }, { "epoch": 4.044117647058823, "grad_norm": 4.8013997077941895, "learning_rate": 2.979779411764706e-05, "loss": 0.06481593132019042, "step": 4400 }, { "epoch": 4.136029411764706, "grad_norm": 1.51911199092865, "learning_rate": 2.933823529411765e-05, "loss": 0.05568636417388916, "step": 4500 }, { "epoch": 4.227941176470588, "grad_norm": 1.1331921815872192, "learning_rate": 2.8878676470588235e-05, "loss": 0.06395863056182861, "step": 4600 }, { "epoch": 4.319852941176471, "grad_norm": 1.9773746728897095, "learning_rate": 2.8419117647058823e-05, "loss": 0.05171878814697266, "step": 4700 }, { "epoch": 4.411764705882353, "grad_norm": 0.720111608505249, "learning_rate": 2.7959558823529414e-05, "loss": 0.06219084739685059, "step": 4800 }, { "epoch": 4.5036764705882355, "grad_norm": 1.243735671043396, "learning_rate": 2.7500000000000004e-05, "loss": 0.06258386135101318, "step": 4900 }, { "epoch": 4.595588235294118, "grad_norm": 0.7214698195457458, "learning_rate": 2.704044117647059e-05, "loss": 0.057212424278259275, "step": 5000 }, { "epoch": 4.6875, "grad_norm": 2.4246177673339844, "learning_rate": 2.658088235294118e-05, "loss": 0.05896786212921143, "step": 5100 }, { "epoch": 4.779411764705882, "grad_norm": 0.3699852228164673, "learning_rate": 2.6121323529411767e-05, "loss": 0.0625047254562378, "step": 5200 }, { "epoch": 4.8713235294117645, "grad_norm": 0.8965820670127869, "learning_rate": 2.566176470588235e-05, "loss": 0.06457361221313476, "step": 5300 }, { "epoch": 4.963235294117647, "grad_norm": 0.6348599791526794, "learning_rate": 2.520220588235294e-05, "loss": 0.05709341049194336, "step": 5400 }, { "epoch": 5.0, "eval_loss": 0.13100895285606384, "eval_runtime": 3.7913, "eval_samples_per_second": 2294.964, "eval_steps_per_second": 71.742, "step": 5440 }, { "epoch": 5.055147058823529, "grad_norm": 0.30243951082229614, "learning_rate": 2.4742647058823532e-05, "loss": 0.0436199951171875, "step": 5500 }, { "epoch": 5.147058823529412, "grad_norm": 1.0563251972198486, "learning_rate": 2.428308823529412e-05, "loss": 0.03898259401321411, "step": 5600 }, { "epoch": 5.238970588235294, "grad_norm": 0.3339505195617676, "learning_rate": 2.3823529411764707e-05, "loss": 0.03611770153045654, "step": 5700 }, { "epoch": 5.330882352941177, "grad_norm": 3.47481107711792, "learning_rate": 2.3363970588235295e-05, "loss": 0.03747700929641724, "step": 5800 }, { "epoch": 5.422794117647059, "grad_norm": 0.49956804513931274, "learning_rate": 2.2904411764705882e-05, "loss": 0.034790968894958495, "step": 5900 }, { "epoch": 5.514705882352941, "grad_norm": 0.2137073427438736, "learning_rate": 2.2444852941176473e-05, "loss": 0.042610764503479004, "step": 6000 }, { "epoch": 5.606617647058823, "grad_norm": 4.238280296325684, "learning_rate": 2.198529411764706e-05, "loss": 0.041733989715576174, "step": 6100 }, { "epoch": 5.698529411764706, "grad_norm": 0.7751985192298889, "learning_rate": 2.1525735294117648e-05, "loss": 0.042198920249938966, "step": 6200 }, { "epoch": 5.790441176470588, "grad_norm": 6.274240493774414, "learning_rate": 2.1066176470588235e-05, "loss": 0.035168659687042234, "step": 6300 }, { "epoch": 5.882352941176471, "grad_norm": 0.8700118064880371, "learning_rate": 2.0606617647058823e-05, "loss": 0.04176306247711182, "step": 6400 }, { "epoch": 5.974264705882353, "grad_norm": 0.40382614731788635, "learning_rate": 2.0151654411764708e-05, "loss": 0.03780954122543335, "step": 6500 }, { "epoch": 6.0, "eval_loss": 0.1548856645822525, "eval_runtime": 3.8561, "eval_samples_per_second": 2256.438, "eval_steps_per_second": 70.538, "step": 6528 }, { "epoch": 6.0661764705882355, "grad_norm": 0.35931289196014404, "learning_rate": 1.9692095588235295e-05, "loss": 0.026753320693969726, "step": 6600 }, { "epoch": 6.158088235294118, "grad_norm": 1.0790654420852661, "learning_rate": 1.9232536764705883e-05, "loss": 0.022549192905426025, "step": 6700 }, { "epoch": 6.25, "grad_norm": 0.39832767844200134, "learning_rate": 1.8777573529411764e-05, "loss": 0.02674192190170288, "step": 6800 }, { "epoch": 6.341911764705882, "grad_norm": 0.38946613669395447, "learning_rate": 1.8318014705882352e-05, "loss": 0.024337658882141112, "step": 6900 }, { "epoch": 6.4338235294117645, "grad_norm": 6.687967300415039, "learning_rate": 1.7858455882352943e-05, "loss": 0.02405022144317627, "step": 7000 }, { "epoch": 6.525735294117647, "grad_norm": 1.1742165088653564, "learning_rate": 1.739889705882353e-05, "loss": 0.023499369621276855, "step": 7100 }, { "epoch": 6.617647058823529, "grad_norm": 0.728435754776001, "learning_rate": 1.693933823529412e-05, "loss": 0.01860466957092285, "step": 7200 }, { "epoch": 6.709558823529412, "grad_norm": 0.18539367616176605, "learning_rate": 1.6479779411764705e-05, "loss": 0.024487736225128173, "step": 7300 }, { "epoch": 6.801470588235294, "grad_norm": 2.0757601261138916, "learning_rate": 1.6020220588235296e-05, "loss": 0.027930150032043456, "step": 7400 }, { "epoch": 6.893382352941177, "grad_norm": 0.6962282657623291, "learning_rate": 1.5560661764705883e-05, "loss": 0.019371466636657717, "step": 7500 }, { "epoch": 6.985294117647059, "grad_norm": 0.3877858519554138, "learning_rate": 1.510110294117647e-05, "loss": 0.023068771362304688, "step": 7600 }, { "epoch": 7.0, "eval_loss": 0.19883336126804352, "eval_runtime": 3.8871, "eval_samples_per_second": 2238.423, "eval_steps_per_second": 69.975, "step": 7616 }, { "epoch": 7.077205882352941, "grad_norm": 0.32462701201438904, "learning_rate": 1.464154411764706e-05, "loss": 0.01416821002960205, "step": 7700 }, { "epoch": 7.169117647058823, "grad_norm": 0.70732182264328, "learning_rate": 1.4181985294117647e-05, "loss": 0.013301538228988647, "step": 7800 }, { "epoch": 7.261029411764706, "grad_norm": 12.949718475341797, "learning_rate": 1.3722426470588238e-05, "loss": 0.01586754560470581, "step": 7900 }, { "epoch": 7.352941176470588, "grad_norm": 1.7924553155899048, "learning_rate": 1.3262867647058824e-05, "loss": 0.019125467538833617, "step": 8000 }, { "epoch": 7.444852941176471, "grad_norm": 0.45370689034461975, "learning_rate": 1.2803308823529411e-05, "loss": 0.017261466979980468, "step": 8100 }, { "epoch": 7.536764705882353, "grad_norm": 0.24471713602542877, "learning_rate": 1.2343750000000002e-05, "loss": 0.016836028099060058, "step": 8200 }, { "epoch": 7.6286764705882355, "grad_norm": 0.273219496011734, "learning_rate": 1.1884191176470588e-05, "loss": 0.014804782867431641, "step": 8300 }, { "epoch": 7.720588235294118, "grad_norm": 0.27901849150657654, "learning_rate": 1.1424632352941177e-05, "loss": 0.017638254165649413, "step": 8400 }, { "epoch": 7.8125, "grad_norm": 0.41847002506256104, "learning_rate": 1.0965073529411766e-05, "loss": 0.014013255834579469, "step": 8500 }, { "epoch": 7.904411764705882, "grad_norm": 0.3298964500427246, "learning_rate": 1.0505514705882353e-05, "loss": 0.015006015300750733, "step": 8600 }, { "epoch": 7.9963235294117645, "grad_norm": 0.5094680786132812, "learning_rate": 1.0045955882352942e-05, "loss": 0.016840940713882445, "step": 8700 }, { "epoch": 8.0, "eval_loss": 0.22593119740486145, "eval_runtime": 3.8624, "eval_samples_per_second": 2252.772, "eval_steps_per_second": 70.423, "step": 8704 }, { "epoch": 8.088235294117647, "grad_norm": 0.23575666546821594, "learning_rate": 9.58639705882353e-06, "loss": 0.011906511783599853, "step": 8800 }, { "epoch": 8.180147058823529, "grad_norm": 0.39200881123542786, "learning_rate": 9.126838235294117e-06, "loss": 0.009779441356658935, "step": 8900 }, { "epoch": 8.272058823529411, "grad_norm": 0.2954489588737488, "learning_rate": 8.667279411764706e-06, "loss": 0.011333670616149902, "step": 9000 }, { "epoch": 8.363970588235293, "grad_norm": 0.1555805653333664, "learning_rate": 8.207720588235294e-06, "loss": 0.011691917181015015, "step": 9100 }, { "epoch": 8.455882352941176, "grad_norm": 0.6293551921844482, "learning_rate": 7.748161764705883e-06, "loss": 0.010650770664215088, "step": 9200 }, { "epoch": 8.547794117647058, "grad_norm": 0.47241711616516113, "learning_rate": 7.288602941176471e-06, "loss": 0.008451443314552307, "step": 9300 }, { "epoch": 8.639705882352942, "grad_norm": 0.39692994952201843, "learning_rate": 6.829044117647059e-06, "loss": 0.01287778615951538, "step": 9400 }, { "epoch": 8.731617647058824, "grad_norm": 4.867070198059082, "learning_rate": 6.374080882352941e-06, "loss": 0.012674452066421508, "step": 9500 }, { "epoch": 8.823529411764707, "grad_norm": 0.20786941051483154, "learning_rate": 5.9145220588235295e-06, "loss": 0.009658980965614319, "step": 9600 }, { "epoch": 8.915441176470589, "grad_norm": 0.24980570375919342, "learning_rate": 5.454963235294118e-06, "loss": 0.012473410367965699, "step": 9700 }, { "epoch": 9.0, "eval_loss": 0.2505253851413727, "eval_runtime": 3.8579, "eval_samples_per_second": 2255.375, "eval_steps_per_second": 70.505, "step": 9792 }, { "epoch": 9.007352941176471, "grad_norm": 0.34893837571144104, "learning_rate": 4.995404411764706e-06, "loss": 0.01137054443359375, "step": 9800 }, { "epoch": 9.099264705882353, "grad_norm": 0.1509261131286621, "learning_rate": 4.535845588235294e-06, "loss": 0.009393535852432251, "step": 9900 }, { "epoch": 9.191176470588236, "grad_norm": 0.4828801453113556, "learning_rate": 4.076286764705883e-06, "loss": 0.011060981750488282, "step": 10000 }, { "epoch": 9.283088235294118, "grad_norm": 0.10140291601419449, "learning_rate": 3.616727941176471e-06, "loss": 0.008189416527748107, "step": 10100 }, { "epoch": 9.375, "grad_norm": 0.13836342096328735, "learning_rate": 3.1571691176470588e-06, "loss": 0.00860303282737732, "step": 10200 }, { "epoch": 9.466911764705882, "grad_norm": 0.3119546175003052, "learning_rate": 2.6976102941176475e-06, "loss": 0.008158923387527465, "step": 10300 }, { "epoch": 9.558823529411764, "grad_norm": 0.7189019918441772, "learning_rate": 2.2380514705882353e-06, "loss": 0.007920079231262207, "step": 10400 }, { "epoch": 9.650735294117647, "grad_norm": 0.28993985056877136, "learning_rate": 1.7784926470588236e-06, "loss": 0.00800090193748474, "step": 10500 }, { "epoch": 9.742647058823529, "grad_norm": 0.6550254225730896, "learning_rate": 1.3189338235294119e-06, "loss": 0.008071759939193726, "step": 10600 }, { "epoch": 9.834558823529411, "grad_norm": 0.47817108035087585, "learning_rate": 8.593750000000001e-07, "loss": 0.008685371279716492, "step": 10700 }, { "epoch": 9.926470588235293, "grad_norm": 0.5251961350440979, "learning_rate": 3.998161764705882e-07, "loss": 0.007078754305839538, "step": 10800 }, { "epoch": 10.0, "eval_loss": 0.24180535972118378, "eval_runtime": 3.9524, "eval_samples_per_second": 2201.471, "eval_steps_per_second": 68.82, "step": 10880 } ], "logging_steps": 100, "max_steps": 10880, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.138963309056e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }