| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 470, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.021321961620469083, |
| "grad_norm": 1.1025959253311157, |
| "learning_rate": 2.033898305084746e-06, |
| "loss": 1.3396, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.042643923240938165, |
| "grad_norm": 0.9394682049751282, |
| "learning_rate": 4.576271186440678e-06, |
| "loss": 1.3427, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.06396588486140725, |
| "grad_norm": 0.5853310823440552, |
| "learning_rate": 7.1186440677966106e-06, |
| "loss": 1.3113, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.08528784648187633, |
| "grad_norm": 1.0313405990600586, |
| "learning_rate": 9.661016949152542e-06, |
| "loss": 1.2611, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.10660980810234541, |
| "grad_norm": 0.548238217830658, |
| "learning_rate": 1.2203389830508475e-05, |
| "loss": 1.2482, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.1279317697228145, |
| "grad_norm": 0.6522016525268555, |
| "learning_rate": 1.4745762711864408e-05, |
| "loss": 1.2438, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 0.6198495626449585, |
| "learning_rate": 1.728813559322034e-05, |
| "loss": 1.1851, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.17057569296375266, |
| "grad_norm": 0.5091772079467773, |
| "learning_rate": 1.983050847457627e-05, |
| "loss": 1.2328, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.19189765458422176, |
| "grad_norm": 0.43554070591926575, |
| "learning_rate": 2.2372881355932205e-05, |
| "loss": 1.2056, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.21321961620469082, |
| "grad_norm": 0.4790154695510864, |
| "learning_rate": 2.4915254237288138e-05, |
| "loss": 1.1369, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.2345415778251599, |
| "grad_norm": 0.5797903537750244, |
| "learning_rate": 2.7457627118644068e-05, |
| "loss": 1.1525, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.255863539445629, |
| "grad_norm": 0.6414856314659119, |
| "learning_rate": 3e-05, |
| "loss": 1.118, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.2771855010660981, |
| "grad_norm": 0.6532033085823059, |
| "learning_rate": 2.9998514182537154e-05, |
| "loss": 1.0826, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 0.5140215158462524, |
| "learning_rate": 2.9994057024502427e-05, |
| "loss": 1.0808, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.31982942430703626, |
| "grad_norm": 0.5573478937149048, |
| "learning_rate": 2.998662940889891e-05, |
| "loss": 1.0923, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.3411513859275053, |
| "grad_norm": 0.6022165417671204, |
| "learning_rate": 2.9976232807204073e-05, |
| "loss": 1.044, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.3624733475479744, |
| "grad_norm": 0.6599242687225342, |
| "learning_rate": 2.9962869279078226e-05, |
| "loss": 1.0641, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.3837953091684435, |
| "grad_norm": 0.5529143810272217, |
| "learning_rate": 2.9946541471956496e-05, |
| "loss": 1.0251, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.4051172707889126, |
| "grad_norm": 0.5225338339805603, |
| "learning_rate": 2.9927252620524346e-05, |
| "loss": 1.0353, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.42643923240938164, |
| "grad_norm": 0.5583758354187012, |
| "learning_rate": 2.9905006546076746e-05, |
| "loss": 1.0061, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 0.5726757645606995, |
| "learning_rate": 2.9879807655761145e-05, |
| "loss": 1.0173, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.4690831556503198, |
| "grad_norm": 0.6370916366577148, |
| "learning_rate": 2.985166094170439e-05, |
| "loss": 1.0122, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.4904051172707889, |
| "grad_norm": 0.6529775857925415, |
| "learning_rate": 2.9820571980023704e-05, |
| "loss": 0.911, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.511727078891258, |
| "grad_norm": 0.6832720637321472, |
| "learning_rate": 2.9786546929722055e-05, |
| "loss": 0.9745, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.5330490405117271, |
| "grad_norm": 0.6681720614433289, |
| "learning_rate": 2.974959253146796e-05, |
| "loss": 0.9218, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.5543710021321961, |
| "grad_norm": 0.8444278836250305, |
| "learning_rate": 2.9709716106260115e-05, |
| "loss": 0.9233, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.5756929637526652, |
| "grad_norm": 0.8203223347663879, |
| "learning_rate": 2.966692555397705e-05, |
| "loss": 0.9167, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.6752311587333679, |
| "learning_rate": 2.962122935181207e-05, |
| "loss": 0.9187, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.6183368869936035, |
| "grad_norm": 0.6852523684501648, |
| "learning_rate": 2.957263655259387e-05, |
| "loss": 0.8627, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.6396588486140725, |
| "grad_norm": 0.8514593839645386, |
| "learning_rate": 2.9521156782993066e-05, |
| "loss": 0.8204, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.6609808102345416, |
| "grad_norm": 0.8665833473205566, |
| "learning_rate": 2.9466800241615075e-05, |
| "loss": 0.8655, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.6823027718550106, |
| "grad_norm": 0.9081581234931946, |
| "learning_rate": 2.940957769697969e-05, |
| "loss": 0.8356, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.7036247334754797, |
| "grad_norm": 0.7875135540962219, |
| "learning_rate": 2.9349500485387718e-05, |
| "loss": 0.8547, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.7249466950959488, |
| "grad_norm": 0.837771475315094, |
| "learning_rate": 2.9286580508675174e-05, |
| "loss": 0.7974, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.8674811720848083, |
| "learning_rate": 2.9220830231855417e-05, |
| "loss": 0.8343, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.767590618336887, |
| "grad_norm": 0.8112357258796692, |
| "learning_rate": 2.9152262680649704e-05, |
| "loss": 0.7642, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.7889125799573561, |
| "grad_norm": 0.855670690536499, |
| "learning_rate": 2.90808914389067e-05, |
| "loss": 0.778, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.8102345415778252, |
| "grad_norm": 0.9481444358825684, |
| "learning_rate": 2.900673064591139e-05, |
| "loss": 0.7688, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.8315565031982942, |
| "grad_norm": 0.9807798266410828, |
| "learning_rate": 2.8929794993583937e-05, |
| "loss": 0.7645, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.8528784648187633, |
| "grad_norm": 1.074795126914978, |
| "learning_rate": 2.8850099723569104e-05, |
| "loss": 0.7552, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.8742004264392325, |
| "grad_norm": 1.0988177061080933, |
| "learning_rate": 2.876766062421675e-05, |
| "loss": 0.7782, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 1.0309362411499023, |
| "learning_rate": 2.8682494027454e-05, |
| "loss": 0.7621, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.9168443496801706, |
| "grad_norm": 0.9497408866882324, |
| "learning_rate": 2.8594616805549752e-05, |
| "loss": 0.6876, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.9381663113006397, |
| "grad_norm": 1.0013132095336914, |
| "learning_rate": 2.8504046367772117e-05, |
| "loss": 0.6893, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.9594882729211087, |
| "grad_norm": 1.0180565118789673, |
| "learning_rate": 2.8410800656939512e-05, |
| "loss": 0.6678, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.9808102345415778, |
| "grad_norm": 1.1235246658325195, |
| "learning_rate": 2.8314898145865996e-05, |
| "loss": 0.6193, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 1.4771363735198975, |
| "learning_rate": 2.8216357833701667e-05, |
| "loss": 0.6586, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.0213219616204692, |
| "grad_norm": 1.1948978900909424, |
| "learning_rate": 2.811519924216873e-05, |
| "loss": 0.579, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.0426439232409381, |
| "grad_norm": 1.012829303741455, |
| "learning_rate": 2.8011442411694105e-05, |
| "loss": 0.5649, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.0639658848614073, |
| "grad_norm": 1.0202640295028687, |
| "learning_rate": 2.79051078974392e-05, |
| "loss": 0.5759, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.0852878464818763, |
| "grad_norm": 1.1689255237579346, |
| "learning_rate": 2.779621676522777e-05, |
| "loss": 0.5548, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.1066098081023454, |
| "grad_norm": 1.0039416551589966, |
| "learning_rate": 2.7684790587372597e-05, |
| "loss": 0.5409, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.1279317697228146, |
| "grad_norm": 1.0827348232269287, |
| "learning_rate": 2.757085143840179e-05, |
| "loss": 0.5911, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.1492537313432836, |
| "grad_norm": 1.0059739351272583, |
| "learning_rate": 2.7454421890685647e-05, |
| "loss": 0.5258, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.1705756929637527, |
| "grad_norm": 1.232321858406067, |
| "learning_rate": 2.7335525009964863e-05, |
| "loss": 0.5129, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.1918976545842217, |
| "grad_norm": 1.052078127861023, |
| "learning_rate": 2.721418435078099e-05, |
| "loss": 0.5184, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.2132196162046909, |
| "grad_norm": 1.2947639226913452, |
| "learning_rate": 2.709042395181008e-05, |
| "loss": 0.5388, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.2345415778251598, |
| "grad_norm": 1.1052894592285156, |
| "learning_rate": 2.6964268331100396e-05, |
| "loss": 0.5633, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.255863539445629, |
| "grad_norm": 1.0759986639022827, |
| "learning_rate": 2.683574248121517e-05, |
| "loss": 0.5133, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.2771855010660982, |
| "grad_norm": 1.1331149339675903, |
| "learning_rate": 2.6704871864281377e-05, |
| "loss": 0.55, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.2985074626865671, |
| "grad_norm": 1.071257472038269, |
| "learning_rate": 2.657168240694541e-05, |
| "loss": 0.5122, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.3198294243070363, |
| "grad_norm": 1.0727146863937378, |
| "learning_rate": 2.6436200495236806e-05, |
| "loss": 0.5708, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.3411513859275053, |
| "grad_norm": 1.2485096454620361, |
| "learning_rate": 2.6298452969340952e-05, |
| "loss": 0.5605, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.3624733475479744, |
| "grad_norm": 1.1561352014541626, |
| "learning_rate": 2.6158467118281765e-05, |
| "loss": 0.4775, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.3837953091684434, |
| "grad_norm": 1.13988196849823, |
| "learning_rate": 2.6016270674515526e-05, |
| "loss": 0.5213, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.4051172707889126, |
| "grad_norm": 1.4094964265823364, |
| "learning_rate": 2.58718918084368e-05, |
| "loss": 0.4526, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.4264392324093818, |
| "grad_norm": 1.132122278213501, |
| "learning_rate": 2.5725359122797657e-05, |
| "loss": 0.5135, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.4477611940298507, |
| "grad_norm": 1.223320484161377, |
| "learning_rate": 2.5576701647041192e-05, |
| "loss": 0.4754, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.4690831556503199, |
| "grad_norm": 1.1605335474014282, |
| "learning_rate": 2.5425948831550528e-05, |
| "loss": 0.4845, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.4904051172707888, |
| "grad_norm": 1.0932228565216064, |
| "learning_rate": 2.5273130541814446e-05, |
| "loss": 0.4388, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.511727078891258, |
| "grad_norm": 1.064093828201294, |
| "learning_rate": 2.511827705251075e-05, |
| "loss": 0.4424, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.533049040511727, |
| "grad_norm": 1.204820156097412, |
| "learning_rate": 2.496141904150859e-05, |
| "loss": 0.4394, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.5543710021321961, |
| "grad_norm": 1.1509909629821777, |
| "learning_rate": 2.4802587583790875e-05, |
| "loss": 0.4334, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.5756929637526653, |
| "grad_norm": 1.1247512102127075, |
| "learning_rate": 2.464181414529809e-05, |
| "loss": 0.4587, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.5970149253731343, |
| "grad_norm": 1.1641768217086792, |
| "learning_rate": 2.447913057669456e-05, |
| "loss": 0.4453, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.6183368869936035, |
| "grad_norm": 1.1374614238739014, |
| "learning_rate": 2.4314569107058572e-05, |
| "loss": 0.4602, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.6396588486140726, |
| "grad_norm": 1.2946219444274902, |
| "learning_rate": 2.4148162337497496e-05, |
| "loss": 0.3953, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.6609808102345416, |
| "grad_norm": 1.1880476474761963, |
| "learning_rate": 2.3979943234689226e-05, |
| "loss": 0.4298, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.6823027718550105, |
| "grad_norm": 1.429024338722229, |
| "learning_rate": 2.3809945124351162e-05, |
| "loss": 0.4449, |
| "step": 395 |
| }, |
| { |
| "epoch": 1.7036247334754797, |
| "grad_norm": 1.3434990644454956, |
| "learning_rate": 2.36382016846381e-05, |
| "loss": 0.4315, |
| "step": 400 |
| }, |
| { |
| "epoch": 1.724946695095949, |
| "grad_norm": 1.0165458917617798, |
| "learning_rate": 2.3464746939470288e-05, |
| "loss": 0.4226, |
| "step": 405 |
| }, |
| { |
| "epoch": 1.7462686567164178, |
| "grad_norm": 1.1595861911773682, |
| "learning_rate": 2.3289615251792984e-05, |
| "loss": 0.3784, |
| "step": 410 |
| }, |
| { |
| "epoch": 1.767590618336887, |
| "grad_norm": 1.1516300439834595, |
| "learning_rate": 2.3112841316768827e-05, |
| "loss": 0.3978, |
| "step": 415 |
| }, |
| { |
| "epoch": 1.7889125799573562, |
| "grad_norm": 1.2137658596038818, |
| "learning_rate": 2.2934460154904436e-05, |
| "loss": 0.3699, |
| "step": 420 |
| }, |
| { |
| "epoch": 1.8102345415778252, |
| "grad_norm": 1.3457313776016235, |
| "learning_rate": 2.2754507105112526e-05, |
| "loss": 0.3552, |
| "step": 425 |
| }, |
| { |
| "epoch": 1.831556503198294, |
| "grad_norm": 1.3003560304641724, |
| "learning_rate": 2.257301781771095e-05, |
| "loss": 0.3705, |
| "step": 430 |
| }, |
| { |
| "epoch": 1.8528784648187633, |
| "grad_norm": 1.2907923460006714, |
| "learning_rate": 2.2390028247360042e-05, |
| "loss": 0.3763, |
| "step": 435 |
| }, |
| { |
| "epoch": 1.8742004264392325, |
| "grad_norm": 1.1227895021438599, |
| "learning_rate": 2.2205574645939684e-05, |
| "loss": 0.3499, |
| "step": 440 |
| }, |
| { |
| "epoch": 1.8955223880597014, |
| "grad_norm": 1.2199273109436035, |
| "learning_rate": 2.201969355536749e-05, |
| "loss": 0.3956, |
| "step": 445 |
| }, |
| { |
| "epoch": 1.9168443496801706, |
| "grad_norm": 1.1481236219406128, |
| "learning_rate": 2.183242180035951e-05, |
| "loss": 0.3847, |
| "step": 450 |
| }, |
| { |
| "epoch": 1.9381663113006398, |
| "grad_norm": 1.1786848306655884, |
| "learning_rate": 2.1643796481134934e-05, |
| "loss": 0.3925, |
| "step": 455 |
| }, |
| { |
| "epoch": 1.9594882729211087, |
| "grad_norm": 1.4188814163208008, |
| "learning_rate": 2.145385496606619e-05, |
| "loss": 0.3548, |
| "step": 460 |
| }, |
| { |
| "epoch": 1.9808102345415777, |
| "grad_norm": 1.1952940225601196, |
| "learning_rate": 2.1262634884275948e-05, |
| "loss": 0.3747, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 1.786958932876587, |
| "learning_rate": 2.107017411818244e-05, |
| "loss": 0.3113, |
| "step": 470 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1175, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 2000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.802258677790147e+17, |
| "train_batch_size": 2, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|