{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.03385168729503861, "eval_steps": 20, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004231460911879826, "grad_norm": 1.6379058361053467, "learning_rate": 2e-05, "loss": 2.4891, "step": 1 }, { "epoch": 0.0004231460911879826, "eval_loss": 2.7103111743927, "eval_runtime": 63.2579, "eval_samples_per_second": 31.459, "eval_steps_per_second": 7.873, "step": 1 }, { "epoch": 0.0008462921823759652, "grad_norm": 1.6534020900726318, "learning_rate": 4e-05, "loss": 2.6718, "step": 2 }, { "epoch": 0.0012694382735639479, "grad_norm": 1.7026811838150024, "learning_rate": 6e-05, "loss": 2.6471, "step": 3 }, { "epoch": 0.0016925843647519305, "grad_norm": 1.6745214462280273, "learning_rate": 8e-05, "loss": 2.6711, "step": 4 }, { "epoch": 0.002115730455939913, "grad_norm": 1.5553100109100342, "learning_rate": 0.0001, "loss": 2.5945, "step": 5 }, { "epoch": 0.0025388765471278957, "grad_norm": 1.4465041160583496, "learning_rate": 0.00012, "loss": 2.6169, "step": 6 }, { "epoch": 0.0029620226383158784, "grad_norm": 1.3366378545761108, "learning_rate": 0.00014, "loss": 2.6392, "step": 7 }, { "epoch": 0.003385168729503861, "grad_norm": 1.3913536071777344, "learning_rate": 0.00016, "loss": 2.5899, "step": 8 }, { "epoch": 0.003808314820691844, "grad_norm": 1.4674654006958008, "learning_rate": 0.00018, "loss": 2.5501, "step": 9 }, { "epoch": 0.004231460911879826, "grad_norm": 1.5482183694839478, "learning_rate": 0.0002, "loss": 2.5371, "step": 10 }, { "epoch": 0.004654607003067809, "grad_norm": 1.5878816843032837, "learning_rate": 0.00019989930665413147, "loss": 2.55, "step": 11 }, { "epoch": 0.0050777530942557915, "grad_norm": 1.4205892086029053, "learning_rate": 0.00019959742939952392, "loss": 2.4969, "step": 12 }, { "epoch": 0.005500899185443774, "grad_norm": 1.4903907775878906, "learning_rate": 0.00019909497617679348, "loss": 2.3645, "step": 13 }, { "epoch": 0.005924045276631757, "grad_norm": 1.3923956155776978, "learning_rate": 0.00019839295885986296, "loss": 2.3951, "step": 14 }, { "epoch": 0.006347191367819739, "grad_norm": 1.3330631256103516, "learning_rate": 0.00019749279121818235, "loss": 2.4397, "step": 15 }, { "epoch": 0.006770337459007722, "grad_norm": 1.3482651710510254, "learning_rate": 0.00019639628606958533, "loss": 2.4354, "step": 16 }, { "epoch": 0.0071934835501957055, "grad_norm": 1.3089728355407715, "learning_rate": 0.00019510565162951537, "loss": 2.3894, "step": 17 }, { "epoch": 0.007616629641383688, "grad_norm": 1.2744436264038086, "learning_rate": 0.00019362348706397373, "loss": 2.3651, "step": 18 }, { "epoch": 0.00803977573257167, "grad_norm": 1.23836350440979, "learning_rate": 0.0001919527772551451, "loss": 2.2486, "step": 19 }, { "epoch": 0.008462921823759652, "grad_norm": 1.2774382829666138, "learning_rate": 0.0001900968867902419, "loss": 2.3211, "step": 20 }, { "epoch": 0.008462921823759652, "eval_loss": 2.3435285091400146, "eval_runtime": 72.0517, "eval_samples_per_second": 27.619, "eval_steps_per_second": 6.912, "step": 20 }, { "epoch": 0.008886067914947636, "grad_norm": 1.351232886314392, "learning_rate": 0.0001880595531856738, "loss": 2.3451, "step": 21 }, { "epoch": 0.009309214006135618, "grad_norm": 1.3749635219573975, "learning_rate": 0.00018584487936018661, "loss": 2.4502, "step": 22 }, { "epoch": 0.009732360097323601, "grad_norm": 1.4125924110412598, "learning_rate": 0.00018345732537213027, "loss": 2.4342, "step": 23 }, { "epoch": 0.010155506188511583, "grad_norm": 1.326550841331482, "learning_rate": 0.00018090169943749476, "loss": 2.3139, "step": 24 }, { "epoch": 0.010578652279699566, "grad_norm": 1.1940902471542358, "learning_rate": 0.000178183148246803, "loss": 2.3268, "step": 25 }, { "epoch": 0.011001798370887548, "grad_norm": 1.294498085975647, "learning_rate": 0.00017530714660036112, "loss": 2.2343, "step": 26 }, { "epoch": 0.011424944462075532, "grad_norm": 1.4341365098953247, "learning_rate": 0.00017227948638273916, "loss": 2.2412, "step": 27 }, { "epoch": 0.011848090553263513, "grad_norm": 1.4379862546920776, "learning_rate": 0.00016910626489868649, "loss": 2.3088, "step": 28 }, { "epoch": 0.012271236644451497, "grad_norm": 1.807862401008606, "learning_rate": 0.00016579387259397127, "loss": 2.3386, "step": 29 }, { "epoch": 0.012694382735639479, "grad_norm": 1.4435659646987915, "learning_rate": 0.00016234898018587337, "loss": 2.2997, "step": 30 }, { "epoch": 0.013117528826827462, "grad_norm": 1.380933403968811, "learning_rate": 0.00015877852522924732, "loss": 2.2587, "step": 31 }, { "epoch": 0.013540674918015444, "grad_norm": 1.3532695770263672, "learning_rate": 0.00015508969814521025, "loss": 2.389, "step": 32 }, { "epoch": 0.013963821009203427, "grad_norm": 1.3885740041732788, "learning_rate": 0.00015128992774059063, "loss": 2.2765, "step": 33 }, { "epoch": 0.014386967100391411, "grad_norm": 1.6315644979476929, "learning_rate": 0.00014738686624729986, "loss": 2.2364, "step": 34 }, { "epoch": 0.014810113191579393, "grad_norm": 1.5489228963851929, "learning_rate": 0.00014338837391175582, "loss": 2.1861, "step": 35 }, { "epoch": 0.015233259282767376, "grad_norm": 1.4338537454605103, "learning_rate": 0.00013930250316539238, "loss": 2.1853, "step": 36 }, { "epoch": 0.015656405373955358, "grad_norm": 1.3952263593673706, "learning_rate": 0.0001351374824081343, "loss": 2.2732, "step": 37 }, { "epoch": 0.01607955146514334, "grad_norm": 1.3308148384094238, "learning_rate": 0.00013090169943749476, "loss": 2.2248, "step": 38 }, { "epoch": 0.016502697556331325, "grad_norm": 1.477274775505066, "learning_rate": 0.00012660368455666752, "loss": 2.2161, "step": 39 }, { "epoch": 0.016925843647519305, "grad_norm": 1.3079551458358765, "learning_rate": 0.00012225209339563145, "loss": 2.1925, "step": 40 }, { "epoch": 0.016925843647519305, "eval_loss": 2.211884021759033, "eval_runtime": 75.9809, "eval_samples_per_second": 26.191, "eval_steps_per_second": 6.554, "step": 40 }, { "epoch": 0.01734898973870729, "grad_norm": 1.243667483329773, "learning_rate": 0.00011785568947986367, "loss": 2.282, "step": 41 }, { "epoch": 0.017772135829895272, "grad_norm": 5.15204381942749, "learning_rate": 0.00011342332658176555, "loss": 2.3092, "step": 42 }, { "epoch": 0.018195281921083255, "grad_norm": 1.3127551078796387, "learning_rate": 0.00010896393089034336, "loss": 2.185, "step": 43 }, { "epoch": 0.018618428012271235, "grad_norm": 1.3681222200393677, "learning_rate": 0.00010448648303505151, "loss": 2.2499, "step": 44 }, { "epoch": 0.01904157410345922, "grad_norm": 1.3882651329040527, "learning_rate": 0.0001, "loss": 2.168, "step": 45 }, { "epoch": 0.019464720194647202, "grad_norm": 1.2424044609069824, "learning_rate": 9.551351696494854e-05, "loss": 2.1857, "step": 46 }, { "epoch": 0.019887866285835186, "grad_norm": 1.449005365371704, "learning_rate": 9.103606910965666e-05, "loss": 2.3204, "step": 47 }, { "epoch": 0.020311012377023166, "grad_norm": 1.2062491178512573, "learning_rate": 8.657667341823448e-05, "loss": 2.1174, "step": 48 }, { "epoch": 0.02073415846821115, "grad_norm": 4.882323741912842, "learning_rate": 8.214431052013634e-05, "loss": 2.3162, "step": 49 }, { "epoch": 0.021157304559399133, "grad_norm": 1.4830926656723022, "learning_rate": 7.774790660436858e-05, "loss": 2.4003, "step": 50 }, { "epoch": 0.021580450650587116, "grad_norm": 1.2459235191345215, "learning_rate": 7.339631544333249e-05, "loss": 2.1419, "step": 51 }, { "epoch": 0.022003596741775096, "grad_norm": 1.3951412439346313, "learning_rate": 6.909830056250527e-05, "loss": 2.114, "step": 52 }, { "epoch": 0.02242674283296308, "grad_norm": 1.2715801000595093, "learning_rate": 6.486251759186572e-05, "loss": 2.0807, "step": 53 }, { "epoch": 0.022849888924151063, "grad_norm": 1.1952708959579468, "learning_rate": 6.069749683460765e-05, "loss": 2.1682, "step": 54 }, { "epoch": 0.023273035015339047, "grad_norm": 1.1247831583023071, "learning_rate": 5.6611626088244194e-05, "loss": 2.062, "step": 55 }, { "epoch": 0.023696181106527027, "grad_norm": 1.2911661863327026, "learning_rate": 5.261313375270014e-05, "loss": 2.234, "step": 56 }, { "epoch": 0.02411932719771501, "grad_norm": 1.1771955490112305, "learning_rate": 4.87100722594094e-05, "loss": 2.2042, "step": 57 }, { "epoch": 0.024542473288902994, "grad_norm": 1.1919941902160645, "learning_rate": 4.491030185478976e-05, "loss": 2.0736, "step": 58 }, { "epoch": 0.024965619380090977, "grad_norm": 1.3543719053268433, "learning_rate": 4.12214747707527e-05, "loss": 2.1515, "step": 59 }, { "epoch": 0.025388765471278957, "grad_norm": 1.123665452003479, "learning_rate": 3.7651019814126654e-05, "loss": 2.1566, "step": 60 }, { "epoch": 0.025388765471278957, "eval_loss": 2.16865873336792, "eval_runtime": 25.383, "eval_samples_per_second": 78.399, "eval_steps_per_second": 19.619, "step": 60 }, { "epoch": 0.02581191156246694, "grad_norm": 1.132775068283081, "learning_rate": 3.4206127406028745e-05, "loss": 2.2303, "step": 61 }, { "epoch": 0.026235057653654924, "grad_norm": 1.1115837097167969, "learning_rate": 3.089373510131354e-05, "loss": 2.1469, "step": 62 }, { "epoch": 0.026658203744842908, "grad_norm": 1.1179817914962769, "learning_rate": 2.7720513617260856e-05, "loss": 2.1541, "step": 63 }, { "epoch": 0.027081349836030888, "grad_norm": 1.3277086019515991, "learning_rate": 2.4692853399638917e-05, "loss": 2.1514, "step": 64 }, { "epoch": 0.02750449592721887, "grad_norm": 1.1748079061508179, "learning_rate": 2.181685175319702e-05, "loss": 2.0713, "step": 65 }, { "epoch": 0.027927642018406855, "grad_norm": 1.210524559020996, "learning_rate": 1.9098300562505266e-05, "loss": 2.1612, "step": 66 }, { "epoch": 0.02835078810959484, "grad_norm": 1.1560407876968384, "learning_rate": 1.6542674627869737e-05, "loss": 2.2039, "step": 67 }, { "epoch": 0.028773934200782822, "grad_norm": 1.1386845111846924, "learning_rate": 1.415512063981339e-05, "loss": 2.1411, "step": 68 }, { "epoch": 0.029197080291970802, "grad_norm": 1.0700095891952515, "learning_rate": 1.19404468143262e-05, "loss": 2.1263, "step": 69 }, { "epoch": 0.029620226383158785, "grad_norm": 1.1910362243652344, "learning_rate": 9.903113209758096e-06, "loss": 2.27, "step": 70 }, { "epoch": 0.03004337247434677, "grad_norm": 1.1575465202331543, "learning_rate": 8.047222744854943e-06, "loss": 2.156, "step": 71 }, { "epoch": 0.030466518565534752, "grad_norm": 1.1447267532348633, "learning_rate": 6.37651293602628e-06, "loss": 2.1257, "step": 72 }, { "epoch": 0.030889664656722732, "grad_norm": 1.1838666200637817, "learning_rate": 4.8943483704846475e-06, "loss": 2.2088, "step": 73 }, { "epoch": 0.031312810747910716, "grad_norm": 1.1437898874282837, "learning_rate": 3.6037139304146762e-06, "loss": 2.0872, "step": 74 }, { "epoch": 0.031735956839098696, "grad_norm": 1.1745190620422363, "learning_rate": 2.5072087818176382e-06, "loss": 2.1591, "step": 75 }, { "epoch": 0.03215910293028668, "grad_norm": 1.2256075143814087, "learning_rate": 1.6070411401370334e-06, "loss": 2.1358, "step": 76 }, { "epoch": 0.03258224902147466, "grad_norm": 1.3967663049697876, "learning_rate": 9.0502382320653e-07, "loss": 2.1687, "step": 77 }, { "epoch": 0.03300539511266265, "grad_norm": 1.205739140510559, "learning_rate": 4.025706004760932e-07, "loss": 2.2164, "step": 78 }, { "epoch": 0.03342854120385063, "grad_norm": 1.2898608446121216, "learning_rate": 1.0069334586854107e-07, "loss": 2.2109, "step": 79 }, { "epoch": 0.03385168729503861, "grad_norm": 1.1767884492874146, "learning_rate": 0.0, "loss": 2.2616, "step": 80 }, { "epoch": 0.03385168729503861, "eval_loss": 2.162363290786743, "eval_runtime": 25.534, "eval_samples_per_second": 77.935, "eval_steps_per_second": 19.503, "step": 80 } ], "logging_steps": 1, "max_steps": 80, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1476504563220480.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }