{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.003514722293004824, "eval_steps": 50, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 3.514722293004824e-05, "grad_norm": 3.201282024383545, "learning_rate": 2.9999999999999997e-05, "loss": 2.6534, "step": 1 }, { "epoch": 3.514722293004824e-05, "eval_loss": 3.5773849487304688, "eval_runtime": 122.8934, "eval_samples_per_second": 4.654, "eval_steps_per_second": 4.654, "step": 1 }, { "epoch": 7.029444586009648e-05, "grad_norm": 10.12950325012207, "learning_rate": 5.9999999999999995e-05, "loss": 3.5291, "step": 2 }, { "epoch": 0.00010544166879014472, "grad_norm": 8.2281494140625, "learning_rate": 8.999999999999999e-05, "loss": 2.8958, "step": 3 }, { "epoch": 0.00014058889172019297, "grad_norm": 7.624779224395752, "learning_rate": 0.00011999999999999999, "loss": 3.5229, "step": 4 }, { "epoch": 0.0001757361146502412, "grad_norm": 9.272777557373047, "learning_rate": 0.00015, "loss": 2.3524, "step": 5 }, { "epoch": 0.00021088333758028944, "grad_norm": 7.767298221588135, "learning_rate": 0.00017999999999999998, "loss": 3.0476, "step": 6 }, { "epoch": 0.0002460305605103377, "grad_norm": 7.62408447265625, "learning_rate": 0.00020999999999999998, "loss": 2.8094, "step": 7 }, { "epoch": 0.00028117778344038594, "grad_norm": 12.137682914733887, "learning_rate": 0.00023999999999999998, "loss": 2.9913, "step": 8 }, { "epoch": 0.00031632500637043413, "grad_norm": 6.323672294616699, "learning_rate": 0.00027, "loss": 2.3277, "step": 9 }, { "epoch": 0.0003514722293004824, "grad_norm": 7.7393903732299805, "learning_rate": 0.0003, "loss": 2.6589, "step": 10 }, { "epoch": 0.00038661945223053063, "grad_norm": 3.816528081893921, "learning_rate": 0.00029990862405286433, "loss": 3.1351, "step": 11 }, { "epoch": 0.0004217666751605789, "grad_norm": 6.924014091491699, "learning_rate": 0.0002996346075389736, "loss": 2.9356, "step": 12 }, { "epoch": 0.00045691389809062713, "grad_norm": 7.251121997833252, "learning_rate": 0.00029917828430524096, "loss": 2.9646, "step": 13 }, { "epoch": 0.0004920611210206754, "grad_norm": 6.662265300750732, "learning_rate": 0.0002985402103112355, "loss": 3.2145, "step": 14 }, { "epoch": 0.0005272083439507236, "grad_norm": 7.596660614013672, "learning_rate": 0.0002977211629518312, "loss": 3.0197, "step": 15 }, { "epoch": 0.0005623555668807719, "grad_norm": 4.958890914916992, "learning_rate": 0.0002967221401100708, "loss": 2.8559, "step": 16 }, { "epoch": 0.0005975027898108201, "grad_norm": 6.678516387939453, "learning_rate": 0.0002955443589413994, "loss": 2.4536, "step": 17 }, { "epoch": 0.0006326500127408683, "grad_norm": 8.238439559936523, "learning_rate": 0.0002941892543907478, "loss": 3.123, "step": 18 }, { "epoch": 0.0006677972356709165, "grad_norm": 8.204689025878906, "learning_rate": 0.00029265847744427303, "loss": 3.4216, "step": 19 }, { "epoch": 0.0007029444586009648, "grad_norm": 7.485687732696533, "learning_rate": 0.0002909538931178862, "loss": 3.3373, "step": 20 }, { "epoch": 0.000738091681531013, "grad_norm": 6.295429229736328, "learning_rate": 0.0002890775781850181, "loss": 3.3456, "step": 21 }, { "epoch": 0.0007732389044610613, "grad_norm": 7.376994609832764, "learning_rate": 0.0002870318186463901, "loss": 3.282, "step": 22 }, { "epoch": 0.0008083861273911095, "grad_norm": 6.7084641456604, "learning_rate": 0.000284819106944875, "loss": 2.7638, "step": 23 }, { "epoch": 0.0008435333503211578, "grad_norm": 7.342163562774658, "learning_rate": 0.000282442138928839, "loss": 2.8477, "step": 24 }, { "epoch": 0.000878680573251206, "grad_norm": 6.798054218292236, "learning_rate": 0.0002799038105676658, "loss": 2.7685, "step": 25 }, { "epoch": 0.0009138277961812543, "grad_norm": 6.962170600891113, "learning_rate": 0.00027720721442346387, "loss": 2.8994, "step": 26 }, { "epoch": 0.0009489750191113025, "grad_norm": 8.554988861083984, "learning_rate": 0.0002743556358832562, "loss": 2.9583, "step": 27 }, { "epoch": 0.0009841222420413508, "grad_norm": 6.769901275634766, "learning_rate": 0.0002713525491562421, "loss": 2.7998, "step": 28 }, { "epoch": 0.001019269464971399, "grad_norm": 9.057374000549316, "learning_rate": 0.00026820161304100823, "loss": 3.3269, "step": 29 }, { "epoch": 0.0010544166879014473, "grad_norm": 10.272748947143555, "learning_rate": 0.00026490666646784665, "loss": 3.6792, "step": 30 }, { "epoch": 0.0010895639108314955, "grad_norm": 7.183256149291992, "learning_rate": 0.00026147172382160914, "loss": 2.88, "step": 31 }, { "epoch": 0.0011247111337615438, "grad_norm": 9.834364891052246, "learning_rate": 0.00025790097005079764, "loss": 2.9482, "step": 32 }, { "epoch": 0.001159858356691592, "grad_norm": 10.326171875, "learning_rate": 0.0002541987555688496, "loss": 2.7405, "step": 33 }, { "epoch": 0.0011950055796216402, "grad_norm": 8.075050354003906, "learning_rate": 0.0002503695909538287, "loss": 3.0049, "step": 34 }, { "epoch": 0.0012301528025516883, "grad_norm": 7.11674165725708, "learning_rate": 0.0002464181414529809, "loss": 2.9412, "step": 35 }, { "epoch": 0.0012653000254817365, "grad_norm": 6.559268951416016, "learning_rate": 0.0002423492212988487, "loss": 2.6168, "step": 36 }, { "epoch": 0.0013004472484117848, "grad_norm": 11.436878204345703, "learning_rate": 0.00023816778784387094, "loss": 3.4648, "step": 37 }, { "epoch": 0.001335594471341833, "grad_norm": 8.354622840881348, "learning_rate": 0.00023387893552061199, "loss": 3.0553, "step": 38 }, { "epoch": 0.0013707416942718813, "grad_norm": 6.86464262008667, "learning_rate": 0.0002294878896349807, "loss": 2.8184, "step": 39 }, { "epoch": 0.0014058889172019295, "grad_norm": 7.403708457946777, "learning_rate": 0.000225, "loss": 2.6431, "step": 40 }, { "epoch": 0.0014410361401319778, "grad_norm": 8.896238327026367, "learning_rate": 0.00022042073441788358, "loss": 2.9453, "step": 41 }, { "epoch": 0.001476183363062026, "grad_norm": 9.727499008178711, "learning_rate": 0.0002157556720183616, "loss": 3.0855, "step": 42 }, { "epoch": 0.0015113305859920743, "grad_norm": 9.778864860534668, "learning_rate": 0.00021101049646137003, "loss": 3.0316, "step": 43 }, { "epoch": 0.0015464778089221225, "grad_norm": 19.503732681274414, "learning_rate": 0.0002061909890123868, "loss": 3.5855, "step": 44 }, { "epoch": 0.0015816250318521708, "grad_norm": 8.386335372924805, "learning_rate": 0.00020130302149885031, "loss": 3.2382, "step": 45 }, { "epoch": 0.001616772254782219, "grad_norm": 9.492271423339844, "learning_rate": 0.0001963525491562421, "loss": 1.8511, "step": 46 }, { "epoch": 0.0016519194777122673, "grad_norm": 9.34842300415039, "learning_rate": 0.00019134560337254986, "loss": 2.8563, "step": 47 }, { "epoch": 0.0016870667006423155, "grad_norm": 15.09698486328125, "learning_rate": 0.00018628828433995013, "loss": 3.2323, "step": 48 }, { "epoch": 0.0017222139235723638, "grad_norm": 12.457684516906738, "learning_rate": 0.00018118675362266385, "loss": 3.075, "step": 49 }, { "epoch": 0.001757361146502412, "grad_norm": 11.620329856872559, "learning_rate": 0.00017604722665003956, "loss": 3.3615, "step": 50 }, { "epoch": 0.001757361146502412, "eval_loss": 2.9333691596984863, "eval_runtime": 122.5269, "eval_samples_per_second": 4.668, "eval_steps_per_second": 4.668, "step": 50 }, { "epoch": 0.0017925083694324603, "grad_norm": 6.184074878692627, "learning_rate": 0.0001708759651440098, "loss": 3.1531, "step": 51 }, { "epoch": 0.0018276555923625085, "grad_norm": 10.926563262939453, "learning_rate": 0.000165679269490148, "loss": 3.6512, "step": 52 }, { "epoch": 0.0018628028152925568, "grad_norm": 6.391138553619385, "learning_rate": 0.00016046347106161876, "loss": 2.8123, "step": 53 }, { "epoch": 0.001897950038222605, "grad_norm": 3.9918131828308105, "learning_rate": 0.00015523492450537517, "loss": 2.6699, "step": 54 }, { "epoch": 0.0019330972611526533, "grad_norm": 5.609512805938721, "learning_rate": 0.00015, "loss": 2.3293, "step": 55 }, { "epoch": 0.0019682444840827015, "grad_norm": 5.8025031089782715, "learning_rate": 0.0001447650754946249, "loss": 2.0517, "step": 56 }, { "epoch": 0.0020033917070127498, "grad_norm": 7.59370756149292, "learning_rate": 0.00013953652893838119, "loss": 3.0335, "step": 57 }, { "epoch": 0.002038538929942798, "grad_norm": 7.709898948669434, "learning_rate": 0.000134320730509852, "loss": 3.0555, "step": 58 }, { "epoch": 0.0020736861528728463, "grad_norm": 5.140454292297363, "learning_rate": 0.0001291240348559902, "loss": 2.6652, "step": 59 }, { "epoch": 0.0021088333758028945, "grad_norm": 6.058025360107422, "learning_rate": 0.00012395277334996044, "loss": 2.7671, "step": 60 }, { "epoch": 0.0021439805987329428, "grad_norm": 9.216087341308594, "learning_rate": 0.00011881324637733611, "loss": 2.7076, "step": 61 }, { "epoch": 0.002179127821662991, "grad_norm": 8.128350257873535, "learning_rate": 0.00011371171566004985, "loss": 2.7331, "step": 62 }, { "epoch": 0.0022142750445930393, "grad_norm": 7.724212169647217, "learning_rate": 0.00010865439662745013, "loss": 2.625, "step": 63 }, { "epoch": 0.0022494222675230875, "grad_norm": 5.113654613494873, "learning_rate": 0.0001036474508437579, "loss": 2.2554, "step": 64 }, { "epoch": 0.0022845694904531358, "grad_norm": 7.291034698486328, "learning_rate": 9.869697850114969e-05, "loss": 2.0766, "step": 65 }, { "epoch": 0.002319716713383184, "grad_norm": 6.133238792419434, "learning_rate": 9.380901098761319e-05, "loss": 2.6534, "step": 66 }, { "epoch": 0.0023548639363132323, "grad_norm": 8.07371711730957, "learning_rate": 8.898950353862998e-05, "loss": 3.2567, "step": 67 }, { "epoch": 0.0023900111592432805, "grad_norm": 8.47569465637207, "learning_rate": 8.424432798163836e-05, "loss": 2.3998, "step": 68 }, { "epoch": 0.0024251583821733283, "grad_norm": 6.5739850997924805, "learning_rate": 7.957926558211642e-05, "loss": 2.8606, "step": 69 }, { "epoch": 0.0024603056051033766, "grad_norm": 10.502620697021484, "learning_rate": 7.500000000000002e-05, "loss": 2.7211, "step": 70 }, { "epoch": 0.002495452828033425, "grad_norm": 6.819766998291016, "learning_rate": 7.051211036501928e-05, "loss": 2.7132, "step": 71 }, { "epoch": 0.002530600050963473, "grad_norm": 8.332561492919922, "learning_rate": 6.612106447938799e-05, "loss": 3.0188, "step": 72 }, { "epoch": 0.0025657472738935213, "grad_norm": 7.98085355758667, "learning_rate": 6.183221215612904e-05, "loss": 1.8015, "step": 73 }, { "epoch": 0.0026008944968235696, "grad_norm": 7.219940662384033, "learning_rate": 5.765077870115125e-05, "loss": 2.4322, "step": 74 }, { "epoch": 0.002636041719753618, "grad_norm": 7.731595993041992, "learning_rate": 5.358185854701909e-05, "loss": 3.1412, "step": 75 }, { "epoch": 0.002671188942683666, "grad_norm": 6.916698932647705, "learning_rate": 4.963040904617131e-05, "loss": 4.0983, "step": 76 }, { "epoch": 0.0027063361656137143, "grad_norm": 7.314121246337891, "learning_rate": 4.5801244431150394e-05, "loss": 2.8968, "step": 77 }, { "epoch": 0.0027414833885437626, "grad_norm": 7.5785441398620605, "learning_rate": 4.209902994920235e-05, "loss": 2.643, "step": 78 }, { "epoch": 0.002776630611473811, "grad_norm": 6.763470649719238, "learning_rate": 3.852827617839084e-05, "loss": 2.8279, "step": 79 }, { "epoch": 0.002811777834403859, "grad_norm": 8.693899154663086, "learning_rate": 3.509333353215331e-05, "loss": 2.7579, "step": 80 }, { "epoch": 0.0028469250573339073, "grad_norm": 5.903140068054199, "learning_rate": 3.1798386958991714e-05, "loss": 2.596, "step": 81 }, { "epoch": 0.0028820722802639556, "grad_norm": 6.02622127532959, "learning_rate": 2.8647450843757897e-05, "loss": 2.5302, "step": 82 }, { "epoch": 0.002917219503194004, "grad_norm": 6.922908306121826, "learning_rate": 2.5644364116743755e-05, "loss": 2.5549, "step": 83 }, { "epoch": 0.002952366726124052, "grad_norm": 6.8705973625183105, "learning_rate": 2.2792785576536105e-05, "loss": 2.7103, "step": 84 }, { "epoch": 0.0029875139490541003, "grad_norm": 8.229166030883789, "learning_rate": 2.009618943233419e-05, "loss": 3.2369, "step": 85 }, { "epoch": 0.0030226611719841485, "grad_norm": 5.061288833618164, "learning_rate": 1.755786107116095e-05, "loss": 2.3396, "step": 86 }, { "epoch": 0.003057808394914197, "grad_norm": 5.73906946182251, "learning_rate": 1.5180893055124977e-05, "loss": 2.1163, "step": 87 }, { "epoch": 0.003092955617844245, "grad_norm": 6.186309814453125, "learning_rate": 1.2968181353609852e-05, "loss": 3.2023, "step": 88 }, { "epoch": 0.0031281028407742933, "grad_norm": 8.73239517211914, "learning_rate": 1.0922421814981901e-05, "loss": 2.935, "step": 89 }, { "epoch": 0.0031632500637043415, "grad_norm": 8.442344665527344, "learning_rate": 9.046106882113751e-06, "loss": 2.048, "step": 90 }, { "epoch": 0.00319839728663439, "grad_norm": 9.889983177185059, "learning_rate": 7.34152255572697e-06, "loss": 3.0998, "step": 91 }, { "epoch": 0.003233544509564438, "grad_norm": 7.554132461547852, "learning_rate": 5.810745609252165e-06, "loss": 2.7425, "step": 92 }, { "epoch": 0.0032686917324944863, "grad_norm": 18.795644760131836, "learning_rate": 4.455641058600528e-06, "loss": 2.7303, "step": 93 }, { "epoch": 0.0033038389554245345, "grad_norm": 9.423656463623047, "learning_rate": 3.2778598899291465e-06, "loss": 3.1626, "step": 94 }, { "epoch": 0.003338986178354583, "grad_norm": 10.255327224731445, "learning_rate": 2.2788370481687965e-06, "loss": 3.7099, "step": 95 }, { "epoch": 0.003374133401284631, "grad_norm": 7.863884925842285, "learning_rate": 1.4597896887644456e-06, "loss": 2.9709, "step": 96 }, { "epoch": 0.0034092806242146793, "grad_norm": 9.880104064941406, "learning_rate": 8.217156947590064e-07, "loss": 3.5264, "step": 97 }, { "epoch": 0.0034444278471447275, "grad_norm": 8.848705291748047, "learning_rate": 3.653924610263703e-07, "loss": 2.9102, "step": 98 }, { "epoch": 0.003479575070074776, "grad_norm": 8.545324325561523, "learning_rate": 9.137594713563568e-08, "loss": 2.644, "step": 99 }, { "epoch": 0.003514722293004824, "grad_norm": 14.05859088897705, "learning_rate": 0.0, "loss": 3.0055, "step": 100 }, { "epoch": 0.003514722293004824, "eval_loss": 2.7083585262298584, "eval_runtime": 122.4923, "eval_samples_per_second": 4.67, "eval_steps_per_second": 4.67, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8964338123538432.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }