LLM-D2 / trainer_state.json
firdavsus's picture
Upload folder using huggingface_hub
c7b53ed verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.357279084551812,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.06,
"grad_norm": 1.6613423824310303,
"learning_rate": 6.361323155216286e-07,
"loss": 2.3639,
"step": 25
},
{
"epoch": 0.13,
"grad_norm": 1.3924915790557861,
"learning_rate": 1.2722646310432571e-06,
"loss": 2.3559,
"step": 50
},
{
"epoch": 0.19,
"grad_norm": 0.9411530494689941,
"learning_rate": 1.908396946564886e-06,
"loss": 2.3245,
"step": 75
},
{
"epoch": 0.25,
"grad_norm": 0.7608889937400818,
"learning_rate": 2.5445292620865143e-06,
"loss": 2.2691,
"step": 100
},
{
"epoch": 0.32,
"grad_norm": 0.8099371790885925,
"learning_rate": 3.1806615776081427e-06,
"loss": 2.2409,
"step": 125
},
{
"epoch": 0.38,
"grad_norm": 0.6795283555984497,
"learning_rate": 3.816793893129772e-06,
"loss": 2.2435,
"step": 150
},
{
"epoch": 0.45,
"grad_norm": 0.6041733622550964,
"learning_rate": 4.4529262086514e-06,
"loss": 2.2375,
"step": 175
},
{
"epoch": 0.51,
"grad_norm": 0.644061803817749,
"learning_rate": 5.0890585241730285e-06,
"loss": 2.2012,
"step": 200
},
{
"epoch": 0.57,
"grad_norm": 0.6609583497047424,
"learning_rate": 5.725190839694656e-06,
"loss": 2.213,
"step": 225
},
{
"epoch": 0.64,
"grad_norm": 0.6314918994903564,
"learning_rate": 6.3613231552162854e-06,
"loss": 2.2088,
"step": 250
},
{
"epoch": 0.7,
"grad_norm": 0.6217303276062012,
"learning_rate": 6.997455470737914e-06,
"loss": 2.1823,
"step": 275
},
{
"epoch": 0.76,
"grad_norm": 0.624250054359436,
"learning_rate": 7.633587786259543e-06,
"loss": 2.1685,
"step": 300
},
{
"epoch": 0.83,
"grad_norm": 0.692873477935791,
"learning_rate": 8.26972010178117e-06,
"loss": 2.166,
"step": 325
},
{
"epoch": 0.89,
"grad_norm": 0.6286394000053406,
"learning_rate": 8.9058524173028e-06,
"loss": 2.1521,
"step": 350
},
{
"epoch": 0.95,
"grad_norm": 0.6370307803153992,
"learning_rate": 9.54198473282443e-06,
"loss": 2.149,
"step": 375
},
{
"epoch": 1.02,
"grad_norm": 0.614319384098053,
"learning_rate": 9.999903358354628e-06,
"loss": 2.1586,
"step": 400
},
{
"epoch": 1.08,
"grad_norm": 0.6239405274391174,
"learning_rate": 9.997980516292023e-06,
"loss": 2.1352,
"step": 425
},
{
"epoch": 1.14,
"grad_norm": 0.6648218631744385,
"learning_rate": 9.99359341519765e-06,
"loss": 2.1352,
"step": 450
},
{
"epoch": 1.21,
"grad_norm": 0.6202364563941956,
"learning_rate": 9.986744218135864e-06,
"loss": 2.1187,
"step": 475
},
{
"epoch": 1.27,
"grad_norm": 0.6447356939315796,
"learning_rate": 9.977436302109771e-06,
"loss": 2.1135,
"step": 500
},
{
"epoch": 1.27,
"eval_loss": 1.9783179759979248,
"eval_runtime": 2.6728,
"eval_samples_per_second": 187.071,
"eval_steps_per_second": 23.571,
"step": 500
},
{
"epoch": 1.34,
"grad_norm": 0.6953230500221252,
"learning_rate": 9.96567425639619e-06,
"loss": 2.1071,
"step": 525
},
{
"epoch": 1.4,
"grad_norm": 0.6353166103363037,
"learning_rate": 9.951463880282912e-06,
"loss": 2.109,
"step": 550
},
{
"epoch": 1.46,
"grad_norm": 0.5800075531005859,
"learning_rate": 9.93481218020935e-06,
"loss": 2.1165,
"step": 575
},
{
"epoch": 1.53,
"grad_norm": 0.6457290053367615,
"learning_rate": 9.915727366312012e-06,
"loss": 2.1009,
"step": 600
},
{
"epoch": 1.59,
"grad_norm": 0.5942001938819885,
"learning_rate": 9.894218848376482e-06,
"loss": 2.1133,
"step": 625
},
{
"epoch": 1.65,
"grad_norm": 0.6016191244125366,
"learning_rate": 9.870297231197922e-06,
"loss": 2.111,
"step": 650
},
{
"epoch": 1.72,
"grad_norm": 0.6793413758277893,
"learning_rate": 9.843974309352356e-06,
"loss": 2.0791,
"step": 675
},
{
"epoch": 1.78,
"grad_norm": 0.6286611557006836,
"learning_rate": 9.81526306138136e-06,
"loss": 2.1128,
"step": 700
},
{
"epoch": 1.84,
"grad_norm": 0.6072127223014832,
"learning_rate": 9.784177643392958e-06,
"loss": 2.0818,
"step": 725
},
{
"epoch": 1.91,
"grad_norm": 0.581510066986084,
"learning_rate": 9.750733382081965e-06,
"loss": 2.0686,
"step": 750
},
{
"epoch": 1.97,
"grad_norm": 0.607868492603302,
"learning_rate": 9.714946767173124e-06,
"loss": 2.0733,
"step": 775
},
{
"epoch": 2.03,
"grad_norm": 0.7183848023414612,
"learning_rate": 9.676835443290842e-06,
"loss": 2.0666,
"step": 800
},
{
"epoch": 2.1,
"grad_norm": 0.6486113667488098,
"learning_rate": 9.63641820125949e-06,
"loss": 2.0623,
"step": 825
},
{
"epoch": 2.16,
"grad_norm": 0.6377106308937073,
"learning_rate": 9.593714968838568e-06,
"loss": 2.0564,
"step": 850
},
{
"epoch": 2.23,
"grad_norm": 0.5825337171554565,
"learning_rate": 9.548746800897305e-06,
"loss": 2.0636,
"step": 875
},
{
"epoch": 2.29,
"grad_norm": 0.6640869975090027,
"learning_rate": 9.501535869033537e-06,
"loss": 2.0637,
"step": 900
},
{
"epoch": 2.35,
"grad_norm": 0.604284405708313,
"learning_rate": 9.452105450641974e-06,
"loss": 2.063,
"step": 925
},
{
"epoch": 2.42,
"grad_norm": 0.6545228362083435,
"learning_rate": 9.400479917437267e-06,
"loss": 2.0379,
"step": 950
},
{
"epoch": 2.48,
"grad_norm": 0.6467187404632568,
"learning_rate": 9.346684723437504e-06,
"loss": 2.0654,
"step": 975
},
{
"epoch": 2.54,
"grad_norm": 0.6103580594062805,
"learning_rate": 9.290746392414084e-06,
"loss": 2.0503,
"step": 1000
},
{
"epoch": 2.54,
"eval_loss": 1.9211387634277344,
"eval_runtime": 2.6351,
"eval_samples_per_second": 189.749,
"eval_steps_per_second": 23.908,
"step": 1000
},
{
"epoch": 2.61,
"grad_norm": 0.6254684329032898,
"learning_rate": 9.232692504814154e-06,
"loss": 2.0655,
"step": 1025
},
{
"epoch": 2.67,
"grad_norm": 0.6028435230255127,
"learning_rate": 9.172551684162025e-06,
"loss": 2.0678,
"step": 1050
},
{
"epoch": 2.73,
"grad_norm": 0.609348714351654,
"learning_rate": 9.110353582946341e-06,
"loss": 2.0406,
"step": 1075
},
{
"epoch": 2.8,
"grad_norm": 0.6535661816596985,
"learning_rate": 9.046128867999867e-06,
"loss": 2.072,
"step": 1100
},
{
"epoch": 2.86,
"grad_norm": 0.6350586414337158,
"learning_rate": 8.979909205379198e-06,
"loss": 2.0436,
"step": 1125
},
{
"epoch": 2.92,
"grad_norm": 0.6003224849700928,
"learning_rate": 8.911727244751763e-06,
"loss": 2.0428,
"step": 1150
},
{
"epoch": 2.99,
"grad_norm": 0.6171026825904846,
"learning_rate": 8.84161660329789e-06,
"loss": 2.0466,
"step": 1175
},
{
"epoch": 3.05,
"grad_norm": 0.6239070892333984,
"learning_rate": 8.76961184913581e-06,
"loss": 2.0201,
"step": 1200
},
{
"epoch": 3.12,
"grad_norm": 0.6221436858177185,
"learning_rate": 8.695748484277833e-06,
"loss": 2.0281,
"step": 1225
},
{
"epoch": 3.18,
"grad_norm": 0.6175958514213562,
"learning_rate": 8.620062927126021e-06,
"loss": 2.0472,
"step": 1250
},
{
"epoch": 3.24,
"grad_norm": 0.6366366147994995,
"learning_rate": 8.54259249451608e-06,
"loss": 2.0326,
"step": 1275
},
{
"epoch": 3.31,
"grad_norm": 0.6660250425338745,
"learning_rate": 8.463375383318254e-06,
"loss": 2.0263,
"step": 1300
},
{
"epoch": 3.37,
"grad_norm": 0.5739914178848267,
"learning_rate": 8.382450651604316e-06,
"loss": 2.0265,
"step": 1325
},
{
"epoch": 3.43,
"grad_norm": 0.6790344715118408,
"learning_rate": 8.29985819938996e-06,
"loss": 2.0219,
"step": 1350
},
{
"epoch": 3.5,
"grad_norm": 0.6223481893539429,
"learning_rate": 8.215638748962047e-06,
"loss": 1.9994,
"step": 1375
},
{
"epoch": 3.56,
"grad_norm": 0.5811251997947693,
"learning_rate": 8.129833824800453e-06,
"loss": 2.0206,
"step": 1400
},
{
"epoch": 3.62,
"grad_norm": 0.5943218469619751,
"learning_rate": 8.042485733104382e-06,
"loss": 2.0131,
"step": 1425
},
{
"epoch": 3.69,
"grad_norm": 0.595011293888092,
"learning_rate": 7.953637540933252e-06,
"loss": 2.0231,
"step": 1450
},
{
"epoch": 3.75,
"grad_norm": 0.7110486030578613,
"learning_rate": 7.863333054972443e-06,
"loss": 2.0297,
"step": 1475
},
{
"epoch": 3.81,
"grad_norm": 0.6503390073776245,
"learning_rate": 7.771616799934372e-06,
"loss": 2.0163,
"step": 1500
},
{
"epoch": 3.81,
"eval_loss": 1.8906679153442383,
"eval_runtime": 2.6486,
"eval_samples_per_second": 188.781,
"eval_steps_per_second": 23.786,
"step": 1500
},
{
"epoch": 3.88,
"grad_norm": 0.6919596791267395,
"learning_rate": 7.67853399660553e-06,
"loss": 2.0236,
"step": 1525
},
{
"epoch": 3.94,
"grad_norm": 0.6393699645996094,
"learning_rate": 7.584130539550348e-06,
"loss": 2.0241,
"step": 1550
},
{
"epoch": 4.01,
"grad_norm": 0.6010494232177734,
"learning_rate": 7.488452974482818e-06,
"loss": 2.0123,
"step": 1575
},
{
"epoch": 4.07,
"grad_norm": 0.6905403733253479,
"learning_rate": 7.3915484753171055e-06,
"loss": 2.0073,
"step": 1600
},
{
"epoch": 4.13,
"grad_norm": 0.6041144728660583,
"learning_rate": 7.293464820908392e-06,
"loss": 2.0118,
"step": 1625
},
{
"epoch": 4.2,
"grad_norm": 0.6558405756950378,
"learning_rate": 7.194250371495467e-06,
"loss": 2.0059,
"step": 1650
},
{
"epoch": 4.26,
"grad_norm": 0.6472019553184509,
"learning_rate": 7.093954044856674e-06,
"loss": 1.9895,
"step": 1675
},
{
"epoch": 4.32,
"grad_norm": 0.6358299255371094,
"learning_rate": 6.992625292190942e-06,
"loss": 1.9934,
"step": 1700
},
{
"epoch": 4.39,
"grad_norm": 0.828366219997406,
"learning_rate": 6.89031407373584e-06,
"loss": 2.0051,
"step": 1725
},
{
"epoch": 4.45,
"grad_norm": 0.5889772176742554,
"learning_rate": 6.787070834134618e-06,
"loss": 2.0057,
"step": 1750
},
{
"epoch": 4.51,
"grad_norm": 0.6443700790405273,
"learning_rate": 6.682946477564438e-06,
"loss": 1.9983,
"step": 1775
},
{
"epoch": 4.58,
"grad_norm": 0.6555039286613464,
"learning_rate": 6.57799234263802e-06,
"loss": 2.0049,
"step": 1800
},
{
"epoch": 4.64,
"grad_norm": 0.5760651230812073,
"learning_rate": 6.47226017709109e-06,
"loss": 1.9864,
"step": 1825
},
{
"epoch": 4.7,
"grad_norm": 0.6189552545547485,
"learning_rate": 6.365802112268104e-06,
"loss": 2.0114,
"step": 1850
},
{
"epoch": 4.77,
"grad_norm": 0.5847841501235962,
"learning_rate": 6.258670637418851e-06,
"loss": 1.9923,
"step": 1875
},
{
"epoch": 4.83,
"grad_norm": 0.5979297757148743,
"learning_rate": 6.150918573818569e-06,
"loss": 1.9992,
"step": 1900
},
{
"epoch": 4.9,
"grad_norm": 0.6612520813941956,
"learning_rate": 6.042599048724366e-06,
"loss": 2.0062,
"step": 1925
},
{
"epoch": 4.96,
"grad_norm": 0.6389756202697754,
"learning_rate": 5.933765469180779e-06,
"loss": 1.9897,
"step": 1950
},
{
"epoch": 5.02,
"grad_norm": 0.5569688081741333,
"learning_rate": 5.82447149568738e-06,
"loss": 1.9913,
"step": 1975
},
{
"epoch": 5.09,
"grad_norm": 0.6268473267555237,
"learning_rate": 5.714771015741414e-06,
"loss": 1.9881,
"step": 2000
},
{
"epoch": 5.09,
"eval_loss": 1.872865915298462,
"eval_runtime": 2.6614,
"eval_samples_per_second": 187.873,
"eval_steps_per_second": 23.672,
"step": 2000
},
{
"epoch": 5.15,
"grad_norm": 0.6172594428062439,
"learning_rate": 5.604718117268515e-06,
"loss": 1.9858,
"step": 2025
},
{
"epoch": 5.21,
"grad_norm": 0.6328703165054321,
"learning_rate": 5.494367061954609e-06,
"loss": 1.9934,
"step": 2050
},
{
"epoch": 5.28,
"grad_norm": 0.6563747525215149,
"learning_rate": 5.383772258492135e-06,
"loss": 1.9751,
"step": 2075
},
{
"epoch": 5.34,
"grad_norm": 0.5623390078544617,
"learning_rate": 5.2729882357537864e-06,
"loss": 1.9911,
"step": 2100
},
{
"epoch": 5.4,
"grad_norm": 0.5955666303634644,
"learning_rate": 5.162069615906998e-06,
"loss": 1.9852,
"step": 2125
},
{
"epoch": 5.47,
"grad_norm": 0.6157717108726501,
"learning_rate": 5.051071087482442e-06,
"loss": 1.967,
"step": 2150
},
{
"epoch": 5.53,
"grad_norm": 0.7150459289550781,
"learning_rate": 4.940047378409786e-06,
"loss": 1.9798,
"step": 2175
},
{
"epoch": 5.59,
"grad_norm": 0.5714321732521057,
"learning_rate": 4.829053229034043e-06,
"loss": 1.9864,
"step": 2200
},
{
"epoch": 5.66,
"grad_norm": 0.5759787559509277,
"learning_rate": 4.718143365125784e-06,
"loss": 1.9802,
"step": 2225
},
{
"epoch": 5.72,
"grad_norm": 0.6573282480239868,
"learning_rate": 4.6073724708985575e-06,
"loss": 1.9851,
"step": 2250
},
{
"epoch": 5.79,
"grad_norm": 0.6285906434059143,
"learning_rate": 4.496795162046774e-06,
"loss": 1.9794,
"step": 2275
},
{
"epoch": 5.85,
"grad_norm": 0.645057201385498,
"learning_rate": 4.386465958817396e-06,
"loss": 1.9776,
"step": 2300
},
{
"epoch": 5.91,
"grad_norm": 0.6082957983016968,
"learning_rate": 4.276439259128667e-06,
"loss": 1.987,
"step": 2325
},
{
"epoch": 5.98,
"grad_norm": 0.6126915216445923,
"learning_rate": 4.1667693117491784e-06,
"loss": 1.9837,
"step": 2350
},
{
"epoch": 6.04,
"grad_norm": 0.5728232860565186,
"learning_rate": 4.057510189550456e-06,
"loss": 1.9822,
"step": 2375
},
{
"epoch": 6.1,
"grad_norm": 0.6158954501152039,
"learning_rate": 3.9487157628462784e-06,
"loss": 1.9709,
"step": 2400
},
{
"epoch": 6.17,
"grad_norm": 0.5914610028266907,
"learning_rate": 3.840439672831872e-06,
"loss": 1.9841,
"step": 2425
},
{
"epoch": 6.23,
"grad_norm": 0.6070579290390015,
"learning_rate": 3.7327353051360703e-06,
"loss": 1.965,
"step": 2450
},
{
"epoch": 6.29,
"grad_norm": 0.6792969107627869,
"learning_rate": 3.625655763499467e-06,
"loss": 1.9792,
"step": 2475
},
{
"epoch": 6.36,
"grad_norm": 0.5990138053894043,
"learning_rate": 3.5192538435915834e-06,
"loss": 1.9739,
"step": 2500
},
{
"epoch": 6.36,
"eval_loss": 1.862921118736267,
"eval_runtime": 2.6134,
"eval_samples_per_second": 191.32,
"eval_steps_per_second": 24.106,
"step": 2500
}
],
"logging_steps": 25,
"max_steps": 3930,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}