AFM-Koto / trainer_state.json
Delta-Vector's picture
Upload folder using huggingface_hub
e68cf39 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5002627430373096,
"eval_steps": 500,
"global_step": 238,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021019442984760903,
"grad_norm": 0.7624253401313755,
"learning_rate": 0.0,
"loss": 2.2955,
"memory/device_mem_reserved(gib)": 68.22,
"memory/max_mem_active(gib)": 63.52,
"memory/max_mem_allocated(gib)": 62.82,
"step": 1
},
{
"epoch": 0.004203888596952181,
"grad_norm": 0.7189116302541813,
"learning_rate": 2e-08,
"loss": 2.2824,
"memory/device_mem_reserved(gib)": 68.31,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 2
},
{
"epoch": 0.006305832895428271,
"grad_norm": 0.8431595274838072,
"learning_rate": 4e-08,
"loss": 2.3101,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 3
},
{
"epoch": 0.008407777193904361,
"grad_norm": 0.8637289443313003,
"learning_rate": 6e-08,
"loss": 2.3514,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 4
},
{
"epoch": 0.010509721492380452,
"grad_norm": 0.8128827491990301,
"learning_rate": 8e-08,
"loss": 2.3621,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 5
},
{
"epoch": 0.012611665790856543,
"grad_norm": 0.9504830158009488,
"learning_rate": 1e-07,
"loss": 2.4108,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 6
},
{
"epoch": 0.014713610089332634,
"grad_norm": 0.9140479063802851,
"learning_rate": 1.2e-07,
"loss": 2.2224,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 7
},
{
"epoch": 0.016815554387808723,
"grad_norm": 0.8632210617655338,
"learning_rate": 1.4e-07,
"loss": 2.3589,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 8
},
{
"epoch": 0.018917498686284815,
"grad_norm": 0.8747745167339828,
"learning_rate": 1.6e-07,
"loss": 2.2411,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 9
},
{
"epoch": 0.021019442984760904,
"grad_norm": 0.7739481898974889,
"learning_rate": 1.8e-07,
"loss": 2.2522,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 10
},
{
"epoch": 0.023121387283236993,
"grad_norm": 0.8301921811025426,
"learning_rate": 2e-07,
"loss": 2.3565,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 11
},
{
"epoch": 0.025223331581713086,
"grad_norm": 0.7677035533090953,
"learning_rate": 2.1999999999999998e-07,
"loss": 2.2208,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 12
},
{
"epoch": 0.027325275880189175,
"grad_norm": 0.7834629656153209,
"learning_rate": 2.4e-07,
"loss": 2.2526,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 13
},
{
"epoch": 0.029427220178665267,
"grad_norm": 0.776588932490268,
"learning_rate": 2.6e-07,
"loss": 2.2727,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 14
},
{
"epoch": 0.03152916447714135,
"grad_norm": 0.7753598356301531,
"learning_rate": 2.8e-07,
"loss": 2.3564,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 15
},
{
"epoch": 0.033631108775617445,
"grad_norm": 0.7165697716264268,
"learning_rate": 3e-07,
"loss": 2.3331,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 16
},
{
"epoch": 0.03573305307409354,
"grad_norm": 0.729205845829164,
"learning_rate": 3.2e-07,
"loss": 2.322,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 17
},
{
"epoch": 0.03783499737256963,
"grad_norm": 0.8495080537327478,
"learning_rate": 3.4000000000000003e-07,
"loss": 2.4952,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 18
},
{
"epoch": 0.039936941671045716,
"grad_norm": 0.7578372584471679,
"learning_rate": 3.6e-07,
"loss": 2.3132,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 19
},
{
"epoch": 0.04203888596952181,
"grad_norm": 0.6681608647353537,
"learning_rate": 3.7999999999999996e-07,
"loss": 2.3086,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 20
},
{
"epoch": 0.0441408302679979,
"grad_norm": 0.7042221303721394,
"learning_rate": 4e-07,
"loss": 2.3644,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 21
},
{
"epoch": 0.046242774566473986,
"grad_norm": 0.725951911870576,
"learning_rate": 4.1999999999999995e-07,
"loss": 2.3208,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 22
},
{
"epoch": 0.04834471886495008,
"grad_norm": 0.6347256826610295,
"learning_rate": 4.3999999999999997e-07,
"loss": 2.328,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 23
},
{
"epoch": 0.05044666316342617,
"grad_norm": 0.5682080423503054,
"learning_rate": 4.6e-07,
"loss": 2.2008,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 24
},
{
"epoch": 0.05254860746190226,
"grad_norm": 0.5787647024012217,
"learning_rate": 4.8e-07,
"loss": 2.2841,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 25
},
{
"epoch": 0.05465055176037835,
"grad_norm": 0.5807941293103913,
"learning_rate": 5e-07,
"loss": 2.2661,
"memory/device_mem_reserved(gib)": 68.94,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 26
},
{
"epoch": 0.05675249605885444,
"grad_norm": 0.5238787661221586,
"learning_rate": 5.2e-07,
"loss": 2.292,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 27
},
{
"epoch": 0.058854440357330534,
"grad_norm": 0.6607378470156829,
"learning_rate": 5.4e-07,
"loss": 2.346,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 28
},
{
"epoch": 0.06095638465580662,
"grad_norm": 0.5949442201958344,
"learning_rate": 5.6e-07,
"loss": 2.2856,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 29
},
{
"epoch": 0.0630583289542827,
"grad_norm": 0.6213907595973902,
"learning_rate": 5.8e-07,
"loss": 2.3527,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 30
},
{
"epoch": 0.0651602732527588,
"grad_norm": 0.6574213245120029,
"learning_rate": 6e-07,
"loss": 2.2896,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 31
},
{
"epoch": 0.06726221755123489,
"grad_norm": 0.7904069125236015,
"learning_rate": 6.2e-07,
"loss": 2.4192,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 32
},
{
"epoch": 0.06936416184971098,
"grad_norm": 0.6912774106481298,
"learning_rate": 6.4e-07,
"loss": 2.3085,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 33
},
{
"epoch": 0.07146610614818708,
"grad_norm": 0.6819796440725628,
"learning_rate": 6.6e-07,
"loss": 2.2756,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 34
},
{
"epoch": 0.07356805044666316,
"grad_norm": 0.7580978517321655,
"learning_rate": 6.800000000000001e-07,
"loss": 2.3645,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 35
},
{
"epoch": 0.07566999474513926,
"grad_norm": 0.6791446776516942,
"learning_rate": 7e-07,
"loss": 2.2628,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 36
},
{
"epoch": 0.07777193904361535,
"grad_norm": 0.569840280711906,
"learning_rate": 7.2e-07,
"loss": 2.2602,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 37
},
{
"epoch": 0.07987388334209143,
"grad_norm": 0.5498888556096215,
"learning_rate": 7.4e-07,
"loss": 2.3167,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 38
},
{
"epoch": 0.08197582764056753,
"grad_norm": 0.5268765090754575,
"learning_rate": 7.599999999999999e-07,
"loss": 2.2378,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 39
},
{
"epoch": 0.08407777193904362,
"grad_norm": 0.4848125502462646,
"learning_rate": 7.799999999999999e-07,
"loss": 2.2882,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 40
},
{
"epoch": 0.0861797162375197,
"grad_norm": 0.5814992292096023,
"learning_rate": 8e-07,
"loss": 2.3471,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 41
},
{
"epoch": 0.0882816605359958,
"grad_norm": 0.6166392360245904,
"learning_rate": 8.199999999999999e-07,
"loss": 2.441,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 42
},
{
"epoch": 0.09038360483447189,
"grad_norm": 0.6377322312855411,
"learning_rate": 8.399999999999999e-07,
"loss": 2.3912,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 43
},
{
"epoch": 0.09248554913294797,
"grad_norm": 0.5055719418643514,
"learning_rate": 8.599999999999999e-07,
"loss": 2.2561,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 44
},
{
"epoch": 0.09458749343142407,
"grad_norm": 0.49178646668795084,
"learning_rate": 8.799999999999999e-07,
"loss": 2.2599,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 45
},
{
"epoch": 0.09668943772990016,
"grad_norm": 0.47537370207387974,
"learning_rate": 9e-07,
"loss": 2.3064,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 46
},
{
"epoch": 0.09879138202837624,
"grad_norm": 0.5089053853006482,
"learning_rate": 9.2e-07,
"loss": 2.391,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 47
},
{
"epoch": 0.10089332632685234,
"grad_norm": 0.4728302009023318,
"learning_rate": 9.399999999999999e-07,
"loss": 2.3139,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 48
},
{
"epoch": 0.10299527062532843,
"grad_norm": 0.4974785018291372,
"learning_rate": 9.6e-07,
"loss": 2.3599,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 49
},
{
"epoch": 0.10509721492380451,
"grad_norm": 0.5140106787374947,
"learning_rate": 9.8e-07,
"loss": 2.4427,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 50
},
{
"epoch": 0.10719915922228061,
"grad_norm": 0.5361457578321233,
"learning_rate": 1e-06,
"loss": 2.3295,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 51
},
{
"epoch": 0.1093011035207567,
"grad_norm": 0.49844160829734835,
"learning_rate": 9.999863397100894e-07,
"loss": 2.2672,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 52
},
{
"epoch": 0.11140304781923278,
"grad_norm": 0.5385578770440957,
"learning_rate": 9.999453595867715e-07,
"loss": 2.3261,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 53
},
{
"epoch": 0.11350499211770888,
"grad_norm": 0.497092836247932,
"learning_rate": 9.998770618692484e-07,
"loss": 2.3326,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 54
},
{
"epoch": 0.11560693641618497,
"grad_norm": 0.539870187568986,
"learning_rate": 9.997814502893856e-07,
"loss": 2.2381,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 55
},
{
"epoch": 0.11770888071466107,
"grad_norm": 0.5073884711048833,
"learning_rate": 9.996585300715115e-07,
"loss": 2.3122,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 56
},
{
"epoch": 0.11981082501313715,
"grad_norm": 0.5162826315178152,
"learning_rate": 9.99508307932129e-07,
"loss": 2.2719,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 57
},
{
"epoch": 0.12191276931161324,
"grad_norm": 0.5135640558488429,
"learning_rate": 9.9933079207955e-07,
"loss": 2.4354,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 58
},
{
"epoch": 0.12401471361008934,
"grad_norm": 0.48495138081284994,
"learning_rate": 9.991259922134465e-07,
"loss": 2.2913,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 59
},
{
"epoch": 0.1261166579085654,
"grad_norm": 0.5031121760605395,
"learning_rate": 9.98893919524321e-07,
"loss": 2.293,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 60
},
{
"epoch": 0.1282186022070415,
"grad_norm": 0.4053272758920918,
"learning_rate": 9.98634586692894e-07,
"loss": 2.2873,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 61
},
{
"epoch": 0.1303205465055176,
"grad_norm": 0.4532646932853173,
"learning_rate": 9.983480078894123e-07,
"loss": 2.3065,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 62
},
{
"epoch": 0.13242249080399368,
"grad_norm": 0.4496821436560576,
"learning_rate": 9.98034198772874e-07,
"loss": 2.2886,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 63
},
{
"epoch": 0.13452443510246978,
"grad_norm": 0.48430661978532813,
"learning_rate": 9.976931764901733e-07,
"loss": 2.3404,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 64
},
{
"epoch": 0.13662637940094588,
"grad_norm": 0.5163168950805126,
"learning_rate": 9.97324959675163e-07,
"loss": 2.286,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 65
},
{
"epoch": 0.13872832369942195,
"grad_norm": 0.4385342628062273,
"learning_rate": 9.969295684476368e-07,
"loss": 2.2923,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 66
},
{
"epoch": 0.14083026799789805,
"grad_norm": 0.4476245967273303,
"learning_rate": 9.9650702441223e-07,
"loss": 2.2454,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 67
},
{
"epoch": 0.14293221229637415,
"grad_norm": 0.4493507785126621,
"learning_rate": 9.960573506572389e-07,
"loss": 2.3361,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 68
},
{
"epoch": 0.14503415659485025,
"grad_norm": 0.4676473798188462,
"learning_rate": 9.955805717533585e-07,
"loss": 2.3795,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 69
},
{
"epoch": 0.14713610089332632,
"grad_norm": 0.5003504816633514,
"learning_rate": 9.950767137523416e-07,
"loss": 2.3638,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 70
},
{
"epoch": 0.14923804519180242,
"grad_norm": 0.41298653135277646,
"learning_rate": 9.94545804185573e-07,
"loss": 2.2986,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 71
},
{
"epoch": 0.15133998949027852,
"grad_norm": 0.48549576119983434,
"learning_rate": 9.939878720625673e-07,
"loss": 2.3772,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 72
},
{
"epoch": 0.1534419337887546,
"grad_norm": 0.37069853589006974,
"learning_rate": 9.93402947869383e-07,
"loss": 2.2609,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 73
},
{
"epoch": 0.1555438780872307,
"grad_norm": 0.3822824223589903,
"learning_rate": 9.927910635669561e-07,
"loss": 2.3263,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 74
},
{
"epoch": 0.1576458223857068,
"grad_norm": 0.4645424064190486,
"learning_rate": 9.921522525893547e-07,
"loss": 2.421,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 75
},
{
"epoch": 0.15974776668418286,
"grad_norm": 0.40728550126377283,
"learning_rate": 9.91486549841951e-07,
"loss": 2.3488,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 76
},
{
"epoch": 0.16184971098265896,
"grad_norm": 0.39534534329560483,
"learning_rate": 9.907939916995152e-07,
"loss": 2.2277,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 77
},
{
"epoch": 0.16395165528113506,
"grad_norm": 0.3994213467776548,
"learning_rate": 9.900746160042272e-07,
"loss": 2.3751,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 78
},
{
"epoch": 0.16605359957961113,
"grad_norm": 0.3952978443639354,
"learning_rate": 9.893284620636098e-07,
"loss": 2.3407,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 79
},
{
"epoch": 0.16815554387808723,
"grad_norm": 0.3847266788854899,
"learning_rate": 9.88555570648379e-07,
"loss": 2.2882,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 80
},
{
"epoch": 0.17025748817656333,
"grad_norm": 0.3942404109616697,
"learning_rate": 9.877559839902183e-07,
"loss": 2.3809,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 81
},
{
"epoch": 0.1723594324750394,
"grad_norm": 0.3726144315608755,
"learning_rate": 9.869297457794698e-07,
"loss": 2.2965,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 82
},
{
"epoch": 0.1744613767735155,
"grad_norm": 0.4044669149844896,
"learning_rate": 9.860769011627474e-07,
"loss": 2.3778,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 83
},
{
"epoch": 0.1765633210719916,
"grad_norm": 0.44263984303122605,
"learning_rate": 9.851974967404702e-07,
"loss": 2.3655,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 84
},
{
"epoch": 0.17866526537046767,
"grad_norm": 0.3800348736088796,
"learning_rate": 9.842915805643156e-07,
"loss": 2.2951,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 85
},
{
"epoch": 0.18076720966894377,
"grad_norm": 0.38644114608168073,
"learning_rate": 9.833592021345937e-07,
"loss": 2.3567,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 86
},
{
"epoch": 0.18286915396741987,
"grad_norm": 0.5360672745714498,
"learning_rate": 9.824004123975434e-07,
"loss": 2.3769,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 87
},
{
"epoch": 0.18497109826589594,
"grad_norm": 0.3826183850679395,
"learning_rate": 9.814152637425477e-07,
"loss": 2.2676,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 88
},
{
"epoch": 0.18707304256437204,
"grad_norm": 0.3874657198676833,
"learning_rate": 9.804038099992716e-07,
"loss": 2.2044,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 89
},
{
"epoch": 0.18917498686284814,
"grad_norm": 0.42284650951618596,
"learning_rate": 9.793661064347204e-07,
"loss": 2.2791,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 90
},
{
"epoch": 0.19127693116132422,
"grad_norm": 0.4012146632153047,
"learning_rate": 9.783022097502203e-07,
"loss": 2.2554,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 91
},
{
"epoch": 0.19337887545980031,
"grad_norm": 0.37104574503246424,
"learning_rate": 9.772121780783201e-07,
"loss": 2.2696,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 92
},
{
"epoch": 0.19548081975827641,
"grad_norm": 0.4115506199685101,
"learning_rate": 9.76096070979614e-07,
"loss": 2.292,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 93
},
{
"epoch": 0.19758276405675249,
"grad_norm": 0.4949212386577297,
"learning_rate": 9.749539494394885e-07,
"loss": 2.3154,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 94
},
{
"epoch": 0.19968470835522859,
"grad_norm": 0.5305093079330326,
"learning_rate": 9.737858758647889e-07,
"loss": 2.3967,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 95
},
{
"epoch": 0.20178665265370468,
"grad_norm": 0.39802845026570083,
"learning_rate": 9.725919140804098e-07,
"loss": 2.3833,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 96
},
{
"epoch": 0.20388859695218076,
"grad_norm": 0.392517153138478,
"learning_rate": 9.713721293258078e-07,
"loss": 2.3458,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 97
},
{
"epoch": 0.20599054125065686,
"grad_norm": 0.38285765355194634,
"learning_rate": 9.70126588251436e-07,
"loss": 2.2321,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 98
},
{
"epoch": 0.20809248554913296,
"grad_norm": 0.42890083185292094,
"learning_rate": 9.688553589151037e-07,
"loss": 2.2823,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 99
},
{
"epoch": 0.21019442984760903,
"grad_norm": 0.3788992789108253,
"learning_rate": 9.675585107782555e-07,
"loss": 2.2955,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 100
},
{
"epoch": 0.21229637414608513,
"grad_norm": 0.40025954957804155,
"learning_rate": 9.66236114702178e-07,
"loss": 2.3454,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 101
},
{
"epoch": 0.21439831844456123,
"grad_norm": 0.4040329751371346,
"learning_rate": 9.648882429441256e-07,
"loss": 2.3362,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 102
},
{
"epoch": 0.2165002627430373,
"grad_norm": 0.35667806143435715,
"learning_rate": 9.635149691533747e-07,
"loss": 2.3089,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 103
},
{
"epoch": 0.2186022070415134,
"grad_norm": 0.42503183804867145,
"learning_rate": 9.621163683671978e-07,
"loss": 2.3024,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 104
},
{
"epoch": 0.2207041513399895,
"grad_norm": 0.3833710476470682,
"learning_rate": 9.606925170067636e-07,
"loss": 2.2944,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 105
},
{
"epoch": 0.22280609563846557,
"grad_norm": 0.38645757625412946,
"learning_rate": 9.592434928729615e-07,
"loss": 2.2595,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 106
},
{
"epoch": 0.22490803993694167,
"grad_norm": 0.37424692884672933,
"learning_rate": 9.577693751421505e-07,
"loss": 2.3025,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 107
},
{
"epoch": 0.22700998423541777,
"grad_norm": 0.3776781851494623,
"learning_rate": 9.562702443618331e-07,
"loss": 2.2724,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 108
},
{
"epoch": 0.22911192853389384,
"grad_norm": 0.392564325121222,
"learning_rate": 9.547461824462533e-07,
"loss": 2.3737,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 109
},
{
"epoch": 0.23121387283236994,
"grad_norm": 0.3480699195596026,
"learning_rate": 9.531972726719215e-07,
"loss": 2.2591,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 110
},
{
"epoch": 0.23331581713084604,
"grad_norm": 0.3563697131151561,
"learning_rate": 9.516235996730644e-07,
"loss": 2.3639,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 111
},
{
"epoch": 0.23541776142932214,
"grad_norm": 0.4943435915920374,
"learning_rate": 9.500252494369991e-07,
"loss": 2.3605,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 112
},
{
"epoch": 0.2375197057277982,
"grad_norm": 0.3975018845059572,
"learning_rate": 9.484023092994364e-07,
"loss": 2.4139,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 113
},
{
"epoch": 0.2396216500262743,
"grad_norm": 0.37314820478206834,
"learning_rate": 9.467548679397071e-07,
"loss": 2.293,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 114
},
{
"epoch": 0.2417235943247504,
"grad_norm": 0.39240855452269136,
"learning_rate": 9.450830153759176e-07,
"loss": 2.3568,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 115
},
{
"epoch": 0.24382553862322648,
"grad_norm": 0.36202032847414545,
"learning_rate": 9.433868429600309e-07,
"loss": 2.36,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 116
},
{
"epoch": 0.24592748292170258,
"grad_norm": 0.3852500669591038,
"learning_rate": 9.416664433728748e-07,
"loss": 2.335,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 117
},
{
"epoch": 0.24802942722017868,
"grad_norm": 0.35255828976101133,
"learning_rate": 9.399219106190775e-07,
"loss": 2.3367,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 118
},
{
"epoch": 0.2501313715186548,
"grad_norm": 0.4145689168519548,
"learning_rate": 9.381533400219317e-07,
"loss": 2.3807,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 119
},
{
"epoch": 0.2522333158171308,
"grad_norm": 0.3638037446306906,
"learning_rate": 9.363608282181861e-07,
"loss": 2.2441,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 120
},
{
"epoch": 0.2543352601156069,
"grad_norm": 0.3892269635122991,
"learning_rate": 9.345444731527641e-07,
"loss": 2.3285,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 121
},
{
"epoch": 0.256437204414083,
"grad_norm": 0.3848382071231666,
"learning_rate": 9.327043740734128e-07,
"loss": 2.2713,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 122
},
{
"epoch": 0.2585391487125591,
"grad_norm": 0.3602411460013298,
"learning_rate": 9.308406315252798e-07,
"loss": 2.32,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 123
},
{
"epoch": 0.2606410930110352,
"grad_norm": 0.36833348877975325,
"learning_rate": 9.289533473454192e-07,
"loss": 2.1967,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 124
},
{
"epoch": 0.2627430373095113,
"grad_norm": 0.3585772049526573,
"learning_rate": 9.270426246572272e-07,
"loss": 2.3642,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 125
},
{
"epoch": 0.26484498160798736,
"grad_norm": 0.34020805834208123,
"learning_rate": 9.251085678648071e-07,
"loss": 2.237,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 126
},
{
"epoch": 0.26694692590646346,
"grad_norm": 0.38311234004852174,
"learning_rate": 9.23151282647265e-07,
"loss": 2.2439,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 127
},
{
"epoch": 0.26904887020493956,
"grad_norm": 0.40490379874064303,
"learning_rate": 9.211708759529346e-07,
"loss": 2.3447,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 128
},
{
"epoch": 0.27115081450341566,
"grad_norm": 0.38814226346705333,
"learning_rate": 9.191674559935347e-07,
"loss": 2.2642,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 129
},
{
"epoch": 0.27325275880189176,
"grad_norm": 0.392535018684069,
"learning_rate": 9.171411322382551e-07,
"loss": 2.4222,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 130
},
{
"epoch": 0.27535470310036786,
"grad_norm": 0.36293069595975763,
"learning_rate": 9.150920154077753e-07,
"loss": 2.2375,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 131
},
{
"epoch": 0.2774566473988439,
"grad_norm": 0.3827224228744126,
"learning_rate": 9.130202174682153e-07,
"loss": 2.3121,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 132
},
{
"epoch": 0.27955859169732,
"grad_norm": 0.39154739093650776,
"learning_rate": 9.109258516250171e-07,
"loss": 2.3246,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 133
},
{
"epoch": 0.2816605359957961,
"grad_norm": 0.35430283896633147,
"learning_rate": 9.08809032316759e-07,
"loss": 2.2922,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 134
},
{
"epoch": 0.2837624802942722,
"grad_norm": 0.39840449294712393,
"learning_rate": 9.066698752089028e-07,
"loss": 2.34,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 135
},
{
"epoch": 0.2858644245927483,
"grad_norm": 0.3657527770786503,
"learning_rate": 9.045084971874737e-07,
"loss": 2.3127,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 136
},
{
"epoch": 0.2879663688912244,
"grad_norm": 0.40390275500061623,
"learning_rate": 9.02325016352673e-07,
"loss": 2.2761,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 137
},
{
"epoch": 0.2900683131897005,
"grad_norm": 0.3486049947257035,
"learning_rate": 9.001195520124255e-07,
"loss": 2.2909,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 138
},
{
"epoch": 0.29217025748817654,
"grad_norm": 0.369271223650673,
"learning_rate": 8.978922246758606e-07,
"loss": 2.3146,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 139
},
{
"epoch": 0.29427220178665264,
"grad_norm": 0.34559908408986584,
"learning_rate": 8.956431560467266e-07,
"loss": 2.3861,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 140
},
{
"epoch": 0.29637414608512874,
"grad_norm": 0.40663130251420515,
"learning_rate": 8.933724690167416e-07,
"loss": 2.3351,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 141
},
{
"epoch": 0.29847609038360484,
"grad_norm": 0.36345254242299446,
"learning_rate": 8.910802876588781e-07,
"loss": 2.2782,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 142
},
{
"epoch": 0.30057803468208094,
"grad_norm": 0.38393881395986873,
"learning_rate": 8.887667372205838e-07,
"loss": 2.2808,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 143
},
{
"epoch": 0.30267997898055704,
"grad_norm": 0.35972360098945216,
"learning_rate": 8.864319441169372e-07,
"loss": 2.2753,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 144
},
{
"epoch": 0.3047819232790331,
"grad_norm": 0.4197014359486705,
"learning_rate": 8.840760359237411e-07,
"loss": 2.3163,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 145
},
{
"epoch": 0.3068838675775092,
"grad_norm": 0.3698464136493578,
"learning_rate": 8.816991413705514e-07,
"loss": 2.3585,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 146
},
{
"epoch": 0.3089858118759853,
"grad_norm": 0.38628726944167563,
"learning_rate": 8.793013903336427e-07,
"loss": 2.2954,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 147
},
{
"epoch": 0.3110877561744614,
"grad_norm": 0.33899721461114324,
"learning_rate": 8.768829138289122e-07,
"loss": 2.2799,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 148
},
{
"epoch": 0.3131897004729375,
"grad_norm": 0.39286568836433555,
"learning_rate": 8.744438440047206e-07,
"loss": 2.3867,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 149
},
{
"epoch": 0.3152916447714136,
"grad_norm": 0.36680644419068636,
"learning_rate": 8.719843141346717e-07,
"loss": 2.2539,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 150
},
{
"epoch": 0.3173935890698896,
"grad_norm": 0.4226555891529418,
"learning_rate": 8.695044586103295e-07,
"loss": 2.4062,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 151
},
{
"epoch": 0.3194955333683657,
"grad_norm": 0.343763121119237,
"learning_rate": 8.67004412933876e-07,
"loss": 2.2993,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 152
},
{
"epoch": 0.3215974776668418,
"grad_norm": 0.34716852552812194,
"learning_rate": 8.644843137107057e-07,
"loss": 2.3404,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 153
},
{
"epoch": 0.3236994219653179,
"grad_norm": 0.3968883598563259,
"learning_rate": 8.619442986419629e-07,
"loss": 2.3012,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 154
},
{
"epoch": 0.325801366263794,
"grad_norm": 0.33889705699000894,
"learning_rate": 8.593845065170163e-07,
"loss": 2.2621,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 155
},
{
"epoch": 0.3279033105622701,
"grad_norm": 0.351512969072057,
"learning_rate": 8.568050772058761e-07,
"loss": 2.357,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 156
},
{
"epoch": 0.33000525486074617,
"grad_norm": 0.3668822383961036,
"learning_rate": 8.542061516515511e-07,
"loss": 2.3499,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 157
},
{
"epoch": 0.33210719915922227,
"grad_norm": 0.3813104081247767,
"learning_rate": 8.515878718623473e-07,
"loss": 2.3762,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 158
},
{
"epoch": 0.33420914345769837,
"grad_norm": 0.3555623840160132,
"learning_rate": 8.489503809041087e-07,
"loss": 2.2511,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 159
},
{
"epoch": 0.33631108775617446,
"grad_norm": 0.3426684159571787,
"learning_rate": 8.462938228923999e-07,
"loss": 2.3354,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 160
},
{
"epoch": 0.33841303205465056,
"grad_norm": 0.3491214060448838,
"learning_rate": 8.436183429846313e-07,
"loss": 2.2395,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 161
},
{
"epoch": 0.34051497635312666,
"grad_norm": 0.4563165572220967,
"learning_rate": 8.409240873721276e-07,
"loss": 2.3872,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 162
},
{
"epoch": 0.3426169206516027,
"grad_norm": 0.33319192852547314,
"learning_rate": 8.382112032721398e-07,
"loss": 2.3122,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 163
},
{
"epoch": 0.3447188649500788,
"grad_norm": 0.38388679911113793,
"learning_rate": 8.354798389198012e-07,
"loss": 2.3693,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 164
},
{
"epoch": 0.3468208092485549,
"grad_norm": 0.39313380831692907,
"learning_rate": 8.327301435600272e-07,
"loss": 2.3085,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 165
},
{
"epoch": 0.348922753547031,
"grad_norm": 0.41915800484281546,
"learning_rate": 8.299622674393614e-07,
"loss": 2.3851,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 166
},
{
"epoch": 0.3510246978455071,
"grad_norm": 0.35317676640002343,
"learning_rate": 8.271763617977641e-07,
"loss": 2.2271,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 167
},
{
"epoch": 0.3531266421439832,
"grad_norm": 0.3528294909167197,
"learning_rate": 8.243725788603508e-07,
"loss": 2.3087,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 168
},
{
"epoch": 0.35522858644245925,
"grad_norm": 0.38320812537843413,
"learning_rate": 8.215510718290723e-07,
"loss": 2.2441,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 169
},
{
"epoch": 0.35733053074093535,
"grad_norm": 0.403856066195845,
"learning_rate": 8.187119948743449e-07,
"loss": 2.3326,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 170
},
{
"epoch": 0.35943247503941145,
"grad_norm": 0.35785694946938973,
"learning_rate": 8.158555031266254e-07,
"loss": 2.332,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 171
},
{
"epoch": 0.36153441933788755,
"grad_norm": 0.34400894235353774,
"learning_rate": 8.129817526679357e-07,
"loss": 2.2897,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 172
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.4126959309133071,
"learning_rate": 8.100909005233334e-07,
"loss": 2.3507,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 173
},
{
"epoch": 0.36573830793483975,
"grad_norm": 0.42717818377517935,
"learning_rate": 8.071831046523318e-07,
"loss": 2.3917,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 174
},
{
"epoch": 0.3678402522333158,
"grad_norm": 0.3579933408679328,
"learning_rate": 8.042585239402697e-07,
"loss": 2.2518,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 175
},
{
"epoch": 0.3699421965317919,
"grad_norm": 0.39551576662619,
"learning_rate": 8.013173181896282e-07,
"loss": 2.4125,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 176
},
{
"epoch": 0.372044140830268,
"grad_norm": 0.365049869078283,
"learning_rate": 7.983596481113005e-07,
"loss": 2.2727,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 177
},
{
"epoch": 0.3741460851287441,
"grad_norm": 0.4011873410931577,
"learning_rate": 7.953856753158094e-07,
"loss": 2.3436,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 178
},
{
"epoch": 0.3762480294272202,
"grad_norm": 0.3842765318105432,
"learning_rate": 7.923955623044775e-07,
"loss": 2.3529,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 179
},
{
"epoch": 0.3783499737256963,
"grad_norm": 0.3554244239833299,
"learning_rate": 7.893894724605468e-07,
"loss": 2.2397,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 180
},
{
"epoch": 0.3804519180241724,
"grad_norm": 0.4463726239175773,
"learning_rate": 7.863675700402526e-07,
"loss": 2.3635,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 181
},
{
"epoch": 0.38255386232264843,
"grad_norm": 0.36826384823166514,
"learning_rate": 7.833300201638474e-07,
"loss": 2.3262,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 182
},
{
"epoch": 0.38465580662112453,
"grad_norm": 0.40131324496051124,
"learning_rate": 7.802769888065789e-07,
"loss": 2.3718,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 183
},
{
"epoch": 0.38675775091960063,
"grad_norm": 0.3849897857523413,
"learning_rate": 7.772086427896211e-07,
"loss": 2.2332,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 184
},
{
"epoch": 0.38885969521807673,
"grad_norm": 0.36493755016771345,
"learning_rate": 7.741251497709583e-07,
"loss": 2.3377,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 185
},
{
"epoch": 0.39096163951655283,
"grad_norm": 0.35102308079227,
"learning_rate": 7.710266782362247e-07,
"loss": 2.3105,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 186
},
{
"epoch": 0.3930635838150289,
"grad_norm": 0.38998005813653297,
"learning_rate": 7.679133974894982e-07,
"loss": 2.3349,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 187
},
{
"epoch": 0.39516552811350497,
"grad_norm": 0.379125931962091,
"learning_rate": 7.647854776440495e-07,
"loss": 2.2724,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 188
},
{
"epoch": 0.39726747241198107,
"grad_norm": 0.3947787843638888,
"learning_rate": 7.616430896130455e-07,
"loss": 2.337,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 189
},
{
"epoch": 0.39936941671045717,
"grad_norm": 0.37487637035067606,
"learning_rate": 7.584864051002126e-07,
"loss": 2.3746,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 190
},
{
"epoch": 0.40147136100893327,
"grad_norm": 0.366432821290813,
"learning_rate": 7.553155965904534e-07,
"loss": 2.3042,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 191
},
{
"epoch": 0.40357330530740937,
"grad_norm": 0.34566325498775646,
"learning_rate": 7.521308373404217e-07,
"loss": 2.2799,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 192
},
{
"epoch": 0.40567524960588547,
"grad_norm": 0.360571812024321,
"learning_rate": 7.489323013690561e-07,
"loss": 2.1848,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 193
},
{
"epoch": 0.4077771939043615,
"grad_norm": 0.38102049467871574,
"learning_rate": 7.457201634480712e-07,
"loss": 2.3506,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 194
},
{
"epoch": 0.4098791382028376,
"grad_norm": 0.4157458990557322,
"learning_rate": 7.424945990924079e-07,
"loss": 2.2602,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 195
},
{
"epoch": 0.4119810825013137,
"grad_norm": 0.3815988927632132,
"learning_rate": 7.392557845506432e-07,
"loss": 2.39,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 196
},
{
"epoch": 0.4140830267997898,
"grad_norm": 0.3575553199145919,
"learning_rate": 7.360038967953597e-07,
"loss": 2.3257,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 197
},
{
"epoch": 0.4161849710982659,
"grad_norm": 0.37179609481335857,
"learning_rate": 7.327391135134749e-07,
"loss": 2.3281,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 198
},
{
"epoch": 0.418286915396742,
"grad_norm": 0.35686209920084183,
"learning_rate": 7.294616130965336e-07,
"loss": 2.2884,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 199
},
{
"epoch": 0.42038885969521805,
"grad_norm": 0.3966932403444605,
"learning_rate": 7.261715746309593e-07,
"loss": 2.3668,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 200
},
{
"epoch": 0.42249080399369415,
"grad_norm": 0.37119910362955255,
"learning_rate": 7.228691778882692e-07,
"loss": 2.216,
"memory/device_mem_reserved(gib)": 69.0,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 201
},
{
"epoch": 0.42459274829217025,
"grad_norm": 0.34835664991688975,
"learning_rate": 7.195546033152506e-07,
"loss": 2.3013,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 202
},
{
"epoch": 0.42669469259064635,
"grad_norm": 0.36756486717782244,
"learning_rate": 7.162280320241019e-07,
"loss": 2.2983,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 203
},
{
"epoch": 0.42879663688912245,
"grad_norm": 0.3580849549174155,
"learning_rate": 7.128896457825363e-07,
"loss": 2.2168,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 204
},
{
"epoch": 0.43089858118759855,
"grad_norm": 0.38919198730377413,
"learning_rate": 7.095396270038492e-07,
"loss": 2.3673,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 205
},
{
"epoch": 0.4330005254860746,
"grad_norm": 0.45321125836545045,
"learning_rate": 7.061781587369518e-07,
"loss": 2.2495,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 206
},
{
"epoch": 0.4351024697845507,
"grad_norm": 0.3390158068246942,
"learning_rate": 7.028054246563678e-07,
"loss": 2.2959,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 207
},
{
"epoch": 0.4372044140830268,
"grad_norm": 0.3932391160329032,
"learning_rate": 6.99421609052199e-07,
"loss": 2.3348,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 208
},
{
"epoch": 0.4393063583815029,
"grad_norm": 0.35196191595880966,
"learning_rate": 6.960268968200538e-07,
"loss": 2.3416,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 209
},
{
"epoch": 0.441408302679979,
"grad_norm": 0.3970691259787115,
"learning_rate": 6.92621473450945e-07,
"loss": 2.3328,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 210
},
{
"epoch": 0.4435102469784551,
"grad_norm": 0.3362178241906251,
"learning_rate": 6.892055250211551e-07,
"loss": 2.2666,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 211
},
{
"epoch": 0.44561219127693114,
"grad_norm": 0.3370093871143424,
"learning_rate": 6.857792381820672e-07,
"loss": 2.3654,
"memory/device_mem_reserved(gib)": 69.04,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 212
},
{
"epoch": 0.44771413557540723,
"grad_norm": 0.3439137104265468,
"learning_rate": 6.823428001499676e-07,
"loss": 2.3236,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 213
},
{
"epoch": 0.44981607987388333,
"grad_norm": 0.363363512278423,
"learning_rate": 6.788963986958152e-07,
"loss": 2.3153,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 214
},
{
"epoch": 0.45191802417235943,
"grad_norm": 0.3550856155819428,
"learning_rate": 6.754402221349825e-07,
"loss": 2.3337,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 215
},
{
"epoch": 0.45401996847083553,
"grad_norm": 0.43364470288014306,
"learning_rate": 6.71974459316964e-07,
"loss": 2.3817,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 216
},
{
"epoch": 0.45612191276931163,
"grad_norm": 0.3594186404992842,
"learning_rate": 6.684992996150598e-07,
"loss": 2.282,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 217
},
{
"epoch": 0.4582238570677877,
"grad_norm": 0.348193721582919,
"learning_rate": 6.650149329160257e-07,
"loss": 2.3266,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 218
},
{
"epoch": 0.4603258013662638,
"grad_norm": 0.36563818617010935,
"learning_rate": 6.615215496096986e-07,
"loss": 2.2706,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 219
},
{
"epoch": 0.4624277456647399,
"grad_norm": 0.3766707141167757,
"learning_rate": 6.580193405785938e-07,
"loss": 2.2786,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 220
},
{
"epoch": 0.464529689963216,
"grad_norm": 0.37040693778721345,
"learning_rate": 6.545084971874736e-07,
"loss": 2.3041,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 221
},
{
"epoch": 0.4666316342616921,
"grad_norm": 0.3814375380964394,
"learning_rate": 6.509892112728928e-07,
"loss": 2.2896,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 222
},
{
"epoch": 0.4687335785601682,
"grad_norm": 0.38809585401355357,
"learning_rate": 6.474616751327142e-07,
"loss": 2.407,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 223
},
{
"epoch": 0.4708355228586443,
"grad_norm": 0.3450197035654617,
"learning_rate": 6.439260815156038e-07,
"loss": 2.3212,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 224
},
{
"epoch": 0.4729374671571203,
"grad_norm": 0.3638524400528564,
"learning_rate": 6.403826236104965e-07,
"loss": 2.3958,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 225
},
{
"epoch": 0.4750394114555964,
"grad_norm": 11.416588328761524,
"learning_rate": 6.368314950360415e-07,
"loss": 2.4091,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 226
},
{
"epoch": 0.4771413557540725,
"grad_norm": 0.38009234188930396,
"learning_rate": 6.33272889830022e-07,
"loss": 2.3481,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 227
},
{
"epoch": 0.4792433000525486,
"grad_norm": 0.3874884323158228,
"learning_rate": 6.297070024387534e-07,
"loss": 2.2936,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 228
},
{
"epoch": 0.4813452443510247,
"grad_norm": 0.3875050817077963,
"learning_rate": 6.261340277064578e-07,
"loss": 2.2781,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 229
},
{
"epoch": 0.4834471886495008,
"grad_norm": 0.35862524615310853,
"learning_rate": 6.225541608646179e-07,
"loss": 2.317,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 230
},
{
"epoch": 0.48554913294797686,
"grad_norm": 0.3684856860526338,
"learning_rate": 6.189675975213093e-07,
"loss": 2.2496,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 231
},
{
"epoch": 0.48765107724645296,
"grad_norm": 0.3592072791824982,
"learning_rate": 6.153745336505124e-07,
"loss": 2.3916,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 232
},
{
"epoch": 0.48975302154492906,
"grad_norm": 0.3492976591005929,
"learning_rate": 6.117751655814037e-07,
"loss": 2.3432,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 233
},
{
"epoch": 0.49185496584340516,
"grad_norm": 0.39657164130018213,
"learning_rate": 6.081696899876281e-07,
"loss": 2.2399,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 234
},
{
"epoch": 0.49395691014188126,
"grad_norm": 0.33918396528061745,
"learning_rate": 6.045583038765537e-07,
"loss": 2.2886,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 235
},
{
"epoch": 0.49605885444035736,
"grad_norm": 0.3812617838396709,
"learning_rate": 6.009412045785051e-07,
"loss": 2.3345,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 236
},
{
"epoch": 0.4981607987388334,
"grad_norm": 0.35316898092715904,
"learning_rate": 5.973185897359827e-07,
"loss": 2.3495,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 237
},
{
"epoch": 0.5002627430373096,
"grad_norm": 0.35687689358418945,
"learning_rate": 5.936906572928624e-07,
"loss": 2.3206,
"memory/device_mem_reserved(gib)": 69.1,
"memory/max_mem_active(gib)": 63.57,
"memory/max_mem_allocated(gib)": 62.86,
"step": 238
}
],
"logging_steps": 1,
"max_steps": 475,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 238,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.365676632322867e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}