{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5002627430373096, "eval_steps": 500, "global_step": 238, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021019442984760903, "grad_norm": 0.7624253401313755, "learning_rate": 0.0, "loss": 2.2955, "memory/device_mem_reserved(gib)": 68.22, "memory/max_mem_active(gib)": 63.52, "memory/max_mem_allocated(gib)": 62.82, "step": 1 }, { "epoch": 0.004203888596952181, "grad_norm": 0.7189116302541813, "learning_rate": 2e-08, "loss": 2.2824, "memory/device_mem_reserved(gib)": 68.31, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 2 }, { "epoch": 0.006305832895428271, "grad_norm": 0.8431595274838072, "learning_rate": 4e-08, "loss": 2.3101, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 3 }, { "epoch": 0.008407777193904361, "grad_norm": 0.8637289443313003, "learning_rate": 6e-08, "loss": 2.3514, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 4 }, { "epoch": 0.010509721492380452, "grad_norm": 0.8128827491990301, "learning_rate": 8e-08, "loss": 2.3621, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 5 }, { "epoch": 0.012611665790856543, "grad_norm": 0.9504830158009488, "learning_rate": 1e-07, "loss": 2.4108, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 6 }, { "epoch": 0.014713610089332634, "grad_norm": 0.9140479063802851, "learning_rate": 1.2e-07, "loss": 2.2224, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 7 }, { "epoch": 0.016815554387808723, "grad_norm": 0.8632210617655338, "learning_rate": 1.4e-07, "loss": 2.3589, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 8 }, { "epoch": 0.018917498686284815, "grad_norm": 0.8747745167339828, "learning_rate": 1.6e-07, "loss": 2.2411, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 9 }, { "epoch": 0.021019442984760904, "grad_norm": 0.7739481898974889, "learning_rate": 1.8e-07, "loss": 2.2522, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 10 }, { "epoch": 0.023121387283236993, "grad_norm": 0.8301921811025426, "learning_rate": 2e-07, "loss": 2.3565, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 11 }, { "epoch": 0.025223331581713086, "grad_norm": 0.7677035533090953, "learning_rate": 2.1999999999999998e-07, "loss": 2.2208, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 12 }, { "epoch": 0.027325275880189175, "grad_norm": 0.7834629656153209, "learning_rate": 2.4e-07, "loss": 2.2526, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 13 }, { "epoch": 0.029427220178665267, "grad_norm": 0.776588932490268, "learning_rate": 2.6e-07, "loss": 2.2727, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 14 }, { "epoch": 0.03152916447714135, "grad_norm": 0.7753598356301531, "learning_rate": 2.8e-07, "loss": 2.3564, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 15 }, { "epoch": 0.033631108775617445, "grad_norm": 0.7165697716264268, "learning_rate": 3e-07, "loss": 2.3331, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 16 }, { "epoch": 0.03573305307409354, "grad_norm": 0.729205845829164, "learning_rate": 3.2e-07, "loss": 2.322, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 17 }, { "epoch": 0.03783499737256963, "grad_norm": 0.8495080537327478, "learning_rate": 3.4000000000000003e-07, "loss": 2.4952, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 18 }, { "epoch": 0.039936941671045716, "grad_norm": 0.7578372584471679, "learning_rate": 3.6e-07, "loss": 2.3132, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 19 }, { "epoch": 0.04203888596952181, "grad_norm": 0.6681608647353537, "learning_rate": 3.7999999999999996e-07, "loss": 2.3086, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 20 }, { "epoch": 0.0441408302679979, "grad_norm": 0.7042221303721394, "learning_rate": 4e-07, "loss": 2.3644, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 21 }, { "epoch": 0.046242774566473986, "grad_norm": 0.725951911870576, "learning_rate": 4.1999999999999995e-07, "loss": 2.3208, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 22 }, { "epoch": 0.04834471886495008, "grad_norm": 0.6347256826610295, "learning_rate": 4.3999999999999997e-07, "loss": 2.328, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 23 }, { "epoch": 0.05044666316342617, "grad_norm": 0.5682080423503054, "learning_rate": 4.6e-07, "loss": 2.2008, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 24 }, { "epoch": 0.05254860746190226, "grad_norm": 0.5787647024012217, "learning_rate": 4.8e-07, "loss": 2.2841, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 25 }, { "epoch": 0.05465055176037835, "grad_norm": 0.5807941293103913, "learning_rate": 5e-07, "loss": 2.2661, "memory/device_mem_reserved(gib)": 68.94, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 26 }, { "epoch": 0.05675249605885444, "grad_norm": 0.5238787661221586, "learning_rate": 5.2e-07, "loss": 2.292, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 27 }, { "epoch": 0.058854440357330534, "grad_norm": 0.6607378470156829, "learning_rate": 5.4e-07, "loss": 2.346, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 28 }, { "epoch": 0.06095638465580662, "grad_norm": 0.5949442201958344, "learning_rate": 5.6e-07, "loss": 2.2856, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 29 }, { "epoch": 0.0630583289542827, "grad_norm": 0.6213907595973902, "learning_rate": 5.8e-07, "loss": 2.3527, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 30 }, { "epoch": 0.0651602732527588, "grad_norm": 0.6574213245120029, "learning_rate": 6e-07, "loss": 2.2896, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 31 }, { "epoch": 0.06726221755123489, "grad_norm": 0.7904069125236015, "learning_rate": 6.2e-07, "loss": 2.4192, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 32 }, { "epoch": 0.06936416184971098, "grad_norm": 0.6912774106481298, "learning_rate": 6.4e-07, "loss": 2.3085, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 33 }, { "epoch": 0.07146610614818708, "grad_norm": 0.6819796440725628, "learning_rate": 6.6e-07, "loss": 2.2756, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 34 }, { "epoch": 0.07356805044666316, "grad_norm": 0.7580978517321655, "learning_rate": 6.800000000000001e-07, "loss": 2.3645, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 35 }, { "epoch": 0.07566999474513926, "grad_norm": 0.6791446776516942, "learning_rate": 7e-07, "loss": 2.2628, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 36 }, { "epoch": 0.07777193904361535, "grad_norm": 0.569840280711906, "learning_rate": 7.2e-07, "loss": 2.2602, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 37 }, { "epoch": 0.07987388334209143, "grad_norm": 0.5498888556096215, "learning_rate": 7.4e-07, "loss": 2.3167, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 38 }, { "epoch": 0.08197582764056753, "grad_norm": 0.5268765090754575, "learning_rate": 7.599999999999999e-07, "loss": 2.2378, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 39 }, { "epoch": 0.08407777193904362, "grad_norm": 0.4848125502462646, "learning_rate": 7.799999999999999e-07, "loss": 2.2882, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 40 }, { "epoch": 0.0861797162375197, "grad_norm": 0.5814992292096023, "learning_rate": 8e-07, "loss": 2.3471, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 41 }, { "epoch": 0.0882816605359958, "grad_norm": 0.6166392360245904, "learning_rate": 8.199999999999999e-07, "loss": 2.441, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 42 }, { "epoch": 0.09038360483447189, "grad_norm": 0.6377322312855411, "learning_rate": 8.399999999999999e-07, "loss": 2.3912, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 43 }, { "epoch": 0.09248554913294797, "grad_norm": 0.5055719418643514, "learning_rate": 8.599999999999999e-07, "loss": 2.2561, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 44 }, { "epoch": 0.09458749343142407, "grad_norm": 0.49178646668795084, "learning_rate": 8.799999999999999e-07, "loss": 2.2599, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 45 }, { "epoch": 0.09668943772990016, "grad_norm": 0.47537370207387974, "learning_rate": 9e-07, "loss": 2.3064, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 46 }, { "epoch": 0.09879138202837624, "grad_norm": 0.5089053853006482, "learning_rate": 9.2e-07, "loss": 2.391, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 47 }, { "epoch": 0.10089332632685234, "grad_norm": 0.4728302009023318, "learning_rate": 9.399999999999999e-07, "loss": 2.3139, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 48 }, { "epoch": 0.10299527062532843, "grad_norm": 0.4974785018291372, "learning_rate": 9.6e-07, "loss": 2.3599, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 49 }, { "epoch": 0.10509721492380451, "grad_norm": 0.5140106787374947, "learning_rate": 9.8e-07, "loss": 2.4427, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 50 }, { "epoch": 0.10719915922228061, "grad_norm": 0.5361457578321233, "learning_rate": 1e-06, "loss": 2.3295, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 51 }, { "epoch": 0.1093011035207567, "grad_norm": 0.49844160829734835, "learning_rate": 9.999863397100894e-07, "loss": 2.2672, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 52 }, { "epoch": 0.11140304781923278, "grad_norm": 0.5385578770440957, "learning_rate": 9.999453595867715e-07, "loss": 2.3261, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 53 }, { "epoch": 0.11350499211770888, "grad_norm": 0.497092836247932, "learning_rate": 9.998770618692484e-07, "loss": 2.3326, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 54 }, { "epoch": 0.11560693641618497, "grad_norm": 0.539870187568986, "learning_rate": 9.997814502893856e-07, "loss": 2.2381, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 55 }, { "epoch": 0.11770888071466107, "grad_norm": 0.5073884711048833, "learning_rate": 9.996585300715115e-07, "loss": 2.3122, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 56 }, { "epoch": 0.11981082501313715, "grad_norm": 0.5162826315178152, "learning_rate": 9.99508307932129e-07, "loss": 2.2719, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 57 }, { "epoch": 0.12191276931161324, "grad_norm": 0.5135640558488429, "learning_rate": 9.9933079207955e-07, "loss": 2.4354, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 58 }, { "epoch": 0.12401471361008934, "grad_norm": 0.48495138081284994, "learning_rate": 9.991259922134465e-07, "loss": 2.2913, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 59 }, { "epoch": 0.1261166579085654, "grad_norm": 0.5031121760605395, "learning_rate": 9.98893919524321e-07, "loss": 2.293, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 60 }, { "epoch": 0.1282186022070415, "grad_norm": 0.4053272758920918, "learning_rate": 9.98634586692894e-07, "loss": 2.2873, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 61 }, { "epoch": 0.1303205465055176, "grad_norm": 0.4532646932853173, "learning_rate": 9.983480078894123e-07, "loss": 2.3065, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 62 }, { "epoch": 0.13242249080399368, "grad_norm": 0.4496821436560576, "learning_rate": 9.98034198772874e-07, "loss": 2.2886, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 63 }, { "epoch": 0.13452443510246978, "grad_norm": 0.48430661978532813, "learning_rate": 9.976931764901733e-07, "loss": 2.3404, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 64 }, { "epoch": 0.13662637940094588, "grad_norm": 0.5163168950805126, "learning_rate": 9.97324959675163e-07, "loss": 2.286, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 65 }, { "epoch": 0.13872832369942195, "grad_norm": 0.4385342628062273, "learning_rate": 9.969295684476368e-07, "loss": 2.2923, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 66 }, { "epoch": 0.14083026799789805, "grad_norm": 0.4476245967273303, "learning_rate": 9.9650702441223e-07, "loss": 2.2454, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 67 }, { "epoch": 0.14293221229637415, "grad_norm": 0.4493507785126621, "learning_rate": 9.960573506572389e-07, "loss": 2.3361, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 68 }, { "epoch": 0.14503415659485025, "grad_norm": 0.4676473798188462, "learning_rate": 9.955805717533585e-07, "loss": 2.3795, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 69 }, { "epoch": 0.14713610089332632, "grad_norm": 0.5003504816633514, "learning_rate": 9.950767137523416e-07, "loss": 2.3638, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 70 }, { "epoch": 0.14923804519180242, "grad_norm": 0.41298653135277646, "learning_rate": 9.94545804185573e-07, "loss": 2.2986, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 71 }, { "epoch": 0.15133998949027852, "grad_norm": 0.48549576119983434, "learning_rate": 9.939878720625673e-07, "loss": 2.3772, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 72 }, { "epoch": 0.1534419337887546, "grad_norm": 0.37069853589006974, "learning_rate": 9.93402947869383e-07, "loss": 2.2609, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 73 }, { "epoch": 0.1555438780872307, "grad_norm": 0.3822824223589903, "learning_rate": 9.927910635669561e-07, "loss": 2.3263, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 74 }, { "epoch": 0.1576458223857068, "grad_norm": 0.4645424064190486, "learning_rate": 9.921522525893547e-07, "loss": 2.421, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 75 }, { "epoch": 0.15974776668418286, "grad_norm": 0.40728550126377283, "learning_rate": 9.91486549841951e-07, "loss": 2.3488, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 76 }, { "epoch": 0.16184971098265896, "grad_norm": 0.39534534329560483, "learning_rate": 9.907939916995152e-07, "loss": 2.2277, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 77 }, { "epoch": 0.16395165528113506, "grad_norm": 0.3994213467776548, "learning_rate": 9.900746160042272e-07, "loss": 2.3751, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 78 }, { "epoch": 0.16605359957961113, "grad_norm": 0.3952978443639354, "learning_rate": 9.893284620636098e-07, "loss": 2.3407, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 79 }, { "epoch": 0.16815554387808723, "grad_norm": 0.3847266788854899, "learning_rate": 9.88555570648379e-07, "loss": 2.2882, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 80 }, { "epoch": 0.17025748817656333, "grad_norm": 0.3942404109616697, "learning_rate": 9.877559839902183e-07, "loss": 2.3809, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 81 }, { "epoch": 0.1723594324750394, "grad_norm": 0.3726144315608755, "learning_rate": 9.869297457794698e-07, "loss": 2.2965, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 82 }, { "epoch": 0.1744613767735155, "grad_norm": 0.4044669149844896, "learning_rate": 9.860769011627474e-07, "loss": 2.3778, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 83 }, { "epoch": 0.1765633210719916, "grad_norm": 0.44263984303122605, "learning_rate": 9.851974967404702e-07, "loss": 2.3655, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 84 }, { "epoch": 0.17866526537046767, "grad_norm": 0.3800348736088796, "learning_rate": 9.842915805643156e-07, "loss": 2.2951, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 85 }, { "epoch": 0.18076720966894377, "grad_norm": 0.38644114608168073, "learning_rate": 9.833592021345937e-07, "loss": 2.3567, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 86 }, { "epoch": 0.18286915396741987, "grad_norm": 0.5360672745714498, "learning_rate": 9.824004123975434e-07, "loss": 2.3769, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 87 }, { "epoch": 0.18497109826589594, "grad_norm": 0.3826183850679395, "learning_rate": 9.814152637425477e-07, "loss": 2.2676, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 88 }, { "epoch": 0.18707304256437204, "grad_norm": 0.3874657198676833, "learning_rate": 9.804038099992716e-07, "loss": 2.2044, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 89 }, { "epoch": 0.18917498686284814, "grad_norm": 0.42284650951618596, "learning_rate": 9.793661064347204e-07, "loss": 2.2791, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 90 }, { "epoch": 0.19127693116132422, "grad_norm": 0.4012146632153047, "learning_rate": 9.783022097502203e-07, "loss": 2.2554, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 91 }, { "epoch": 0.19337887545980031, "grad_norm": 0.37104574503246424, "learning_rate": 9.772121780783201e-07, "loss": 2.2696, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 92 }, { "epoch": 0.19548081975827641, "grad_norm": 0.4115506199685101, "learning_rate": 9.76096070979614e-07, "loss": 2.292, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 93 }, { "epoch": 0.19758276405675249, "grad_norm": 0.4949212386577297, "learning_rate": 9.749539494394885e-07, "loss": 2.3154, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 94 }, { "epoch": 0.19968470835522859, "grad_norm": 0.5305093079330326, "learning_rate": 9.737858758647889e-07, "loss": 2.3967, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 95 }, { "epoch": 0.20178665265370468, "grad_norm": 0.39802845026570083, "learning_rate": 9.725919140804098e-07, "loss": 2.3833, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 96 }, { "epoch": 0.20388859695218076, "grad_norm": 0.392517153138478, "learning_rate": 9.713721293258078e-07, "loss": 2.3458, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 97 }, { "epoch": 0.20599054125065686, "grad_norm": 0.38285765355194634, "learning_rate": 9.70126588251436e-07, "loss": 2.2321, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 98 }, { "epoch": 0.20809248554913296, "grad_norm": 0.42890083185292094, "learning_rate": 9.688553589151037e-07, "loss": 2.2823, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 99 }, { "epoch": 0.21019442984760903, "grad_norm": 0.3788992789108253, "learning_rate": 9.675585107782555e-07, "loss": 2.2955, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 100 }, { "epoch": 0.21229637414608513, "grad_norm": 0.40025954957804155, "learning_rate": 9.66236114702178e-07, "loss": 2.3454, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 101 }, { "epoch": 0.21439831844456123, "grad_norm": 0.4040329751371346, "learning_rate": 9.648882429441256e-07, "loss": 2.3362, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 102 }, { "epoch": 0.2165002627430373, "grad_norm": 0.35667806143435715, "learning_rate": 9.635149691533747e-07, "loss": 2.3089, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 103 }, { "epoch": 0.2186022070415134, "grad_norm": 0.42503183804867145, "learning_rate": 9.621163683671978e-07, "loss": 2.3024, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 104 }, { "epoch": 0.2207041513399895, "grad_norm": 0.3833710476470682, "learning_rate": 9.606925170067636e-07, "loss": 2.2944, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 105 }, { "epoch": 0.22280609563846557, "grad_norm": 0.38645757625412946, "learning_rate": 9.592434928729615e-07, "loss": 2.2595, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 106 }, { "epoch": 0.22490803993694167, "grad_norm": 0.37424692884672933, "learning_rate": 9.577693751421505e-07, "loss": 2.3025, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 107 }, { "epoch": 0.22700998423541777, "grad_norm": 0.3776781851494623, "learning_rate": 9.562702443618331e-07, "loss": 2.2724, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 108 }, { "epoch": 0.22911192853389384, "grad_norm": 0.392564325121222, "learning_rate": 9.547461824462533e-07, "loss": 2.3737, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 109 }, { "epoch": 0.23121387283236994, "grad_norm": 0.3480699195596026, "learning_rate": 9.531972726719215e-07, "loss": 2.2591, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 110 }, { "epoch": 0.23331581713084604, "grad_norm": 0.3563697131151561, "learning_rate": 9.516235996730644e-07, "loss": 2.3639, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 111 }, { "epoch": 0.23541776142932214, "grad_norm": 0.4943435915920374, "learning_rate": 9.500252494369991e-07, "loss": 2.3605, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 112 }, { "epoch": 0.2375197057277982, "grad_norm": 0.3975018845059572, "learning_rate": 9.484023092994364e-07, "loss": 2.4139, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 113 }, { "epoch": 0.2396216500262743, "grad_norm": 0.37314820478206834, "learning_rate": 9.467548679397071e-07, "loss": 2.293, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 114 }, { "epoch": 0.2417235943247504, "grad_norm": 0.39240855452269136, "learning_rate": 9.450830153759176e-07, "loss": 2.3568, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 115 }, { "epoch": 0.24382553862322648, "grad_norm": 0.36202032847414545, "learning_rate": 9.433868429600309e-07, "loss": 2.36, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 116 }, { "epoch": 0.24592748292170258, "grad_norm": 0.3852500669591038, "learning_rate": 9.416664433728748e-07, "loss": 2.335, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 117 }, { "epoch": 0.24802942722017868, "grad_norm": 0.35255828976101133, "learning_rate": 9.399219106190775e-07, "loss": 2.3367, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 118 }, { "epoch": 0.2501313715186548, "grad_norm": 0.4145689168519548, "learning_rate": 9.381533400219317e-07, "loss": 2.3807, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 119 }, { "epoch": 0.2522333158171308, "grad_norm": 0.3638037446306906, "learning_rate": 9.363608282181861e-07, "loss": 2.2441, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 120 }, { "epoch": 0.2543352601156069, "grad_norm": 0.3892269635122991, "learning_rate": 9.345444731527641e-07, "loss": 2.3285, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 121 }, { "epoch": 0.256437204414083, "grad_norm": 0.3848382071231666, "learning_rate": 9.327043740734128e-07, "loss": 2.2713, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 122 }, { "epoch": 0.2585391487125591, "grad_norm": 0.3602411460013298, "learning_rate": 9.308406315252798e-07, "loss": 2.32, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 123 }, { "epoch": 0.2606410930110352, "grad_norm": 0.36833348877975325, "learning_rate": 9.289533473454192e-07, "loss": 2.1967, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 124 }, { "epoch": 0.2627430373095113, "grad_norm": 0.3585772049526573, "learning_rate": 9.270426246572272e-07, "loss": 2.3642, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 125 }, { "epoch": 0.26484498160798736, "grad_norm": 0.34020805834208123, "learning_rate": 9.251085678648071e-07, "loss": 2.237, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 126 }, { "epoch": 0.26694692590646346, "grad_norm": 0.38311234004852174, "learning_rate": 9.23151282647265e-07, "loss": 2.2439, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 127 }, { "epoch": 0.26904887020493956, "grad_norm": 0.40490379874064303, "learning_rate": 9.211708759529346e-07, "loss": 2.3447, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 128 }, { "epoch": 0.27115081450341566, "grad_norm": 0.38814226346705333, "learning_rate": 9.191674559935347e-07, "loss": 2.2642, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 129 }, { "epoch": 0.27325275880189176, "grad_norm": 0.392535018684069, "learning_rate": 9.171411322382551e-07, "loss": 2.4222, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 130 }, { "epoch": 0.27535470310036786, "grad_norm": 0.36293069595975763, "learning_rate": 9.150920154077753e-07, "loss": 2.2375, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 131 }, { "epoch": 0.2774566473988439, "grad_norm": 0.3827224228744126, "learning_rate": 9.130202174682153e-07, "loss": 2.3121, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 132 }, { "epoch": 0.27955859169732, "grad_norm": 0.39154739093650776, "learning_rate": 9.109258516250171e-07, "loss": 2.3246, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 133 }, { "epoch": 0.2816605359957961, "grad_norm": 0.35430283896633147, "learning_rate": 9.08809032316759e-07, "loss": 2.2922, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 134 }, { "epoch": 0.2837624802942722, "grad_norm": 0.39840449294712393, "learning_rate": 9.066698752089028e-07, "loss": 2.34, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 135 }, { "epoch": 0.2858644245927483, "grad_norm": 0.3657527770786503, "learning_rate": 9.045084971874737e-07, "loss": 2.3127, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 136 }, { "epoch": 0.2879663688912244, "grad_norm": 0.40390275500061623, "learning_rate": 9.02325016352673e-07, "loss": 2.2761, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 137 }, { "epoch": 0.2900683131897005, "grad_norm": 0.3486049947257035, "learning_rate": 9.001195520124255e-07, "loss": 2.2909, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 138 }, { "epoch": 0.29217025748817654, "grad_norm": 0.369271223650673, "learning_rate": 8.978922246758606e-07, "loss": 2.3146, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 139 }, { "epoch": 0.29427220178665264, "grad_norm": 0.34559908408986584, "learning_rate": 8.956431560467266e-07, "loss": 2.3861, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 140 }, { "epoch": 0.29637414608512874, "grad_norm": 0.40663130251420515, "learning_rate": 8.933724690167416e-07, "loss": 2.3351, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 141 }, { "epoch": 0.29847609038360484, "grad_norm": 0.36345254242299446, "learning_rate": 8.910802876588781e-07, "loss": 2.2782, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 142 }, { "epoch": 0.30057803468208094, "grad_norm": 0.38393881395986873, "learning_rate": 8.887667372205838e-07, "loss": 2.2808, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 143 }, { "epoch": 0.30267997898055704, "grad_norm": 0.35972360098945216, "learning_rate": 8.864319441169372e-07, "loss": 2.2753, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 144 }, { "epoch": 0.3047819232790331, "grad_norm": 0.4197014359486705, "learning_rate": 8.840760359237411e-07, "loss": 2.3163, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 145 }, { "epoch": 0.3068838675775092, "grad_norm": 0.3698464136493578, "learning_rate": 8.816991413705514e-07, "loss": 2.3585, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 146 }, { "epoch": 0.3089858118759853, "grad_norm": 0.38628726944167563, "learning_rate": 8.793013903336427e-07, "loss": 2.2954, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 147 }, { "epoch": 0.3110877561744614, "grad_norm": 0.33899721461114324, "learning_rate": 8.768829138289122e-07, "loss": 2.2799, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 148 }, { "epoch": 0.3131897004729375, "grad_norm": 0.39286568836433555, "learning_rate": 8.744438440047206e-07, "loss": 2.3867, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 149 }, { "epoch": 0.3152916447714136, "grad_norm": 0.36680644419068636, "learning_rate": 8.719843141346717e-07, "loss": 2.2539, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 150 }, { "epoch": 0.3173935890698896, "grad_norm": 0.4226555891529418, "learning_rate": 8.695044586103295e-07, "loss": 2.4062, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 151 }, { "epoch": 0.3194955333683657, "grad_norm": 0.343763121119237, "learning_rate": 8.67004412933876e-07, "loss": 2.2993, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 152 }, { "epoch": 0.3215974776668418, "grad_norm": 0.34716852552812194, "learning_rate": 8.644843137107057e-07, "loss": 2.3404, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 153 }, { "epoch": 0.3236994219653179, "grad_norm": 0.3968883598563259, "learning_rate": 8.619442986419629e-07, "loss": 2.3012, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 154 }, { "epoch": 0.325801366263794, "grad_norm": 0.33889705699000894, "learning_rate": 8.593845065170163e-07, "loss": 2.2621, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 155 }, { "epoch": 0.3279033105622701, "grad_norm": 0.351512969072057, "learning_rate": 8.568050772058761e-07, "loss": 2.357, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 156 }, { "epoch": 0.33000525486074617, "grad_norm": 0.3668822383961036, "learning_rate": 8.542061516515511e-07, "loss": 2.3499, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 157 }, { "epoch": 0.33210719915922227, "grad_norm": 0.3813104081247767, "learning_rate": 8.515878718623473e-07, "loss": 2.3762, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 158 }, { "epoch": 0.33420914345769837, "grad_norm": 0.3555623840160132, "learning_rate": 8.489503809041087e-07, "loss": 2.2511, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 159 }, { "epoch": 0.33631108775617446, "grad_norm": 0.3426684159571787, "learning_rate": 8.462938228923999e-07, "loss": 2.3354, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 160 }, { "epoch": 0.33841303205465056, "grad_norm": 0.3491214060448838, "learning_rate": 8.436183429846313e-07, "loss": 2.2395, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 161 }, { "epoch": 0.34051497635312666, "grad_norm": 0.4563165572220967, "learning_rate": 8.409240873721276e-07, "loss": 2.3872, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 162 }, { "epoch": 0.3426169206516027, "grad_norm": 0.33319192852547314, "learning_rate": 8.382112032721398e-07, "loss": 2.3122, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 163 }, { "epoch": 0.3447188649500788, "grad_norm": 0.38388679911113793, "learning_rate": 8.354798389198012e-07, "loss": 2.3693, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 164 }, { "epoch": 0.3468208092485549, "grad_norm": 0.39313380831692907, "learning_rate": 8.327301435600272e-07, "loss": 2.3085, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 165 }, { "epoch": 0.348922753547031, "grad_norm": 0.41915800484281546, "learning_rate": 8.299622674393614e-07, "loss": 2.3851, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 166 }, { "epoch": 0.3510246978455071, "grad_norm": 0.35317676640002343, "learning_rate": 8.271763617977641e-07, "loss": 2.2271, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 167 }, { "epoch": 0.3531266421439832, "grad_norm": 0.3528294909167197, "learning_rate": 8.243725788603508e-07, "loss": 2.3087, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 168 }, { "epoch": 0.35522858644245925, "grad_norm": 0.38320812537843413, "learning_rate": 8.215510718290723e-07, "loss": 2.2441, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 169 }, { "epoch": 0.35733053074093535, "grad_norm": 0.403856066195845, "learning_rate": 8.187119948743449e-07, "loss": 2.3326, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 170 }, { "epoch": 0.35943247503941145, "grad_norm": 0.35785694946938973, "learning_rate": 8.158555031266254e-07, "loss": 2.332, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 171 }, { "epoch": 0.36153441933788755, "grad_norm": 0.34400894235353774, "learning_rate": 8.129817526679357e-07, "loss": 2.2897, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 172 }, { "epoch": 0.36363636363636365, "grad_norm": 0.4126959309133071, "learning_rate": 8.100909005233334e-07, "loss": 2.3507, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 173 }, { "epoch": 0.36573830793483975, "grad_norm": 0.42717818377517935, "learning_rate": 8.071831046523318e-07, "loss": 2.3917, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 174 }, { "epoch": 0.3678402522333158, "grad_norm": 0.3579933408679328, "learning_rate": 8.042585239402697e-07, "loss": 2.2518, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 175 }, { "epoch": 0.3699421965317919, "grad_norm": 0.39551576662619, "learning_rate": 8.013173181896282e-07, "loss": 2.4125, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 176 }, { "epoch": 0.372044140830268, "grad_norm": 0.365049869078283, "learning_rate": 7.983596481113005e-07, "loss": 2.2727, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 177 }, { "epoch": 0.3741460851287441, "grad_norm": 0.4011873410931577, "learning_rate": 7.953856753158094e-07, "loss": 2.3436, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 178 }, { "epoch": 0.3762480294272202, "grad_norm": 0.3842765318105432, "learning_rate": 7.923955623044775e-07, "loss": 2.3529, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 179 }, { "epoch": 0.3783499737256963, "grad_norm": 0.3554244239833299, "learning_rate": 7.893894724605468e-07, "loss": 2.2397, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 180 }, { "epoch": 0.3804519180241724, "grad_norm": 0.4463726239175773, "learning_rate": 7.863675700402526e-07, "loss": 2.3635, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 181 }, { "epoch": 0.38255386232264843, "grad_norm": 0.36826384823166514, "learning_rate": 7.833300201638474e-07, "loss": 2.3262, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 182 }, { "epoch": 0.38465580662112453, "grad_norm": 0.40131324496051124, "learning_rate": 7.802769888065789e-07, "loss": 2.3718, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 183 }, { "epoch": 0.38675775091960063, "grad_norm": 0.3849897857523413, "learning_rate": 7.772086427896211e-07, "loss": 2.2332, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 184 }, { "epoch": 0.38885969521807673, "grad_norm": 0.36493755016771345, "learning_rate": 7.741251497709583e-07, "loss": 2.3377, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 185 }, { "epoch": 0.39096163951655283, "grad_norm": 0.35102308079227, "learning_rate": 7.710266782362247e-07, "loss": 2.3105, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 186 }, { "epoch": 0.3930635838150289, "grad_norm": 0.38998005813653297, "learning_rate": 7.679133974894982e-07, "loss": 2.3349, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 187 }, { "epoch": 0.39516552811350497, "grad_norm": 0.379125931962091, "learning_rate": 7.647854776440495e-07, "loss": 2.2724, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 188 }, { "epoch": 0.39726747241198107, "grad_norm": 0.3947787843638888, "learning_rate": 7.616430896130455e-07, "loss": 2.337, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 189 }, { "epoch": 0.39936941671045717, "grad_norm": 0.37487637035067606, "learning_rate": 7.584864051002126e-07, "loss": 2.3746, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 190 }, { "epoch": 0.40147136100893327, "grad_norm": 0.366432821290813, "learning_rate": 7.553155965904534e-07, "loss": 2.3042, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 191 }, { "epoch": 0.40357330530740937, "grad_norm": 0.34566325498775646, "learning_rate": 7.521308373404217e-07, "loss": 2.2799, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 192 }, { "epoch": 0.40567524960588547, "grad_norm": 0.360571812024321, "learning_rate": 7.489323013690561e-07, "loss": 2.1848, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 193 }, { "epoch": 0.4077771939043615, "grad_norm": 0.38102049467871574, "learning_rate": 7.457201634480712e-07, "loss": 2.3506, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 194 }, { "epoch": 0.4098791382028376, "grad_norm": 0.4157458990557322, "learning_rate": 7.424945990924079e-07, "loss": 2.2602, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 195 }, { "epoch": 0.4119810825013137, "grad_norm": 0.3815988927632132, "learning_rate": 7.392557845506432e-07, "loss": 2.39, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 196 }, { "epoch": 0.4140830267997898, "grad_norm": 0.3575553199145919, "learning_rate": 7.360038967953597e-07, "loss": 2.3257, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 197 }, { "epoch": 0.4161849710982659, "grad_norm": 0.37179609481335857, "learning_rate": 7.327391135134749e-07, "loss": 2.3281, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 198 }, { "epoch": 0.418286915396742, "grad_norm": 0.35686209920084183, "learning_rate": 7.294616130965336e-07, "loss": 2.2884, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 199 }, { "epoch": 0.42038885969521805, "grad_norm": 0.3966932403444605, "learning_rate": 7.261715746309593e-07, "loss": 2.3668, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 200 }, { "epoch": 0.42249080399369415, "grad_norm": 0.37119910362955255, "learning_rate": 7.228691778882692e-07, "loss": 2.216, "memory/device_mem_reserved(gib)": 69.0, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 201 }, { "epoch": 0.42459274829217025, "grad_norm": 0.34835664991688975, "learning_rate": 7.195546033152506e-07, "loss": 2.3013, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 202 }, { "epoch": 0.42669469259064635, "grad_norm": 0.36756486717782244, "learning_rate": 7.162280320241019e-07, "loss": 2.2983, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 203 }, { "epoch": 0.42879663688912245, "grad_norm": 0.3580849549174155, "learning_rate": 7.128896457825363e-07, "loss": 2.2168, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 204 }, { "epoch": 0.43089858118759855, "grad_norm": 0.38919198730377413, "learning_rate": 7.095396270038492e-07, "loss": 2.3673, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 205 }, { "epoch": 0.4330005254860746, "grad_norm": 0.45321125836545045, "learning_rate": 7.061781587369518e-07, "loss": 2.2495, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 206 }, { "epoch": 0.4351024697845507, "grad_norm": 0.3390158068246942, "learning_rate": 7.028054246563678e-07, "loss": 2.2959, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 207 }, { "epoch": 0.4372044140830268, "grad_norm": 0.3932391160329032, "learning_rate": 6.99421609052199e-07, "loss": 2.3348, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 208 }, { "epoch": 0.4393063583815029, "grad_norm": 0.35196191595880966, "learning_rate": 6.960268968200538e-07, "loss": 2.3416, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 209 }, { "epoch": 0.441408302679979, "grad_norm": 0.3970691259787115, "learning_rate": 6.92621473450945e-07, "loss": 2.3328, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 210 }, { "epoch": 0.4435102469784551, "grad_norm": 0.3362178241906251, "learning_rate": 6.892055250211551e-07, "loss": 2.2666, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 211 }, { "epoch": 0.44561219127693114, "grad_norm": 0.3370093871143424, "learning_rate": 6.857792381820672e-07, "loss": 2.3654, "memory/device_mem_reserved(gib)": 69.04, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 212 }, { "epoch": 0.44771413557540723, "grad_norm": 0.3439137104265468, "learning_rate": 6.823428001499676e-07, "loss": 2.3236, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 213 }, { "epoch": 0.44981607987388333, "grad_norm": 0.363363512278423, "learning_rate": 6.788963986958152e-07, "loss": 2.3153, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 214 }, { "epoch": 0.45191802417235943, "grad_norm": 0.3550856155819428, "learning_rate": 6.754402221349825e-07, "loss": 2.3337, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 215 }, { "epoch": 0.45401996847083553, "grad_norm": 0.43364470288014306, "learning_rate": 6.71974459316964e-07, "loss": 2.3817, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 216 }, { "epoch": 0.45612191276931163, "grad_norm": 0.3594186404992842, "learning_rate": 6.684992996150598e-07, "loss": 2.282, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 217 }, { "epoch": 0.4582238570677877, "grad_norm": 0.348193721582919, "learning_rate": 6.650149329160257e-07, "loss": 2.3266, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 218 }, { "epoch": 0.4603258013662638, "grad_norm": 0.36563818617010935, "learning_rate": 6.615215496096986e-07, "loss": 2.2706, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 219 }, { "epoch": 0.4624277456647399, "grad_norm": 0.3766707141167757, "learning_rate": 6.580193405785938e-07, "loss": 2.2786, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 220 }, { "epoch": 0.464529689963216, "grad_norm": 0.37040693778721345, "learning_rate": 6.545084971874736e-07, "loss": 2.3041, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 221 }, { "epoch": 0.4666316342616921, "grad_norm": 0.3814375380964394, "learning_rate": 6.509892112728928e-07, "loss": 2.2896, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 222 }, { "epoch": 0.4687335785601682, "grad_norm": 0.38809585401355357, "learning_rate": 6.474616751327142e-07, "loss": 2.407, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 223 }, { "epoch": 0.4708355228586443, "grad_norm": 0.3450197035654617, "learning_rate": 6.439260815156038e-07, "loss": 2.3212, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 224 }, { "epoch": 0.4729374671571203, "grad_norm": 0.3638524400528564, "learning_rate": 6.403826236104965e-07, "loss": 2.3958, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 225 }, { "epoch": 0.4750394114555964, "grad_norm": 11.416588328761524, "learning_rate": 6.368314950360415e-07, "loss": 2.4091, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 226 }, { "epoch": 0.4771413557540725, "grad_norm": 0.38009234188930396, "learning_rate": 6.33272889830022e-07, "loss": 2.3481, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 227 }, { "epoch": 0.4792433000525486, "grad_norm": 0.3874884323158228, "learning_rate": 6.297070024387534e-07, "loss": 2.2936, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 228 }, { "epoch": 0.4813452443510247, "grad_norm": 0.3875050817077963, "learning_rate": 6.261340277064578e-07, "loss": 2.2781, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 229 }, { "epoch": 0.4834471886495008, "grad_norm": 0.35862524615310853, "learning_rate": 6.225541608646179e-07, "loss": 2.317, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 230 }, { "epoch": 0.48554913294797686, "grad_norm": 0.3684856860526338, "learning_rate": 6.189675975213093e-07, "loss": 2.2496, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 231 }, { "epoch": 0.48765107724645296, "grad_norm": 0.3592072791824982, "learning_rate": 6.153745336505124e-07, "loss": 2.3916, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 232 }, { "epoch": 0.48975302154492906, "grad_norm": 0.3492976591005929, "learning_rate": 6.117751655814037e-07, "loss": 2.3432, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 233 }, { "epoch": 0.49185496584340516, "grad_norm": 0.39657164130018213, "learning_rate": 6.081696899876281e-07, "loss": 2.2399, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 234 }, { "epoch": 0.49395691014188126, "grad_norm": 0.33918396528061745, "learning_rate": 6.045583038765537e-07, "loss": 2.2886, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 235 }, { "epoch": 0.49605885444035736, "grad_norm": 0.3812617838396709, "learning_rate": 6.009412045785051e-07, "loss": 2.3345, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 236 }, { "epoch": 0.4981607987388334, "grad_norm": 0.35316898092715904, "learning_rate": 5.973185897359827e-07, "loss": 2.3495, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 237 }, { "epoch": 0.5002627430373096, "grad_norm": 0.35687689358418945, "learning_rate": 5.936906572928624e-07, "loss": 2.3206, "memory/device_mem_reserved(gib)": 69.1, "memory/max_mem_active(gib)": 63.57, "memory/max_mem_allocated(gib)": 62.86, "step": 238 } ], "logging_steps": 1, "max_steps": 475, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 238, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.365676632322867e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }