diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9958368026644462, + "eval_steps": 500, + "global_step": 600, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0033305578684429643, + "grad_norm": 2.1135175063937415, + "learning_rate": 1.3333333333333334e-07, + "loss": 1.6413, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.01, + "memory/max_mem_allocated(gib)": 56.7, + "step": 1 + }, + { + "epoch": 0.006661115736885929, + "grad_norm": 2.0196598114735065, + "learning_rate": 2.6666666666666667e-07, + "loss": 1.6382, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 2 + }, + { + "epoch": 0.009991673605328892, + "grad_norm": 2.037892565480129, + "learning_rate": 4.0000000000000003e-07, + "loss": 1.6536, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 3 + }, + { + "epoch": 0.013322231473771857, + "grad_norm": 1.980939710918612, + "learning_rate": 5.333333333333333e-07, + "loss": 1.6712, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 4 + }, + { + "epoch": 0.01665278934221482, + "grad_norm": 2.0553629965683196, + "learning_rate": 6.666666666666666e-07, + "loss": 1.591, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 5 + }, + { + "epoch": 0.019983347210657785, + "grad_norm": 2.1321442384194493, + "learning_rate": 8.000000000000001e-07, + "loss": 1.6275, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 6 + }, + { + "epoch": 0.02331390507910075, + "grad_norm": 2.0224554441924147, + "learning_rate": 9.333333333333333e-07, + "loss": 1.6802, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 7 + }, + { + "epoch": 0.026644462947543714, + "grad_norm": 2.0657857283218144, + "learning_rate": 1.0666666666666667e-06, + "loss": 1.5768, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 8 + }, + { + "epoch": 0.02997502081598668, + "grad_norm": 2.0104233987359206, + "learning_rate": 1.2e-06, + "loss": 1.6026, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 9 + }, + { + "epoch": 0.03330557868442964, + "grad_norm": 2.098692014200002, + "learning_rate": 1.3333333333333332e-06, + "loss": 1.682, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 10 + }, + { + "epoch": 0.03663613655287261, + "grad_norm": 2.0879014611232116, + "learning_rate": 1.4666666666666667e-06, + "loss": 1.6368, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 11 + }, + { + "epoch": 0.03996669442131557, + "grad_norm": 2.0701872996726443, + "learning_rate": 1.6000000000000001e-06, + "loss": 1.629, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 12 + }, + { + "epoch": 0.04329725228975854, + "grad_norm": 2.105064067100562, + "learning_rate": 1.7333333333333332e-06, + "loss": 1.6568, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 13 + }, + { + "epoch": 0.0466278101582015, + "grad_norm": 2.1084872575258733, + "learning_rate": 1.8666666666666667e-06, + "loss": 1.597, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 14 + }, + { + "epoch": 0.04995836802664446, + "grad_norm": 2.0616161807879965, + "learning_rate": 2e-06, + "loss": 1.6008, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 15 + }, + { + "epoch": 0.05328892589508743, + "grad_norm": 1.92970469468585, + "learning_rate": 2.1333333333333334e-06, + "loss": 1.6815, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 16 + }, + { + "epoch": 0.05661948376353039, + "grad_norm": 2.0527427262697855, + "learning_rate": 2.266666666666667e-06, + "loss": 1.6873, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 17 + }, + { + "epoch": 0.05995004163197336, + "grad_norm": 1.9622305052083537, + "learning_rate": 2.4e-06, + "loss": 1.6334, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 18 + }, + { + "epoch": 0.06328059950041633, + "grad_norm": 1.9979390122219929, + "learning_rate": 2.533333333333333e-06, + "loss": 1.6623, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 19 + }, + { + "epoch": 0.06661115736885928, + "grad_norm": 2.0311968068371367, + "learning_rate": 2.6666666666666664e-06, + "loss": 1.607, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 20 + }, + { + "epoch": 0.06994171523730225, + "grad_norm": 1.968344786501615, + "learning_rate": 2.8e-06, + "loss": 1.6087, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 21 + }, + { + "epoch": 0.07327227310574522, + "grad_norm": 2.1145916019697952, + "learning_rate": 2.9333333333333333e-06, + "loss": 1.5926, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 22 + }, + { + "epoch": 0.07660283097418817, + "grad_norm": 2.0129475295050496, + "learning_rate": 3.066666666666667e-06, + "loss": 1.6171, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 23 + }, + { + "epoch": 0.07993338884263114, + "grad_norm": 1.8817164699193898, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.6552, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 24 + }, + { + "epoch": 0.08326394671107411, + "grad_norm": 1.9306634203997992, + "learning_rate": 3.333333333333333e-06, + "loss": 1.6288, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 25 + }, + { + "epoch": 0.08659450457951708, + "grad_norm": 1.8839715974459492, + "learning_rate": 3.4666666666666664e-06, + "loss": 1.5772, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 26 + }, + { + "epoch": 0.08992506244796003, + "grad_norm": 1.9004207576591563, + "learning_rate": 3.6e-06, + "loss": 1.6019, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 27 + }, + { + "epoch": 0.093255620316403, + "grad_norm": 1.8508009396241183, + "learning_rate": 3.7333333333333333e-06, + "loss": 1.6347, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 28 + }, + { + "epoch": 0.09658617818484597, + "grad_norm": 1.6521817439090796, + "learning_rate": 3.866666666666666e-06, + "loss": 1.6425, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 29 + }, + { + "epoch": 0.09991673605328892, + "grad_norm": 1.5825237347457706, + "learning_rate": 4e-06, + "loss": 1.4999, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 30 + }, + { + "epoch": 0.10324729392173189, + "grad_norm": 1.4406934972277887, + "learning_rate": 4.133333333333333e-06, + "loss": 1.537, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 31 + }, + { + "epoch": 0.10657785179017486, + "grad_norm": 1.3686252476380623, + "learning_rate": 4.266666666666667e-06, + "loss": 1.5054, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 32 + }, + { + "epoch": 0.10990840965861781, + "grad_norm": 1.190989973623068, + "learning_rate": 4.399999999999999e-06, + "loss": 1.5673, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 33 + }, + { + "epoch": 0.11323896752706078, + "grad_norm": 1.0921718147815354, + "learning_rate": 4.533333333333334e-06, + "loss": 1.5383, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 34 + }, + { + "epoch": 0.11656952539550375, + "grad_norm": 0.9720091603452963, + "learning_rate": 4.666666666666666e-06, + "loss": 1.5698, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 35 + }, + { + "epoch": 0.11990008326394672, + "grad_norm": 0.8634677699569875, + "learning_rate": 4.8e-06, + "loss": 1.5286, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 36 + }, + { + "epoch": 0.12323064113238967, + "grad_norm": 0.7720350215206407, + "learning_rate": 4.933333333333333e-06, + "loss": 1.5897, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 37 + }, + { + "epoch": 0.12656119900083265, + "grad_norm": 0.7351438783567595, + "learning_rate": 5.066666666666666e-06, + "loss": 1.471, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 38 + }, + { + "epoch": 0.1298917568692756, + "grad_norm": 0.6436527036047347, + "learning_rate": 5.2e-06, + "loss": 1.5523, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 39 + }, + { + "epoch": 0.13322231473771856, + "grad_norm": 0.5914433909472115, + "learning_rate": 5.333333333333333e-06, + "loss": 1.5169, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 40 + }, + { + "epoch": 0.13655287260616153, + "grad_norm": 0.5708899134928395, + "learning_rate": 5.466666666666667e-06, + "loss": 1.4727, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 41 + }, + { + "epoch": 0.1398834304746045, + "grad_norm": 0.562979308505682, + "learning_rate": 5.6e-06, + "loss": 1.5101, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 42 + }, + { + "epoch": 0.14321398834304747, + "grad_norm": 0.5333098859373814, + "learning_rate": 5.733333333333332e-06, + "loss": 1.5053, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 43 + }, + { + "epoch": 0.14654454621149043, + "grad_norm": 0.518700589700869, + "learning_rate": 5.866666666666667e-06, + "loss": 1.5522, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 44 + }, + { + "epoch": 0.1498751040799334, + "grad_norm": 0.5123134702021855, + "learning_rate": 5.999999999999999e-06, + "loss": 1.4581, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 45 + }, + { + "epoch": 0.15320566194837634, + "grad_norm": 0.5233023339387923, + "learning_rate": 6.133333333333334e-06, + "loss": 1.4503, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 46 + }, + { + "epoch": 0.1565362198168193, + "grad_norm": 0.4984395351799732, + "learning_rate": 6.266666666666666e-06, + "loss": 1.4698, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 47 + }, + { + "epoch": 0.15986677768526228, + "grad_norm": 0.48116733820243823, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.5399, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 48 + }, + { + "epoch": 0.16319733555370525, + "grad_norm": 0.4917159508967155, + "learning_rate": 6.533333333333333e-06, + "loss": 1.4674, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 49 + }, + { + "epoch": 0.16652789342214822, + "grad_norm": 0.4631697484027289, + "learning_rate": 6.666666666666666e-06, + "loss": 1.5063, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 50 + }, + { + "epoch": 0.16985845129059118, + "grad_norm": 0.4506097490342786, + "learning_rate": 6.8e-06, + "loss": 1.4787, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 51 + }, + { + "epoch": 0.17318900915903415, + "grad_norm": 0.4808943580292107, + "learning_rate": 6.933333333333333e-06, + "loss": 1.5355, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 52 + }, + { + "epoch": 0.1765195670274771, + "grad_norm": 0.4353655566788618, + "learning_rate": 7.066666666666667e-06, + "loss": 1.4545, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 53 + }, + { + "epoch": 0.17985012489592006, + "grad_norm": 0.42881276266179474, + "learning_rate": 7.2e-06, + "loss": 1.4726, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 54 + }, + { + "epoch": 0.18318068276436303, + "grad_norm": 0.4243886425058161, + "learning_rate": 7.333333333333332e-06, + "loss": 1.5364, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 55 + }, + { + "epoch": 0.186511240632806, + "grad_norm": 0.4078516232902407, + "learning_rate": 7.466666666666667e-06, + "loss": 1.5441, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 56 + }, + { + "epoch": 0.18984179850124897, + "grad_norm": 0.39819776399963164, + "learning_rate": 7.599999999999999e-06, + "loss": 1.5394, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 57 + }, + { + "epoch": 0.19317235636969193, + "grad_norm": 0.3993196408685462, + "learning_rate": 7.733333333333333e-06, + "loss": 1.4883, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 58 + }, + { + "epoch": 0.1965029142381349, + "grad_norm": 0.3992257742362516, + "learning_rate": 7.866666666666667e-06, + "loss": 1.4933, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 59 + }, + { + "epoch": 0.19983347210657784, + "grad_norm": 0.39782096872195477, + "learning_rate": 8e-06, + "loss": 1.4729, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 60 + }, + { + "epoch": 0.2031640299750208, + "grad_norm": 0.38436305350726707, + "learning_rate": 7.99851604526062e-06, + "loss": 1.4777, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 61 + }, + { + "epoch": 0.20649458784346378, + "grad_norm": 0.3782583438569582, + "learning_rate": 7.99702712746191e-06, + "loss": 1.535, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 62 + }, + { + "epoch": 0.20982514571190675, + "grad_norm": 0.3910775225847348, + "learning_rate": 7.995533221663874e-06, + "loss": 1.4643, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 63 + }, + { + "epoch": 0.21315570358034971, + "grad_norm": 0.37376830993433585, + "learning_rate": 7.994034302759135e-06, + "loss": 1.4265, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 64 + }, + { + "epoch": 0.21648626144879268, + "grad_norm": 0.37205196740456564, + "learning_rate": 7.99253034547152e-06, + "loss": 1.484, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 65 + }, + { + "epoch": 0.21981681931723562, + "grad_norm": 0.37012462931708767, + "learning_rate": 7.991021324354658e-06, + "loss": 1.4668, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 66 + }, + { + "epoch": 0.2231473771856786, + "grad_norm": 0.36609254541987934, + "learning_rate": 7.989507213790519e-06, + "loss": 1.4512, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 67 + }, + { + "epoch": 0.22647793505412156, + "grad_norm": 0.36389643029866026, + "learning_rate": 7.987987987987988e-06, + "loss": 1.4666, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 68 + }, + { + "epoch": 0.22980849292256453, + "grad_norm": 0.3835942907380993, + "learning_rate": 7.986463620981386e-06, + "loss": 1.5581, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 69 + }, + { + "epoch": 0.2331390507910075, + "grad_norm": 0.3709505537460329, + "learning_rate": 7.984934086629002e-06, + "loss": 1.4942, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 70 + }, + { + "epoch": 0.23646960865945046, + "grad_norm": 0.3734284694232727, + "learning_rate": 7.983399358611582e-06, + "loss": 1.5449, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 71 + }, + { + "epoch": 0.23980016652789343, + "grad_norm": 0.38168285139161445, + "learning_rate": 7.981859410430838e-06, + "loss": 1.4972, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 72 + }, + { + "epoch": 0.24313072439633637, + "grad_norm": 0.4166060644404285, + "learning_rate": 7.98031421540791e-06, + "loss": 1.5273, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 73 + }, + { + "epoch": 0.24646128226477934, + "grad_norm": 0.3721773268353121, + "learning_rate": 7.978763746681835e-06, + "loss": 1.5459, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 74 + }, + { + "epoch": 0.2497918401332223, + "grad_norm": 0.3785109036596187, + "learning_rate": 7.977207977207977e-06, + "loss": 1.5221, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 75 + }, + { + "epoch": 0.2531223980016653, + "grad_norm": 0.3798544993330551, + "learning_rate": 7.975646879756469e-06, + "loss": 1.447, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 76 + }, + { + "epoch": 0.25645295587010825, + "grad_norm": 0.3676458399839075, + "learning_rate": 7.974080426910615e-06, + "loss": 1.552, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 77 + }, + { + "epoch": 0.2597835137385512, + "grad_norm": 0.3906392619486636, + "learning_rate": 7.972508591065292e-06, + "loss": 1.5524, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 78 + }, + { + "epoch": 0.2631140716069942, + "grad_norm": 0.4086059406499793, + "learning_rate": 7.97093134442532e-06, + "loss": 1.5347, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 79 + }, + { + "epoch": 0.2664446294754371, + "grad_norm": 0.37866878925235237, + "learning_rate": 7.969348659003832e-06, + "loss": 1.4263, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 80 + }, + { + "epoch": 0.2697751873438801, + "grad_norm": 0.36630829174020924, + "learning_rate": 7.96776050662061e-06, + "loss": 1.4882, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 81 + }, + { + "epoch": 0.27310574521232306, + "grad_norm": 0.3572953266857883, + "learning_rate": 7.966166858900421e-06, + "loss": 1.4996, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 82 + }, + { + "epoch": 0.27643630308076605, + "grad_norm": 0.37034991529495037, + "learning_rate": 7.964567687271327e-06, + "loss": 1.4558, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 83 + }, + { + "epoch": 0.279766860949209, + "grad_norm": 0.39454254411893813, + "learning_rate": 7.962962962962963e-06, + "loss": 1.481, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 84 + }, + { + "epoch": 0.28309741881765194, + "grad_norm": 0.36598142148212737, + "learning_rate": 7.961352657004831e-06, + "loss": 1.4647, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 85 + }, + { + "epoch": 0.28642797668609493, + "grad_norm": 0.40131025635004997, + "learning_rate": 7.959736740224545e-06, + "loss": 1.486, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 86 + }, + { + "epoch": 0.28975853455453787, + "grad_norm": 0.36813808559204136, + "learning_rate": 7.958115183246073e-06, + "loss": 1.5104, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 87 + }, + { + "epoch": 0.29308909242298087, + "grad_norm": 0.4399054897841581, + "learning_rate": 7.956487956487956e-06, + "loss": 1.5511, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 88 + }, + { + "epoch": 0.2964196502914238, + "grad_norm": 0.4137480663423791, + "learning_rate": 7.95485503016151e-06, + "loss": 1.5431, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 89 + }, + { + "epoch": 0.2997502081598668, + "grad_norm": 0.39082659570701933, + "learning_rate": 7.953216374269006e-06, + "loss": 1.5094, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 90 + }, + { + "epoch": 0.30308076602830974, + "grad_norm": 0.4222547479458042, + "learning_rate": 7.951571958601836e-06, + "loss": 1.528, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 91 + }, + { + "epoch": 0.3064113238967527, + "grad_norm": 0.4565848989524497, + "learning_rate": 7.949921752738653e-06, + "loss": 1.4345, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 92 + }, + { + "epoch": 0.3097418817651957, + "grad_norm": 0.3909465393349193, + "learning_rate": 7.948265726043504e-06, + "loss": 1.4885, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 93 + }, + { + "epoch": 0.3130724396336386, + "grad_norm": 0.40399439020361494, + "learning_rate": 7.946603847663918e-06, + "loss": 1.4836, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 94 + }, + { + "epoch": 0.3164029975020816, + "grad_norm": 0.3940685084379771, + "learning_rate": 7.944936086529007e-06, + "loss": 1.4894, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 95 + }, + { + "epoch": 0.31973355537052456, + "grad_norm": 0.36455393248573603, + "learning_rate": 7.943262411347517e-06, + "loss": 1.4765, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 96 + }, + { + "epoch": 0.32306411323896755, + "grad_norm": 0.42216219555871026, + "learning_rate": 7.94158279060588e-06, + "loss": 1.505, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 97 + }, + { + "epoch": 0.3263946711074105, + "grad_norm": 0.3833612688097333, + "learning_rate": 7.93989719256623e-06, + "loss": 1.4803, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 98 + }, + { + "epoch": 0.32972522897585343, + "grad_norm": 0.3793312412105176, + "learning_rate": 7.938205585264408e-06, + "loss": 1.4721, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 99 + }, + { + "epoch": 0.33305578684429643, + "grad_norm": 0.6231405275420779, + "learning_rate": 7.936507936507936e-06, + "loss": 1.4941, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 100 + }, + { + "epoch": 0.33638634471273937, + "grad_norm": 0.39916108511305454, + "learning_rate": 7.934804213873981e-06, + "loss": 1.5113, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 101 + }, + { + "epoch": 0.33971690258118237, + "grad_norm": 0.39832888981715536, + "learning_rate": 7.933094384707288e-06, + "loss": 1.4616, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 102 + }, + { + "epoch": 0.3430474604496253, + "grad_norm": 0.35554379353616694, + "learning_rate": 7.931378416118093e-06, + "loss": 1.4754, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 103 + }, + { + "epoch": 0.3463780183180683, + "grad_norm": 0.3778786204869107, + "learning_rate": 7.929656274980016e-06, + "loss": 1.5204, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 104 + }, + { + "epoch": 0.34970857618651124, + "grad_norm": 0.3979509981477904, + "learning_rate": 7.927927927927927e-06, + "loss": 1.4972, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 105 + }, + { + "epoch": 0.3530391340549542, + "grad_norm": 0.3829152377900939, + "learning_rate": 7.926193341355797e-06, + "loss": 1.4852, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 106 + }, + { + "epoch": 0.3563696919233972, + "grad_norm": 0.3783230292732417, + "learning_rate": 7.924452481414507e-06, + "loss": 1.4605, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 107 + }, + { + "epoch": 0.3597002497918401, + "grad_norm": 0.3702225917786687, + "learning_rate": 7.922705314009662e-06, + "loss": 1.4751, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 108 + }, + { + "epoch": 0.3630308076602831, + "grad_norm": 0.423076463648796, + "learning_rate": 7.920951804799353e-06, + "loss": 1.5043, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 109 + }, + { + "epoch": 0.36636136552872606, + "grad_norm": 0.4015775298544568, + "learning_rate": 7.919191919191919e-06, + "loss": 1.4993, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 110 + }, + { + "epoch": 0.36969192339716905, + "grad_norm": 0.395772531232646, + "learning_rate": 7.917425622343655e-06, + "loss": 1.5074, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 111 + }, + { + "epoch": 0.373022481265612, + "grad_norm": 0.4063035915678222, + "learning_rate": 7.915652879156528e-06, + "loss": 1.5005, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 112 + }, + { + "epoch": 0.37635303913405493, + "grad_norm": 0.3749049780160411, + "learning_rate": 7.913873654275848e-06, + "loss": 1.5016, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 113 + }, + { + "epoch": 0.37968359700249793, + "grad_norm": 0.40207184709524446, + "learning_rate": 7.912087912087911e-06, + "loss": 1.5112, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 114 + }, + { + "epoch": 0.38301415487094087, + "grad_norm": 0.3761596500147066, + "learning_rate": 7.910295616717634e-06, + "loss": 1.4226, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 115 + }, + { + "epoch": 0.38634471273938387, + "grad_norm": 0.34919530357614503, + "learning_rate": 7.908496732026144e-06, + "loss": 1.454, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 116 + }, + { + "epoch": 0.3896752706078268, + "grad_norm": 0.3783249892281946, + "learning_rate": 7.906691221608348e-06, + "loss": 1.3926, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 117 + }, + { + "epoch": 0.3930058284762698, + "grad_norm": 0.38789047851939196, + "learning_rate": 7.904879048790487e-06, + "loss": 1.5148, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 118 + }, + { + "epoch": 0.39633638634471274, + "grad_norm": 0.38028310552438055, + "learning_rate": 7.903060176627645e-06, + "loss": 1.5512, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 119 + }, + { + "epoch": 0.3996669442131557, + "grad_norm": 0.3557857851285413, + "learning_rate": 7.901234567901235e-06, + "loss": 1.5145, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 120 + }, + { + "epoch": 0.4029975020815987, + "grad_norm": 0.3648126505851961, + "learning_rate": 7.89940218511647e-06, + "loss": 1.4616, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 121 + }, + { + "epoch": 0.4063280599500416, + "grad_norm": 0.3518641114757544, + "learning_rate": 7.897562990499793e-06, + "loss": 1.4444, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 122 + }, + { + "epoch": 0.4096586178184846, + "grad_norm": 0.3812409352914946, + "learning_rate": 7.895716945996276e-06, + "loss": 1.4524, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 123 + }, + { + "epoch": 0.41298917568692756, + "grad_norm": 0.37136499335096407, + "learning_rate": 7.893864013266998e-06, + "loss": 1.4495, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 124 + }, + { + "epoch": 0.4163197335553705, + "grad_norm": 0.36965696298407785, + "learning_rate": 7.892004153686396e-06, + "loss": 1.454, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 125 + }, + { + "epoch": 0.4196502914238135, + "grad_norm": 0.37625883797439813, + "learning_rate": 7.890137328339575e-06, + "loss": 1.4738, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 126 + }, + { + "epoch": 0.42298084929225643, + "grad_norm": 0.3891305395409707, + "learning_rate": 7.888263498019595e-06, + "loss": 1.4336, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 127 + }, + { + "epoch": 0.42631140716069943, + "grad_norm": 0.33836499033668194, + "learning_rate": 7.886382623224727e-06, + "loss": 1.4435, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 128 + }, + { + "epoch": 0.42964196502914237, + "grad_norm": 0.39084535016086686, + "learning_rate": 7.88449466415568e-06, + "loss": 1.4598, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 129 + }, + { + "epoch": 0.43297252289758537, + "grad_norm": 0.3896756879145717, + "learning_rate": 7.882599580712787e-06, + "loss": 1.5065, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 130 + }, + { + "epoch": 0.4363030807660283, + "grad_norm": 0.4252827004151611, + "learning_rate": 7.880697332493174e-06, + "loss": 1.4083, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 131 + }, + { + "epoch": 0.43963363863447125, + "grad_norm": 0.3608365697753635, + "learning_rate": 7.878787878787878e-06, + "loss": 1.441, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 132 + }, + { + "epoch": 0.44296419650291424, + "grad_norm": 0.403123415092978, + "learning_rate": 7.876871178578958e-06, + "loss": 1.4627, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 133 + }, + { + "epoch": 0.4462947543713572, + "grad_norm": 0.40013457143727, + "learning_rate": 7.874947190536545e-06, + "loss": 1.4955, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 134 + }, + { + "epoch": 0.4496253122398002, + "grad_norm": 0.3883976625001682, + "learning_rate": 7.873015873015873e-06, + "loss": 1.4298, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 135 + }, + { + "epoch": 0.4529558701082431, + "grad_norm": 0.47893723454397114, + "learning_rate": 7.871077184054283e-06, + "loss": 1.4706, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 136 + }, + { + "epoch": 0.4562864279766861, + "grad_norm": 0.3939594731251799, + "learning_rate": 7.869131081368174e-06, + "loss": 1.4659, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 137 + }, + { + "epoch": 0.45961698584512906, + "grad_norm": 0.39872483940488357, + "learning_rate": 7.867177522349935e-06, + "loss": 1.4428, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 138 + }, + { + "epoch": 0.462947543713572, + "grad_norm": 0.41681968734219343, + "learning_rate": 7.865216464064831e-06, + "loss": 1.5116, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 139 + }, + { + "epoch": 0.466278101582015, + "grad_norm": 0.3950334535334994, + "learning_rate": 7.863247863247863e-06, + "loss": 1.4453, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 140 + }, + { + "epoch": 0.46960865945045793, + "grad_norm": 0.3569883912128034, + "learning_rate": 7.861271676300578e-06, + "loss": 1.462, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 141 + }, + { + "epoch": 0.47293921731890093, + "grad_norm": 0.3784473417547298, + "learning_rate": 7.85928785928786e-06, + "loss": 1.4961, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 142 + }, + { + "epoch": 0.47626977518734387, + "grad_norm": 0.35459480974078084, + "learning_rate": 7.857296367934665e-06, + "loss": 1.5362, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 143 + }, + { + "epoch": 0.47960033305578686, + "grad_norm": 0.3662426670901604, + "learning_rate": 7.85529715762274e-06, + "loss": 1.3832, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 144 + }, + { + "epoch": 0.4829308909242298, + "grad_norm": 0.4066610425979986, + "learning_rate": 7.85329018338727e-06, + "loss": 1.4641, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 145 + }, + { + "epoch": 0.48626144879267275, + "grad_norm": 0.3545713986492447, + "learning_rate": 7.851275399913532e-06, + "loss": 1.4675, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 146 + }, + { + "epoch": 0.48959200666111574, + "grad_norm": 0.3664688735051096, + "learning_rate": 7.849252761533463e-06, + "loss": 1.4683, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 147 + }, + { + "epoch": 0.4929225645295587, + "grad_norm": 0.3733605661751341, + "learning_rate": 7.847222222222221e-06, + "loss": 1.4315, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 148 + }, + { + "epoch": 0.4962531223980017, + "grad_norm": 0.3380374141462393, + "learning_rate": 7.845183735594695e-06, + "loss": 1.4401, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 149 + }, + { + "epoch": 0.4995836802664446, + "grad_norm": 0.407518229964574, + "learning_rate": 7.84313725490196e-06, + "loss": 1.4437, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 150 + }, + { + "epoch": 0.5029142381348876, + "grad_norm": 0.36142186690399497, + "learning_rate": 7.841082733027723e-06, + "loss": 1.4444, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 151 + }, + { + "epoch": 0.5062447960033306, + "grad_norm": 0.35245555484230673, + "learning_rate": 7.839020122484688e-06, + "loss": 1.4013, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 152 + }, + { + "epoch": 0.5095753538717736, + "grad_norm": 0.3751518274944043, + "learning_rate": 7.836949375410913e-06, + "loss": 1.4325, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 153 + }, + { + "epoch": 0.5129059117402165, + "grad_norm": 0.364299926744196, + "learning_rate": 7.834870443566096e-06, + "loss": 1.4757, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 154 + }, + { + "epoch": 0.5162364696086594, + "grad_norm": 0.352709296353453, + "learning_rate": 7.832783278327833e-06, + "loss": 1.4405, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 155 + }, + { + "epoch": 0.5195670274771024, + "grad_norm": 0.3595882748979197, + "learning_rate": 7.830687830687831e-06, + "loss": 1.5005, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 156 + }, + { + "epoch": 0.5228975853455454, + "grad_norm": 0.3663062717533196, + "learning_rate": 7.828584051248068e-06, + "loss": 1.4916, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 157 + }, + { + "epoch": 0.5262281432139884, + "grad_norm": 0.39230898190550817, + "learning_rate": 7.82647189021691e-06, + "loss": 1.5925, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 158 + }, + { + "epoch": 0.5295587010824313, + "grad_norm": 0.34764213510621217, + "learning_rate": 7.824351297405189e-06, + "loss": 1.533, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 159 + }, + { + "epoch": 0.5328892589508742, + "grad_norm": 0.4356036173061448, + "learning_rate": 7.822222222222222e-06, + "loss": 1.4768, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 160 + }, + { + "epoch": 0.5362198168193172, + "grad_norm": 0.3650633676087402, + "learning_rate": 7.820084613671788e-06, + "loss": 1.4834, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 161 + }, + { + "epoch": 0.5395503746877602, + "grad_norm": 0.36003662026404476, + "learning_rate": 7.81793842034806e-06, + "loss": 1.4745, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 162 + }, + { + "epoch": 0.5428809325562032, + "grad_norm": 0.45089712637002705, + "learning_rate": 7.815783590431477e-06, + "loss": 1.4762, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 163 + }, + { + "epoch": 0.5462114904246461, + "grad_norm": 0.3727325869359898, + "learning_rate": 7.813620071684589e-06, + "loss": 1.4605, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 164 + }, + { + "epoch": 0.5495420482930891, + "grad_norm": 0.3396845072209277, + "learning_rate": 7.81144781144781e-06, + "loss": 1.4793, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 165 + }, + { + "epoch": 0.5528726061615321, + "grad_norm": 0.35005093334327886, + "learning_rate": 7.809266756635177e-06, + "loss": 1.4699, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 166 + }, + { + "epoch": 0.556203164029975, + "grad_norm": 0.3836826797224187, + "learning_rate": 7.807076853729998e-06, + "loss": 1.4727, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 167 + }, + { + "epoch": 0.559533721898418, + "grad_norm": 0.3866747204941054, + "learning_rate": 7.804878048780487e-06, + "loss": 1.4656, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 168 + }, + { + "epoch": 0.5628642797668609, + "grad_norm": 0.3754060351606817, + "learning_rate": 7.802670287395338e-06, + "loss": 1.4427, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 169 + }, + { + "epoch": 0.5661948376353039, + "grad_norm": 0.3560392764595894, + "learning_rate": 7.80045351473923e-06, + "loss": 1.469, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 170 + }, + { + "epoch": 0.5695253955037469, + "grad_norm": 0.38308734497417124, + "learning_rate": 7.79822767552829e-06, + "loss": 1.5086, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 171 + }, + { + "epoch": 0.5728559533721899, + "grad_norm": 0.359072776955667, + "learning_rate": 7.7959927140255e-06, + "loss": 1.4531, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 172 + }, + { + "epoch": 0.5761865112406328, + "grad_norm": 0.3922686356985507, + "learning_rate": 7.793748574036049e-06, + "loss": 1.5004, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 173 + }, + { + "epoch": 0.5795170691090757, + "grad_norm": 0.38139432931110967, + "learning_rate": 7.791495198902606e-06, + "loss": 1.4596, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 174 + }, + { + "epoch": 0.5828476269775187, + "grad_norm": 0.386725308323352, + "learning_rate": 7.789232531500573e-06, + "loss": 1.4107, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 175 + }, + { + "epoch": 0.5861781848459617, + "grad_norm": 0.3590860738790805, + "learning_rate": 7.786960514233242e-06, + "loss": 1.4748, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 176 + }, + { + "epoch": 0.5895087427144047, + "grad_norm": 0.38618438471328675, + "learning_rate": 7.784679089026915e-06, + "loss": 1.481, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 177 + }, + { + "epoch": 0.5928393005828476, + "grad_norm": 0.35271293932202913, + "learning_rate": 7.782388197325957e-06, + "loss": 1.4445, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 178 + }, + { + "epoch": 0.5961698584512906, + "grad_norm": 0.3587339394337467, + "learning_rate": 7.78008778008778e-06, + "loss": 1.482, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 179 + }, + { + "epoch": 0.5995004163197336, + "grad_norm": 0.4051854093619042, + "learning_rate": 7.777777777777777e-06, + "loss": 1.4538, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 180 + }, + { + "epoch": 0.6028309741881765, + "grad_norm": 0.3624830177676393, + "learning_rate": 7.775458130364185e-06, + "loss": 1.3882, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 181 + }, + { + "epoch": 0.6061615320566195, + "grad_norm": 0.40327439887058536, + "learning_rate": 7.773128777312878e-06, + "loss": 1.4439, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 182 + }, + { + "epoch": 0.6094920899250624, + "grad_norm": 0.3849485884337272, + "learning_rate": 7.77078965758211e-06, + "loss": 1.4598, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 183 + }, + { + "epoch": 0.6128226477935054, + "grad_norm": 0.3800546336095655, + "learning_rate": 7.76844070961718e-06, + "loss": 1.5077, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 184 + }, + { + "epoch": 0.6161532056619484, + "grad_norm": 0.4058514640829756, + "learning_rate": 7.766081871345029e-06, + "loss": 1.4557, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 185 + }, + { + "epoch": 0.6194837635303914, + "grad_norm": 0.3547116281420189, + "learning_rate": 7.763713080168775e-06, + "loss": 1.4465, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 186 + }, + { + "epoch": 0.6228143213988343, + "grad_norm": 0.36935786461716674, + "learning_rate": 7.76133427296218e-06, + "loss": 1.3674, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 187 + }, + { + "epoch": 0.6261448792672772, + "grad_norm": 0.3465066682351456, + "learning_rate": 7.75894538606403e-06, + "loss": 1.5018, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 188 + }, + { + "epoch": 0.6294754371357202, + "grad_norm": 0.37821929789921876, + "learning_rate": 7.75654635527247e-06, + "loss": 1.46, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 189 + }, + { + "epoch": 0.6328059950041632, + "grad_norm": 0.42147975033129337, + "learning_rate": 7.754137115839244e-06, + "loss": 1.4324, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 190 + }, + { + "epoch": 0.6361365528726062, + "grad_norm": 0.395112631651776, + "learning_rate": 7.751717602463872e-06, + "loss": 1.4682, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 191 + }, + { + "epoch": 0.6394671107410491, + "grad_norm": 0.3866087697502269, + "learning_rate": 7.749287749287749e-06, + "loss": 1.4845, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 192 + }, + { + "epoch": 0.6427976686094921, + "grad_norm": 0.39380953384339784, + "learning_rate": 7.746847489888173e-06, + "loss": 1.4628, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 193 + }, + { + "epoch": 0.6461282264779351, + "grad_norm": 0.38499086799547566, + "learning_rate": 7.744396757272294e-06, + "loss": 1.4485, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 194 + }, + { + "epoch": 0.649458784346378, + "grad_norm": 0.3628021970554608, + "learning_rate": 7.741935483870966e-06, + "loss": 1.4306, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 195 + }, + { + "epoch": 0.652789342214821, + "grad_norm": 0.37886204626432507, + "learning_rate": 7.739463601532567e-06, + "loss": 1.4178, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 196 + }, + { + "epoch": 0.6561199000832639, + "grad_norm": 0.36347566586862995, + "learning_rate": 7.736981041516678e-06, + "loss": 1.3917, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 197 + }, + { + "epoch": 0.6594504579517069, + "grad_norm": 0.3808525608826558, + "learning_rate": 7.734487734487733e-06, + "loss": 1.425, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 198 + }, + { + "epoch": 0.6627810158201499, + "grad_norm": 0.36703672958616185, + "learning_rate": 7.731983610508556e-06, + "loss": 1.3963, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 199 + }, + { + "epoch": 0.6661115736885929, + "grad_norm": 0.3449284331155099, + "learning_rate": 7.729468599033817e-06, + "loss": 1.5389, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 200 + }, + { + "epoch": 0.6694421315570358, + "grad_norm": 0.38098871722055255, + "learning_rate": 7.726942628903412e-06, + "loss": 1.4354, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 201 + }, + { + "epoch": 0.6727726894254787, + "grad_norm": 0.37447535098026113, + "learning_rate": 7.72440562833576e-06, + "loss": 1.4238, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 202 + }, + { + "epoch": 0.6761032472939217, + "grad_norm": 0.3815996192127943, + "learning_rate": 7.721857524920983e-06, + "loss": 1.4465, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 203 + }, + { + "epoch": 0.6794338051623647, + "grad_norm": 0.43830167523580127, + "learning_rate": 7.719298245614036e-06, + "loss": 1.4464, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 204 + }, + { + "epoch": 0.6827643630308077, + "grad_norm": 0.49374391843463344, + "learning_rate": 7.716727716727717e-06, + "loss": 1.4326, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 205 + }, + { + "epoch": 0.6860949208992506, + "grad_norm": 0.40611516537871767, + "learning_rate": 7.714145863925599e-06, + "loss": 1.4867, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 206 + }, + { + "epoch": 0.6894254787676936, + "grad_norm": 0.39306412548059455, + "learning_rate": 7.711552612214863e-06, + "loss": 1.4879, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 207 + }, + { + "epoch": 0.6927560366361366, + "grad_norm": 0.3732547746311456, + "learning_rate": 7.708947885939036e-06, + "loss": 1.5305, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 208 + }, + { + "epoch": 0.6960865945045795, + "grad_norm": 0.3749992070235647, + "learning_rate": 7.706331608770632e-06, + "loss": 1.4422, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 209 + }, + { + "epoch": 0.6994171523730225, + "grad_norm": 0.4236632648954227, + "learning_rate": 7.703703703703702e-06, + "loss": 1.4362, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 210 + }, + { + "epoch": 0.7027477102414654, + "grad_norm": 0.3799687473741569, + "learning_rate": 7.701064093046274e-06, + "loss": 1.512, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 211 + }, + { + "epoch": 0.7060782681099084, + "grad_norm": 0.3724271784543797, + "learning_rate": 7.698412698412699e-06, + "loss": 1.469, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 212 + }, + { + "epoch": 0.7094088259783514, + "grad_norm": 0.364477503994216, + "learning_rate": 7.695749440715883e-06, + "loss": 1.4811, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 213 + }, + { + "epoch": 0.7127393838467944, + "grad_norm": 0.3925520005032744, + "learning_rate": 7.693074240159441e-06, + "loss": 1.5027, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 214 + }, + { + "epoch": 0.7160699417152373, + "grad_norm": 0.40921223587397654, + "learning_rate": 7.690387016229713e-06, + "loss": 1.488, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 215 + }, + { + "epoch": 0.7194004995836802, + "grad_norm": 0.3981162315328969, + "learning_rate": 7.687687687687688e-06, + "loss": 1.4343, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 216 + }, + { + "epoch": 0.7227310574521232, + "grad_norm": 0.35388766488814566, + "learning_rate": 7.684976172560823e-06, + "loss": 1.4599, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 217 + }, + { + "epoch": 0.7260616153205662, + "grad_norm": 0.3449802535833205, + "learning_rate": 7.682252388134742e-06, + "loss": 1.442, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 218 + }, + { + "epoch": 0.7293921731890092, + "grad_norm": 0.34627676487411824, + "learning_rate": 7.679516250944822e-06, + "loss": 1.4461, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 219 + }, + { + "epoch": 0.7327227310574521, + "grad_norm": 0.35799089084524466, + "learning_rate": 7.676767676767677e-06, + "loss": 1.4731, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 220 + }, + { + "epoch": 0.736053288925895, + "grad_norm": 0.3820520257947768, + "learning_rate": 7.674006580612503e-06, + "loss": 1.4566, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 221 + }, + { + "epoch": 0.7393838467943381, + "grad_norm": 0.3641120307221186, + "learning_rate": 7.671232876712327e-06, + "loss": 1.4525, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 222 + }, + { + "epoch": 0.742714404662781, + "grad_norm": 0.37136269720782134, + "learning_rate": 7.668446478515128e-06, + "loss": 1.4548, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 223 + }, + { + "epoch": 0.746044962531224, + "grad_norm": 0.4138383130083843, + "learning_rate": 7.665647298674822e-06, + "loss": 1.5395, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 224 + }, + { + "epoch": 0.7493755203996669, + "grad_norm": 0.37512729325167443, + "learning_rate": 7.662835249042145e-06, + "loss": 1.4348, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 225 + }, + { + "epoch": 0.7527060782681099, + "grad_norm": 0.3574220209010036, + "learning_rate": 7.660010240655401e-06, + "loss": 1.4205, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 226 + }, + { + "epoch": 0.7560366361365529, + "grad_norm": 0.3509015504877034, + "learning_rate": 7.657172183731076e-06, + "loss": 1.4074, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 227 + }, + { + "epoch": 0.7593671940049959, + "grad_norm": 0.4191818637620366, + "learning_rate": 7.654320987654322e-06, + "loss": 1.434, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 228 + }, + { + "epoch": 0.7626977518734388, + "grad_norm": 0.38073125720358314, + "learning_rate": 7.651456560969322e-06, + "loss": 1.4315, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 229 + }, + { + "epoch": 0.7660283097418817, + "grad_norm": 0.3489534004367162, + "learning_rate": 7.648578811369509e-06, + "loss": 1.4292, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 230 + }, + { + "epoch": 0.7693588676103247, + "grad_norm": 0.39880199669766575, + "learning_rate": 7.645687645687645e-06, + "loss": 1.4797, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 231 + }, + { + "epoch": 0.7726894254787677, + "grad_norm": 0.3377554646810836, + "learning_rate": 7.642782969885774e-06, + "loss": 1.3638, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 232 + }, + { + "epoch": 0.7760199833472107, + "grad_norm": 0.45577113603344144, + "learning_rate": 7.639864689045015e-06, + "loss": 1.5272, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 233 + }, + { + "epoch": 0.7793505412156536, + "grad_norm": 0.3872639106321951, + "learning_rate": 7.636932707355241e-06, + "loss": 1.5223, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 234 + }, + { + "epoch": 0.7826810990840966, + "grad_norm": 0.41241615465906434, + "learning_rate": 7.633986928104575e-06, + "loss": 1.4047, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 235 + }, + { + "epoch": 0.7860116569525396, + "grad_norm": 0.350902547985464, + "learning_rate": 7.631027253668762e-06, + "loss": 1.4599, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 236 + }, + { + "epoch": 0.7893422148209825, + "grad_norm": 0.36780129033305325, + "learning_rate": 7.6280535855003936e-06, + "loss": 1.4872, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 237 + }, + { + "epoch": 0.7926727726894255, + "grad_norm": 0.3504301681190647, + "learning_rate": 7.625065824117956e-06, + "loss": 1.4508, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 238 + }, + { + "epoch": 0.7960033305578684, + "grad_norm": 0.425786005279154, + "learning_rate": 7.622063869094748e-06, + "loss": 1.5359, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 239 + }, + { + "epoch": 0.7993338884263114, + "grad_norm": 0.3423914333711706, + "learning_rate": 7.619047619047619e-06, + "loss": 1.5116, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 240 + }, + { + "epoch": 0.8026644462947544, + "grad_norm": 0.39752748882813016, + "learning_rate": 7.616016971625564e-06, + "loss": 1.3967, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 241 + }, + { + "epoch": 0.8059950041631974, + "grad_norm": 0.35349720101513005, + "learning_rate": 7.61297182349814e-06, + "loss": 1.428, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 242 + }, + { + "epoch": 0.8093255620316403, + "grad_norm": 0.3592529486243108, + "learning_rate": 7.609912070343725e-06, + "loss": 1.4716, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 243 + }, + { + "epoch": 0.8126561199000832, + "grad_norm": 0.41007914987868593, + "learning_rate": 7.606837606837607e-06, + "loss": 1.4601, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 244 + }, + { + "epoch": 0.8159866777685262, + "grad_norm": 0.4368820717106569, + "learning_rate": 7.603748326639893e-06, + "loss": 1.4299, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 245 + }, + { + "epoch": 0.8193172356369692, + "grad_norm": 0.34781376516299506, + "learning_rate": 7.600644122383253e-06, + "loss": 1.3832, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 246 + }, + { + "epoch": 0.8226477935054122, + "grad_norm": 0.4378928638690168, + "learning_rate": 7.597524885660478e-06, + "loss": 1.5006, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 247 + }, + { + "epoch": 0.8259783513738551, + "grad_norm": 0.38866511125189074, + "learning_rate": 7.594390507011865e-06, + "loss": 1.3808, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 248 + }, + { + "epoch": 0.829308909242298, + "grad_norm": 0.3796151796802332, + "learning_rate": 7.591240875912408e-06, + "loss": 1.4048, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 249 + }, + { + "epoch": 0.832639467110741, + "grad_norm": 0.47512939093169254, + "learning_rate": 7.588075880758807e-06, + "loss": 1.4533, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 250 + }, + { + "epoch": 0.835970024979184, + "grad_norm": 0.4002177494781384, + "learning_rate": 7.584895408856289e-06, + "loss": 1.4364, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 251 + }, + { + "epoch": 0.839300582847627, + "grad_norm": 0.41628446885968545, + "learning_rate": 7.581699346405228e-06, + "loss": 1.5213, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 252 + }, + { + "epoch": 0.8426311407160699, + "grad_norm": 0.41586597700526384, + "learning_rate": 7.578487578487578e-06, + "loss": 1.4163, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 253 + }, + { + "epoch": 0.8459616985845129, + "grad_norm": 0.37080164887555395, + "learning_rate": 7.575259989053093e-06, + "loss": 1.4262, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 254 + }, + { + "epoch": 0.8492922564529559, + "grad_norm": 0.44276862899193814, + "learning_rate": 7.57201646090535e-06, + "loss": 1.4434, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 255 + }, + { + "epoch": 0.8526228143213989, + "grad_norm": 0.3565514945143501, + "learning_rate": 7.568756875687569e-06, + "loss": 1.4628, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 256 + }, + { + "epoch": 0.8559533721898418, + "grad_norm": 0.3424453222650746, + "learning_rate": 7.565481113868211e-06, + "loss": 1.4397, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 257 + }, + { + "epoch": 0.8592839300582847, + "grad_norm": 0.36361177745212486, + "learning_rate": 7.562189054726368e-06, + "loss": 1.434, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 258 + }, + { + "epoch": 0.8626144879267277, + "grad_norm": 0.39961369778575284, + "learning_rate": 7.558880576336936e-06, + "loss": 1.3263, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 259 + }, + { + "epoch": 0.8659450457951707, + "grad_norm": 0.3694683835624918, + "learning_rate": 7.555555555555556e-06, + "loss": 1.465, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 260 + }, + { + "epoch": 0.8692756036636137, + "grad_norm": 0.38926907075141, + "learning_rate": 7.552213868003341e-06, + "loss": 1.4639, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 261 + }, + { + "epoch": 0.8726061615320566, + "grad_norm": 0.41002402289266, + "learning_rate": 7.548855388051367e-06, + "loss": 1.4583, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 262 + }, + { + "epoch": 0.8759367194004996, + "grad_norm": 0.39476689396263037, + "learning_rate": 7.545479988804925e-06, + "loss": 1.5369, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 263 + }, + { + "epoch": 0.8792672772689425, + "grad_norm": 0.42338165790994337, + "learning_rate": 7.542087542087541e-06, + "loss": 1.4149, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 264 + }, + { + "epoch": 0.8825978351373855, + "grad_norm": 0.37580056414171503, + "learning_rate": 7.538677918424753e-06, + "loss": 1.4767, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 265 + }, + { + "epoch": 0.8859283930058285, + "grad_norm": 0.35524395112624974, + "learning_rate": 7.535250987027637e-06, + "loss": 1.4565, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 266 + }, + { + "epoch": 0.8892589508742714, + "grad_norm": 0.333001020301385, + "learning_rate": 7.531806615776081e-06, + "loss": 1.4653, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 267 + }, + { + "epoch": 0.8925895087427144, + "grad_norm": 0.325740334034441, + "learning_rate": 7.5283446712018136e-06, + "loss": 1.4583, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 268 + }, + { + "epoch": 0.8959200666111574, + "grad_norm": 0.3579186582787629, + "learning_rate": 7.524865018471157e-06, + "loss": 1.441, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 269 + }, + { + "epoch": 0.8992506244796004, + "grad_norm": 0.38100337783570354, + "learning_rate": 7.521367521367521e-06, + "loss": 1.4725, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 270 + }, + { + "epoch": 0.9025811823480433, + "grad_norm": 0.4439224251441086, + "learning_rate": 7.5178520422736365e-06, + "loss": 1.4433, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 271 + }, + { + "epoch": 0.9059117402164862, + "grad_norm": 0.36404138775247186, + "learning_rate": 7.514318442153494e-06, + "loss": 1.4502, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 272 + }, + { + "epoch": 0.9092422980849292, + "grad_norm": 0.3739016590981095, + "learning_rate": 7.5107665805340226e-06, + "loss": 1.5614, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 273 + }, + { + "epoch": 0.9125728559533722, + "grad_norm": 0.34618213523089303, + "learning_rate": 7.5071963154864715e-06, + "loss": 1.4818, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 274 + }, + { + "epoch": 0.9159034138218152, + "grad_norm": 0.3516565079552471, + "learning_rate": 7.5036075036075024e-06, + "loss": 1.4811, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 275 + }, + { + "epoch": 0.9192339716902581, + "grad_norm": 0.372847709765313, + "learning_rate": 7.499999999999999e-06, + "loss": 1.4314, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 276 + }, + { + "epoch": 0.922564529558701, + "grad_norm": 0.33633722585110437, + "learning_rate": 7.496373658253553e-06, + "loss": 1.467, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 277 + }, + { + "epoch": 0.925895087427144, + "grad_norm": 0.3502677047499933, + "learning_rate": 7.4927283304246645e-06, + "loss": 1.465, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 278 + }, + { + "epoch": 0.929225645295587, + "grad_norm": 0.3301559549021256, + "learning_rate": 7.4890638670166225e-06, + "loss": 1.4345, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 279 + }, + { + "epoch": 0.93255620316403, + "grad_norm": 0.3995138440783666, + "learning_rate": 7.485380116959064e-06, + "loss": 1.3383, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 280 + }, + { + "epoch": 0.9358867610324729, + "grad_norm": 0.3871081436839965, + "learning_rate": 7.481676927587217e-06, + "loss": 1.4796, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 281 + }, + { + "epoch": 0.9392173189009159, + "grad_norm": 0.4376413636164512, + "learning_rate": 7.4779541446208115e-06, + "loss": 1.4644, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 282 + }, + { + "epoch": 0.9425478767693589, + "grad_norm": 0.43206377875237645, + "learning_rate": 7.474211612142647e-06, + "loss": 1.4107, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 283 + }, + { + "epoch": 0.9458784346378019, + "grad_norm": 0.4025303715871277, + "learning_rate": 7.470449172576832e-06, + "loss": 1.4318, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 284 + }, + { + "epoch": 0.9492089925062448, + "grad_norm": 0.37724819780360036, + "learning_rate": 7.466666666666667e-06, + "loss": 1.4454, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 285 + }, + { + "epoch": 0.9525395503746877, + "grad_norm": 0.35328542805788227, + "learning_rate": 7.462863933452169e-06, + "loss": 1.4175, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 286 + }, + { + "epoch": 0.9558701082431307, + "grad_norm": 0.3788093515621439, + "learning_rate": 7.459040810247245e-06, + "loss": 1.4429, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 287 + }, + { + "epoch": 0.9592006661115737, + "grad_norm": 0.4018461850957888, + "learning_rate": 7.455197132616486e-06, + "loss": 1.4679, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 288 + }, + { + "epoch": 0.9625312239800167, + "grad_norm": 0.3792573314031364, + "learning_rate": 7.451332734351601e-06, + "loss": 1.5191, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 289 + }, + { + "epoch": 0.9658617818484596, + "grad_norm": 0.4173737668171256, + "learning_rate": 7.447447447447447e-06, + "loss": 1.4999, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 290 + }, + { + "epoch": 0.9691923397169026, + "grad_norm": 0.34073643176316165, + "learning_rate": 7.443541102077687e-06, + "loss": 1.3667, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 291 + }, + { + "epoch": 0.9725228975853455, + "grad_norm": 0.37255255694817807, + "learning_rate": 7.439613526570048e-06, + "loss": 1.4196, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 292 + }, + { + "epoch": 0.9758534554537885, + "grad_norm": 0.3751657637349412, + "learning_rate": 7.435664547381168e-06, + "loss": 1.4545, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 293 + }, + { + "epoch": 0.9791840133222315, + "grad_norm": 0.45200307278108437, + "learning_rate": 7.431693989071039e-06, + "loss": 1.4199, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 294 + }, + { + "epoch": 0.9825145711906744, + "grad_norm": 0.3889800375255201, + "learning_rate": 7.427701674277017e-06, + "loss": 1.4801, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 295 + }, + { + "epoch": 0.9858451290591174, + "grad_norm": 0.3473904572951369, + "learning_rate": 7.4236874236874235e-06, + "loss": 1.393, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 296 + }, + { + "epoch": 0.9891756869275604, + "grad_norm": 0.3581161377664693, + "learning_rate": 7.419651056014692e-06, + "loss": 1.4073, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 297 + }, + { + "epoch": 0.9925062447960034, + "grad_norm": 0.35620326479274533, + "learning_rate": 7.415592387968079e-06, + "loss": 1.43, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 298 + }, + { + "epoch": 0.9958368026644463, + "grad_norm": 0.3676482591149261, + "learning_rate": 7.4115112342259155e-06, + "loss": 1.4148, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 299 + }, + { + "epoch": 0.9991673605328892, + "grad_norm": 0.43615077194471996, + "learning_rate": 7.407407407407408e-06, + "loss": 1.4154, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 300 + }, + { + "epoch": 1.0, + "grad_norm": 0.5905255990041776, + "learning_rate": 7.403280718043948e-06, + "loss": 1.4231, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 301 + }, + { + "epoch": 1.003330557868443, + "grad_norm": 0.4020383385971024, + "learning_rate": 7.399130974549968e-06, + "loss": 1.4394, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 302 + }, + { + "epoch": 1.0066611157368859, + "grad_norm": 0.3865309278317666, + "learning_rate": 7.394957983193276e-06, + "loss": 1.4413, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 303 + }, + { + "epoch": 1.009991673605329, + "grad_norm": 0.40327921143010825, + "learning_rate": 7.390761548064918e-06, + "loss": 1.4576, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 304 + }, + { + "epoch": 1.0133222314737718, + "grad_norm": 0.4070700680157323, + "learning_rate": 7.386541471048513e-06, + "loss": 1.4817, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 305 + }, + { + "epoch": 1.0166527893422148, + "grad_norm": 0.43460108634631706, + "learning_rate": 7.382297551789077e-06, + "loss": 1.3939, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 306 + }, + { + "epoch": 1.0199833472106579, + "grad_norm": 0.42277608172713, + "learning_rate": 7.378029587661315e-06, + "loss": 1.4239, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 307 + }, + { + "epoch": 1.0233139050791007, + "grad_norm": 0.37583795807106635, + "learning_rate": 7.373737373737374e-06, + "loss": 1.4878, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 308 + }, + { + "epoch": 1.0266444629475437, + "grad_norm": 0.4013147771199415, + "learning_rate": 7.3694207027540355e-06, + "loss": 1.3758, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 309 + }, + { + "epoch": 1.0299750208159866, + "grad_norm": 0.38150977748656323, + "learning_rate": 7.365079365079365e-06, + "loss": 1.4122, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 310 + }, + { + "epoch": 1.0333055786844296, + "grad_norm": 0.36288526161353013, + "learning_rate": 7.360713148678764e-06, + "loss": 1.4775, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 311 + }, + { + "epoch": 1.0366361365528727, + "grad_norm": 0.5122154405495047, + "learning_rate": 7.35632183908046e-06, + "loss": 1.4385, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 312 + }, + { + "epoch": 1.0399666944213155, + "grad_norm": 0.4907680124574417, + "learning_rate": 7.351905219340377e-06, + "loss": 1.4321, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 313 + }, + { + "epoch": 1.0432972522897586, + "grad_norm": 0.3750039319171418, + "learning_rate": 7.347463070006422e-06, + "loss": 1.4609, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 314 + }, + { + "epoch": 1.0466278101582014, + "grad_norm": 0.415847010986813, + "learning_rate": 7.342995169082125e-06, + "loss": 1.399, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 315 + }, + { + "epoch": 1.0499583680266444, + "grad_norm": 0.40484373034787197, + "learning_rate": 7.338501291989663e-06, + "loss": 1.4082, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 316 + }, + { + "epoch": 1.0532889258950875, + "grad_norm": 0.3556695114896482, + "learning_rate": 7.333981211532231e-06, + "loss": 1.5045, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 317 + }, + { + "epoch": 1.0566194837635303, + "grad_norm": 0.4217178250762373, + "learning_rate": 7.329434697855749e-06, + "loss": 1.5051, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 318 + }, + { + "epoch": 1.0599500416319734, + "grad_norm": 0.44541116731095065, + "learning_rate": 7.324861518409905e-06, + "loss": 1.454, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 319 + }, + { + "epoch": 1.0632805995004164, + "grad_norm": 0.3722519430085194, + "learning_rate": 7.320261437908496e-06, + "loss": 1.4864, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 320 + }, + { + "epoch": 1.0666111573688593, + "grad_norm": 0.49955941789670055, + "learning_rate": 7.315634218289086e-06, + "loss": 1.4328, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 321 + }, + { + "epoch": 1.0699417152373023, + "grad_norm": 0.559077472675475, + "learning_rate": 7.310979618671926e-06, + "loss": 1.4387, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 322 + }, + { + "epoch": 1.0732722731057451, + "grad_norm": 0.38492000673298576, + "learning_rate": 7.306297395318167e-06, + "loss": 1.4173, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 323 + }, + { + "epoch": 1.0766028309741882, + "grad_norm": 0.46264263086480695, + "learning_rate": 7.301587301587301e-06, + "loss": 1.458, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 324 + }, + { + "epoch": 1.0799333888426312, + "grad_norm": 0.48393689092527553, + "learning_rate": 7.296849087893865e-06, + "loss": 1.4984, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 325 + }, + { + "epoch": 1.083263946711074, + "grad_norm": 0.3833552546352091, + "learning_rate": 7.29208250166334e-06, + "loss": 1.4801, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 326 + }, + { + "epoch": 1.0865945045795171, + "grad_norm": 0.45636346067253053, + "learning_rate": 7.287287287287286e-06, + "loss": 1.4335, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 327 + }, + { + "epoch": 1.08992506244796, + "grad_norm": 0.5170759134234261, + "learning_rate": 7.282463186077643e-06, + "loss": 1.4619, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 328 + }, + { + "epoch": 1.093255620316403, + "grad_norm": 0.41413948804668765, + "learning_rate": 7.277609936220207e-06, + "loss": 1.4976, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 329 + }, + { + "epoch": 1.096586178184846, + "grad_norm": 0.47675818666743885, + "learning_rate": 7.272727272727272e-06, + "loss": 1.5166, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 330 + }, + { + "epoch": 1.0999167360532889, + "grad_norm": 0.4446284191251516, + "learning_rate": 7.267814927389396e-06, + "loss": 1.3756, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 331 + }, + { + "epoch": 1.103247293921732, + "grad_norm": 0.3712448236233366, + "learning_rate": 7.262872628726287e-06, + "loss": 1.4177, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 332 + }, + { + "epoch": 1.1065778517901748, + "grad_norm": 0.354780077610888, + "learning_rate": 7.257900101936799e-06, + "loss": 1.3888, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 333 + }, + { + "epoch": 1.1099084096586178, + "grad_norm": 0.4287556354375581, + "learning_rate": 7.252897068847988e-06, + "loss": 1.463, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 334 + }, + { + "epoch": 1.1132389675270609, + "grad_norm": 0.43855718184558823, + "learning_rate": 7.247863247863247e-06, + "loss": 1.4375, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 335 + }, + { + "epoch": 1.1165695253955037, + "grad_norm": 0.36652449074881177, + "learning_rate": 7.242798353909463e-06, + "loss": 1.4724, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 336 + }, + { + "epoch": 1.1199000832639467, + "grad_norm": 0.41471476618444547, + "learning_rate": 7.237702098383213e-06, + "loss": 1.4368, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 337 + }, + { + "epoch": 1.1232306411323896, + "grad_norm": 0.3584246686612814, + "learning_rate": 7.2325741890959285e-06, + "loss": 1.507, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 338 + }, + { + "epoch": 1.1265611990008326, + "grad_norm": 0.35472951006324893, + "learning_rate": 7.227414330218068e-06, + "loss": 1.3847, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 339 + }, + { + "epoch": 1.1298917568692757, + "grad_norm": 0.40770232084467445, + "learning_rate": 7.222222222222222e-06, + "loss": 1.4722, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 340 + }, + { + "epoch": 1.1332223147377185, + "grad_norm": 0.3854760192656062, + "learning_rate": 7.216997561825147e-06, + "loss": 1.4397, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 341 + }, + { + "epoch": 1.1365528726061616, + "grad_norm": 0.3425435570180868, + "learning_rate": 7.211740041928721e-06, + "loss": 1.3917, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 342 + }, + { + "epoch": 1.1398834304746046, + "grad_norm": 0.3629363871231361, + "learning_rate": 7.206449351559762e-06, + "loss": 1.4329, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 343 + }, + { + "epoch": 1.1432139883430474, + "grad_norm": 0.3746351865474382, + "learning_rate": 7.20112517580872e-06, + "loss": 1.4325, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 344 + }, + { + "epoch": 1.1465445462114905, + "grad_norm": 0.35633065876642767, + "learning_rate": 7.195767195767195e-06, + "loss": 1.4802, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 345 + }, + { + "epoch": 1.1498751040799333, + "grad_norm": 0.41086591430313346, + "learning_rate": 7.1903750884642605e-06, + "loss": 1.386, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 346 + }, + { + "epoch": 1.1532056619483764, + "grad_norm": 0.4248601636564269, + "learning_rate": 7.184948526801562e-06, + "loss": 1.3764, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 347 + }, + { + "epoch": 1.1565362198168194, + "grad_norm": 0.3677689809276377, + "learning_rate": 7.179487179487179e-06, + "loss": 1.3977, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 348 + }, + { + "epoch": 1.1598667776852623, + "grad_norm": 0.4562607243713519, + "learning_rate": 7.173990710968203e-06, + "loss": 1.4743, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 349 + }, + { + "epoch": 1.1631973355537053, + "grad_norm": 0.36851546433374166, + "learning_rate": 7.168458781362006e-06, + "loss": 1.3998, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 350 + }, + { + "epoch": 1.1665278934221481, + "grad_norm": 0.3440172529786023, + "learning_rate": 7.1628910463861915e-06, + "loss": 1.4388, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 351 + }, + { + "epoch": 1.1698584512905912, + "grad_norm": 0.34899417175359176, + "learning_rate": 7.157287157287158e-06, + "loss": 1.4109, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 352 + }, + { + "epoch": 1.1731890091590342, + "grad_norm": 0.3369095274891404, + "learning_rate": 7.151646760767281e-06, + "loss": 1.4721, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 353 + }, + { + "epoch": 1.176519567027477, + "grad_norm": 0.3373083746918916, + "learning_rate": 7.145969498910675e-06, + "loss": 1.3879, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 354 + }, + { + "epoch": 1.1798501248959201, + "grad_norm": 0.3127699546260214, + "learning_rate": 7.140255009107467e-06, + "loss": 1.4067, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 355 + }, + { + "epoch": 1.183180682764363, + "grad_norm": 0.3455714541263257, + "learning_rate": 7.1345029239766076e-06, + "loss": 1.4729, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 356 + }, + { + "epoch": 1.186511240632806, + "grad_norm": 0.35459135197814406, + "learning_rate": 7.128712871287129e-06, + "loss": 1.4845, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 357 + }, + { + "epoch": 1.189841798501249, + "grad_norm": 0.3382392537839561, + "learning_rate": 7.122884473877851e-06, + "loss": 1.4796, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 358 + }, + { + "epoch": 1.1931723563696919, + "grad_norm": 0.3229617810865785, + "learning_rate": 7.117017349575488e-06, + "loss": 1.4258, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 359 + }, + { + "epoch": 1.196502914238135, + "grad_norm": 0.4132878845320615, + "learning_rate": 7.11111111111111e-06, + "loss": 1.4344, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 360 + }, + { + "epoch": 1.1998334721065778, + "grad_norm": 0.3909252234186588, + "learning_rate": 7.105165366034932e-06, + "loss": 1.4136, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 361 + }, + { + "epoch": 1.2031640299750208, + "grad_norm": 0.4166542946239009, + "learning_rate": 7.0991797166293805e-06, + "loss": 1.418, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 362 + }, + { + "epoch": 1.2064945878434639, + "grad_norm": 0.3978265092622875, + "learning_rate": 7.093153759820426e-06, + "loss": 1.4778, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 363 + }, + { + "epoch": 1.2098251457119067, + "grad_norm": 0.3362476483926624, + "learning_rate": 7.087087087087086e-06, + "loss": 1.4045, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 364 + }, + { + "epoch": 1.2131557035803497, + "grad_norm": 0.34227618124914144, + "learning_rate": 7.0809792843691135e-06, + "loss": 1.3691, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 365 + }, + { + "epoch": 1.2164862614487926, + "grad_norm": 0.4154500404546309, + "learning_rate": 7.074829931972789e-06, + "loss": 1.4296, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 366 + }, + { + "epoch": 1.2198168193172356, + "grad_norm": 0.3910386371341375, + "learning_rate": 7.068638604474782e-06, + "loss": 1.4128, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 367 + }, + { + "epoch": 1.2231473771856787, + "grad_norm": 0.32973016037230485, + "learning_rate": 7.062404870624048e-06, + "loss": 1.3952, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 368 + }, + { + "epoch": 1.2264779350541215, + "grad_norm": 0.3476414929125133, + "learning_rate": 7.056128293241695e-06, + "loss": 1.4132, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 369 + }, + { + "epoch": 1.2298084929225646, + "grad_norm": 0.3490075847160727, + "learning_rate": 7.0498084291187725e-06, + "loss": 1.5034, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 370 + }, + { + "epoch": 1.2331390507910074, + "grad_norm": 0.4213209017684047, + "learning_rate": 7.043444828911956e-06, + "loss": 1.4407, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 371 + }, + { + "epoch": 1.2364696086594504, + "grad_norm": 0.41156726116014214, + "learning_rate": 7.037037037037037e-06, + "loss": 1.4922, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 372 + }, + { + "epoch": 1.2398001665278935, + "grad_norm": 0.3274736563867899, + "learning_rate": 7.0305845915602e-06, + "loss": 1.4443, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 373 + }, + { + "epoch": 1.2431307243963363, + "grad_norm": 0.4016877039684572, + "learning_rate": 7.024087024087023e-06, + "loss": 1.4765, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 374 + }, + { + "epoch": 1.2464612822647794, + "grad_norm": 0.37926187648963133, + "learning_rate": 7.017543859649123e-06, + "loss": 1.4944, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 375 + }, + { + "epoch": 1.2497918401332222, + "grad_norm": 0.3995775555374175, + "learning_rate": 7.0109546165884185e-06, + "loss": 1.4737, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 376 + }, + { + "epoch": 1.2531223980016652, + "grad_norm": 0.4179330927454956, + "learning_rate": 7.0043188064389475e-06, + "loss": 1.396, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 377 + }, + { + "epoch": 1.2564529558701083, + "grad_norm": 0.4026676583822718, + "learning_rate": 6.997635933806146e-06, + "loss": 1.5024, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 378 + }, + { + "epoch": 1.2597835137385511, + "grad_norm": 0.3729935293489866, + "learning_rate": 6.9909054962435735e-06, + "loss": 1.5035, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 379 + }, + { + "epoch": 1.2631140716069942, + "grad_norm": 0.37785861617292904, + "learning_rate": 6.984126984126983e-06, + "loss": 1.4859, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 380 + }, + { + "epoch": 1.266444629475437, + "grad_norm": 0.34618072727066834, + "learning_rate": 6.977299880525687e-06, + "loss": 1.3753, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 381 + }, + { + "epoch": 1.26977518734388, + "grad_norm": 0.3603657688818211, + "learning_rate": 6.970423661071143e-06, + "loss": 1.4396, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 382 + }, + { + "epoch": 1.2731057452123231, + "grad_norm": 0.31695078316874364, + "learning_rate": 6.963497793822704e-06, + "loss": 1.4512, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 383 + }, + { + "epoch": 1.2764363030807662, + "grad_norm": 0.3998834526312468, + "learning_rate": 6.956521739130433e-06, + "loss": 1.4068, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 384 + }, + { + "epoch": 1.279766860949209, + "grad_norm": 0.40218592316674945, + "learning_rate": 6.949494949494949e-06, + "loss": 1.4314, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 385 + }, + { + "epoch": 1.2830974188176518, + "grad_norm": 0.4377216092057675, + "learning_rate": 6.942416869424169e-06, + "loss": 1.4159, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 386 + }, + { + "epoch": 1.2864279766860949, + "grad_norm": 0.3806613338175727, + "learning_rate": 6.935286935286935e-06, + "loss": 1.4383, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 387 + }, + { + "epoch": 1.289758534554538, + "grad_norm": 0.41315217581288083, + "learning_rate": 6.928104575163398e-06, + "loss": 1.4639, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 388 + }, + { + "epoch": 1.293089092422981, + "grad_norm": 0.4242068360276873, + "learning_rate": 6.920869208692086e-06, + "loss": 1.5043, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 389 + }, + { + "epoch": 1.2964196502914238, + "grad_norm": 0.40526133848179174, + "learning_rate": 6.913580246913578e-06, + "loss": 1.4969, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 390 + }, + { + "epoch": 1.2997502081598669, + "grad_norm": 0.4390648977103527, + "learning_rate": 6.9062370921106965e-06, + "loss": 1.4634, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 391 + }, + { + "epoch": 1.3030807660283097, + "grad_norm": 0.3293053257002483, + "learning_rate": 6.898839137645108e-06, + "loss": 1.4837, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 392 + }, + { + "epoch": 1.3064113238967527, + "grad_norm": 0.3741205703169676, + "learning_rate": 6.891385767790261e-06, + "loss": 1.3888, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 393 + }, + { + "epoch": 1.3097418817651958, + "grad_norm": 0.36736277922290345, + "learning_rate": 6.883876357560567e-06, + "loss": 1.4422, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 394 + }, + { + "epoch": 1.3130724396336386, + "grad_norm": 0.34987451065304387, + "learning_rate": 6.876310272536688e-06, + "loss": 1.4384, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 395 + }, + { + "epoch": 1.3164029975020817, + "grad_norm": 0.3574591374681954, + "learning_rate": 6.868686868686868e-06, + "loss": 1.4453, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 396 + }, + { + "epoch": 1.3197335553705245, + "grad_norm": 0.31108139602911883, + "learning_rate": 6.861005492184199e-06, + "loss": 1.4302, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 397 + }, + { + "epoch": 1.3230641132389676, + "grad_norm": 0.3317920901111113, + "learning_rate": 6.853265479219677e-06, + "loss": 1.4599, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 398 + }, + { + "epoch": 1.3263946711074106, + "grad_norm": 0.3319586529185681, + "learning_rate": 6.8454661558109825e-06, + "loss": 1.4349, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 399 + }, + { + "epoch": 1.3297252289758534, + "grad_norm": 0.35385561486286676, + "learning_rate": 6.837606837606837e-06, + "loss": 1.4262, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 400 + }, + { + "epoch": 1.3330557868442965, + "grad_norm": 0.36123629997437273, + "learning_rate": 6.82968682968683e-06, + "loss": 1.4475, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 401 + }, + { + "epoch": 1.3363863447127393, + "grad_norm": 0.3678679588945442, + "learning_rate": 6.821705426356589e-06, + "loss": 1.4662, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 402 + }, + { + "epoch": 1.3397169025811824, + "grad_norm": 0.35623277676543963, + "learning_rate": 6.813661910938175e-06, + "loss": 1.4157, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 403 + }, + { + "epoch": 1.3430474604496254, + "grad_norm": 0.3670378130601921, + "learning_rate": 6.805555555555554e-06, + "loss": 1.4289, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 404 + }, + { + "epoch": 1.3463780183180682, + "grad_norm": 0.37365353793241013, + "learning_rate": 6.797385620915031e-06, + "loss": 1.4758, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 405 + }, + { + "epoch": 1.3497085761865113, + "grad_norm": 0.4227767618895852, + "learning_rate": 6.78915135608049e-06, + "loss": 1.4522, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 406 + }, + { + "epoch": 1.3530391340549541, + "grad_norm": 0.392419028331304, + "learning_rate": 6.780851998243303e-06, + "loss": 1.4386, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 407 + }, + { + "epoch": 1.3563696919233972, + "grad_norm": 0.34332622194519336, + "learning_rate": 6.772486772486772e-06, + "loss": 1.4143, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 408 + }, + { + "epoch": 1.3597002497918402, + "grad_norm": 0.39554534288670906, + "learning_rate": 6.76405489154493e-06, + "loss": 1.4289, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 409 + }, + { + "epoch": 1.363030807660283, + "grad_norm": 0.3680781980427255, + "learning_rate": 6.7555555555555545e-06, + "loss": 1.4604, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 410 + }, + { + "epoch": 1.3663613655287261, + "grad_norm": 0.359696703224119, + "learning_rate": 6.7469879518072274e-06, + "loss": 1.4552, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 411 + }, + { + "epoch": 1.369691923397169, + "grad_norm": 0.43946142988468057, + "learning_rate": 6.738351254480287e-06, + "loss": 1.4649, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 412 + }, + { + "epoch": 1.373022481265612, + "grad_norm": 0.38158471914984216, + "learning_rate": 6.729644624381466e-06, + "loss": 1.4553, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 413 + }, + { + "epoch": 1.376353039134055, + "grad_norm": 0.3785264160376015, + "learning_rate": 6.720867208672086e-06, + "loss": 1.457, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 414 + }, + { + "epoch": 1.3796835970024979, + "grad_norm": 0.40275915279313634, + "learning_rate": 6.712018140589569e-06, + "loss": 1.4665, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 415 + }, + { + "epoch": 1.383014154870941, + "grad_norm": 0.37268382461278277, + "learning_rate": 6.703096539162113e-06, + "loss": 1.377, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 416 + }, + { + "epoch": 1.3863447127393838, + "grad_norm": 0.3362832443073036, + "learning_rate": 6.694101508916324e-06, + "loss": 1.4122, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 417 + }, + { + "epoch": 1.3896752706078268, + "grad_norm": 0.3428291854645596, + "learning_rate": 6.6850321395775945e-06, + "loss": 1.3466, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 418 + }, + { + "epoch": 1.3930058284762699, + "grad_norm": 0.38976496538071015, + "learning_rate": 6.675887505763023e-06, + "loss": 1.4711, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 419 + }, + { + "epoch": 1.3963363863447127, + "grad_norm": 0.4034367524201395, + "learning_rate": 6.666666666666666e-06, + "loss": 1.5079, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 420 + }, + { + "epoch": 1.3996669442131557, + "grad_norm": 0.38251655422807695, + "learning_rate": 6.657368665736867e-06, + "loss": 1.4715, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 421 + }, + { + "epoch": 1.4029975020815986, + "grad_norm": 0.3636615753904805, + "learning_rate": 6.647992530345471e-06, + "loss": 1.4175, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 422 + }, + { + "epoch": 1.4063280599500416, + "grad_norm": 0.3543871236347375, + "learning_rate": 6.6385372714486634e-06, + "loss": 1.4008, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 423 + }, + { + "epoch": 1.4096586178184847, + "grad_norm": 0.4270397824248548, + "learning_rate": 6.6290018832391705e-06, + "loss": 1.4082, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 424 + }, + { + "epoch": 1.4129891756869275, + "grad_norm": 0.41956585580281563, + "learning_rate": 6.6193853427895966e-06, + "loss": 1.4075, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 425 + }, + { + "epoch": 1.4163197335553706, + "grad_norm": 0.47761484099497725, + "learning_rate": 6.60968660968661e-06, + "loss": 1.4104, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 426 + }, + { + "epoch": 1.4196502914238134, + "grad_norm": 0.4078261265233408, + "learning_rate": 6.599904625655699e-06, + "loss": 1.4317, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 427 + }, + { + "epoch": 1.4229808492922564, + "grad_norm": 0.37691692681004796, + "learning_rate": 6.590038314176245e-06, + "loss": 1.391, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 428 + }, + { + "epoch": 1.4263114071606995, + "grad_norm": 0.3288508827565593, + "learning_rate": 6.580086580086579e-06, + "loss": 1.401, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 429 + }, + { + "epoch": 1.4296419650291423, + "grad_norm": 0.338319616372442, + "learning_rate": 6.570048309178745e-06, + "loss": 1.4178, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 430 + }, + { + "epoch": 1.4329725228975854, + "grad_norm": 0.414862604672987, + "learning_rate": 6.559922367782628e-06, + "loss": 1.4642, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 431 + }, + { + "epoch": 1.4363030807660282, + "grad_norm": 0.3949017633125201, + "learning_rate": 6.54970760233918e-06, + "loss": 1.3643, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 432 + }, + { + "epoch": 1.4396336386344712, + "grad_norm": 0.3872663647349424, + "learning_rate": 6.53940283896231e-06, + "loss": 1.3998, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 433 + }, + { + "epoch": 1.4429641965029143, + "grad_norm": 0.3778182716944692, + "learning_rate": 6.529006882989183e-06, + "loss": 1.421, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 434 + }, + { + "epoch": 1.4462947543713571, + "grad_norm": 0.3368637084806252, + "learning_rate": 6.518518518518519e-06, + "loss": 1.4562, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 435 + }, + { + "epoch": 1.4496253122398002, + "grad_norm": 0.37088068849156625, + "learning_rate": 6.507936507936509e-06, + "loss": 1.389, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 436 + }, + { + "epoch": 1.452955870108243, + "grad_norm": 0.4171977510324979, + "learning_rate": 6.497259591429994e-06, + "loss": 1.4314, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 437 + }, + { + "epoch": 1.456286427976686, + "grad_norm": 0.36493233792748947, + "learning_rate": 6.486486486486486e-06, + "loss": 1.4239, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 438 + }, + { + "epoch": 1.4596169858451291, + "grad_norm": 0.36409025362836434, + "learning_rate": 6.475615887380592e-06, + "loss": 1.4011, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 439 + }, + { + "epoch": 1.462947543713572, + "grad_norm": 0.3432633374051585, + "learning_rate": 6.464646464646463e-06, + "loss": 1.4706, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 440 + }, + { + "epoch": 1.466278101582015, + "grad_norm": 0.36918146681400343, + "learning_rate": 6.453576864535769e-06, + "loss": 1.4048, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 441 + }, + { + "epoch": 1.4696086594504578, + "grad_norm": 0.3558974109435063, + "learning_rate": 6.442405708460755e-06, + "loss": 1.4233, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 442 + }, + { + "epoch": 1.4729392173189009, + "grad_norm": 0.3319935101093491, + "learning_rate": 6.431131592421914e-06, + "loss": 1.4557, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 443 + }, + { + "epoch": 1.476269775187344, + "grad_norm": 0.3957835276431251, + "learning_rate": 6.419753086419752e-06, + "loss": 1.4974, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 444 + }, + { + "epoch": 1.479600333055787, + "grad_norm": 0.46743126820019115, + "learning_rate": 6.408268733850127e-06, + "loss": 1.3428, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 445 + }, + { + "epoch": 1.4829308909242298, + "grad_norm": 0.32072751511352704, + "learning_rate": 6.396677050882658e-06, + "loss": 1.4252, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 446 + }, + { + "epoch": 1.4862614487926726, + "grad_norm": 0.3691624108782593, + "learning_rate": 6.384976525821596e-06, + "loss": 1.4288, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 447 + }, + { + "epoch": 1.4895920066611157, + "grad_norm": 0.41832466518878647, + "learning_rate": 6.373165618448636e-06, + "loss": 1.4287, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 448 + }, + { + "epoch": 1.4929225645295587, + "grad_norm": 0.3728200914294547, + "learning_rate": 6.361242759347024e-06, + "loss": 1.391, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 449 + }, + { + "epoch": 1.4962531223980018, + "grad_norm": 0.3489172461380398, + "learning_rate": 6.349206349206349e-06, + "loss": 1.4012, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 450 + }, + { + "epoch": 1.4995836802664446, + "grad_norm": 0.45831242097179337, + "learning_rate": 6.337054758107389e-06, + "loss": 1.4062, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 451 + }, + { + "epoch": 1.5029142381348874, + "grad_norm": 0.4485083988308969, + "learning_rate": 6.324786324786324e-06, + "loss": 1.4077, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 452 + }, + { + "epoch": 1.5062447960033305, + "grad_norm": 0.3469124587165823, + "learning_rate": 6.312399355877616e-06, + "loss": 1.3635, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 453 + }, + { + "epoch": 1.5095753538717736, + "grad_norm": 0.4359834643536742, + "learning_rate": 6.299892125134842e-06, + "loss": 1.3951, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 454 + }, + { + "epoch": 1.5129059117402166, + "grad_norm": 0.43347338145656295, + "learning_rate": 6.287262872628726e-06, + "loss": 1.438, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 455 + }, + { + "epoch": 1.5162364696086594, + "grad_norm": 0.3544519721589859, + "learning_rate": 6.274509803921569e-06, + "loss": 1.4028, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 456 + }, + { + "epoch": 1.5195670274771023, + "grad_norm": 0.4175623558211923, + "learning_rate": 6.261631089217296e-06, + "loss": 1.4649, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 457 + }, + { + "epoch": 1.5228975853455453, + "grad_norm": 0.47794327593006264, + "learning_rate": 6.248624862486248e-06, + "loss": 1.4552, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 458 + }, + { + "epoch": 1.5262281432139884, + "grad_norm": 0.5102221497723193, + "learning_rate": 6.235489220563847e-06, + "loss": 1.5577, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 459 + }, + { + "epoch": 1.5295587010824314, + "grad_norm": 0.361727454686882, + "learning_rate": 6.2222222222222215e-06, + "loss": 1.4977, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 460 + }, + { + "epoch": 1.5328892589508742, + "grad_norm": 0.43568797487755334, + "learning_rate": 6.208821887213847e-06, + "loss": 1.4417, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 461 + }, + { + "epoch": 1.536219816819317, + "grad_norm": 0.39795557103291623, + "learning_rate": 6.195286195286195e-06, + "loss": 1.4479, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 462 + }, + { + "epoch": 1.5395503746877601, + "grad_norm": 0.3699426752838303, + "learning_rate": 6.181613085166384e-06, + "loss": 1.4379, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 463 + }, + { + "epoch": 1.5428809325562032, + "grad_norm": 0.5138765482501748, + "learning_rate": 6.167800453514738e-06, + "loss": 1.4433, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 464 + }, + { + "epoch": 1.5462114904246462, + "grad_norm": 0.5597671339637968, + "learning_rate": 6.153846153846153e-06, + "loss": 1.4255, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 465 + }, + { + "epoch": 1.549542048293089, + "grad_norm": 0.4443208189107028, + "learning_rate": 6.1397479954180976e-06, + "loss": 1.4458, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 466 + }, + { + "epoch": 1.552872606161532, + "grad_norm": 0.41782304334586917, + "learning_rate": 6.125503742084053e-06, + "loss": 1.4362, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 467 + }, + { + "epoch": 1.556203164029975, + "grad_norm": 0.511701451750574, + "learning_rate": 6.11111111111111e-06, + "loss": 1.4378, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 468 + }, + { + "epoch": 1.559533721898418, + "grad_norm": 0.4272528437058103, + "learning_rate": 6.096567771960442e-06, + "loss": 1.4315, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 469 + }, + { + "epoch": 1.562864279766861, + "grad_norm": 0.42099653002903337, + "learning_rate": 6.0818713450292395e-06, + "loss": 1.4092, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 470 + }, + { + "epoch": 1.5661948376353039, + "grad_norm": 0.4635591149261861, + "learning_rate": 6.067019400352732e-06, + "loss": 1.4357, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 471 + }, + { + "epoch": 1.569525395503747, + "grad_norm": 0.5318262046494987, + "learning_rate": 6.052009456264775e-06, + "loss": 1.4753, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 472 + }, + { + "epoch": 1.5728559533721898, + "grad_norm": 0.4098578230232083, + "learning_rate": 6.036838978015449e-06, + "loss": 1.4192, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 473 + }, + { + "epoch": 1.5761865112406328, + "grad_norm": 0.4563174114919455, + "learning_rate": 6.021505376344085e-06, + "loss": 1.4676, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 474 + }, + { + "epoch": 1.5795170691090759, + "grad_norm": 0.5270544922424331, + "learning_rate": 6.006006006006005e-06, + "loss": 1.4267, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 475 + }, + { + "epoch": 1.5828476269775187, + "grad_norm": 0.3910787909582668, + "learning_rate": 5.990338164251208e-06, + "loss": 1.3766, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 476 + }, + { + "epoch": 1.5861781848459617, + "grad_norm": 0.4736515430850208, + "learning_rate": 5.974499089253187e-06, + "loss": 1.4437, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 477 + }, + { + "epoch": 1.5895087427144046, + "grad_norm": 0.5430796464569592, + "learning_rate": 5.958485958485957e-06, + "loss": 1.4482, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 478 + }, + { + "epoch": 1.5928393005828476, + "grad_norm": 0.38226206389298173, + "learning_rate": 5.942295887047268e-06, + "loss": 1.412, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 479 + }, + { + "epoch": 1.5961698584512907, + "grad_norm": 0.3721223079028304, + "learning_rate": 5.925925925925925e-06, + "loss": 1.45, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 480 + }, + { + "epoch": 1.5995004163197337, + "grad_norm": 0.3827064109331823, + "learning_rate": 5.909373060211049e-06, + "loss": 1.4217, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 481 + }, + { + "epoch": 1.6028309741881765, + "grad_norm": 0.33684324932641296, + "learning_rate": 5.892634207240949e-06, + "loss": 1.3557, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 482 + }, + { + "epoch": 1.6061615320566194, + "grad_norm": 0.31468847211788964, + "learning_rate": 5.875706214689265e-06, + "loss": 1.4122, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 483 + }, + { + "epoch": 1.6094920899250624, + "grad_norm": 0.4442799216781044, + "learning_rate": 5.858585858585859e-06, + "loss": 1.4285, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 484 + }, + { + "epoch": 1.6128226477935055, + "grad_norm": 0.4567121702156198, + "learning_rate": 5.841269841269841e-06, + "loss": 1.4764, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 485 + }, + { + "epoch": 1.6161532056619485, + "grad_norm": 0.3590206566567271, + "learning_rate": 5.82375478927203e-06, + "loss": 1.4229, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 486 + }, + { + "epoch": 1.6194837635303914, + "grad_norm": 0.3652198930331244, + "learning_rate": 5.806037251123956e-06, + "loss": 1.4151, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 487 + }, + { + "epoch": 1.6228143213988342, + "grad_norm": 0.35866861963268476, + "learning_rate": 5.7881136950904385e-06, + "loss": 1.3369, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 488 + }, + { + "epoch": 1.6261448792672772, + "grad_norm": 0.4750936045573692, + "learning_rate": 5.7699805068226105e-06, + "loss": 1.4715, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 489 + }, + { + "epoch": 1.6294754371357203, + "grad_norm": 0.3613198830707804, + "learning_rate": 5.7516339869281045e-06, + "loss": 1.4291, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 490 + }, + { + "epoch": 1.6328059950041633, + "grad_norm": 0.43606379412430957, + "learning_rate": 5.733070348454964e-06, + "loss": 1.4011, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 491 + }, + { + "epoch": 1.6361365528726062, + "grad_norm": 0.35042984426925494, + "learning_rate": 5.7142857142857145e-06, + "loss": 1.4368, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 492 + }, + { + "epoch": 1.639467110741049, + "grad_norm": 0.31661366243629, + "learning_rate": 5.695276114437791e-06, + "loss": 1.4541, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 493 + }, + { + "epoch": 1.642797668609492, + "grad_norm": 0.3561358967067642, + "learning_rate": 5.676037483266399e-06, + "loss": 1.433, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 494 + }, + { + "epoch": 1.646128226477935, + "grad_norm": 0.3931637346563919, + "learning_rate": 5.656565656565656e-06, + "loss": 1.4193, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 495 + }, + { + "epoch": 1.6494587843463782, + "grad_norm": 0.48631366553960975, + "learning_rate": 5.6368563685636855e-06, + "loss": 1.4012, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 496 + }, + { + "epoch": 1.652789342214821, + "grad_norm": 0.41348933242163105, + "learning_rate": 5.616905248807089e-06, + "loss": 1.3883, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 497 + }, + { + "epoch": 1.6561199000832638, + "grad_norm": 0.3541766139316355, + "learning_rate": 5.59670781893004e-06, + "loss": 1.363, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 498 + }, + { + "epoch": 1.6594504579517069, + "grad_norm": 0.410383164470969, + "learning_rate": 5.576259489302967e-06, + "loss": 1.3955, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 499 + }, + { + "epoch": 1.66278101582015, + "grad_norm": 0.4100549908496841, + "learning_rate": 5.555555555555555e-06, + "loss": 1.3663, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 500 + }, + { + "epoch": 1.666111573688593, + "grad_norm": 0.4122832272958553, + "learning_rate": 5.534591194968553e-06, + "loss": 1.5108, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 501 + }, + { + "epoch": 1.6694421315570358, + "grad_norm": 0.33209617039282874, + "learning_rate": 5.513361462728551e-06, + "loss": 1.4069, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 502 + }, + { + "epoch": 1.6727726894254786, + "grad_norm": 0.34650064809899755, + "learning_rate": 5.491861288039631e-06, + "loss": 1.3953, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 503 + }, + { + "epoch": 1.6761032472939217, + "grad_norm": 0.3583592015376779, + "learning_rate": 5.47008547008547e-06, + "loss": 1.4181, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 504 + }, + { + "epoch": 1.6794338051623647, + "grad_norm": 0.34343414571245584, + "learning_rate": 5.448028673835125e-06, + "loss": 1.418, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 505 + }, + { + "epoch": 1.6827643630308078, + "grad_norm": 0.35638669107128673, + "learning_rate": 5.425685425685425e-06, + "loss": 1.4052, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 506 + }, + { + "epoch": 1.6860949208992506, + "grad_norm": 0.3467424021658532, + "learning_rate": 5.403050108932461e-06, + "loss": 1.4581, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 507 + }, + { + "epoch": 1.6894254787676934, + "grad_norm": 0.32381127071831955, + "learning_rate": 5.3801169590643285e-06, + "loss": 1.459, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 508 + }, + { + "epoch": 1.6927560366361365, + "grad_norm": 0.3811936086039866, + "learning_rate": 5.356880058866813e-06, + "loss": 1.5033, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 509 + }, + { + "epoch": 1.6960865945045795, + "grad_norm": 0.3612050754686712, + "learning_rate": 5.333333333333333e-06, + "loss": 1.4137, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 510 + }, + { + "epoch": 1.6994171523730226, + "grad_norm": 0.35765265665477713, + "learning_rate": 5.309470544369873e-06, + "loss": 1.4087, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 511 + }, + { + "epoch": 1.7027477102414654, + "grad_norm": 0.3357163323849947, + "learning_rate": 5.285285285285285e-06, + "loss": 1.4851, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 512 + }, + { + "epoch": 1.7060782681099083, + "grad_norm": 0.3449646759899252, + "learning_rate": 5.260770975056689e-06, + "loss": 1.442, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 513 + }, + { + "epoch": 1.7094088259783513, + "grad_norm": 0.3335919341097906, + "learning_rate": 5.235920852359208e-06, + "loss": 1.454, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 514 + }, + { + "epoch": 1.7127393838467944, + "grad_norm": 0.3414007515866483, + "learning_rate": 5.210727969348659e-06, + "loss": 1.4762, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 515 + }, + { + "epoch": 1.7160699417152374, + "grad_norm": 0.37174665041283544, + "learning_rate": 5.185185185185185e-06, + "loss": 1.4615, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 516 + }, + { + "epoch": 1.7194004995836802, + "grad_norm": 0.37265087217053033, + "learning_rate": 5.159285159285159e-06, + "loss": 1.4072, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 517 + }, + { + "epoch": 1.722731057452123, + "grad_norm": 0.3445160578098801, + "learning_rate": 5.1330203442879505e-06, + "loss": 1.4337, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 518 + }, + { + "epoch": 1.7260616153205661, + "grad_norm": 0.3675807887019101, + "learning_rate": 5.106382978723403e-06, + "loss": 1.4147, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 519 + }, + { + "epoch": 1.7293921731890092, + "grad_norm": 0.34584285856367675, + "learning_rate": 5.079365079365079e-06, + "loss": 1.4193, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 520 + }, + { + "epoch": 1.7327227310574522, + "grad_norm": 0.3685778739128953, + "learning_rate": 5.051958433253396e-06, + "loss": 1.4466, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 521 + }, + { + "epoch": 1.736053288925895, + "grad_norm": 0.35632916296360506, + "learning_rate": 5.02415458937198e-06, + "loss": 1.4299, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 522 + }, + { + "epoch": 1.739383846794338, + "grad_norm": 0.3941128956001842, + "learning_rate": 4.995944849959448e-06, + "loss": 1.4264, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 523 + }, + { + "epoch": 1.742714404662781, + "grad_norm": 0.3481786883352737, + "learning_rate": 4.967320261437908e-06, + "loss": 1.4279, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 524 + }, + { + "epoch": 1.746044962531224, + "grad_norm": 0.3627527951339854, + "learning_rate": 4.938271604938271e-06, + "loss": 1.5152, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 525 + }, + { + "epoch": 1.749375520399667, + "grad_norm": 0.34527513358988937, + "learning_rate": 4.9087893864013265e-06, + "loss": 1.4088, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 526 + }, + { + "epoch": 1.7527060782681099, + "grad_norm": 0.368823414133051, + "learning_rate": 4.878863826232247e-06, + "loss": 1.3944, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 527 + }, + { + "epoch": 1.756036636136553, + "grad_norm": 0.3471938836863914, + "learning_rate": 4.848484848484849e-06, + "loss": 1.3809, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 528 + }, + { + "epoch": 1.7593671940049957, + "grad_norm": 0.39760839658681035, + "learning_rate": 4.817642069550467e-06, + "loss": 1.4081, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 529 + }, + { + "epoch": 1.7626977518734388, + "grad_norm": 0.35630999152948084, + "learning_rate": 4.786324786324786e-06, + "loss": 1.4049, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 530 + }, + { + "epoch": 1.7660283097418819, + "grad_norm": 0.3123127862091999, + "learning_rate": 4.754521963824289e-06, + "loss": 1.4033, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 531 + }, + { + "epoch": 1.7693588676103247, + "grad_norm": 0.3565716669933871, + "learning_rate": 4.722222222222222e-06, + "loss": 1.4548, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 532 + }, + { + "epoch": 1.7726894254787677, + "grad_norm": 0.3415824605451111, + "learning_rate": 4.68941382327209e-06, + "loss": 1.3379, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 533 + }, + { + "epoch": 1.7760199833472106, + "grad_norm": 0.37445157627374487, + "learning_rate": 4.6560846560846555e-06, + "loss": 1.5023, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 534 + }, + { + "epoch": 1.7793505412156536, + "grad_norm": 0.4140970552339397, + "learning_rate": 4.622222222222222e-06, + "loss": 1.4982, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 535 + }, + { + "epoch": 1.7826810990840967, + "grad_norm": 0.3696216853055909, + "learning_rate": 4.587813620071684e-06, + "loss": 1.3795, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 536 + }, + { + "epoch": 1.7860116569525397, + "grad_norm": 0.3374796769034963, + "learning_rate": 4.552845528455284e-06, + "loss": 1.4356, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 537 + }, + { + "epoch": 1.7893422148209825, + "grad_norm": 0.4227610049072286, + "learning_rate": 4.517304189435337e-06, + "loss": 1.4625, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 538 + }, + { + "epoch": 1.7926727726894254, + "grad_norm": 0.36612259553982557, + "learning_rate": 4.4811753902663e-06, + "loss": 1.4274, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 539 + }, + { + "epoch": 1.7960033305578684, + "grad_norm": 0.4222638209328834, + "learning_rate": 4.444444444444443e-06, + "loss": 1.5129, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 540 + }, + { + "epoch": 1.7993338884263115, + "grad_norm": 0.41009576553628174, + "learning_rate": 4.407096171802053e-06, + "loss": 1.4873, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 541 + }, + { + "epoch": 1.8026644462947545, + "grad_norm": 0.35086922544434007, + "learning_rate": 4.369114877589454e-06, + "loss": 1.3718, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 542 + }, + { + "epoch": 1.8059950041631974, + "grad_norm": 0.35855015526438827, + "learning_rate": 4.33048433048433e-06, + "loss": 1.4031, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 543 + }, + { + "epoch": 1.8093255620316402, + "grad_norm": 0.42477533100459036, + "learning_rate": 4.291187739463601e-06, + "loss": 1.4473, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 544 + }, + { + "epoch": 1.8126561199000832, + "grad_norm": 0.39791782472493653, + "learning_rate": 4.251207729468599e-06, + "loss": 1.4374, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 545 + }, + { + "epoch": 1.8159866777685263, + "grad_norm": 0.3444343384513091, + "learning_rate": 4.210526315789473e-06, + "loss": 1.4048, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 546 + }, + { + "epoch": 1.8193172356369693, + "grad_norm": 0.3453119165966736, + "learning_rate": 4.169124877089478e-06, + "loss": 1.3581, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 547 + }, + { + "epoch": 1.8226477935054122, + "grad_norm": 0.38186414289634574, + "learning_rate": 4.126984126984126e-06, + "loss": 1.4774, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 548 + }, + { + "epoch": 1.825978351373855, + "grad_norm": 0.3371300332212375, + "learning_rate": 4.084084084084084e-06, + "loss": 1.3565, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 549 + }, + { + "epoch": 1.829308909242298, + "grad_norm": 0.32042065002080106, + "learning_rate": 4.0404040404040395e-06, + "loss": 1.3807, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 550 + }, + { + "epoch": 1.832639467110741, + "grad_norm": 0.3776475075214216, + "learning_rate": 3.995922528032619e-06, + "loss": 1.4305, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 551 + }, + { + "epoch": 1.8359700249791842, + "grad_norm": 0.3351717661136717, + "learning_rate": 3.9506172839506175e-06, + "loss": 1.4133, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 552 + }, + { + "epoch": 1.839300582847627, + "grad_norm": 0.37528610178789024, + "learning_rate": 3.904465212876428e-06, + "loss": 1.4994, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 553 + }, + { + "epoch": 1.8426311407160698, + "grad_norm": 0.38936785329254486, + "learning_rate": 3.857442348008385e-06, + "loss": 1.393, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 554 + }, + { + "epoch": 1.8459616985845129, + "grad_norm": 0.40525496168183883, + "learning_rate": 3.8095238095238094e-06, + "loss": 1.4019, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 555 + }, + { + "epoch": 1.849292256452956, + "grad_norm": 0.4169994094961459, + "learning_rate": 3.7606837606837604e-06, + "loss": 1.4208, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 556 + }, + { + "epoch": 1.852622814321399, + "grad_norm": 0.4093560262894869, + "learning_rate": 3.710895361380798e-06, + "loss": 1.44, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 557 + }, + { + "epoch": 1.8559533721898418, + "grad_norm": 0.35662577286334196, + "learning_rate": 3.660130718954248e-06, + "loss": 1.4168, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 558 + }, + { + "epoch": 1.8592839300582846, + "grad_norm": 0.3469062029498766, + "learning_rate": 3.6083608360836084e-06, + "loss": 1.4109, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 559 + }, + { + "epoch": 1.8626144879267277, + "grad_norm": 0.35913894186601036, + "learning_rate": 3.5555555555555546e-06, + "loss": 1.3026, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 560 + }, + { + "epoch": 1.8659450457951707, + "grad_norm": 0.3601783041537011, + "learning_rate": 3.501683501683501e-06, + "loss": 1.4429, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 561 + }, + { + "epoch": 1.8692756036636138, + "grad_norm": 0.4301246312907219, + "learning_rate": 3.4467120181405894e-06, + "loss": 1.4415, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 562 + }, + { + "epoch": 1.8726061615320566, + "grad_norm": 0.44543619950365937, + "learning_rate": 3.390607101947308e-06, + "loss": 1.4354, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 563 + }, + { + "epoch": 1.8759367194004994, + "grad_norm": 0.4110006980910609, + "learning_rate": 3.333333333333333e-06, + "loss": 1.5156, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 564 + }, + { + "epoch": 1.8792672772689425, + "grad_norm": 0.36681035057341954, + "learning_rate": 3.27485380116959e-06, + "loss": 1.3926, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 565 + }, + { + "epoch": 1.8825978351373855, + "grad_norm": 0.3639552416710322, + "learning_rate": 3.215130023640661e-06, + "loss": 1.4537, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 566 + }, + { + "epoch": 1.8859283930058286, + "grad_norm": 0.3809201109198225, + "learning_rate": 3.154121863799283e-06, + "loss": 1.4344, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 567 + }, + { + "epoch": 1.8892589508742714, + "grad_norm": 0.5067748995958425, + "learning_rate": 3.0917874396135263e-06, + "loss": 1.444, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 568 + }, + { + "epoch": 1.8925895087427143, + "grad_norm": 0.39339545860925257, + "learning_rate": 3.028083028083028e-06, + "loss": 1.4368, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 569 + }, + { + "epoch": 1.8959200666111573, + "grad_norm": 0.3645143242760266, + "learning_rate": 2.9629629629629625e-06, + "loss": 1.4189, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 570 + }, + { + "epoch": 1.8992506244796004, + "grad_norm": 0.41301404150023885, + "learning_rate": 2.8963795255930087e-06, + "loss": 1.4513, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 571 + }, + { + "epoch": 1.9025811823480434, + "grad_norm": 0.35445322756534786, + "learning_rate": 2.828282828282828e-06, + "loss": 1.4212, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 572 + }, + { + "epoch": 1.9059117402164862, + "grad_norm": 0.31609898679838344, + "learning_rate": 2.758620689655172e-06, + "loss": 1.4282, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 573 + }, + { + "epoch": 1.909242298084929, + "grad_norm": 0.38641815454972966, + "learning_rate": 2.6873385012919895e-06, + "loss": 1.5401, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 574 + }, + { + "epoch": 1.9125728559533721, + "grad_norm": 0.38729985084754753, + "learning_rate": 2.6143790849673204e-06, + "loss": 1.461, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 575 + }, + { + "epoch": 1.9159034138218152, + "grad_norm": 0.4550081298663739, + "learning_rate": 2.5396825396825395e-06, + "loss": 1.4602, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 576 + }, + { + "epoch": 1.9192339716902582, + "grad_norm": 0.3605173725442084, + "learning_rate": 2.4631860776439087e-06, + "loss": 1.4104, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 577 + }, + { + "epoch": 1.922564529558701, + "grad_norm": 0.38548981376382463, + "learning_rate": 2.384823848238482e-06, + "loss": 1.4465, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 578 + }, + { + "epoch": 1.9258950874271439, + "grad_norm": 0.39748551935246357, + "learning_rate": 2.304526748971193e-06, + "loss": 1.443, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 579 + }, + { + "epoch": 1.929225645295587, + "grad_norm": 0.3638886379639791, + "learning_rate": 2.222222222222222e-06, + "loss": 1.4129, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 580 + }, + { + "epoch": 1.93255620316403, + "grad_norm": 0.36953734209449074, + "learning_rate": 2.1378340365682133e-06, + "loss": 1.3176, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 581 + }, + { + "epoch": 1.935886761032473, + "grad_norm": 0.3267944344034355, + "learning_rate": 2.051282051282051e-06, + "loss": 1.4588, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 582 + }, + { + "epoch": 1.9392173189009159, + "grad_norm": 0.3915434082543582, + "learning_rate": 1.962481962481962e-06, + "loss": 1.4441, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 583 + }, + { + "epoch": 1.942547876769359, + "grad_norm": 0.3556155258308632, + "learning_rate": 1.871345029239766e-06, + "loss": 1.3898, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 584 + }, + { + "epoch": 1.9458784346378017, + "grad_norm": 0.35583427100431714, + "learning_rate": 1.7777777777777775e-06, + "loss": 1.4117, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 585 + }, + { + "epoch": 1.9492089925062448, + "grad_norm": 0.3312617219719275, + "learning_rate": 1.6816816816816814e-06, + "loss": 1.4243, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 586 + }, + { + "epoch": 1.9525395503746878, + "grad_norm": 0.3171322439070156, + "learning_rate": 1.582952815829528e-06, + "loss": 1.3974, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 587 + }, + { + "epoch": 1.9558701082431307, + "grad_norm": 0.2931034713127486, + "learning_rate": 1.4814814814814812e-06, + "loss": 1.4232, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 588 + }, + { + "epoch": 1.9592006661115737, + "grad_norm": 0.31803832338980526, + "learning_rate": 1.3771517996870107e-06, + "loss": 1.4475, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 589 + }, + { + "epoch": 1.9625312239800166, + "grad_norm": 0.3102745879861819, + "learning_rate": 1.2698412698412697e-06, + "loss": 1.4991, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 590 + }, + { + "epoch": 1.9658617818484596, + "grad_norm": 0.35190966791382605, + "learning_rate": 1.1594202898550724e-06, + "loss": 1.4806, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 591 + }, + { + "epoch": 1.9691923397169027, + "grad_norm": 0.3133274529689738, + "learning_rate": 1.045751633986928e-06, + "loss": 1.347, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 592 + }, + { + "epoch": 1.9725228975853455, + "grad_norm": 0.30605048339614954, + "learning_rate": 9.286898839137644e-07, + "loss": 1.3999, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 593 + }, + { + "epoch": 1.9758534554537885, + "grad_norm": 0.3151090112991302, + "learning_rate": 8.08080808080808e-07, + "loss": 1.4339, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 594 + }, + { + "epoch": 1.9791840133222314, + "grad_norm": 0.35650599063479166, + "learning_rate": 6.837606837606837e-07, + "loss": 1.4009, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 595 + }, + { + "epoch": 1.9825145711906744, + "grad_norm": 0.3384958491564326, + "learning_rate": 5.555555555555555e-07, + "loss": 1.4611, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 596 + }, + { + "epoch": 1.9858451290591175, + "grad_norm": 0.3335636198476521, + "learning_rate": 4.2328042328042324e-07, + "loss": 1.3728, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 597 + }, + { + "epoch": 1.9891756869275605, + "grad_norm": 0.32277296814250667, + "learning_rate": 2.8673835125448024e-07, + "loss": 1.387, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 598 + }, + { + "epoch": 1.9925062447960034, + "grad_norm": 0.3467254801927619, + "learning_rate": 1.4571948998178507e-07, + "loss": 1.4114, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 599 + }, + { + "epoch": 1.9958368026644462, + "grad_norm": 0.3207639144479259, + "learning_rate": 0, + "loss": 1.3956, + "memory/device_mem_reserved(gib)": 59.75, + "memory/max_mem_active(gib)": 57.09, + "memory/max_mem_allocated(gib)": 56.77, + "step": 600 + } + ], + "logging_steps": 1, + "max_steps": 600, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 150, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.439031159441326e+18, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}