TDVE_Weight / trainer_state.json
Moyao001's picture
Upload folder using huggingface_hub
35ee3f2 verified
{
"best_metric": 0.8585651674989221,
"best_model_checkpoint": "/DATA/DATA3/wjt/AIGV-qwen2.5/AIGV-main/output/quality_MLP/v2-20250503-111817/checkpoint-2000",
"epoch": 6.480881399870382,
"eval_steps": 2000,
"global_step": 10000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006480881399870382,
"grad_norm": 1.0703125,
"learning_rate": 1.295336787564767e-07,
"loss": 0.5365753173828125,
"memory(GiB)": 32.2,
"step": 1,
"train_speed(iter/s)": 0.260463
},
{
"epoch": 0.0032404406999351912,
"grad_norm": 1.078125,
"learning_rate": 6.476683937823834e-07,
"loss": 0.49399423599243164,
"memory(GiB)": 32.2,
"step": 5,
"train_speed(iter/s)": 0.639071
},
{
"epoch": 0.0064808813998703824,
"grad_norm": 1.09375,
"learning_rate": 1.2953367875647669e-06,
"loss": 0.5098413467407227,
"memory(GiB)": 34.73,
"step": 10,
"train_speed(iter/s)": 0.770125
},
{
"epoch": 0.009721322099805573,
"grad_norm": 1.0859375,
"learning_rate": 1.9430051813471504e-06,
"loss": 0.5465492248535156,
"memory(GiB)": 38.69,
"step": 15,
"train_speed(iter/s)": 0.741698
},
{
"epoch": 0.012961762799740765,
"grad_norm": 0.5703125,
"learning_rate": 2.5906735751295338e-06,
"loss": 0.5868881225585938,
"memory(GiB)": 38.69,
"step": 20,
"train_speed(iter/s)": 0.769326
},
{
"epoch": 0.016202203499675955,
"grad_norm": 1.078125,
"learning_rate": 3.238341968911917e-06,
"loss": 0.4694648742675781,
"memory(GiB)": 38.69,
"step": 25,
"train_speed(iter/s)": 0.810966
},
{
"epoch": 0.019442644199611146,
"grad_norm": 1.078125,
"learning_rate": 3.886010362694301e-06,
"loss": 0.5193641662597657,
"memory(GiB)": 38.69,
"step": 30,
"train_speed(iter/s)": 0.788689
},
{
"epoch": 0.02268308489954634,
"grad_norm": 1.0703125,
"learning_rate": 4.533678756476685e-06,
"loss": 0.6055156707763671,
"memory(GiB)": 43.05,
"step": 35,
"train_speed(iter/s)": 0.770943
},
{
"epoch": 0.02592352559948153,
"grad_norm": 0.55078125,
"learning_rate": 5.1813471502590676e-06,
"loss": 0.562900161743164,
"memory(GiB)": 43.05,
"step": 40,
"train_speed(iter/s)": 0.780659
},
{
"epoch": 0.02916396629941672,
"grad_norm": 1.078125,
"learning_rate": 5.829015544041451e-06,
"loss": 0.5973004817962646,
"memory(GiB)": 43.05,
"step": 45,
"train_speed(iter/s)": 0.781833
},
{
"epoch": 0.03240440699935191,
"grad_norm": 0.5390625,
"learning_rate": 6.476683937823834e-06,
"loss": 0.6351554870605469,
"memory(GiB)": 43.05,
"step": 50,
"train_speed(iter/s)": 0.784208
},
{
"epoch": 0.0356448476992871,
"grad_norm": 0.5390625,
"learning_rate": 7.124352331606218e-06,
"loss": 0.5043731689453125,
"memory(GiB)": 43.05,
"step": 55,
"train_speed(iter/s)": 0.801354
},
{
"epoch": 0.03888528839922229,
"grad_norm": 0.53515625,
"learning_rate": 7.772020725388602e-06,
"loss": 0.509942626953125,
"memory(GiB)": 43.05,
"step": 60,
"train_speed(iter/s)": 0.811061
},
{
"epoch": 0.04212572909915749,
"grad_norm": 1.046875,
"learning_rate": 8.419689119170985e-06,
"loss": 0.5730291366577148,
"memory(GiB)": 43.05,
"step": 65,
"train_speed(iter/s)": 0.796095
},
{
"epoch": 0.04536616979909268,
"grad_norm": 1.0703125,
"learning_rate": 9.06735751295337e-06,
"loss": 0.5984684944152832,
"memory(GiB)": 43.05,
"step": 70,
"train_speed(iter/s)": 0.782284
},
{
"epoch": 0.04860661049902787,
"grad_norm": 1.078125,
"learning_rate": 9.715025906735752e-06,
"loss": 0.5932106018066406,
"memory(GiB)": 43.05,
"step": 75,
"train_speed(iter/s)": 0.754338
},
{
"epoch": 0.05184705119896306,
"grad_norm": 0.5703125,
"learning_rate": 1.0362694300518135e-05,
"loss": 0.5908912658691406,
"memory(GiB)": 43.05,
"step": 80,
"train_speed(iter/s)": 0.753761
},
{
"epoch": 0.05508749189889825,
"grad_norm": 1.078125,
"learning_rate": 1.101036269430052e-05,
"loss": 0.5384979248046875,
"memory(GiB)": 43.05,
"step": 85,
"train_speed(iter/s)": 0.759779
},
{
"epoch": 0.05832793259883344,
"grad_norm": 1.09375,
"learning_rate": 1.1658031088082903e-05,
"loss": 0.5038749694824218,
"memory(GiB)": 43.05,
"step": 90,
"train_speed(iter/s)": 0.759167
},
{
"epoch": 0.06156837329876863,
"grad_norm": 1.078125,
"learning_rate": 1.2305699481865286e-05,
"loss": 0.5468292713165284,
"memory(GiB)": 43.05,
"step": 95,
"train_speed(iter/s)": 0.759403
},
{
"epoch": 0.06480881399870382,
"grad_norm": 1.0625,
"learning_rate": 1.2953367875647668e-05,
"loss": 0.48979339599609373,
"memory(GiB)": 43.05,
"step": 100,
"train_speed(iter/s)": 0.755654
},
{
"epoch": 0.06804925469863901,
"grad_norm": 0.54296875,
"learning_rate": 1.3601036269430053e-05,
"loss": 0.5300430297851563,
"memory(GiB)": 43.05,
"step": 105,
"train_speed(iter/s)": 0.763136
},
{
"epoch": 0.0712896953985742,
"grad_norm": 1.0546875,
"learning_rate": 1.4248704663212436e-05,
"loss": 0.5134796142578125,
"memory(GiB)": 43.05,
"step": 110,
"train_speed(iter/s)": 0.772151
},
{
"epoch": 0.07453013609850939,
"grad_norm": 1.0546875,
"learning_rate": 1.4896373056994819e-05,
"loss": 0.5596694946289062,
"memory(GiB)": 43.05,
"step": 115,
"train_speed(iter/s)": 0.768512
},
{
"epoch": 0.07777057679844458,
"grad_norm": 1.0625,
"learning_rate": 1.5544041450777204e-05,
"loss": 0.6054977416992188,
"memory(GiB)": 43.05,
"step": 120,
"train_speed(iter/s)": 0.753057
},
{
"epoch": 0.08101101749837979,
"grad_norm": 1.09375,
"learning_rate": 1.6191709844559585e-05,
"loss": 0.5818267822265625,
"memory(GiB)": 43.05,
"step": 125,
"train_speed(iter/s)": 0.752964
},
{
"epoch": 0.08425145819831498,
"grad_norm": 1.09375,
"learning_rate": 1.683937823834197e-05,
"loss": 0.4538330078125,
"memory(GiB)": 43.05,
"step": 130,
"train_speed(iter/s)": 0.760911
},
{
"epoch": 0.08749189889825017,
"grad_norm": 1.0625,
"learning_rate": 1.7487046632124354e-05,
"loss": 0.56064453125,
"memory(GiB)": 43.05,
"step": 135,
"train_speed(iter/s)": 0.770633
},
{
"epoch": 0.09073233959818536,
"grad_norm": 1.0703125,
"learning_rate": 1.813471502590674e-05,
"loss": 0.5388214111328125,
"memory(GiB)": 43.05,
"step": 140,
"train_speed(iter/s)": 0.765569
},
{
"epoch": 0.09397278029812055,
"grad_norm": 1.09375,
"learning_rate": 1.878238341968912e-05,
"loss": 0.4977081298828125,
"memory(GiB)": 43.05,
"step": 145,
"train_speed(iter/s)": 0.767162
},
{
"epoch": 0.09721322099805574,
"grad_norm": 1.0625,
"learning_rate": 1.9430051813471504e-05,
"loss": 0.502288818359375,
"memory(GiB)": 43.05,
"step": 150,
"train_speed(iter/s)": 0.767997
},
{
"epoch": 0.10045366169799093,
"grad_norm": 1.09375,
"learning_rate": 2.0077720207253886e-05,
"loss": 0.5877655029296875,
"memory(GiB)": 43.05,
"step": 155,
"train_speed(iter/s)": 0.764005
},
{
"epoch": 0.10369410239792612,
"grad_norm": 1.1171875,
"learning_rate": 2.072538860103627e-05,
"loss": 0.497576904296875,
"memory(GiB)": 43.05,
"step": 160,
"train_speed(iter/s)": 0.765148
},
{
"epoch": 0.10693454309786131,
"grad_norm": 1.1015625,
"learning_rate": 2.1373056994818655e-05,
"loss": 0.482635498046875,
"memory(GiB)": 43.05,
"step": 165,
"train_speed(iter/s)": 0.768335
},
{
"epoch": 0.1101749837977965,
"grad_norm": 1.1015625,
"learning_rate": 2.202072538860104e-05,
"loss": 0.600677490234375,
"memory(GiB)": 43.05,
"step": 170,
"train_speed(iter/s)": 0.768261
},
{
"epoch": 0.11341542449773169,
"grad_norm": 1.140625,
"learning_rate": 2.266839378238342e-05,
"loss": 0.50074462890625,
"memory(GiB)": 43.05,
"step": 175,
"train_speed(iter/s)": 0.774096
},
{
"epoch": 0.11665586519766688,
"grad_norm": 1.109375,
"learning_rate": 2.3316062176165805e-05,
"loss": 0.5025146484375,
"memory(GiB)": 43.05,
"step": 180,
"train_speed(iter/s)": 0.77619
},
{
"epoch": 0.11989630589760207,
"grad_norm": 1.1484375,
"learning_rate": 2.3963730569948187e-05,
"loss": 0.457147216796875,
"memory(GiB)": 43.05,
"step": 185,
"train_speed(iter/s)": 0.778384
},
{
"epoch": 0.12313674659753726,
"grad_norm": 1.1953125,
"learning_rate": 2.461139896373057e-05,
"loss": 0.518804931640625,
"memory(GiB)": 43.05,
"step": 190,
"train_speed(iter/s)": 0.772501
},
{
"epoch": 0.12637718729747247,
"grad_norm": 1.1796875,
"learning_rate": 2.5259067357512956e-05,
"loss": 0.52044677734375,
"memory(GiB)": 43.05,
"step": 195,
"train_speed(iter/s)": 0.768668
},
{
"epoch": 0.12961762799740764,
"grad_norm": 1.25,
"learning_rate": 2.5906735751295337e-05,
"loss": 0.51678466796875,
"memory(GiB)": 43.05,
"step": 200,
"train_speed(iter/s)": 0.769841
},
{
"epoch": 0.13285806869734285,
"grad_norm": 1.296875,
"learning_rate": 2.655440414507772e-05,
"loss": 0.57254638671875,
"memory(GiB)": 43.05,
"step": 205,
"train_speed(iter/s)": 0.769437
},
{
"epoch": 0.13609850939727802,
"grad_norm": 1.359375,
"learning_rate": 2.7202072538860106e-05,
"loss": 0.5950927734375,
"memory(GiB)": 43.05,
"step": 210,
"train_speed(iter/s)": 0.758835
},
{
"epoch": 0.13933895009721323,
"grad_norm": 1.40625,
"learning_rate": 2.7849740932642487e-05,
"loss": 0.4545166015625,
"memory(GiB)": 43.05,
"step": 215,
"train_speed(iter/s)": 0.758691
},
{
"epoch": 0.1425793907971484,
"grad_norm": 1.5390625,
"learning_rate": 2.8497409326424872e-05,
"loss": 0.4698974609375,
"memory(GiB)": 43.05,
"step": 220,
"train_speed(iter/s)": 0.763472
},
{
"epoch": 0.1458198314970836,
"grad_norm": 1.6796875,
"learning_rate": 2.9145077720207253e-05,
"loss": 0.527783203125,
"memory(GiB)": 43.05,
"step": 225,
"train_speed(iter/s)": 0.759259
},
{
"epoch": 0.14906027219701878,
"grad_norm": 1.640625,
"learning_rate": 2.9792746113989638e-05,
"loss": 0.3810546875,
"memory(GiB)": 43.05,
"step": 230,
"train_speed(iter/s)": 0.759974
},
{
"epoch": 0.152300712896954,
"grad_norm": 2.03125,
"learning_rate": 3.0440414507772026e-05,
"loss": 0.6087646484375,
"memory(GiB)": 43.05,
"step": 235,
"train_speed(iter/s)": 0.759738
},
{
"epoch": 0.15554115359688916,
"grad_norm": 1.9765625,
"learning_rate": 3.108808290155441e-05,
"loss": 0.539697265625,
"memory(GiB)": 43.05,
"step": 240,
"train_speed(iter/s)": 0.757412
},
{
"epoch": 0.15878159429682437,
"grad_norm": 2.546875,
"learning_rate": 3.173575129533679e-05,
"loss": 0.414892578125,
"memory(GiB)": 43.05,
"step": 245,
"train_speed(iter/s)": 0.757164
},
{
"epoch": 0.16202203499675957,
"grad_norm": 2.765625,
"learning_rate": 3.238341968911917e-05,
"loss": 0.466015625,
"memory(GiB)": 43.05,
"step": 250,
"train_speed(iter/s)": 0.760448
},
{
"epoch": 0.16526247569669475,
"grad_norm": 3.34375,
"learning_rate": 3.303108808290156e-05,
"loss": 0.4650390625,
"memory(GiB)": 43.05,
"step": 255,
"train_speed(iter/s)": 0.760227
},
{
"epoch": 0.16850291639662995,
"grad_norm": 3.53125,
"learning_rate": 3.367875647668394e-05,
"loss": 0.42080078125,
"memory(GiB)": 43.05,
"step": 260,
"train_speed(iter/s)": 0.758035
},
{
"epoch": 0.17174335709656513,
"grad_norm": 3.9375,
"learning_rate": 3.432642487046632e-05,
"loss": 0.360791015625,
"memory(GiB)": 43.05,
"step": 265,
"train_speed(iter/s)": 0.759979
},
{
"epoch": 0.17498379779650033,
"grad_norm": 4.75,
"learning_rate": 3.497409326424871e-05,
"loss": 0.40830078125,
"memory(GiB)": 43.05,
"step": 270,
"train_speed(iter/s)": 0.760981
},
{
"epoch": 0.1782242384964355,
"grad_norm": 5.46875,
"learning_rate": 3.562176165803109e-05,
"loss": 0.3232421875,
"memory(GiB)": 43.05,
"step": 275,
"train_speed(iter/s)": 0.76279
},
{
"epoch": 0.18146467919637072,
"grad_norm": 5.28125,
"learning_rate": 3.626943005181348e-05,
"loss": 0.35078125,
"memory(GiB)": 43.05,
"step": 280,
"train_speed(iter/s)": 0.760403
},
{
"epoch": 0.1847051198963059,
"grad_norm": 6.125,
"learning_rate": 3.691709844559585e-05,
"loss": 0.262109375,
"memory(GiB)": 43.05,
"step": 285,
"train_speed(iter/s)": 0.75989
},
{
"epoch": 0.1879455605962411,
"grad_norm": 7.40625,
"learning_rate": 3.756476683937824e-05,
"loss": 0.240234375,
"memory(GiB)": 43.05,
"step": 290,
"train_speed(iter/s)": 0.764385
},
{
"epoch": 0.19118600129617627,
"grad_norm": 8.1875,
"learning_rate": 3.821243523316063e-05,
"loss": 0.2201171875,
"memory(GiB)": 43.05,
"step": 295,
"train_speed(iter/s)": 0.761947
},
{
"epoch": 0.19442644199611148,
"grad_norm": 8.1875,
"learning_rate": 3.886010362694301e-05,
"loss": 0.13984375,
"memory(GiB)": 43.05,
"step": 300,
"train_speed(iter/s)": 0.76188
},
{
"epoch": 0.19766688269604665,
"grad_norm": 10.625,
"learning_rate": 3.950777202072539e-05,
"loss": 0.1181640625,
"memory(GiB)": 43.05,
"step": 305,
"train_speed(iter/s)": 0.766249
},
{
"epoch": 0.20090732339598186,
"grad_norm": 0.54296875,
"learning_rate": 4.015544041450777e-05,
"loss": 0.1419921875,
"memory(GiB)": 43.05,
"step": 310,
"train_speed(iter/s)": 0.762176
},
{
"epoch": 0.20414776409591703,
"grad_norm": 1.9140625,
"learning_rate": 4.080310880829016e-05,
"loss": 0.14765625,
"memory(GiB)": 43.05,
"step": 315,
"train_speed(iter/s)": 0.759722
},
{
"epoch": 0.20738820479585224,
"grad_norm": 2.34375,
"learning_rate": 4.145077720207254e-05,
"loss": 0.1498046875,
"memory(GiB)": 43.05,
"step": 320,
"train_speed(iter/s)": 0.759166
},
{
"epoch": 0.21062864549578741,
"grad_norm": 1.0390625,
"learning_rate": 4.209844559585492e-05,
"loss": 0.0775390625,
"memory(GiB)": 43.05,
"step": 325,
"train_speed(iter/s)": 0.76154
},
{
"epoch": 0.21386908619572262,
"grad_norm": 10.625,
"learning_rate": 4.274611398963731e-05,
"loss": 0.09013671875,
"memory(GiB)": 43.05,
"step": 330,
"train_speed(iter/s)": 0.763867
},
{
"epoch": 0.21710952689565782,
"grad_norm": 0.7734375,
"learning_rate": 4.339378238341969e-05,
"loss": 0.1416015625,
"memory(GiB)": 43.05,
"step": 335,
"train_speed(iter/s)": 0.761462
},
{
"epoch": 0.220349967595593,
"grad_norm": 6.28125,
"learning_rate": 4.404145077720208e-05,
"loss": 0.0611328125,
"memory(GiB)": 43.05,
"step": 340,
"train_speed(iter/s)": 0.763926
},
{
"epoch": 0.2235904082955282,
"grad_norm": 5.5625,
"learning_rate": 4.468911917098445e-05,
"loss": 0.080859375,
"memory(GiB)": 43.05,
"step": 345,
"train_speed(iter/s)": 0.763924
},
{
"epoch": 0.22683084899546338,
"grad_norm": 2.203125,
"learning_rate": 4.533678756476684e-05,
"loss": 0.096484375,
"memory(GiB)": 43.05,
"step": 350,
"train_speed(iter/s)": 0.764442
},
{
"epoch": 0.23007128969539858,
"grad_norm": 10.25,
"learning_rate": 4.598445595854923e-05,
"loss": 0.059375,
"memory(GiB)": 43.05,
"step": 355,
"train_speed(iter/s)": 0.765663
},
{
"epoch": 0.23331173039533376,
"grad_norm": 10.5,
"learning_rate": 4.663212435233161e-05,
"loss": 0.1283203125,
"memory(GiB)": 43.05,
"step": 360,
"train_speed(iter/s)": 0.762173
},
{
"epoch": 0.23655217109526896,
"grad_norm": 12.9375,
"learning_rate": 4.727979274611399e-05,
"loss": 0.1201171875,
"memory(GiB)": 43.05,
"step": 365,
"train_speed(iter/s)": 0.761207
},
{
"epoch": 0.23979261179520414,
"grad_norm": 4.5625,
"learning_rate": 4.792746113989637e-05,
"loss": 0.05654296875,
"memory(GiB)": 43.05,
"step": 370,
"train_speed(iter/s)": 0.761617
},
{
"epoch": 0.24303305249513935,
"grad_norm": 3.921875,
"learning_rate": 4.857512953367876e-05,
"loss": 0.05234375,
"memory(GiB)": 43.05,
"step": 375,
"train_speed(iter/s)": 0.765087
},
{
"epoch": 0.24627349319507452,
"grad_norm": 4.96875,
"learning_rate": 4.922279792746114e-05,
"loss": 0.0787109375,
"memory(GiB)": 43.05,
"step": 380,
"train_speed(iter/s)": 0.763822
},
{
"epoch": 0.24951393389500973,
"grad_norm": 6.6875,
"learning_rate": 4.9870466321243523e-05,
"loss": 0.1029296875,
"memory(GiB)": 43.05,
"step": 385,
"train_speed(iter/s)": 0.763392
},
{
"epoch": 0.25275437459494493,
"grad_norm": 10.875,
"learning_rate": 5.051813471502591e-05,
"loss": 0.075390625,
"memory(GiB)": 43.05,
"step": 390,
"train_speed(iter/s)": 0.764437
},
{
"epoch": 0.2559948152948801,
"grad_norm": 4.34375,
"learning_rate": 5.11658031088083e-05,
"loss": 0.0646484375,
"memory(GiB)": 43.05,
"step": 395,
"train_speed(iter/s)": 0.762921
},
{
"epoch": 0.2592352559948153,
"grad_norm": 9.5,
"learning_rate": 5.1813471502590674e-05,
"loss": 0.0505859375,
"memory(GiB)": 43.05,
"step": 400,
"train_speed(iter/s)": 0.763592
},
{
"epoch": 0.26247569669475046,
"grad_norm": 3.296875,
"learning_rate": 5.2461139896373055e-05,
"loss": 0.0591796875,
"memory(GiB)": 43.05,
"step": 405,
"train_speed(iter/s)": 0.761955
},
{
"epoch": 0.2657161373946857,
"grad_norm": 0.7734375,
"learning_rate": 5.310880829015544e-05,
"loss": 0.065234375,
"memory(GiB)": 43.05,
"step": 410,
"train_speed(iter/s)": 0.765098
},
{
"epoch": 0.26895657809462087,
"grad_norm": 12.9375,
"learning_rate": 5.375647668393783e-05,
"loss": 0.0865234375,
"memory(GiB)": 43.05,
"step": 415,
"train_speed(iter/s)": 0.76542
},
{
"epoch": 0.27219701879455604,
"grad_norm": 2.640625,
"learning_rate": 5.440414507772021e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 420,
"train_speed(iter/s)": 0.768599
},
{
"epoch": 0.2754374594944913,
"grad_norm": 12.5625,
"learning_rate": 5.505181347150259e-05,
"loss": 0.07587890625,
"memory(GiB)": 43.05,
"step": 425,
"train_speed(iter/s)": 0.769594
},
{
"epoch": 0.27867790019442645,
"grad_norm": 2.90625,
"learning_rate": 5.5699481865284975e-05,
"loss": 0.0716796875,
"memory(GiB)": 43.05,
"step": 430,
"train_speed(iter/s)": 0.77124
},
{
"epoch": 0.28191834089436163,
"grad_norm": 14.4375,
"learning_rate": 5.634715025906736e-05,
"loss": 0.06484375,
"memory(GiB)": 43.05,
"step": 435,
"train_speed(iter/s)": 0.773115
},
{
"epoch": 0.2851587815942968,
"grad_norm": 9.75,
"learning_rate": 5.6994818652849744e-05,
"loss": 0.0515625,
"memory(GiB)": 43.05,
"step": 440,
"train_speed(iter/s)": 0.77487
},
{
"epoch": 0.28839922229423204,
"grad_norm": 11.0,
"learning_rate": 5.764248704663213e-05,
"loss": 0.0740234375,
"memory(GiB)": 43.05,
"step": 445,
"train_speed(iter/s)": 0.775776
},
{
"epoch": 0.2916396629941672,
"grad_norm": 1.2734375,
"learning_rate": 5.8290155440414506e-05,
"loss": 0.08212890625,
"memory(GiB)": 43.05,
"step": 450,
"train_speed(iter/s)": 0.776063
},
{
"epoch": 0.2948801036941024,
"grad_norm": 8.875,
"learning_rate": 5.8937823834196894e-05,
"loss": 0.06875,
"memory(GiB)": 43.05,
"step": 455,
"train_speed(iter/s)": 0.77501
},
{
"epoch": 0.29812054439403757,
"grad_norm": 1.46875,
"learning_rate": 5.9585492227979276e-05,
"loss": 0.05693359375,
"memory(GiB)": 43.05,
"step": 460,
"train_speed(iter/s)": 0.776518
},
{
"epoch": 0.3013609850939728,
"grad_norm": 1.8359375,
"learning_rate": 6.0233160621761664e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 465,
"train_speed(iter/s)": 0.777886
},
{
"epoch": 0.304601425793908,
"grad_norm": 15.1875,
"learning_rate": 6.088082901554405e-05,
"loss": 0.0626953125,
"memory(GiB)": 43.05,
"step": 470,
"train_speed(iter/s)": 0.778123
},
{
"epoch": 0.30784186649384315,
"grad_norm": 9.9375,
"learning_rate": 6.152849740932643e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 475,
"train_speed(iter/s)": 0.7802
},
{
"epoch": 0.31108230719377833,
"grad_norm": 12.0,
"learning_rate": 6.217616580310881e-05,
"loss": 0.05234375,
"memory(GiB)": 43.05,
"step": 480,
"train_speed(iter/s)": 0.781066
},
{
"epoch": 0.31432274789371356,
"grad_norm": 9.625,
"learning_rate": 6.28238341968912e-05,
"loss": 0.05703125,
"memory(GiB)": 43.05,
"step": 485,
"train_speed(iter/s)": 0.777163
},
{
"epoch": 0.31756318859364874,
"grad_norm": 1.4453125,
"learning_rate": 6.347150259067358e-05,
"loss": 0.0689453125,
"memory(GiB)": 43.05,
"step": 490,
"train_speed(iter/s)": 0.776769
},
{
"epoch": 0.3208036292935839,
"grad_norm": 12.4375,
"learning_rate": 6.411917098445595e-05,
"loss": 0.11328125,
"memory(GiB)": 43.05,
"step": 495,
"train_speed(iter/s)": 0.776231
},
{
"epoch": 0.32404406999351915,
"grad_norm": 1.1796875,
"learning_rate": 6.476683937823834e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 500,
"train_speed(iter/s)": 0.777694
},
{
"epoch": 0.3272845106934543,
"grad_norm": 8.6875,
"learning_rate": 6.541450777202073e-05,
"loss": 0.06787109375,
"memory(GiB)": 43.05,
"step": 505,
"train_speed(iter/s)": 0.779091
},
{
"epoch": 0.3305249513933895,
"grad_norm": 1.1953125,
"learning_rate": 6.606217616580311e-05,
"loss": 0.06767578125,
"memory(GiB)": 43.05,
"step": 510,
"train_speed(iter/s)": 0.77883
},
{
"epoch": 0.3337653920933247,
"grad_norm": 1.9140625,
"learning_rate": 6.67098445595855e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 515,
"train_speed(iter/s)": 0.777959
},
{
"epoch": 0.3370058327932599,
"grad_norm": 10.5,
"learning_rate": 6.735751295336788e-05,
"loss": 0.04677734375,
"memory(GiB)": 43.05,
"step": 520,
"train_speed(iter/s)": 0.777442
},
{
"epoch": 0.3402462734931951,
"grad_norm": 9.6875,
"learning_rate": 6.800518134715027e-05,
"loss": 0.0787109375,
"memory(GiB)": 43.05,
"step": 525,
"train_speed(iter/s)": 0.777622
},
{
"epoch": 0.34348671419313026,
"grad_norm": 10.0625,
"learning_rate": 6.865284974093264e-05,
"loss": 0.06318359375,
"memory(GiB)": 43.05,
"step": 530,
"train_speed(iter/s)": 0.7802
},
{
"epoch": 0.34672715489306544,
"grad_norm": 11.125,
"learning_rate": 6.930051813471503e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 535,
"train_speed(iter/s)": 0.780842
},
{
"epoch": 0.34996759559300067,
"grad_norm": 0.66015625,
"learning_rate": 6.994818652849742e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 540,
"train_speed(iter/s)": 0.779714
},
{
"epoch": 0.35320803629293585,
"grad_norm": 7.375,
"learning_rate": 7.059585492227979e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 545,
"train_speed(iter/s)": 0.779558
},
{
"epoch": 0.356448476992871,
"grad_norm": 1.671875,
"learning_rate": 7.124352331606218e-05,
"loss": 0.0501953125,
"memory(GiB)": 43.05,
"step": 550,
"train_speed(iter/s)": 0.779993
},
{
"epoch": 0.3596889176928062,
"grad_norm": 8.875,
"learning_rate": 7.189119170984457e-05,
"loss": 0.0609375,
"memory(GiB)": 43.05,
"step": 555,
"train_speed(iter/s)": 0.782429
},
{
"epoch": 0.36292935839274143,
"grad_norm": 16.125,
"learning_rate": 7.253886010362695e-05,
"loss": 0.082421875,
"memory(GiB)": 43.05,
"step": 560,
"train_speed(iter/s)": 0.782177
},
{
"epoch": 0.3661697990926766,
"grad_norm": 3.78125,
"learning_rate": 7.318652849740933e-05,
"loss": 0.066015625,
"memory(GiB)": 43.05,
"step": 565,
"train_speed(iter/s)": 0.780785
},
{
"epoch": 0.3694102397926118,
"grad_norm": 1.890625,
"learning_rate": 7.38341968911917e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 570,
"train_speed(iter/s)": 0.778912
},
{
"epoch": 0.37265068049254696,
"grad_norm": 11.875,
"learning_rate": 7.448186528497409e-05,
"loss": 0.0791015625,
"memory(GiB)": 43.05,
"step": 575,
"train_speed(iter/s)": 0.77911
},
{
"epoch": 0.3758911211924822,
"grad_norm": 2.5,
"learning_rate": 7.512953367875648e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 580,
"train_speed(iter/s)": 0.779861
},
{
"epoch": 0.37913156189241737,
"grad_norm": 14.9375,
"learning_rate": 7.577720207253887e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 585,
"train_speed(iter/s)": 0.779469
},
{
"epoch": 0.38237200259235254,
"grad_norm": 0.94921875,
"learning_rate": 7.642487046632126e-05,
"loss": 0.05390625,
"memory(GiB)": 43.05,
"step": 590,
"train_speed(iter/s)": 0.781608
},
{
"epoch": 0.3856124432922878,
"grad_norm": 1.265625,
"learning_rate": 7.707253886010363e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 595,
"train_speed(iter/s)": 0.782382
},
{
"epoch": 0.38885288399222295,
"grad_norm": 8.4375,
"learning_rate": 7.772020725388602e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 600,
"train_speed(iter/s)": 0.781728
},
{
"epoch": 0.39209332469215813,
"grad_norm": 10.1875,
"learning_rate": 7.836787564766839e-05,
"loss": 0.0505859375,
"memory(GiB)": 43.05,
"step": 605,
"train_speed(iter/s)": 0.780685
},
{
"epoch": 0.3953337653920933,
"grad_norm": 8.3125,
"learning_rate": 7.901554404145078e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 610,
"train_speed(iter/s)": 0.78192
},
{
"epoch": 0.39857420609202854,
"grad_norm": 9.4375,
"learning_rate": 7.966321243523317e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 615,
"train_speed(iter/s)": 0.781089
},
{
"epoch": 0.4018146467919637,
"grad_norm": 3.78125,
"learning_rate": 8.031088082901554e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 620,
"train_speed(iter/s)": 0.779088
},
{
"epoch": 0.4050550874918989,
"grad_norm": 5.125,
"learning_rate": 8.095854922279793e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 625,
"train_speed(iter/s)": 0.778978
},
{
"epoch": 0.40829552819183407,
"grad_norm": 2.890625,
"learning_rate": 8.160621761658032e-05,
"loss": 0.05361328125,
"memory(GiB)": 43.05,
"step": 630,
"train_speed(iter/s)": 0.779658
},
{
"epoch": 0.4115359688917693,
"grad_norm": 1.2734375,
"learning_rate": 8.22538860103627e-05,
"loss": 0.0625,
"memory(GiB)": 43.05,
"step": 635,
"train_speed(iter/s)": 0.780894
},
{
"epoch": 0.4147764095917045,
"grad_norm": 1.859375,
"learning_rate": 8.290155440414508e-05,
"loss": 0.0771484375,
"memory(GiB)": 43.05,
"step": 640,
"train_speed(iter/s)": 0.778867
},
{
"epoch": 0.41801685029163965,
"grad_norm": 1.21875,
"learning_rate": 8.354922279792747e-05,
"loss": 0.065625,
"memory(GiB)": 43.05,
"step": 645,
"train_speed(iter/s)": 0.77719
},
{
"epoch": 0.42125729099157483,
"grad_norm": 1.5546875,
"learning_rate": 8.419689119170984e-05,
"loss": 0.06796875,
"memory(GiB)": 43.05,
"step": 650,
"train_speed(iter/s)": 0.778792
},
{
"epoch": 0.42449773169151006,
"grad_norm": 1.171875,
"learning_rate": 8.484455958549223e-05,
"loss": 0.04267578125,
"memory(GiB)": 43.05,
"step": 655,
"train_speed(iter/s)": 0.777084
},
{
"epoch": 0.42773817239144524,
"grad_norm": 0.81640625,
"learning_rate": 8.549222797927462e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 660,
"train_speed(iter/s)": 0.778245
},
{
"epoch": 0.4309786130913804,
"grad_norm": 9.8125,
"learning_rate": 8.6139896373057e-05,
"loss": 0.06953125,
"memory(GiB)": 43.05,
"step": 665,
"train_speed(iter/s)": 0.778039
},
{
"epoch": 0.43421905379131565,
"grad_norm": 4.09375,
"learning_rate": 8.678756476683938e-05,
"loss": 0.06484375,
"memory(GiB)": 43.05,
"step": 670,
"train_speed(iter/s)": 0.777077
},
{
"epoch": 0.4374594944912508,
"grad_norm": 12.6875,
"learning_rate": 8.743523316062177e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 675,
"train_speed(iter/s)": 0.777329
},
{
"epoch": 0.440699935191186,
"grad_norm": 6.9375,
"learning_rate": 8.808290155440416e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 680,
"train_speed(iter/s)": 0.7785
},
{
"epoch": 0.4439403758911212,
"grad_norm": 2.328125,
"learning_rate": 8.873056994818653e-05,
"loss": 0.05625,
"memory(GiB)": 43.05,
"step": 685,
"train_speed(iter/s)": 0.776505
},
{
"epoch": 0.4471808165910564,
"grad_norm": 16.625,
"learning_rate": 8.93782383419689e-05,
"loss": 0.0560546875,
"memory(GiB)": 43.05,
"step": 690,
"train_speed(iter/s)": 0.776332
},
{
"epoch": 0.4504212572909916,
"grad_norm": 13.4375,
"learning_rate": 9.00259067357513e-05,
"loss": 0.0673828125,
"memory(GiB)": 43.05,
"step": 695,
"train_speed(iter/s)": 0.777024
},
{
"epoch": 0.45366169799092676,
"grad_norm": 9.1875,
"learning_rate": 9.067357512953368e-05,
"loss": 0.0634765625,
"memory(GiB)": 43.05,
"step": 700,
"train_speed(iter/s)": 0.77715
},
{
"epoch": 0.45690213869086194,
"grad_norm": 1.09375,
"learning_rate": 9.132124352331607e-05,
"loss": 0.03935546875,
"memory(GiB)": 43.05,
"step": 705,
"train_speed(iter/s)": 0.778282
},
{
"epoch": 0.46014257939079717,
"grad_norm": 9.875,
"learning_rate": 9.196891191709846e-05,
"loss": 0.07890625,
"memory(GiB)": 43.05,
"step": 710,
"train_speed(iter/s)": 0.778004
},
{
"epoch": 0.46338302009073234,
"grad_norm": 7.15625,
"learning_rate": 9.261658031088083e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 715,
"train_speed(iter/s)": 0.778247
},
{
"epoch": 0.4666234607906675,
"grad_norm": 13.9375,
"learning_rate": 9.326424870466322e-05,
"loss": 0.0638671875,
"memory(GiB)": 43.05,
"step": 720,
"train_speed(iter/s)": 0.779281
},
{
"epoch": 0.4698639014906027,
"grad_norm": 9.8125,
"learning_rate": 9.39119170984456e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 725,
"train_speed(iter/s)": 0.779538
},
{
"epoch": 0.47310434219053793,
"grad_norm": 4.34375,
"learning_rate": 9.455958549222798e-05,
"loss": 0.0443359375,
"memory(GiB)": 43.05,
"step": 730,
"train_speed(iter/s)": 0.777998
},
{
"epoch": 0.4763447828904731,
"grad_norm": 3.96875,
"learning_rate": 9.520725388601037e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 735,
"train_speed(iter/s)": 0.775346
},
{
"epoch": 0.4795852235904083,
"grad_norm": 0.62890625,
"learning_rate": 9.585492227979275e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 740,
"train_speed(iter/s)": 0.775944
},
{
"epoch": 0.48282566429034346,
"grad_norm": 0.79296875,
"learning_rate": 9.650259067357513e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 745,
"train_speed(iter/s)": 0.776402
},
{
"epoch": 0.4860661049902787,
"grad_norm": 13.375,
"learning_rate": 9.715025906735752e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 750,
"train_speed(iter/s)": 0.77542
},
{
"epoch": 0.48930654569021387,
"grad_norm": 8.375,
"learning_rate": 9.779792746113991e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 755,
"train_speed(iter/s)": 0.777383
},
{
"epoch": 0.49254698639014904,
"grad_norm": 10.375,
"learning_rate": 9.844559585492228e-05,
"loss": 0.065625,
"memory(GiB)": 43.05,
"step": 760,
"train_speed(iter/s)": 0.77834
},
{
"epoch": 0.4957874270900843,
"grad_norm": 8.9375,
"learning_rate": 9.909326424870466e-05,
"loss": 0.06845703125,
"memory(GiB)": 43.05,
"step": 765,
"train_speed(iter/s)": 0.778175
},
{
"epoch": 0.49902786779001945,
"grad_norm": 0.875,
"learning_rate": 9.974093264248705e-05,
"loss": 0.070703125,
"memory(GiB)": 43.05,
"step": 770,
"train_speed(iter/s)": 0.776861
},
{
"epoch": 0.5022683084899546,
"grad_norm": 5.71875,
"learning_rate": 9.999998966446853e-05,
"loss": 0.0751953125,
"memory(GiB)": 43.05,
"step": 775,
"train_speed(iter/s)": 0.77442
},
{
"epoch": 0.5055087491898899,
"grad_norm": 13.5,
"learning_rate": 9.999992650290278e-05,
"loss": 0.07265625,
"memory(GiB)": 43.05,
"step": 780,
"train_speed(iter/s)": 0.774088
},
{
"epoch": 0.508749189889825,
"grad_norm": 0.671875,
"learning_rate": 9.999980592180564e-05,
"loss": 0.0541015625,
"memory(GiB)": 43.05,
"step": 785,
"train_speed(iter/s)": 0.775409
},
{
"epoch": 0.5119896305897602,
"grad_norm": 0.59765625,
"learning_rate": 9.999962792131561e-05,
"loss": 0.076171875,
"memory(GiB)": 43.05,
"step": 790,
"train_speed(iter/s)": 0.775991
},
{
"epoch": 0.5152300712896954,
"grad_norm": 12.375,
"learning_rate": 9.999939250163708e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 795,
"train_speed(iter/s)": 0.776217
},
{
"epoch": 0.5184705119896306,
"grad_norm": 10.5,
"learning_rate": 9.999909966304044e-05,
"loss": 0.0849609375,
"memory(GiB)": 43.05,
"step": 800,
"train_speed(iter/s)": 0.774889
},
{
"epoch": 0.5217109526895658,
"grad_norm": 12.1875,
"learning_rate": 9.999874940586194e-05,
"loss": 0.0599609375,
"memory(GiB)": 43.05,
"step": 805,
"train_speed(iter/s)": 0.775316
},
{
"epoch": 0.5249513933895009,
"grad_norm": 19.0,
"learning_rate": 9.999834173050383e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 810,
"train_speed(iter/s)": 0.774057
},
{
"epoch": 0.5281918340894362,
"grad_norm": 11.25,
"learning_rate": 9.99978766374343e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 815,
"train_speed(iter/s)": 0.774313
},
{
"epoch": 0.5314322747893714,
"grad_norm": 18.0,
"learning_rate": 9.999735412718742e-05,
"loss": 0.0634765625,
"memory(GiB)": 43.05,
"step": 820,
"train_speed(iter/s)": 0.773696
},
{
"epoch": 0.5346727154893065,
"grad_norm": 9.6875,
"learning_rate": 9.999677420036327e-05,
"loss": 0.075390625,
"memory(GiB)": 43.05,
"step": 825,
"train_speed(iter/s)": 0.774409
},
{
"epoch": 0.5379131561892417,
"grad_norm": 11.1875,
"learning_rate": 9.999613685762782e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 830,
"train_speed(iter/s)": 0.775219
},
{
"epoch": 0.541153596889177,
"grad_norm": 10.625,
"learning_rate": 9.999544209971299e-05,
"loss": 0.0587890625,
"memory(GiB)": 43.05,
"step": 835,
"train_speed(iter/s)": 0.774369
},
{
"epoch": 0.5443940375891121,
"grad_norm": 14.25,
"learning_rate": 9.999468992741665e-05,
"loss": 0.05078125,
"memory(GiB)": 43.05,
"step": 840,
"train_speed(iter/s)": 0.771281
},
{
"epoch": 0.5476344782890473,
"grad_norm": 2.109375,
"learning_rate": 9.999388034160256e-05,
"loss": 0.03955078125,
"memory(GiB)": 43.05,
"step": 845,
"train_speed(iter/s)": 0.772797
},
{
"epoch": 0.5508749189889826,
"grad_norm": 1.6015625,
"learning_rate": 9.999301334320046e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 850,
"train_speed(iter/s)": 0.77259
},
{
"epoch": 0.5541153596889177,
"grad_norm": 10.75,
"learning_rate": 9.999208893320602e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 855,
"train_speed(iter/s)": 0.772764
},
{
"epoch": 0.5573558003888529,
"grad_norm": 3.375,
"learning_rate": 9.999110711268078e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 860,
"train_speed(iter/s)": 0.772924
},
{
"epoch": 0.560596241088788,
"grad_norm": 5.875,
"learning_rate": 9.99900678827523e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 865,
"train_speed(iter/s)": 0.771177
},
{
"epoch": 0.5638366817887233,
"grad_norm": 8.4375,
"learning_rate": 9.998897124461401e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 870,
"train_speed(iter/s)": 0.771316
},
{
"epoch": 0.5670771224886585,
"grad_norm": 0.69140625,
"learning_rate": 9.998781719952526e-05,
"loss": 0.0564453125,
"memory(GiB)": 43.05,
"step": 875,
"train_speed(iter/s)": 0.772151
},
{
"epoch": 0.5703175631885936,
"grad_norm": 12.8125,
"learning_rate": 9.998660574881138e-05,
"loss": 0.09287109375,
"memory(GiB)": 43.05,
"step": 880,
"train_speed(iter/s)": 0.772395
},
{
"epoch": 0.5735580038885288,
"grad_norm": 5.8125,
"learning_rate": 9.998533689386357e-05,
"loss": 0.08984375,
"memory(GiB)": 43.05,
"step": 885,
"train_speed(iter/s)": 0.772617
},
{
"epoch": 0.5767984445884641,
"grad_norm": 16.875,
"learning_rate": 9.998401063613897e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 890,
"train_speed(iter/s)": 0.77251
},
{
"epoch": 0.5800388852883992,
"grad_norm": 14.3125,
"learning_rate": 9.998262697716065e-05,
"loss": 0.089453125,
"memory(GiB)": 43.05,
"step": 895,
"train_speed(iter/s)": 0.771789
},
{
"epoch": 0.5832793259883344,
"grad_norm": 2.046875,
"learning_rate": 9.998118591851762e-05,
"loss": 0.053125,
"memory(GiB)": 43.05,
"step": 900,
"train_speed(iter/s)": 0.772184
},
{
"epoch": 0.5865197666882696,
"grad_norm": 7.28125,
"learning_rate": 9.997968746186472e-05,
"loss": 0.0642578125,
"memory(GiB)": 43.05,
"step": 905,
"train_speed(iter/s)": 0.773333
},
{
"epoch": 0.5897602073882048,
"grad_norm": 18.0,
"learning_rate": 9.997813160892283e-05,
"loss": 0.0443359375,
"memory(GiB)": 43.05,
"step": 910,
"train_speed(iter/s)": 0.772565
},
{
"epoch": 0.59300064808814,
"grad_norm": 3.5,
"learning_rate": 9.997651836147864e-05,
"loss": 0.0650390625,
"memory(GiB)": 43.05,
"step": 915,
"train_speed(iter/s)": 0.772992
},
{
"epoch": 0.5962410887880751,
"grad_norm": 8.6875,
"learning_rate": 9.99748477213848e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 920,
"train_speed(iter/s)": 0.774383
},
{
"epoch": 0.5994815294880104,
"grad_norm": 1.421875,
"learning_rate": 9.997311969055987e-05,
"loss": 0.0591796875,
"memory(GiB)": 43.05,
"step": 925,
"train_speed(iter/s)": 0.77552
},
{
"epoch": 0.6027219701879456,
"grad_norm": 11.625,
"learning_rate": 9.99713342709883e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 930,
"train_speed(iter/s)": 0.775589
},
{
"epoch": 0.6059624108878807,
"grad_norm": 1.8046875,
"learning_rate": 9.996949146472045e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 935,
"train_speed(iter/s)": 0.774582
},
{
"epoch": 0.609202851587816,
"grad_norm": 2.03125,
"learning_rate": 9.996759127387258e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 940,
"train_speed(iter/s)": 0.774099
},
{
"epoch": 0.6124432922877512,
"grad_norm": 10.5,
"learning_rate": 9.996563370062685e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 945,
"train_speed(iter/s)": 0.773654
},
{
"epoch": 0.6156837329876863,
"grad_norm": 2.046875,
"learning_rate": 9.996361874723137e-05,
"loss": 0.04248046875,
"memory(GiB)": 43.05,
"step": 950,
"train_speed(iter/s)": 0.774127
},
{
"epoch": 0.6189241736876215,
"grad_norm": 2.484375,
"learning_rate": 9.996154641600004e-05,
"loss": 0.058203125,
"memory(GiB)": 43.05,
"step": 955,
"train_speed(iter/s)": 0.773992
},
{
"epoch": 0.6221646143875567,
"grad_norm": 10.1875,
"learning_rate": 9.995941670931272e-05,
"loss": 0.0546875,
"memory(GiB)": 43.05,
"step": 960,
"train_speed(iter/s)": 0.774066
},
{
"epoch": 0.6254050550874919,
"grad_norm": 0.69140625,
"learning_rate": 9.995722962961517e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 965,
"train_speed(iter/s)": 0.774572
},
{
"epoch": 0.6286454957874271,
"grad_norm": 8.75,
"learning_rate": 9.9954985179419e-05,
"loss": 0.0599609375,
"memory(GiB)": 43.05,
"step": 970,
"train_speed(iter/s)": 0.774691
},
{
"epoch": 0.6318859364873622,
"grad_norm": 11.125,
"learning_rate": 9.995268336130173e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 975,
"train_speed(iter/s)": 0.774947
},
{
"epoch": 0.6351263771872975,
"grad_norm": 2.0,
"learning_rate": 9.995032417790673e-05,
"loss": 0.0779296875,
"memory(GiB)": 43.05,
"step": 980,
"train_speed(iter/s)": 0.774809
},
{
"epoch": 0.6383668178872327,
"grad_norm": 5.96875,
"learning_rate": 9.994790763194329e-05,
"loss": 0.0666015625,
"memory(GiB)": 43.05,
"step": 985,
"train_speed(iter/s)": 0.774857
},
{
"epoch": 0.6416072585871678,
"grad_norm": 3.703125,
"learning_rate": 9.994543372618654e-05,
"loss": 0.0865234375,
"memory(GiB)": 43.05,
"step": 990,
"train_speed(iter/s)": 0.77391
},
{
"epoch": 0.6448476992871031,
"grad_norm": 11.375,
"learning_rate": 9.994290246347751e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 995,
"train_speed(iter/s)": 0.774097
},
{
"epoch": 0.6480881399870383,
"grad_norm": 8.8125,
"learning_rate": 9.994031384672306e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 1000,
"train_speed(iter/s)": 0.773134
},
{
"epoch": 0.6513285806869734,
"grad_norm": 0.72265625,
"learning_rate": 9.993766787889596e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 1005,
"train_speed(iter/s)": 0.773592
},
{
"epoch": 0.6545690213869086,
"grad_norm": 1.8046875,
"learning_rate": 9.99349645630348e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 1010,
"train_speed(iter/s)": 0.77459
},
{
"epoch": 0.6578094620868438,
"grad_norm": 11.75,
"learning_rate": 9.993220390224405e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 1015,
"train_speed(iter/s)": 0.774071
},
{
"epoch": 0.661049902786779,
"grad_norm": 8.4375,
"learning_rate": 9.992938589969405e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 1020,
"train_speed(iter/s)": 0.774983
},
{
"epoch": 0.6642903434867142,
"grad_norm": 0.7421875,
"learning_rate": 9.992651055862094e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 1025,
"train_speed(iter/s)": 0.775407
},
{
"epoch": 0.6675307841866494,
"grad_norm": 10.625,
"learning_rate": 9.992357788232677e-05,
"loss": 0.07265625,
"memory(GiB)": 43.05,
"step": 1030,
"train_speed(iter/s)": 0.775489
},
{
"epoch": 0.6707712248865846,
"grad_norm": 11.0625,
"learning_rate": 9.992058787417941e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 1035,
"train_speed(iter/s)": 0.77588
},
{
"epoch": 0.6740116655865198,
"grad_norm": 0.93359375,
"learning_rate": 9.991754053761253e-05,
"loss": 0.04052734375,
"memory(GiB)": 43.05,
"step": 1040,
"train_speed(iter/s)": 0.775803
},
{
"epoch": 0.6772521062864549,
"grad_norm": 1.4609375,
"learning_rate": 9.991443587612567e-05,
"loss": 0.0552734375,
"memory(GiB)": 43.05,
"step": 1045,
"train_speed(iter/s)": 0.775196
},
{
"epoch": 0.6804925469863902,
"grad_norm": 7.34375,
"learning_rate": 9.991127389328423e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 1050,
"train_speed(iter/s)": 0.776071
},
{
"epoch": 0.6837329876863253,
"grad_norm": 7.625,
"learning_rate": 9.990805459271936e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 1055,
"train_speed(iter/s)": 0.776
},
{
"epoch": 0.6869734283862605,
"grad_norm": 0.94921875,
"learning_rate": 9.990477797812814e-05,
"loss": 0.0818359375,
"memory(GiB)": 43.05,
"step": 1060,
"train_speed(iter/s)": 0.776054
},
{
"epoch": 0.6902138690861958,
"grad_norm": 1.75,
"learning_rate": 9.990144405327336e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 1065,
"train_speed(iter/s)": 0.775956
},
{
"epoch": 0.6934543097861309,
"grad_norm": 16.25,
"learning_rate": 9.98980528219837e-05,
"loss": 0.0697265625,
"memory(GiB)": 43.05,
"step": 1070,
"train_speed(iter/s)": 0.776794
},
{
"epoch": 0.6966947504860661,
"grad_norm": 1.0625,
"learning_rate": 9.989460428815362e-05,
"loss": 0.0607421875,
"memory(GiB)": 43.05,
"step": 1075,
"train_speed(iter/s)": 0.777
},
{
"epoch": 0.6999351911860013,
"grad_norm": 13.0625,
"learning_rate": 9.989109845574336e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 1080,
"train_speed(iter/s)": 0.776921
},
{
"epoch": 0.7031756318859365,
"grad_norm": 12.3125,
"learning_rate": 9.988753532877904e-05,
"loss": 0.059375,
"memory(GiB)": 43.05,
"step": 1085,
"train_speed(iter/s)": 0.776292
},
{
"epoch": 0.7064160725858717,
"grad_norm": 1.6484375,
"learning_rate": 9.98839149113525e-05,
"loss": 0.03642578125,
"memory(GiB)": 43.05,
"step": 1090,
"train_speed(iter/s)": 0.776828
},
{
"epoch": 0.7096565132858069,
"grad_norm": 10.3125,
"learning_rate": 9.988023720762138e-05,
"loss": 0.05390625,
"memory(GiB)": 43.05,
"step": 1095,
"train_speed(iter/s)": 0.776747
},
{
"epoch": 0.712896953985742,
"grad_norm": 12.875,
"learning_rate": 9.987650222180917e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 1100,
"train_speed(iter/s)": 0.774409
},
{
"epoch": 0.7161373946856773,
"grad_norm": 5.53125,
"learning_rate": 9.987270995820508e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 1105,
"train_speed(iter/s)": 0.774632
},
{
"epoch": 0.7193778353856124,
"grad_norm": 3.8125,
"learning_rate": 9.986886042116413e-05,
"loss": 0.04638671875,
"memory(GiB)": 43.05,
"step": 1110,
"train_speed(iter/s)": 0.774254
},
{
"epoch": 0.7226182760855476,
"grad_norm": 1.5546875,
"learning_rate": 9.986495361510705e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 1115,
"train_speed(iter/s)": 0.774629
},
{
"epoch": 0.7258587167854829,
"grad_norm": 11.125,
"learning_rate": 9.986098954452043e-05,
"loss": 0.053515625,
"memory(GiB)": 43.05,
"step": 1120,
"train_speed(iter/s)": 0.775487
},
{
"epoch": 0.729099157485418,
"grad_norm": 12.8125,
"learning_rate": 9.985696821395659e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 1125,
"train_speed(iter/s)": 0.776194
},
{
"epoch": 0.7323395981853532,
"grad_norm": 0.9375,
"learning_rate": 9.985288962803354e-05,
"loss": 0.0541015625,
"memory(GiB)": 43.05,
"step": 1130,
"train_speed(iter/s)": 0.776858
},
{
"epoch": 0.7355800388852884,
"grad_norm": 15.625,
"learning_rate": 9.984875379143515e-05,
"loss": 0.0466796875,
"memory(GiB)": 43.05,
"step": 1135,
"train_speed(iter/s)": 0.776744
},
{
"epoch": 0.7388204795852236,
"grad_norm": 12.375,
"learning_rate": 9.984456070891094e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 1140,
"train_speed(iter/s)": 0.777105
},
{
"epoch": 0.7420609202851588,
"grad_norm": 13.75,
"learning_rate": 9.984031038527628e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 1145,
"train_speed(iter/s)": 0.775913
},
{
"epoch": 0.7453013609850939,
"grad_norm": 10.8125,
"learning_rate": 9.983600282541213e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 1150,
"train_speed(iter/s)": 0.776782
},
{
"epoch": 0.7485418016850292,
"grad_norm": 0.546875,
"learning_rate": 9.98316380342653e-05,
"loss": 0.0521484375,
"memory(GiB)": 43.05,
"step": 1155,
"train_speed(iter/s)": 0.777432
},
{
"epoch": 0.7517822423849644,
"grad_norm": 4.25,
"learning_rate": 9.98272160168483e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 1160,
"train_speed(iter/s)": 0.77706
},
{
"epoch": 0.7550226830848995,
"grad_norm": 13.375,
"learning_rate": 9.982273677823928e-05,
"loss": 0.0568359375,
"memory(GiB)": 43.05,
"step": 1165,
"train_speed(iter/s)": 0.777442
},
{
"epoch": 0.7582631237848347,
"grad_norm": 9.8125,
"learning_rate": 9.981820032358222e-05,
"loss": 0.055859375,
"memory(GiB)": 43.05,
"step": 1170,
"train_speed(iter/s)": 0.777855
},
{
"epoch": 0.76150356448477,
"grad_norm": 10.125,
"learning_rate": 9.981360665808675e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 1175,
"train_speed(iter/s)": 0.778025
},
{
"epoch": 0.7647440051847051,
"grad_norm": 2.953125,
"learning_rate": 9.980895578702815e-05,
"loss": 0.06181640625,
"memory(GiB)": 43.05,
"step": 1180,
"train_speed(iter/s)": 0.778134
},
{
"epoch": 0.7679844458846403,
"grad_norm": 7.0,
"learning_rate": 9.980424771574749e-05,
"loss": 0.0611328125,
"memory(GiB)": 43.05,
"step": 1185,
"train_speed(iter/s)": 0.778486
},
{
"epoch": 0.7712248865845756,
"grad_norm": 11.6875,
"learning_rate": 9.979948244965147e-05,
"loss": 0.048046875,
"memory(GiB)": 43.05,
"step": 1190,
"train_speed(iter/s)": 0.777604
},
{
"epoch": 0.7744653272845107,
"grad_norm": 2.265625,
"learning_rate": 9.979465999421247e-05,
"loss": 0.04833984375,
"memory(GiB)": 43.05,
"step": 1195,
"train_speed(iter/s)": 0.776874
},
{
"epoch": 0.7777057679844459,
"grad_norm": 7.34375,
"learning_rate": 9.978978035496858e-05,
"loss": 0.05361328125,
"memory(GiB)": 43.05,
"step": 1200,
"train_speed(iter/s)": 0.776012
},
{
"epoch": 0.780946208684381,
"grad_norm": 0.81640625,
"learning_rate": 9.978484353752354e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 1205,
"train_speed(iter/s)": 0.77616
},
{
"epoch": 0.7841866493843163,
"grad_norm": 1.0703125,
"learning_rate": 9.977984954754674e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 1210,
"train_speed(iter/s)": 0.776525
},
{
"epoch": 0.7874270900842515,
"grad_norm": 2.515625,
"learning_rate": 9.977479839077326e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 1215,
"train_speed(iter/s)": 0.777313
},
{
"epoch": 0.7906675307841866,
"grad_norm": 16.125,
"learning_rate": 9.976969007300378e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 1220,
"train_speed(iter/s)": 0.776634
},
{
"epoch": 0.7939079714841218,
"grad_norm": 12.75,
"learning_rate": 9.976452460010468e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 1225,
"train_speed(iter/s)": 0.776792
},
{
"epoch": 0.7971484121840571,
"grad_norm": 3.0,
"learning_rate": 9.975930197800794e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 1230,
"train_speed(iter/s)": 0.777111
},
{
"epoch": 0.8003888528839922,
"grad_norm": 10.25,
"learning_rate": 9.975402221271117e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 1235,
"train_speed(iter/s)": 0.777639
},
{
"epoch": 0.8036292935839274,
"grad_norm": 0.875,
"learning_rate": 9.974868531027761e-05,
"loss": 0.0509765625,
"memory(GiB)": 43.05,
"step": 1240,
"train_speed(iter/s)": 0.777899
},
{
"epoch": 0.8068697342838627,
"grad_norm": 15.9375,
"learning_rate": 9.974329127683614e-05,
"loss": 0.07109375,
"memory(GiB)": 43.05,
"step": 1245,
"train_speed(iter/s)": 0.777904
},
{
"epoch": 0.8101101749837978,
"grad_norm": 12.9375,
"learning_rate": 9.973784011858123e-05,
"loss": 0.060546875,
"memory(GiB)": 43.05,
"step": 1250,
"train_speed(iter/s)": 0.777258
},
{
"epoch": 0.813350615683733,
"grad_norm": 10.0625,
"learning_rate": 9.97323318417729e-05,
"loss": 0.0505859375,
"memory(GiB)": 43.05,
"step": 1255,
"train_speed(iter/s)": 0.776907
},
{
"epoch": 0.8165910563836681,
"grad_norm": 2.03125,
"learning_rate": 9.972676645273688e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 1260,
"train_speed(iter/s)": 0.776807
},
{
"epoch": 0.8198314970836034,
"grad_norm": 12.9375,
"learning_rate": 9.972114395786436e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 1265,
"train_speed(iter/s)": 0.77681
},
{
"epoch": 0.8230719377835386,
"grad_norm": 12.4375,
"learning_rate": 9.971546436361221e-05,
"loss": 0.05,
"memory(GiB)": 43.05,
"step": 1270,
"train_speed(iter/s)": 0.775909
},
{
"epoch": 0.8263123784834737,
"grad_norm": 4.15625,
"learning_rate": 9.970972767650281e-05,
"loss": 0.04521484375,
"memory(GiB)": 43.05,
"step": 1275,
"train_speed(iter/s)": 0.775947
},
{
"epoch": 0.829552819183409,
"grad_norm": 4.875,
"learning_rate": 9.970393390312414e-05,
"loss": 0.06591796875,
"memory(GiB)": 43.05,
"step": 1280,
"train_speed(iter/s)": 0.77626
},
{
"epoch": 0.8327932598833442,
"grad_norm": 7.125,
"learning_rate": 9.969808305012971e-05,
"loss": 0.0515625,
"memory(GiB)": 43.05,
"step": 1285,
"train_speed(iter/s)": 0.776127
},
{
"epoch": 0.8360337005832793,
"grad_norm": 10.8125,
"learning_rate": 9.96921751242386e-05,
"loss": 0.0662109375,
"memory(GiB)": 43.05,
"step": 1290,
"train_speed(iter/s)": 0.775879
},
{
"epoch": 0.8392741412832145,
"grad_norm": 1.4140625,
"learning_rate": 9.968621013223544e-05,
"loss": 0.0544921875,
"memory(GiB)": 43.05,
"step": 1295,
"train_speed(iter/s)": 0.775725
},
{
"epoch": 0.8425145819831497,
"grad_norm": 6.21875,
"learning_rate": 9.968018808097039e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 1300,
"train_speed(iter/s)": 0.775627
},
{
"epoch": 0.8457550226830849,
"grad_norm": 1.609375,
"learning_rate": 9.967410897735909e-05,
"loss": 0.058203125,
"memory(GiB)": 43.05,
"step": 1305,
"train_speed(iter/s)": 0.775025
},
{
"epoch": 0.8489954633830201,
"grad_norm": 4.53125,
"learning_rate": 9.966797282838274e-05,
"loss": 0.0521484375,
"memory(GiB)": 43.05,
"step": 1310,
"train_speed(iter/s)": 0.775352
},
{
"epoch": 0.8522359040829552,
"grad_norm": 17.75,
"learning_rate": 9.966177964108809e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 1315,
"train_speed(iter/s)": 0.77452
},
{
"epoch": 0.8554763447828905,
"grad_norm": 6.0625,
"learning_rate": 9.96555294225873e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 1320,
"train_speed(iter/s)": 0.775285
},
{
"epoch": 0.8587167854828257,
"grad_norm": 11.375,
"learning_rate": 9.964922218005812e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 1325,
"train_speed(iter/s)": 0.775617
},
{
"epoch": 0.8619572261827608,
"grad_norm": 8.6875,
"learning_rate": 9.964285792074368e-05,
"loss": 0.0630859375,
"memory(GiB)": 43.05,
"step": 1330,
"train_speed(iter/s)": 0.775421
},
{
"epoch": 0.8651976668826961,
"grad_norm": 12.9375,
"learning_rate": 9.96364366519527e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 1335,
"train_speed(iter/s)": 0.775758
},
{
"epoch": 0.8684381075826313,
"grad_norm": 3.46875,
"learning_rate": 9.962995838105929e-05,
"loss": 0.0572265625,
"memory(GiB)": 43.05,
"step": 1340,
"train_speed(iter/s)": 0.774714
},
{
"epoch": 0.8716785482825664,
"grad_norm": 3.90625,
"learning_rate": 9.962342311550305e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 1345,
"train_speed(iter/s)": 0.773981
},
{
"epoch": 0.8749189889825016,
"grad_norm": 3.671875,
"learning_rate": 9.961683086278903e-05,
"loss": 0.0626953125,
"memory(GiB)": 43.05,
"step": 1350,
"train_speed(iter/s)": 0.773107
},
{
"epoch": 0.8781594296824368,
"grad_norm": 0.59765625,
"learning_rate": 9.961018163048773e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 1355,
"train_speed(iter/s)": 0.772617
},
{
"epoch": 0.881399870382372,
"grad_norm": 9.4375,
"learning_rate": 9.960347542623504e-05,
"loss": 0.0521484375,
"memory(GiB)": 43.05,
"step": 1360,
"train_speed(iter/s)": 0.773595
},
{
"epoch": 0.8846403110823072,
"grad_norm": 9.6875,
"learning_rate": 9.959671225773237e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 1365,
"train_speed(iter/s)": 0.774588
},
{
"epoch": 0.8878807517822424,
"grad_norm": 3.515625,
"learning_rate": 9.958989213274646e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 1370,
"train_speed(iter/s)": 0.77517
},
{
"epoch": 0.8911211924821776,
"grad_norm": 1.1328125,
"learning_rate": 9.958301505910948e-05,
"loss": 0.0599609375,
"memory(GiB)": 43.05,
"step": 1375,
"train_speed(iter/s)": 0.774153
},
{
"epoch": 0.8943616331821128,
"grad_norm": 11.125,
"learning_rate": 9.957608104471903e-05,
"loss": 0.0654296875,
"memory(GiB)": 43.05,
"step": 1380,
"train_speed(iter/s)": 0.77468
},
{
"epoch": 0.8976020738820479,
"grad_norm": 0.77734375,
"learning_rate": 9.956909009753807e-05,
"loss": 0.0630859375,
"memory(GiB)": 43.05,
"step": 1385,
"train_speed(iter/s)": 0.77371
},
{
"epoch": 0.9008425145819832,
"grad_norm": 4.9375,
"learning_rate": 9.956204222559495e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 1390,
"train_speed(iter/s)": 0.773416
},
{
"epoch": 0.9040829552819183,
"grad_norm": 9.8125,
"learning_rate": 9.955493743698339e-05,
"loss": 0.047265625,
"memory(GiB)": 43.05,
"step": 1395,
"train_speed(iter/s)": 0.774051
},
{
"epoch": 0.9073233959818535,
"grad_norm": 7.96875,
"learning_rate": 9.954777573986247e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 1400,
"train_speed(iter/s)": 0.773786
},
{
"epoch": 0.9105638366817888,
"grad_norm": 2.65625,
"learning_rate": 9.954055714245665e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 1405,
"train_speed(iter/s)": 0.774347
},
{
"epoch": 0.9138042773817239,
"grad_norm": 9.3125,
"learning_rate": 9.953328165305568e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 1410,
"train_speed(iter/s)": 0.774895
},
{
"epoch": 0.9170447180816591,
"grad_norm": 13.5625,
"learning_rate": 9.95259492800147e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 1415,
"train_speed(iter/s)": 0.774985
},
{
"epoch": 0.9202851587815943,
"grad_norm": 15.6875,
"learning_rate": 9.951856003175414e-05,
"loss": 0.0466796875,
"memory(GiB)": 43.05,
"step": 1420,
"train_speed(iter/s)": 0.77452
},
{
"epoch": 0.9235255994815295,
"grad_norm": 3.265625,
"learning_rate": 9.951111391675976e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 1425,
"train_speed(iter/s)": 0.774202
},
{
"epoch": 0.9267660401814647,
"grad_norm": 2.8125,
"learning_rate": 9.950361094358263e-05,
"loss": 0.0607421875,
"memory(GiB)": 43.05,
"step": 1430,
"train_speed(iter/s)": 0.774296
},
{
"epoch": 0.9300064808813999,
"grad_norm": 0.69921875,
"learning_rate": 9.949605112083909e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 1435,
"train_speed(iter/s)": 0.775216
},
{
"epoch": 0.933246921581335,
"grad_norm": 9.1875,
"learning_rate": 9.948843445721079e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 1440,
"train_speed(iter/s)": 0.774922
},
{
"epoch": 0.9364873622812703,
"grad_norm": 3.765625,
"learning_rate": 9.948076096144463e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 1445,
"train_speed(iter/s)": 0.774879
},
{
"epoch": 0.9397278029812054,
"grad_norm": 12.4375,
"learning_rate": 9.947303064235283e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 1450,
"train_speed(iter/s)": 0.775133
},
{
"epoch": 0.9429682436811406,
"grad_norm": 12.875,
"learning_rate": 9.946524350881282e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 1455,
"train_speed(iter/s)": 0.774842
},
{
"epoch": 0.9462086843810759,
"grad_norm": 6.65625,
"learning_rate": 9.945739956976725e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 1460,
"train_speed(iter/s)": 0.774929
},
{
"epoch": 0.949449125081011,
"grad_norm": 0.4375,
"learning_rate": 9.944949883422408e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 1465,
"train_speed(iter/s)": 0.775231
},
{
"epoch": 0.9526895657809462,
"grad_norm": 4.1875,
"learning_rate": 9.944154131125642e-05,
"loss": 0.050390625,
"memory(GiB)": 43.05,
"step": 1470,
"train_speed(iter/s)": 0.775942
},
{
"epoch": 0.9559300064808814,
"grad_norm": 1.0234375,
"learning_rate": 9.943352701000266e-05,
"loss": 0.0587890625,
"memory(GiB)": 43.05,
"step": 1475,
"train_speed(iter/s)": 0.776016
},
{
"epoch": 0.9591704471808166,
"grad_norm": 1.1015625,
"learning_rate": 9.942545593966636e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 1480,
"train_speed(iter/s)": 0.776078
},
{
"epoch": 0.9624108878807518,
"grad_norm": 14.25,
"learning_rate": 9.941732810951626e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 1485,
"train_speed(iter/s)": 0.775252
},
{
"epoch": 0.9656513285806869,
"grad_norm": 2.8125,
"learning_rate": 9.940914352888628e-05,
"loss": 0.073828125,
"memory(GiB)": 43.05,
"step": 1490,
"train_speed(iter/s)": 0.774903
},
{
"epoch": 0.9688917692806222,
"grad_norm": 13.5,
"learning_rate": 9.940090220717556e-05,
"loss": 0.0498046875,
"memory(GiB)": 43.05,
"step": 1495,
"train_speed(iter/s)": 0.775228
},
{
"epoch": 0.9721322099805574,
"grad_norm": 1.5625,
"learning_rate": 9.939260415384837e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 1500,
"train_speed(iter/s)": 0.773932
},
{
"epoch": 0.9753726506804925,
"grad_norm": 1.25,
"learning_rate": 9.93842493784341e-05,
"loss": 0.0744140625,
"memory(GiB)": 43.05,
"step": 1505,
"train_speed(iter/s)": 0.77458
},
{
"epoch": 0.9786130913804277,
"grad_norm": 14.4375,
"learning_rate": 9.937583789052735e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 1510,
"train_speed(iter/s)": 0.774348
},
{
"epoch": 0.981853532080363,
"grad_norm": 10.6875,
"learning_rate": 9.936736969978778e-05,
"loss": 0.0515625,
"memory(GiB)": 43.05,
"step": 1515,
"train_speed(iter/s)": 0.774064
},
{
"epoch": 0.9850939727802981,
"grad_norm": 9.0,
"learning_rate": 9.93588448159402e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 1520,
"train_speed(iter/s)": 0.774565
},
{
"epoch": 0.9883344134802333,
"grad_norm": 6.71875,
"learning_rate": 9.935026324877455e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 1525,
"train_speed(iter/s)": 0.774861
},
{
"epoch": 0.9915748541801686,
"grad_norm": 1.265625,
"learning_rate": 9.93416250081458e-05,
"loss": 0.055859375,
"memory(GiB)": 43.05,
"step": 1530,
"train_speed(iter/s)": 0.774348
},
{
"epoch": 0.9948152948801037,
"grad_norm": 3.96875,
"learning_rate": 9.933293010397403e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 1535,
"train_speed(iter/s)": 0.773239
},
{
"epoch": 0.9980557355800389,
"grad_norm": 8.875,
"learning_rate": 9.932417854624444e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 1540,
"train_speed(iter/s)": 0.77374
},
{
"epoch": 1.0012961762799741,
"grad_norm": 2.375,
"learning_rate": 9.931537034500723e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 1545,
"train_speed(iter/s)": 0.774118
},
{
"epoch": 1.0045366169799093,
"grad_norm": 3.984375,
"learning_rate": 9.930650551037769e-05,
"loss": 0.0484375,
"memory(GiB)": 43.05,
"step": 1550,
"train_speed(iter/s)": 0.77415
},
{
"epoch": 1.0077770576798444,
"grad_norm": 9.875,
"learning_rate": 9.929758405253608e-05,
"loss": 0.0498046875,
"memory(GiB)": 43.05,
"step": 1555,
"train_speed(iter/s)": 0.774997
},
{
"epoch": 1.0110174983797797,
"grad_norm": 0.63671875,
"learning_rate": 9.928860598172778e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 1560,
"train_speed(iter/s)": 0.775007
},
{
"epoch": 1.0142579390797148,
"grad_norm": 9.8125,
"learning_rate": 9.927957130826313e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 1565,
"train_speed(iter/s)": 0.775272
},
{
"epoch": 1.01749837977965,
"grad_norm": 3.625,
"learning_rate": 9.927048004251747e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 1570,
"train_speed(iter/s)": 0.774385
},
{
"epoch": 1.0207388204795853,
"grad_norm": 8.4375,
"learning_rate": 9.926133219493115e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 1575,
"train_speed(iter/s)": 0.774424
},
{
"epoch": 1.0239792611795204,
"grad_norm": 0.78125,
"learning_rate": 9.925212777600946e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 1580,
"train_speed(iter/s)": 0.774282
},
{
"epoch": 1.0272197018794555,
"grad_norm": 13.375,
"learning_rate": 9.92428667963227e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 1585,
"train_speed(iter/s)": 0.773988
},
{
"epoch": 1.030460142579391,
"grad_norm": 0.76171875,
"learning_rate": 9.923354926650614e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 1590,
"train_speed(iter/s)": 0.773749
},
{
"epoch": 1.033700583279326,
"grad_norm": 18.375,
"learning_rate": 9.922417519725992e-05,
"loss": 0.091015625,
"memory(GiB)": 43.05,
"step": 1595,
"train_speed(iter/s)": 0.773458
},
{
"epoch": 1.0369410239792611,
"grad_norm": 10.75,
"learning_rate": 9.921474459934917e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 1600,
"train_speed(iter/s)": 0.772849
},
{
"epoch": 1.0401814646791965,
"grad_norm": 12.375,
"learning_rate": 9.920525748360389e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 1605,
"train_speed(iter/s)": 0.772784
},
{
"epoch": 1.0434219053791316,
"grad_norm": 9.8125,
"learning_rate": 9.919571386091904e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 1610,
"train_speed(iter/s)": 0.772857
},
{
"epoch": 1.0466623460790667,
"grad_norm": 7.875,
"learning_rate": 9.918611374225442e-05,
"loss": 0.0560546875,
"memory(GiB)": 43.05,
"step": 1615,
"train_speed(iter/s)": 0.772536
},
{
"epoch": 1.0499027867790018,
"grad_norm": 3.859375,
"learning_rate": 9.917645713863475e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 1620,
"train_speed(iter/s)": 0.771715
},
{
"epoch": 1.0531432274789372,
"grad_norm": 10.1875,
"learning_rate": 9.916674406114959e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 1625,
"train_speed(iter/s)": 0.771869
},
{
"epoch": 1.0563836681788723,
"grad_norm": 10.5625,
"learning_rate": 9.915697452095337e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 1630,
"train_speed(iter/s)": 0.772117
},
{
"epoch": 1.0596241088788074,
"grad_norm": 16.5,
"learning_rate": 9.914714852926535e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 1635,
"train_speed(iter/s)": 0.771676
},
{
"epoch": 1.0628645495787428,
"grad_norm": 3.859375,
"learning_rate": 9.913726609736961e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 1640,
"train_speed(iter/s)": 0.771235
},
{
"epoch": 1.0661049902786779,
"grad_norm": 3.296875,
"learning_rate": 9.912732723661511e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 1645,
"train_speed(iter/s)": 0.770824
},
{
"epoch": 1.069345430978613,
"grad_norm": 2.046875,
"learning_rate": 9.911733195841549e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 1650,
"train_speed(iter/s)": 0.769905
},
{
"epoch": 1.0725858716785484,
"grad_norm": 11.625,
"learning_rate": 9.91072802742493e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 1655,
"train_speed(iter/s)": 0.769874
},
{
"epoch": 1.0758263123784835,
"grad_norm": 12.8125,
"learning_rate": 9.90971721956598e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 1660,
"train_speed(iter/s)": 0.770173
},
{
"epoch": 1.0790667530784186,
"grad_norm": 2.71875,
"learning_rate": 9.908700773425503e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 1665,
"train_speed(iter/s)": 0.77044
},
{
"epoch": 1.082307193778354,
"grad_norm": 0.93359375,
"learning_rate": 9.907678690170779e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 1670,
"train_speed(iter/s)": 0.770527
},
{
"epoch": 1.085547634478289,
"grad_norm": 13.8125,
"learning_rate": 9.906650970975558e-05,
"loss": 0.054296875,
"memory(GiB)": 43.05,
"step": 1675,
"train_speed(iter/s)": 0.770283
},
{
"epoch": 1.0887880751782242,
"grad_norm": 3.59375,
"learning_rate": 9.905617617020068e-05,
"loss": 0.058984375,
"memory(GiB)": 43.05,
"step": 1680,
"train_speed(iter/s)": 0.770746
},
{
"epoch": 1.0920285158781595,
"grad_norm": 10.3125,
"learning_rate": 9.904578629491003e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 1685,
"train_speed(iter/s)": 0.771044
},
{
"epoch": 1.0952689565780946,
"grad_norm": 8.6875,
"learning_rate": 9.903534009581528e-05,
"loss": 0.062109375,
"memory(GiB)": 43.05,
"step": 1690,
"train_speed(iter/s)": 0.771311
},
{
"epoch": 1.0985093972780298,
"grad_norm": 1.7421875,
"learning_rate": 9.902483758491277e-05,
"loss": 0.053125,
"memory(GiB)": 43.05,
"step": 1695,
"train_speed(iter/s)": 0.770537
},
{
"epoch": 1.101749837977965,
"grad_norm": 2.484375,
"learning_rate": 9.90142787742635e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 1700,
"train_speed(iter/s)": 0.771309
},
{
"epoch": 1.1049902786779002,
"grad_norm": 3.109375,
"learning_rate": 9.900366367599314e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 1705,
"train_speed(iter/s)": 0.771763
},
{
"epoch": 1.1082307193778353,
"grad_norm": 11.75,
"learning_rate": 9.899299230229197e-05,
"loss": 0.059765625,
"memory(GiB)": 43.05,
"step": 1710,
"train_speed(iter/s)": 0.771543
},
{
"epoch": 1.1114711600777705,
"grad_norm": 11.5625,
"learning_rate": 9.898226466541493e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 1715,
"train_speed(iter/s)": 0.771128
},
{
"epoch": 1.1147116007777058,
"grad_norm": 12.375,
"learning_rate": 9.897148077768155e-05,
"loss": 0.068359375,
"memory(GiB)": 43.05,
"step": 1720,
"train_speed(iter/s)": 0.771249
},
{
"epoch": 1.117952041477641,
"grad_norm": 1.125,
"learning_rate": 9.896064065147595e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 1725,
"train_speed(iter/s)": 0.771506
},
{
"epoch": 1.121192482177576,
"grad_norm": 3.21875,
"learning_rate": 9.894974429924686e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 1730,
"train_speed(iter/s)": 0.771759
},
{
"epoch": 1.1244329228775114,
"grad_norm": 9.1875,
"learning_rate": 9.893879173350757e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 1735,
"train_speed(iter/s)": 0.771993
},
{
"epoch": 1.1276733635774465,
"grad_norm": 2.421875,
"learning_rate": 9.892778296683591e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 1740,
"train_speed(iter/s)": 0.772457
},
{
"epoch": 1.1309138042773816,
"grad_norm": 14.3125,
"learning_rate": 9.891671801187428e-05,
"loss": 0.058984375,
"memory(GiB)": 43.05,
"step": 1745,
"train_speed(iter/s)": 0.772356
},
{
"epoch": 1.134154244977317,
"grad_norm": 12.9375,
"learning_rate": 9.890559688132956e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 1750,
"train_speed(iter/s)": 0.773111
},
{
"epoch": 1.137394685677252,
"grad_norm": 14.25,
"learning_rate": 9.88944195879732e-05,
"loss": 0.0640625,
"memory(GiB)": 43.05,
"step": 1755,
"train_speed(iter/s)": 0.772017
},
{
"epoch": 1.1406351263771872,
"grad_norm": 13.25,
"learning_rate": 9.888318614464113e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 1760,
"train_speed(iter/s)": 0.771971
},
{
"epoch": 1.1438755670771226,
"grad_norm": 8.5625,
"learning_rate": 9.88718965642337e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 1765,
"train_speed(iter/s)": 0.771885
},
{
"epoch": 1.1471160077770577,
"grad_norm": 9.125,
"learning_rate": 9.886055085971583e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 1770,
"train_speed(iter/s)": 0.772214
},
{
"epoch": 1.1503564484769928,
"grad_norm": 11.75,
"learning_rate": 9.88491490441168e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 1775,
"train_speed(iter/s)": 0.772198
},
{
"epoch": 1.1535968891769282,
"grad_norm": 8.6875,
"learning_rate": 9.883769113053039e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 1780,
"train_speed(iter/s)": 0.771899
},
{
"epoch": 1.1568373298768633,
"grad_norm": 0.66796875,
"learning_rate": 9.882617713211477e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 1785,
"train_speed(iter/s)": 0.771741
},
{
"epoch": 1.1600777705767984,
"grad_norm": 13.25,
"learning_rate": 9.881460706209254e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 1790,
"train_speed(iter/s)": 0.771772
},
{
"epoch": 1.1633182112767337,
"grad_norm": 1.265625,
"learning_rate": 9.880298093375064e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 1795,
"train_speed(iter/s)": 0.771523
},
{
"epoch": 1.1665586519766689,
"grad_norm": 9.5,
"learning_rate": 9.879129876044048e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 1800,
"train_speed(iter/s)": 0.771301
},
{
"epoch": 1.169799092676604,
"grad_norm": 2.0,
"learning_rate": 9.877956055557776e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 1805,
"train_speed(iter/s)": 0.771532
},
{
"epoch": 1.173039533376539,
"grad_norm": 8.0625,
"learning_rate": 9.876776633264254e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 1810,
"train_speed(iter/s)": 0.771949
},
{
"epoch": 1.1762799740764744,
"grad_norm": 1.734375,
"learning_rate": 9.87559161051792e-05,
"loss": 0.0552734375,
"memory(GiB)": 43.05,
"step": 1815,
"train_speed(iter/s)": 0.772095
},
{
"epoch": 1.1795204147764096,
"grad_norm": 9.5625,
"learning_rate": 9.874400988679646e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 1820,
"train_speed(iter/s)": 0.772592
},
{
"epoch": 1.1827608554763447,
"grad_norm": 6.96875,
"learning_rate": 9.873204769116736e-05,
"loss": 0.0671875,
"memory(GiB)": 43.05,
"step": 1825,
"train_speed(iter/s)": 0.772865
},
{
"epoch": 1.18600129617628,
"grad_norm": 2.359375,
"learning_rate": 9.872002953202914e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 1830,
"train_speed(iter/s)": 0.771885
},
{
"epoch": 1.1892417368762151,
"grad_norm": 18.125,
"learning_rate": 9.87079554231834e-05,
"loss": 0.05791015625,
"memory(GiB)": 43.05,
"step": 1835,
"train_speed(iter/s)": 0.772038
},
{
"epoch": 1.1924821775761503,
"grad_norm": 10.5,
"learning_rate": 9.869582537849593e-05,
"loss": 0.05693359375,
"memory(GiB)": 43.05,
"step": 1840,
"train_speed(iter/s)": 0.771967
},
{
"epoch": 1.1957226182760856,
"grad_norm": 11.375,
"learning_rate": 9.86836394118968e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 1845,
"train_speed(iter/s)": 0.772659
},
{
"epoch": 1.1989630589760207,
"grad_norm": 3.359375,
"learning_rate": 9.867139753738028e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 1850,
"train_speed(iter/s)": 0.771954
},
{
"epoch": 1.2022034996759559,
"grad_norm": 1.15625,
"learning_rate": 9.865909976900484e-05,
"loss": 0.05078125,
"memory(GiB)": 43.05,
"step": 1855,
"train_speed(iter/s)": 0.772176
},
{
"epoch": 1.2054439403758912,
"grad_norm": 16.5,
"learning_rate": 9.864674612089313e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 1860,
"train_speed(iter/s)": 0.77217
},
{
"epoch": 1.2086843810758263,
"grad_norm": 10.5625,
"learning_rate": 9.8634336607232e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 1865,
"train_speed(iter/s)": 0.772255
},
{
"epoch": 1.2119248217757614,
"grad_norm": 7.34375,
"learning_rate": 9.862187124227245e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 1870,
"train_speed(iter/s)": 0.772989
},
{
"epoch": 1.2151652624756968,
"grad_norm": 3.390625,
"learning_rate": 9.860935004032957e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 1875,
"train_speed(iter/s)": 0.773017
},
{
"epoch": 1.218405703175632,
"grad_norm": 9.5,
"learning_rate": 9.859677301578265e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 1880,
"train_speed(iter/s)": 0.77328
},
{
"epoch": 1.221646143875567,
"grad_norm": 5.71875,
"learning_rate": 9.858414018307503e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 1885,
"train_speed(iter/s)": 0.773031
},
{
"epoch": 1.2248865845755024,
"grad_norm": 12.1875,
"learning_rate": 9.857145155671417e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 1890,
"train_speed(iter/s)": 0.773256
},
{
"epoch": 1.2281270252754375,
"grad_norm": 1.265625,
"learning_rate": 9.855870715127158e-05,
"loss": 0.04462890625,
"memory(GiB)": 43.05,
"step": 1895,
"train_speed(iter/s)": 0.773677
},
{
"epoch": 1.2313674659753726,
"grad_norm": 3.25,
"learning_rate": 9.854590698138283e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 1900,
"train_speed(iter/s)": 0.772772
},
{
"epoch": 1.2346079066753077,
"grad_norm": 1.7578125,
"learning_rate": 9.853305106174756e-05,
"loss": 0.049609375,
"memory(GiB)": 43.05,
"step": 1905,
"train_speed(iter/s)": 0.772561
},
{
"epoch": 1.237848347375243,
"grad_norm": 1.609375,
"learning_rate": 9.852013940712938e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 1910,
"train_speed(iter/s)": 0.772825
},
{
"epoch": 1.2410887880751782,
"grad_norm": 1.9765625,
"learning_rate": 9.850717203235598e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 1915,
"train_speed(iter/s)": 0.773516
},
{
"epoch": 1.2443292287751135,
"grad_norm": 11.375,
"learning_rate": 9.849414895231895e-05,
"loss": 0.0642578125,
"memory(GiB)": 43.05,
"step": 1920,
"train_speed(iter/s)": 0.773144
},
{
"epoch": 1.2475696694750487,
"grad_norm": 2.65625,
"learning_rate": 9.848107018197393e-05,
"loss": 0.081640625,
"memory(GiB)": 43.05,
"step": 1925,
"train_speed(iter/s)": 0.773611
},
{
"epoch": 1.2508101101749838,
"grad_norm": 5.15625,
"learning_rate": 9.846793573634043e-05,
"loss": 0.0564453125,
"memory(GiB)": 43.05,
"step": 1930,
"train_speed(iter/s)": 0.773066
},
{
"epoch": 1.254050550874919,
"grad_norm": 0.69921875,
"learning_rate": 9.845474563050199e-05,
"loss": 0.0720703125,
"memory(GiB)": 43.05,
"step": 1935,
"train_speed(iter/s)": 0.773323
},
{
"epoch": 1.2572909915748542,
"grad_norm": 9.3125,
"learning_rate": 9.8441499879606e-05,
"loss": 0.057421875,
"memory(GiB)": 43.05,
"step": 1940,
"train_speed(iter/s)": 0.773681
},
{
"epoch": 1.2605314322747894,
"grad_norm": 2.84375,
"learning_rate": 9.842819849886382e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 1945,
"train_speed(iter/s)": 0.772924
},
{
"epoch": 1.2637718729747245,
"grad_norm": 6.5625,
"learning_rate": 9.841484150355061e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 1950,
"train_speed(iter/s)": 0.772429
},
{
"epoch": 1.2670123136746598,
"grad_norm": 17.5,
"learning_rate": 9.840142890900546e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 1955,
"train_speed(iter/s)": 0.771885
},
{
"epoch": 1.270252754374595,
"grad_norm": 13.4375,
"learning_rate": 9.838796073063127e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 1960,
"train_speed(iter/s)": 0.771377
},
{
"epoch": 1.27349319507453,
"grad_norm": 1.3828125,
"learning_rate": 9.837443698389482e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 1965,
"train_speed(iter/s)": 0.77133
},
{
"epoch": 1.2767336357744652,
"grad_norm": 7.40625,
"learning_rate": 9.836085768432665e-05,
"loss": 0.057421875,
"memory(GiB)": 43.05,
"step": 1970,
"train_speed(iter/s)": 0.771328
},
{
"epoch": 1.2799740764744005,
"grad_norm": 4.5,
"learning_rate": 9.834722284752116e-05,
"loss": 0.0560546875,
"memory(GiB)": 43.05,
"step": 1975,
"train_speed(iter/s)": 0.771233
},
{
"epoch": 1.2832145171743357,
"grad_norm": 11.625,
"learning_rate": 9.833353248913647e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 1980,
"train_speed(iter/s)": 0.771228
},
{
"epoch": 1.286454957874271,
"grad_norm": 2.921875,
"learning_rate": 9.831978662489447e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 1985,
"train_speed(iter/s)": 0.771556
},
{
"epoch": 1.2896953985742061,
"grad_norm": 5.34375,
"learning_rate": 9.830598527058082e-05,
"loss": 0.058203125,
"memory(GiB)": 43.05,
"step": 1990,
"train_speed(iter/s)": 0.770866
},
{
"epoch": 1.2929358392741412,
"grad_norm": 5.8125,
"learning_rate": 9.82921284420449e-05,
"loss": 0.04091796875,
"memory(GiB)": 43.05,
"step": 1995,
"train_speed(iter/s)": 0.770663
},
{
"epoch": 1.2961762799740764,
"grad_norm": 11.0625,
"learning_rate": 9.827821615519976e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 2000,
"train_speed(iter/s)": 0.771025
},
{
"epoch": 1.2994167206740117,
"grad_norm": 8.625,
"learning_rate": 9.826424842602218e-05,
"loss": 0.051171875,
"memory(GiB)": 43.05,
"step": 2005,
"train_speed(iter/s)": 0.611913
},
{
"epoch": 1.3026571613739468,
"grad_norm": 12.875,
"learning_rate": 9.825022527055258e-05,
"loss": 0.058203125,
"memory(GiB)": 43.05,
"step": 2010,
"train_speed(iter/s)": 0.611729
},
{
"epoch": 1.3058976020738822,
"grad_norm": 0.89453125,
"learning_rate": 9.823614670489507e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 2015,
"train_speed(iter/s)": 0.612226
},
{
"epoch": 1.3091380427738173,
"grad_norm": 0.63671875,
"learning_rate": 9.822201274521734e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 2020,
"train_speed(iter/s)": 0.612871
},
{
"epoch": 1.3123784834737524,
"grad_norm": 16.125,
"learning_rate": 9.820782340775072e-05,
"loss": 0.0529296875,
"memory(GiB)": 43.05,
"step": 2025,
"train_speed(iter/s)": 0.612987
},
{
"epoch": 1.3156189241736875,
"grad_norm": 1.96875,
"learning_rate": 9.819357870879016e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 2030,
"train_speed(iter/s)": 0.613366
},
{
"epoch": 1.3188593648736229,
"grad_norm": 2.140625,
"learning_rate": 9.817927866469414e-05,
"loss": 0.0599609375,
"memory(GiB)": 43.05,
"step": 2035,
"train_speed(iter/s)": 0.613579
},
{
"epoch": 1.322099805573558,
"grad_norm": 1.015625,
"learning_rate": 9.816492329188474e-05,
"loss": 0.06015625,
"memory(GiB)": 43.05,
"step": 2040,
"train_speed(iter/s)": 0.613311
},
{
"epoch": 1.3253402462734931,
"grad_norm": 1.40625,
"learning_rate": 9.815051260684753e-05,
"loss": 0.05029296875,
"memory(GiB)": 43.05,
"step": 2045,
"train_speed(iter/s)": 0.613476
},
{
"epoch": 1.3285806869734285,
"grad_norm": 14.625,
"learning_rate": 9.813604662613168e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 2050,
"train_speed(iter/s)": 0.613536
},
{
"epoch": 1.3318211276733636,
"grad_norm": 13.8125,
"learning_rate": 9.812152536634975e-05,
"loss": 0.0609375,
"memory(GiB)": 43.05,
"step": 2055,
"train_speed(iter/s)": 0.613809
},
{
"epoch": 1.3350615683732987,
"grad_norm": 11.3125,
"learning_rate": 9.810694884417788e-05,
"loss": 0.02646484375,
"memory(GiB)": 43.05,
"step": 2060,
"train_speed(iter/s)": 0.614351
},
{
"epoch": 1.3383020090732338,
"grad_norm": 1.71875,
"learning_rate": 9.809231707635565e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 2065,
"train_speed(iter/s)": 0.614905
},
{
"epoch": 1.3415424497731692,
"grad_norm": 14.875,
"learning_rate": 9.807763007968602e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 2070,
"train_speed(iter/s)": 0.615482
},
{
"epoch": 1.3447828904731043,
"grad_norm": 1.046875,
"learning_rate": 9.806288787103548e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 2075,
"train_speed(iter/s)": 0.61574
},
{
"epoch": 1.3480233311730396,
"grad_norm": 12.0,
"learning_rate": 9.804809046733383e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 2080,
"train_speed(iter/s)": 0.61596
},
{
"epoch": 1.3512637718729748,
"grad_norm": 1.6328125,
"learning_rate": 9.80332378855743e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 2085,
"train_speed(iter/s)": 0.61648
},
{
"epoch": 1.3545042125729099,
"grad_norm": 14.9375,
"learning_rate": 9.80183301428135e-05,
"loss": 0.053125,
"memory(GiB)": 43.05,
"step": 2090,
"train_speed(iter/s)": 0.616824
},
{
"epoch": 1.357744653272845,
"grad_norm": 3.453125,
"learning_rate": 9.800336725617135e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 2095,
"train_speed(iter/s)": 0.617314
},
{
"epoch": 1.3609850939727803,
"grad_norm": 1.8046875,
"learning_rate": 9.798834924283112e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 2100,
"train_speed(iter/s)": 0.617762
},
{
"epoch": 1.3642255346727155,
"grad_norm": 0.73046875,
"learning_rate": 9.797327612003938e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 2105,
"train_speed(iter/s)": 0.617954
},
{
"epoch": 1.3674659753726508,
"grad_norm": 2.671875,
"learning_rate": 9.7958147905106e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 2110,
"train_speed(iter/s)": 0.618398
},
{
"epoch": 1.370706416072586,
"grad_norm": 5.15625,
"learning_rate": 9.794296461540407e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 2115,
"train_speed(iter/s)": 0.618813
},
{
"epoch": 1.373946856772521,
"grad_norm": 13.25,
"learning_rate": 9.792772626837001e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 2120,
"train_speed(iter/s)": 0.618524
},
{
"epoch": 1.3771872974724562,
"grad_norm": 10.625,
"learning_rate": 9.791243288150338e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 2125,
"train_speed(iter/s)": 0.619045
},
{
"epoch": 1.3804277381723915,
"grad_norm": 14.1875,
"learning_rate": 9.789708447236702e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 2130,
"train_speed(iter/s)": 0.619342
},
{
"epoch": 1.3836681788723266,
"grad_norm": 12.9375,
"learning_rate": 9.788168105858691e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 2135,
"train_speed(iter/s)": 0.61966
},
{
"epoch": 1.3869086195722617,
"grad_norm": 0.86328125,
"learning_rate": 9.786622265785221e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 2140,
"train_speed(iter/s)": 0.619981
},
{
"epoch": 1.390149060272197,
"grad_norm": 1.8671875,
"learning_rate": 9.785070928791525e-05,
"loss": 0.049609375,
"memory(GiB)": 43.05,
"step": 2145,
"train_speed(iter/s)": 0.620392
},
{
"epoch": 1.3933895009721322,
"grad_norm": 12.5625,
"learning_rate": 9.783514096659141e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 2150,
"train_speed(iter/s)": 0.620642
},
{
"epoch": 1.3966299416720673,
"grad_norm": 2.109375,
"learning_rate": 9.78195177117593e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 2155,
"train_speed(iter/s)": 0.621305
},
{
"epoch": 1.3998703823720027,
"grad_norm": 12.0,
"learning_rate": 9.78038395413605e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 2160,
"train_speed(iter/s)": 0.621819
},
{
"epoch": 1.4031108230719378,
"grad_norm": 10.625,
"learning_rate": 9.778810647339971e-05,
"loss": 0.05625,
"memory(GiB)": 43.05,
"step": 2165,
"train_speed(iter/s)": 0.62221
},
{
"epoch": 1.406351263771873,
"grad_norm": 1.28125,
"learning_rate": 9.777231852594467e-05,
"loss": 0.015234375,
"memory(GiB)": 43.05,
"step": 2170,
"train_speed(iter/s)": 0.622883
},
{
"epoch": 1.4095917044718083,
"grad_norm": 3.15625,
"learning_rate": 9.775647571712614e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 2175,
"train_speed(iter/s)": 0.62319
},
{
"epoch": 1.4128321451717434,
"grad_norm": 0.8125,
"learning_rate": 9.774057806513788e-05,
"loss": 0.044140625,
"memory(GiB)": 43.05,
"step": 2180,
"train_speed(iter/s)": 0.623107
},
{
"epoch": 1.4160725858716785,
"grad_norm": 15.75,
"learning_rate": 9.772462558823662e-05,
"loss": 0.0615234375,
"memory(GiB)": 43.05,
"step": 2185,
"train_speed(iter/s)": 0.623169
},
{
"epoch": 1.4193130265716136,
"grad_norm": 8.25,
"learning_rate": 9.770861830474208e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 2190,
"train_speed(iter/s)": 0.623578
},
{
"epoch": 1.422553467271549,
"grad_norm": 3.75,
"learning_rate": 9.769255623303687e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 2195,
"train_speed(iter/s)": 0.62381
},
{
"epoch": 1.425793907971484,
"grad_norm": 1.53125,
"learning_rate": 9.767643939156658e-05,
"loss": 0.0638671875,
"memory(GiB)": 43.05,
"step": 2200,
"train_speed(iter/s)": 0.623993
},
{
"epoch": 1.4290343486714194,
"grad_norm": 14.4375,
"learning_rate": 9.766026779883966e-05,
"loss": 0.0453125,
"memory(GiB)": 43.05,
"step": 2205,
"train_speed(iter/s)": 0.624369
},
{
"epoch": 1.4322747893713546,
"grad_norm": 2.109375,
"learning_rate": 9.764404147342742e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 2210,
"train_speed(iter/s)": 0.624681
},
{
"epoch": 1.4355152300712897,
"grad_norm": 0.89453125,
"learning_rate": 9.76277604339641e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 2215,
"train_speed(iter/s)": 0.624586
},
{
"epoch": 1.4387556707712248,
"grad_norm": 1.65625,
"learning_rate": 9.761142469914666e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 2220,
"train_speed(iter/s)": 0.624822
},
{
"epoch": 1.4419961114711601,
"grad_norm": 13.125,
"learning_rate": 9.759503428773498e-05,
"loss": 0.0884765625,
"memory(GiB)": 43.05,
"step": 2225,
"train_speed(iter/s)": 0.625064
},
{
"epoch": 1.4452365521710953,
"grad_norm": 9.5625,
"learning_rate": 9.757858921855166e-05,
"loss": 0.060546875,
"memory(GiB)": 43.05,
"step": 2230,
"train_speed(iter/s)": 0.624889
},
{
"epoch": 1.4484769928710304,
"grad_norm": 0.71484375,
"learning_rate": 9.756208951048207e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 2235,
"train_speed(iter/s)": 0.624644
},
{
"epoch": 1.4517174335709657,
"grad_norm": 4.78125,
"learning_rate": 9.75455351824744e-05,
"loss": 0.0736328125,
"memory(GiB)": 43.05,
"step": 2240,
"train_speed(iter/s)": 0.624896
},
{
"epoch": 1.4549578742709008,
"grad_norm": 13.0625,
"learning_rate": 9.752892625353946e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 2245,
"train_speed(iter/s)": 0.625284
},
{
"epoch": 1.458198314970836,
"grad_norm": 5.5625,
"learning_rate": 9.751226274275085e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 2250,
"train_speed(iter/s)": 0.625567
},
{
"epoch": 1.4614387556707713,
"grad_norm": 1.9140625,
"learning_rate": 9.749554466924482e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 2255,
"train_speed(iter/s)": 0.625805
},
{
"epoch": 1.4646791963707064,
"grad_norm": 11.6875,
"learning_rate": 9.747877205222027e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 2260,
"train_speed(iter/s)": 0.626097
},
{
"epoch": 1.4679196370706415,
"grad_norm": 14.5625,
"learning_rate": 9.746194491093871e-05,
"loss": 0.0513671875,
"memory(GiB)": 43.05,
"step": 2265,
"train_speed(iter/s)": 0.626494
},
{
"epoch": 1.471160077770577,
"grad_norm": 13.4375,
"learning_rate": 9.744506326472435e-05,
"loss": 0.0513671875,
"memory(GiB)": 43.05,
"step": 2270,
"train_speed(iter/s)": 0.626361
},
{
"epoch": 1.474400518470512,
"grad_norm": 14.6875,
"learning_rate": 9.742812713296394e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 2275,
"train_speed(iter/s)": 0.62684
},
{
"epoch": 1.4776409591704471,
"grad_norm": 13.625,
"learning_rate": 9.741113653510677e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 2280,
"train_speed(iter/s)": 0.626967
},
{
"epoch": 1.4808813998703823,
"grad_norm": 13.875,
"learning_rate": 9.739409149066472e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 2285,
"train_speed(iter/s)": 0.626882
},
{
"epoch": 1.4841218405703176,
"grad_norm": 0.95703125,
"learning_rate": 9.73769920192122e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 2290,
"train_speed(iter/s)": 0.626984
},
{
"epoch": 1.4873622812702527,
"grad_norm": 0.84765625,
"learning_rate": 9.73598381403861e-05,
"loss": 0.0859375,
"memory(GiB)": 43.05,
"step": 2295,
"train_speed(iter/s)": 0.627301
},
{
"epoch": 1.490602721970188,
"grad_norm": 1.25,
"learning_rate": 9.734262987388583e-05,
"loss": 0.05703125,
"memory(GiB)": 43.05,
"step": 2300,
"train_speed(iter/s)": 0.627792
},
{
"epoch": 1.4938431626701232,
"grad_norm": 0.6171875,
"learning_rate": 9.732536723947321e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 2305,
"train_speed(iter/s)": 0.627848
},
{
"epoch": 1.4970836033700583,
"grad_norm": 14.25,
"learning_rate": 9.73080502569725e-05,
"loss": 0.0638671875,
"memory(GiB)": 43.05,
"step": 2310,
"train_speed(iter/s)": 0.627722
},
{
"epoch": 1.5003240440699934,
"grad_norm": 0.90625,
"learning_rate": 9.729067894627042e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 2315,
"train_speed(iter/s)": 0.627674
},
{
"epoch": 1.5035644847699285,
"grad_norm": 8.3125,
"learning_rate": 9.727325332731604e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 2320,
"train_speed(iter/s)": 0.627639
},
{
"epoch": 1.5068049254698639,
"grad_norm": 1.078125,
"learning_rate": 9.72557734201208e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 2325,
"train_speed(iter/s)": 0.62791
},
{
"epoch": 1.5100453661697992,
"grad_norm": 11.5625,
"learning_rate": 9.72382392447585e-05,
"loss": 0.04521484375,
"memory(GiB)": 43.05,
"step": 2330,
"train_speed(iter/s)": 0.628461
},
{
"epoch": 1.5132858068697344,
"grad_norm": 3.140625,
"learning_rate": 9.722065082136525e-05,
"loss": 0.05859375,
"memory(GiB)": 43.05,
"step": 2335,
"train_speed(iter/s)": 0.628836
},
{
"epoch": 1.5165262475696695,
"grad_norm": 2.4375,
"learning_rate": 9.720300817013945e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 2340,
"train_speed(iter/s)": 0.628933
},
{
"epoch": 1.5197666882696046,
"grad_norm": 0.76953125,
"learning_rate": 9.71853113113418e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 2345,
"train_speed(iter/s)": 0.629481
},
{
"epoch": 1.5230071289695397,
"grad_norm": 5.15625,
"learning_rate": 9.716756026529523e-05,
"loss": 0.0541015625,
"memory(GiB)": 43.05,
"step": 2350,
"train_speed(iter/s)": 0.629513
},
{
"epoch": 1.526247569669475,
"grad_norm": 1.625,
"learning_rate": 9.71497550523849e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 2355,
"train_speed(iter/s)": 0.629561
},
{
"epoch": 1.5294880103694104,
"grad_norm": 3.59375,
"learning_rate": 9.713189569305818e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 2360,
"train_speed(iter/s)": 0.630132
},
{
"epoch": 1.5327284510693455,
"grad_norm": 0.63671875,
"learning_rate": 9.711398220782464e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 2365,
"train_speed(iter/s)": 0.630615
},
{
"epoch": 1.5359688917692806,
"grad_norm": 7.375,
"learning_rate": 9.709601461725597e-05,
"loss": 0.0453125,
"memory(GiB)": 43.05,
"step": 2370,
"train_speed(iter/s)": 0.631142
},
{
"epoch": 1.5392093324692158,
"grad_norm": 17.375,
"learning_rate": 9.7077992941986e-05,
"loss": 0.050390625,
"memory(GiB)": 43.05,
"step": 2375,
"train_speed(iter/s)": 0.631273
},
{
"epoch": 1.5424497731691509,
"grad_norm": 10.8125,
"learning_rate": 9.705991720271072e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 2380,
"train_speed(iter/s)": 0.631619
},
{
"epoch": 1.5456902138690862,
"grad_norm": 2.828125,
"learning_rate": 9.704178742018816e-05,
"loss": 0.05703125,
"memory(GiB)": 43.05,
"step": 2385,
"train_speed(iter/s)": 0.631783
},
{
"epoch": 1.5489306545690213,
"grad_norm": 7.90625,
"learning_rate": 9.70236036152384e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 2390,
"train_speed(iter/s)": 0.632404
},
{
"epoch": 1.5521710952689567,
"grad_norm": 2.3125,
"learning_rate": 9.70053658087436e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 2395,
"train_speed(iter/s)": 0.632371
},
{
"epoch": 1.5554115359688918,
"grad_norm": 11.9375,
"learning_rate": 9.69870740216479e-05,
"loss": 0.05,
"memory(GiB)": 43.05,
"step": 2400,
"train_speed(iter/s)": 0.63274
},
{
"epoch": 1.558651976668827,
"grad_norm": 12.6875,
"learning_rate": 9.696872827495747e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 2405,
"train_speed(iter/s)": 0.633094
},
{
"epoch": 1.561892417368762,
"grad_norm": 6.1875,
"learning_rate": 9.695032858974042e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 2410,
"train_speed(iter/s)": 0.633711
},
{
"epoch": 1.5651328580686974,
"grad_norm": 10.125,
"learning_rate": 9.693187498712679e-05,
"loss": 0.0564453125,
"memory(GiB)": 43.05,
"step": 2415,
"train_speed(iter/s)": 0.634154
},
{
"epoch": 1.5683732987686325,
"grad_norm": 2.6875,
"learning_rate": 9.691336748830857e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 2420,
"train_speed(iter/s)": 0.634504
},
{
"epoch": 1.5716137394685679,
"grad_norm": 2.53125,
"learning_rate": 9.689480611453963e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 2425,
"train_speed(iter/s)": 0.634641
},
{
"epoch": 1.574854180168503,
"grad_norm": 0.828125,
"learning_rate": 9.687619088713571e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 2430,
"train_speed(iter/s)": 0.634905
},
{
"epoch": 1.578094620868438,
"grad_norm": 1.65625,
"learning_rate": 9.685752182747439e-05,
"loss": 0.0814453125,
"memory(GiB)": 43.05,
"step": 2435,
"train_speed(iter/s)": 0.635094
},
{
"epoch": 1.5813350615683732,
"grad_norm": 1.21875,
"learning_rate": 9.683879895699506e-05,
"loss": 0.0466796875,
"memory(GiB)": 43.05,
"step": 2440,
"train_speed(iter/s)": 0.635554
},
{
"epoch": 1.5845755022683083,
"grad_norm": 1.953125,
"learning_rate": 9.682002229719894e-05,
"loss": 0.04384765625,
"memory(GiB)": 43.05,
"step": 2445,
"train_speed(iter/s)": 0.635603
},
{
"epoch": 1.5878159429682437,
"grad_norm": 2.71875,
"learning_rate": 9.6801191869649e-05,
"loss": 0.0552734375,
"memory(GiB)": 43.05,
"step": 2450,
"train_speed(iter/s)": 0.636109
},
{
"epoch": 1.591056383668179,
"grad_norm": 2.921875,
"learning_rate": 9.678230769596996e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 2455,
"train_speed(iter/s)": 0.636399
},
{
"epoch": 1.5942968243681142,
"grad_norm": 15.9375,
"learning_rate": 9.676336979784826e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 2460,
"train_speed(iter/s)": 0.636494
},
{
"epoch": 1.5975372650680493,
"grad_norm": 12.8125,
"learning_rate": 9.674437819703202e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 2465,
"train_speed(iter/s)": 0.636654
},
{
"epoch": 1.6007777057679844,
"grad_norm": 16.875,
"learning_rate": 9.672533291533105e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 2470,
"train_speed(iter/s)": 0.636867
},
{
"epoch": 1.6040181464679195,
"grad_norm": 3.703125,
"learning_rate": 9.670623397461684e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 2475,
"train_speed(iter/s)": 0.636582
},
{
"epoch": 1.6072585871678549,
"grad_norm": 2.890625,
"learning_rate": 9.668708139682243e-05,
"loss": 0.0544921875,
"memory(GiB)": 43.05,
"step": 2480,
"train_speed(iter/s)": 0.636894
},
{
"epoch": 1.61049902786779,
"grad_norm": 12.1875,
"learning_rate": 9.666787520394251e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 2485,
"train_speed(iter/s)": 0.637045
},
{
"epoch": 1.6137394685677253,
"grad_norm": 2.09375,
"learning_rate": 9.664861541803332e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 2490,
"train_speed(iter/s)": 0.637455
},
{
"epoch": 1.6169799092676604,
"grad_norm": 10.0,
"learning_rate": 9.662930206121263e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 2495,
"train_speed(iter/s)": 0.637851
},
{
"epoch": 1.6202203499675956,
"grad_norm": 4.71875,
"learning_rate": 9.660993515565979e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 2500,
"train_speed(iter/s)": 0.637773
},
{
"epoch": 1.6234607906675307,
"grad_norm": 15.3125,
"learning_rate": 9.659051472361559e-05,
"loss": 0.053515625,
"memory(GiB)": 43.05,
"step": 2505,
"train_speed(iter/s)": 0.638016
},
{
"epoch": 1.626701231367466,
"grad_norm": 2.71875,
"learning_rate": 9.657104078738228e-05,
"loss": 0.0615234375,
"memory(GiB)": 43.05,
"step": 2510,
"train_speed(iter/s)": 0.638155
},
{
"epoch": 1.6299416720674011,
"grad_norm": 3.03125,
"learning_rate": 9.655151336932362e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 2515,
"train_speed(iter/s)": 0.638142
},
{
"epoch": 1.6331821127673365,
"grad_norm": 16.125,
"learning_rate": 9.653193249186472e-05,
"loss": 0.05537109375,
"memory(GiB)": 43.05,
"step": 2520,
"train_speed(iter/s)": 0.638251
},
{
"epoch": 1.6364225534672716,
"grad_norm": 16.5,
"learning_rate": 9.651229817749212e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 2525,
"train_speed(iter/s)": 0.638404
},
{
"epoch": 1.6396629941672067,
"grad_norm": 0.8984375,
"learning_rate": 9.64926104487537e-05,
"loss": 0.053125,
"memory(GiB)": 43.05,
"step": 2530,
"train_speed(iter/s)": 0.638731
},
{
"epoch": 1.6429034348671419,
"grad_norm": 12.5625,
"learning_rate": 9.647286932825872e-05,
"loss": 0.0494140625,
"memory(GiB)": 43.05,
"step": 2535,
"train_speed(iter/s)": 0.639032
},
{
"epoch": 1.646143875567077,
"grad_norm": 9.0,
"learning_rate": 9.64530748386777e-05,
"loss": 0.0568359375,
"memory(GiB)": 43.05,
"step": 2540,
"train_speed(iter/s)": 0.639515
},
{
"epoch": 1.6493843162670123,
"grad_norm": 9.75,
"learning_rate": 9.643322700274251e-05,
"loss": 0.0552734375,
"memory(GiB)": 43.05,
"step": 2545,
"train_speed(iter/s)": 0.639772
},
{
"epoch": 1.6526247569669477,
"grad_norm": 1.3203125,
"learning_rate": 9.641332584324625e-05,
"loss": 0.0642578125,
"memory(GiB)": 43.05,
"step": 2550,
"train_speed(iter/s)": 0.640097
},
{
"epoch": 1.6558651976668828,
"grad_norm": 0.55078125,
"learning_rate": 9.639337138304323e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 2555,
"train_speed(iter/s)": 0.640597
},
{
"epoch": 1.659105638366818,
"grad_norm": 0.671875,
"learning_rate": 9.637336364504903e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 2560,
"train_speed(iter/s)": 0.640832
},
{
"epoch": 1.662346079066753,
"grad_norm": 1.25,
"learning_rate": 9.635330265224038e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 2565,
"train_speed(iter/s)": 0.641157
},
{
"epoch": 1.6655865197666881,
"grad_norm": 0.51953125,
"learning_rate": 9.633318842765515e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 2570,
"train_speed(iter/s)": 0.641182
},
{
"epoch": 1.6688269604666235,
"grad_norm": 2.265625,
"learning_rate": 9.631302099439237e-05,
"loss": 0.0537109375,
"memory(GiB)": 43.05,
"step": 2575,
"train_speed(iter/s)": 0.641614
},
{
"epoch": 1.6720674011665586,
"grad_norm": 1.5390625,
"learning_rate": 9.629280037561217e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 2580,
"train_speed(iter/s)": 0.642187
},
{
"epoch": 1.675307841866494,
"grad_norm": 1.359375,
"learning_rate": 9.627252659453573e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 2585,
"train_speed(iter/s)": 0.64238
},
{
"epoch": 1.678548282566429,
"grad_norm": 12.0625,
"learning_rate": 9.625219967444537e-05,
"loss": 0.0537109375,
"memory(GiB)": 43.05,
"step": 2590,
"train_speed(iter/s)": 0.642289
},
{
"epoch": 1.6817887232663642,
"grad_norm": 3.25,
"learning_rate": 9.623181963868428e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 2595,
"train_speed(iter/s)": 0.642606
},
{
"epoch": 1.6850291639662993,
"grad_norm": 0.98828125,
"learning_rate": 9.62113865106568e-05,
"loss": 0.0638671875,
"memory(GiB)": 43.05,
"step": 2600,
"train_speed(iter/s)": 0.643011
},
{
"epoch": 1.6882696046662347,
"grad_norm": 8.375,
"learning_rate": 9.619090031382815e-05,
"loss": 0.0548828125,
"memory(GiB)": 43.05,
"step": 2605,
"train_speed(iter/s)": 0.643071
},
{
"epoch": 1.6915100453661698,
"grad_norm": 3.25,
"learning_rate": 9.617036107172454e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 2610,
"train_speed(iter/s)": 0.643615
},
{
"epoch": 1.6947504860661051,
"grad_norm": 5.1875,
"learning_rate": 9.614976880793306e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 2615,
"train_speed(iter/s)": 0.64371
},
{
"epoch": 1.6979909267660402,
"grad_norm": 14.4375,
"learning_rate": 9.612912354610171e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 2620,
"train_speed(iter/s)": 0.643613
},
{
"epoch": 1.7012313674659754,
"grad_norm": 0.578125,
"learning_rate": 9.610842530993935e-05,
"loss": 0.0671875,
"memory(GiB)": 43.05,
"step": 2625,
"train_speed(iter/s)": 0.644023
},
{
"epoch": 1.7044718081659105,
"grad_norm": 1.9296875,
"learning_rate": 9.608767412321568e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 2630,
"train_speed(iter/s)": 0.644424
},
{
"epoch": 1.7077122488658456,
"grad_norm": 8.1875,
"learning_rate": 9.606687000976123e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 2635,
"train_speed(iter/s)": 0.644588
},
{
"epoch": 1.710952689565781,
"grad_norm": 3.5625,
"learning_rate": 9.604601299346722e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 2640,
"train_speed(iter/s)": 0.64475
},
{
"epoch": 1.7141931302657163,
"grad_norm": 3.3125,
"learning_rate": 9.602510309828574e-05,
"loss": 0.06328125,
"memory(GiB)": 43.05,
"step": 2645,
"train_speed(iter/s)": 0.645059
},
{
"epoch": 1.7174335709656514,
"grad_norm": 13.5,
"learning_rate": 9.600414034822954e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 2650,
"train_speed(iter/s)": 0.645293
},
{
"epoch": 1.7206740116655865,
"grad_norm": 3.265625,
"learning_rate": 9.598312476737206e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 2655,
"train_speed(iter/s)": 0.645706
},
{
"epoch": 1.7239144523655217,
"grad_norm": 16.5,
"learning_rate": 9.596205637984746e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 2660,
"train_speed(iter/s)": 0.645929
},
{
"epoch": 1.7271548930654568,
"grad_norm": 11.875,
"learning_rate": 9.59409352098505e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 2665,
"train_speed(iter/s)": 0.646384
},
{
"epoch": 1.7303953337653921,
"grad_norm": 13.5625,
"learning_rate": 9.591976128163658e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 2670,
"train_speed(iter/s)": 0.646461
},
{
"epoch": 1.7336357744653272,
"grad_norm": 12.4375,
"learning_rate": 9.589853461952166e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 2675,
"train_speed(iter/s)": 0.646645
},
{
"epoch": 1.7368762151652626,
"grad_norm": 1.03125,
"learning_rate": 9.58772552478823e-05,
"loss": 0.051953125,
"memory(GiB)": 43.05,
"step": 2680,
"train_speed(iter/s)": 0.646713
},
{
"epoch": 1.7401166558651977,
"grad_norm": 10.125,
"learning_rate": 9.585592319115553e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 2685,
"train_speed(iter/s)": 0.646561
},
{
"epoch": 1.7433570965651328,
"grad_norm": 4.1875,
"learning_rate": 9.583453847383895e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 2690,
"train_speed(iter/s)": 0.647019
},
{
"epoch": 1.746597537265068,
"grad_norm": 2.21875,
"learning_rate": 9.58131011204906e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 2695,
"train_speed(iter/s)": 0.647176
},
{
"epoch": 1.7498379779650033,
"grad_norm": 2.84375,
"learning_rate": 9.579161115572898e-05,
"loss": 0.0716796875,
"memory(GiB)": 43.05,
"step": 2700,
"train_speed(iter/s)": 0.647045
},
{
"epoch": 1.7530784186649384,
"grad_norm": 13.375,
"learning_rate": 9.577006860423297e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 2705,
"train_speed(iter/s)": 0.647214
},
{
"epoch": 1.7563188593648738,
"grad_norm": 12.25,
"learning_rate": 9.57484734907419e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 2710,
"train_speed(iter/s)": 0.647505
},
{
"epoch": 1.7595593000648089,
"grad_norm": 9.6875,
"learning_rate": 9.572682584005541e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 2715,
"train_speed(iter/s)": 0.647532
},
{
"epoch": 1.762799740764744,
"grad_norm": 10.25,
"learning_rate": 9.570512567703352e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 2720,
"train_speed(iter/s)": 0.647616
},
{
"epoch": 1.7660401814646791,
"grad_norm": 4.03125,
"learning_rate": 9.568337302659651e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 2725,
"train_speed(iter/s)": 0.647668
},
{
"epoch": 1.7692806221646142,
"grad_norm": 8.8125,
"learning_rate": 9.566156791372498e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 2730,
"train_speed(iter/s)": 0.647617
},
{
"epoch": 1.7725210628645496,
"grad_norm": 2.828125,
"learning_rate": 9.563971036345973e-05,
"loss": 0.065625,
"memory(GiB)": 43.05,
"step": 2735,
"train_speed(iter/s)": 0.647826
},
{
"epoch": 1.775761503564485,
"grad_norm": 4.9375,
"learning_rate": 9.56178004009018e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 2740,
"train_speed(iter/s)": 0.64777
},
{
"epoch": 1.77900194426442,
"grad_norm": 0.74609375,
"learning_rate": 9.559583805121246e-05,
"loss": 0.04697265625,
"memory(GiB)": 43.05,
"step": 2745,
"train_speed(iter/s)": 0.648228
},
{
"epoch": 1.7822423849643552,
"grad_norm": 1.875,
"learning_rate": 9.557382333961307e-05,
"loss": 0.053515625,
"memory(GiB)": 43.05,
"step": 2750,
"train_speed(iter/s)": 0.648208
},
{
"epoch": 1.7854828256642903,
"grad_norm": 11.75,
"learning_rate": 9.555175629138516e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 2755,
"train_speed(iter/s)": 0.648575
},
{
"epoch": 1.7887232663642254,
"grad_norm": 7.59375,
"learning_rate": 9.552963693187034e-05,
"loss": 0.05185546875,
"memory(GiB)": 43.05,
"step": 2760,
"train_speed(iter/s)": 0.648868
},
{
"epoch": 1.7919637070641607,
"grad_norm": 9.875,
"learning_rate": 9.550746528647036e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 2765,
"train_speed(iter/s)": 0.649151
},
{
"epoch": 1.7952041477640959,
"grad_norm": 13.5625,
"learning_rate": 9.548524138064694e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 2770,
"train_speed(iter/s)": 0.649594
},
{
"epoch": 1.7984445884640312,
"grad_norm": 1.2734375,
"learning_rate": 9.546296523992183e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 2775,
"train_speed(iter/s)": 0.650047
},
{
"epoch": 1.8016850291639663,
"grad_norm": 4.46875,
"learning_rate": 9.544063688987681e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 2780,
"train_speed(iter/s)": 0.649979
},
{
"epoch": 1.8049254698639015,
"grad_norm": 8.0,
"learning_rate": 9.541825635615356e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 2785,
"train_speed(iter/s)": 0.650359
},
{
"epoch": 1.8081659105638366,
"grad_norm": 0.59375,
"learning_rate": 9.539582366445372e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 2790,
"train_speed(iter/s)": 0.650716
},
{
"epoch": 1.811406351263772,
"grad_norm": 12.375,
"learning_rate": 9.537333884053883e-05,
"loss": 0.0673828125,
"memory(GiB)": 43.05,
"step": 2795,
"train_speed(iter/s)": 0.650829
},
{
"epoch": 1.814646791963707,
"grad_norm": 15.625,
"learning_rate": 9.535080191023026e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 2800,
"train_speed(iter/s)": 0.650751
},
{
"epoch": 1.8178872326636424,
"grad_norm": 14.25,
"learning_rate": 9.53282128994093e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 2805,
"train_speed(iter/s)": 0.650893
},
{
"epoch": 1.8211276733635775,
"grad_norm": 1.1015625,
"learning_rate": 9.530557183401696e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 2810,
"train_speed(iter/s)": 0.650918
},
{
"epoch": 1.8243681140635126,
"grad_norm": 2.9375,
"learning_rate": 9.528287874005406e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 2815,
"train_speed(iter/s)": 0.650992
},
{
"epoch": 1.8276085547634477,
"grad_norm": 0.314453125,
"learning_rate": 9.526013364358118e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 2820,
"train_speed(iter/s)": 0.651478
},
{
"epoch": 1.8308489954633829,
"grad_norm": 14.125,
"learning_rate": 9.523733657071864e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 2825,
"train_speed(iter/s)": 0.651322
},
{
"epoch": 1.8340894361633182,
"grad_norm": 12.6875,
"learning_rate": 9.521448754764639e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 2830,
"train_speed(iter/s)": 0.651614
},
{
"epoch": 1.8373298768632536,
"grad_norm": 2.3125,
"learning_rate": 9.519158660060409e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 2835,
"train_speed(iter/s)": 0.651857
},
{
"epoch": 1.8405703175631887,
"grad_norm": 2.921875,
"learning_rate": 9.5168633755891e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 2840,
"train_speed(iter/s)": 0.651982
},
{
"epoch": 1.8438107582631238,
"grad_norm": 1.453125,
"learning_rate": 9.514562903986601e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 2845,
"train_speed(iter/s)": 0.652264
},
{
"epoch": 1.847051198963059,
"grad_norm": 7.8125,
"learning_rate": 9.512257247894754e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 2850,
"train_speed(iter/s)": 0.652446
},
{
"epoch": 1.850291639662994,
"grad_norm": 9.6875,
"learning_rate": 9.509946409961356e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 2855,
"train_speed(iter/s)": 0.652627
},
{
"epoch": 1.8535320803629294,
"grad_norm": 3.625,
"learning_rate": 9.50763039284016e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 2860,
"train_speed(iter/s)": 0.652836
},
{
"epoch": 1.8567725210628645,
"grad_norm": 12.5625,
"learning_rate": 9.505309199190857e-05,
"loss": 0.060546875,
"memory(GiB)": 43.05,
"step": 2865,
"train_speed(iter/s)": 0.653211
},
{
"epoch": 1.8600129617627998,
"grad_norm": 0.66796875,
"learning_rate": 9.50298283167909e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 2870,
"train_speed(iter/s)": 0.653502
},
{
"epoch": 1.863253402462735,
"grad_norm": 11.875,
"learning_rate": 9.500651292976444e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 2875,
"train_speed(iter/s)": 0.65399
},
{
"epoch": 1.86649384316267,
"grad_norm": 9.3125,
"learning_rate": 9.498314585760436e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 2880,
"train_speed(iter/s)": 0.654499
},
{
"epoch": 1.8697342838626052,
"grad_norm": 2.359375,
"learning_rate": 9.495972712714525e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 2885,
"train_speed(iter/s)": 0.654706
},
{
"epoch": 1.8729747245625405,
"grad_norm": 9.9375,
"learning_rate": 9.4936256765281e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 2890,
"train_speed(iter/s)": 0.655062
},
{
"epoch": 1.8762151652624757,
"grad_norm": 11.5,
"learning_rate": 9.491273479896479e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 2895,
"train_speed(iter/s)": 0.655055
},
{
"epoch": 1.879455605962411,
"grad_norm": 0.390625,
"learning_rate": 9.488916125520905e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 2900,
"train_speed(iter/s)": 0.655601
},
{
"epoch": 1.8826960466623461,
"grad_norm": 9.8125,
"learning_rate": 9.486553616108547e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 2905,
"train_speed(iter/s)": 0.65603
},
{
"epoch": 1.8859364873622813,
"grad_norm": 2.78125,
"learning_rate": 9.484185954372493e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 2910,
"train_speed(iter/s)": 0.656213
},
{
"epoch": 1.8891769280622164,
"grad_norm": 1.40625,
"learning_rate": 9.481813143031747e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 2915,
"train_speed(iter/s)": 0.65662
},
{
"epoch": 1.8924173687621515,
"grad_norm": 16.5,
"learning_rate": 9.479435184811229e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 2920,
"train_speed(iter/s)": 0.656523
},
{
"epoch": 1.8956578094620868,
"grad_norm": 15.625,
"learning_rate": 9.477052082441765e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 2925,
"train_speed(iter/s)": 0.656557
},
{
"epoch": 1.8988982501620222,
"grad_norm": 9.8125,
"learning_rate": 9.474663838660094e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 2930,
"train_speed(iter/s)": 0.656682
},
{
"epoch": 1.9021386908619573,
"grad_norm": 1.3046875,
"learning_rate": 9.472270456208855e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 2935,
"train_speed(iter/s)": 0.656969
},
{
"epoch": 1.9053791315618924,
"grad_norm": 2.890625,
"learning_rate": 9.469871937836591e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 2940,
"train_speed(iter/s)": 0.657051
},
{
"epoch": 1.9086195722618275,
"grad_norm": 0.77734375,
"learning_rate": 9.467468286297742e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 2945,
"train_speed(iter/s)": 0.657156
},
{
"epoch": 1.9118600129617627,
"grad_norm": 18.0,
"learning_rate": 9.465059504352643e-05,
"loss": 0.0607421875,
"memory(GiB)": 43.05,
"step": 2950,
"train_speed(iter/s)": 0.657068
},
{
"epoch": 1.915100453661698,
"grad_norm": 2.65625,
"learning_rate": 9.462645594767519e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 2955,
"train_speed(iter/s)": 0.656706
},
{
"epoch": 1.9183408943616331,
"grad_norm": 0.474609375,
"learning_rate": 9.460226560314487e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 2960,
"train_speed(iter/s)": 0.656903
},
{
"epoch": 1.9215813350615685,
"grad_norm": 13.8125,
"learning_rate": 9.457802403771548e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 2965,
"train_speed(iter/s)": 0.657217
},
{
"epoch": 1.9248217757615036,
"grad_norm": 10.8125,
"learning_rate": 9.455373127922583e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 2970,
"train_speed(iter/s)": 0.657478
},
{
"epoch": 1.9280622164614387,
"grad_norm": 1.640625,
"learning_rate": 9.452938735557355e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 2975,
"train_speed(iter/s)": 0.657538
},
{
"epoch": 1.9313026571613738,
"grad_norm": 1.6640625,
"learning_rate": 9.450499229471501e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 2980,
"train_speed(iter/s)": 0.657753
},
{
"epoch": 1.9345430978613092,
"grad_norm": 1.8828125,
"learning_rate": 9.448054612466532e-05,
"loss": 0.047265625,
"memory(GiB)": 43.05,
"step": 2985,
"train_speed(iter/s)": 0.657793
},
{
"epoch": 1.9377835385612443,
"grad_norm": 9.5,
"learning_rate": 9.445604887349827e-05,
"loss": 0.05703125,
"memory(GiB)": 43.05,
"step": 2990,
"train_speed(iter/s)": 0.658063
},
{
"epoch": 1.9410239792611796,
"grad_norm": 4.46875,
"learning_rate": 9.443150056934631e-05,
"loss": 0.0224609375,
"memory(GiB)": 43.05,
"step": 2995,
"train_speed(iter/s)": 0.658252
},
{
"epoch": 1.9442644199611148,
"grad_norm": 1.1015625,
"learning_rate": 9.440690124040051e-05,
"loss": 0.0509765625,
"memory(GiB)": 43.05,
"step": 3000,
"train_speed(iter/s)": 0.658655
},
{
"epoch": 1.9475048606610499,
"grad_norm": 2.484375,
"learning_rate": 9.438225091491057e-05,
"loss": 0.06015625,
"memory(GiB)": 43.05,
"step": 3005,
"train_speed(iter/s)": 0.65849
},
{
"epoch": 1.950745301360985,
"grad_norm": 7.71875,
"learning_rate": 9.435754962118474e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 3010,
"train_speed(iter/s)": 0.658902
},
{
"epoch": 1.9539857420609201,
"grad_norm": 3.78125,
"learning_rate": 9.433279738758977e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 3015,
"train_speed(iter/s)": 0.659047
},
{
"epoch": 1.9572261827608555,
"grad_norm": 14.1875,
"learning_rate": 9.430799424255096e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 3020,
"train_speed(iter/s)": 0.658827
},
{
"epoch": 1.9604666234607908,
"grad_norm": 9.4375,
"learning_rate": 9.428314021455205e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 3025,
"train_speed(iter/s)": 0.658955
},
{
"epoch": 1.963707064160726,
"grad_norm": 3.859375,
"learning_rate": 9.42582353321352e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 3030,
"train_speed(iter/s)": 0.658996
},
{
"epoch": 1.966947504860661,
"grad_norm": 3.890625,
"learning_rate": 9.423327962390098e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 3035,
"train_speed(iter/s)": 0.659243
},
{
"epoch": 1.9701879455605962,
"grad_norm": 5.0,
"learning_rate": 9.420827311850836e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 3040,
"train_speed(iter/s)": 0.659632
},
{
"epoch": 1.9734283862605313,
"grad_norm": 0.8359375,
"learning_rate": 9.41832158446746e-05,
"loss": 0.062890625,
"memory(GiB)": 43.05,
"step": 3045,
"train_speed(iter/s)": 0.659849
},
{
"epoch": 1.9766688269604666,
"grad_norm": 1.6953125,
"learning_rate": 9.415810783117528e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 3050,
"train_speed(iter/s)": 0.660188
},
{
"epoch": 1.9799092676604018,
"grad_norm": 14.0625,
"learning_rate": 9.413294910684426e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 3055,
"train_speed(iter/s)": 0.660519
},
{
"epoch": 1.983149708360337,
"grad_norm": 4.5625,
"learning_rate": 9.410773970057362e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 3060,
"train_speed(iter/s)": 0.6607
},
{
"epoch": 1.9863901490602722,
"grad_norm": 8.375,
"learning_rate": 9.408247964131364e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 3065,
"train_speed(iter/s)": 0.66096
},
{
"epoch": 1.9896305897602073,
"grad_norm": 4.3125,
"learning_rate": 9.405716895807279e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 3070,
"train_speed(iter/s)": 0.661027
},
{
"epoch": 1.9928710304601425,
"grad_norm": 10.9375,
"learning_rate": 9.403180767991767e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 3075,
"train_speed(iter/s)": 0.661232
},
{
"epoch": 1.9961114711600778,
"grad_norm": 17.625,
"learning_rate": 9.400639583597296e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 3080,
"train_speed(iter/s)": 0.660635
},
{
"epoch": 1.999351911860013,
"grad_norm": 5.3125,
"learning_rate": 9.398093345542144e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 3085,
"train_speed(iter/s)": 0.660788
},
{
"epoch": 2.0025923525599483,
"grad_norm": 5.03125,
"learning_rate": 9.395542056750391e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 3090,
"train_speed(iter/s)": 0.660896
},
{
"epoch": 2.0058327932598834,
"grad_norm": 3.375,
"learning_rate": 9.392985720151915e-05,
"loss": 0.0544921875,
"memory(GiB)": 43.05,
"step": 3095,
"train_speed(iter/s)": 0.661222
},
{
"epoch": 2.0090732339598185,
"grad_norm": 12.8125,
"learning_rate": 9.390424338682396e-05,
"loss": 0.03798828125,
"memory(GiB)": 43.05,
"step": 3100,
"train_speed(iter/s)": 0.661405
},
{
"epoch": 2.0123136746597536,
"grad_norm": 1.015625,
"learning_rate": 9.387857915283304e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 3105,
"train_speed(iter/s)": 0.661361
},
{
"epoch": 2.0155541153596888,
"grad_norm": 13.375,
"learning_rate": 9.385286452901902e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 3110,
"train_speed(iter/s)": 0.661828
},
{
"epoch": 2.0187945560596243,
"grad_norm": 2.828125,
"learning_rate": 9.382709954491235e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 3115,
"train_speed(iter/s)": 0.661605
},
{
"epoch": 2.0220349967595594,
"grad_norm": 6.9375,
"learning_rate": 9.380128423010133e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 3120,
"train_speed(iter/s)": 0.661861
},
{
"epoch": 2.0252754374594946,
"grad_norm": 1.671875,
"learning_rate": 9.377541861423211e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 3125,
"train_speed(iter/s)": 0.662028
},
{
"epoch": 2.0285158781594297,
"grad_norm": 1.2109375,
"learning_rate": 9.374950272700851e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 3130,
"train_speed(iter/s)": 0.662475
},
{
"epoch": 2.031756318859365,
"grad_norm": 17.0,
"learning_rate": 9.37235365981922e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 3135,
"train_speed(iter/s)": 0.662552
},
{
"epoch": 2.0349967595593,
"grad_norm": 7.75,
"learning_rate": 9.369752025760243e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 3140,
"train_speed(iter/s)": 0.662818
},
{
"epoch": 2.038237200259235,
"grad_norm": 0.97265625,
"learning_rate": 9.367145373511619e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 3145,
"train_speed(iter/s)": 0.662597
},
{
"epoch": 2.0414776409591706,
"grad_norm": 2.1875,
"learning_rate": 9.364533706066807e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 3150,
"train_speed(iter/s)": 0.662557
},
{
"epoch": 2.0447180816591057,
"grad_norm": 0.59765625,
"learning_rate": 9.361917026425025e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 3155,
"train_speed(iter/s)": 0.662682
},
{
"epoch": 2.047958522359041,
"grad_norm": 10.1875,
"learning_rate": 9.35929533759125e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 3160,
"train_speed(iter/s)": 0.662887
},
{
"epoch": 2.051198963058976,
"grad_norm": 9.75,
"learning_rate": 9.356668642576205e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 3165,
"train_speed(iter/s)": 0.663122
},
{
"epoch": 2.054439403758911,
"grad_norm": 4.84375,
"learning_rate": 9.354036944396372e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 3170,
"train_speed(iter/s)": 0.663428
},
{
"epoch": 2.057679844458846,
"grad_norm": 13.6875,
"learning_rate": 9.351400246073969e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 3175,
"train_speed(iter/s)": 0.663244
},
{
"epoch": 2.060920285158782,
"grad_norm": 11.125,
"learning_rate": 9.34875855063696e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 3180,
"train_speed(iter/s)": 0.663705
},
{
"epoch": 2.064160725858717,
"grad_norm": 1.34375,
"learning_rate": 9.346111861119051e-05,
"loss": 0.048046875,
"memory(GiB)": 43.05,
"step": 3185,
"train_speed(iter/s)": 0.663841
},
{
"epoch": 2.067401166558652,
"grad_norm": 15.4375,
"learning_rate": 9.343460180559678e-05,
"loss": 0.01953125,
"memory(GiB)": 43.05,
"step": 3190,
"train_speed(iter/s)": 0.664087
},
{
"epoch": 2.070641607258587,
"grad_norm": 6.15625,
"learning_rate": 9.340803512004008e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 3195,
"train_speed(iter/s)": 0.664161
},
{
"epoch": 2.0738820479585223,
"grad_norm": 10.1875,
"learning_rate": 9.338141858502944e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 3200,
"train_speed(iter/s)": 0.664059
},
{
"epoch": 2.0771224886584574,
"grad_norm": 12.875,
"learning_rate": 9.335475223113104e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 3205,
"train_speed(iter/s)": 0.664006
},
{
"epoch": 2.080362929358393,
"grad_norm": 3.984375,
"learning_rate": 9.332803608896835e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 3210,
"train_speed(iter/s)": 0.664025
},
{
"epoch": 2.083603370058328,
"grad_norm": 1.78125,
"learning_rate": 9.330127018922194e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 3215,
"train_speed(iter/s)": 0.664065
},
{
"epoch": 2.086843810758263,
"grad_norm": 3.09375,
"learning_rate": 9.32744545626296e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 3220,
"train_speed(iter/s)": 0.66432
},
{
"epoch": 2.0900842514581983,
"grad_norm": 5.15625,
"learning_rate": 9.324758923998617e-05,
"loss": 0.044921875,
"memory(GiB)": 43.05,
"step": 3225,
"train_speed(iter/s)": 0.664349
},
{
"epoch": 2.0933246921581334,
"grad_norm": 12.5,
"learning_rate": 9.32206742521436e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 3230,
"train_speed(iter/s)": 0.664254
},
{
"epoch": 2.0965651328580686,
"grad_norm": 0.625,
"learning_rate": 9.319370963001084e-05,
"loss": 0.0208984375,
"memory(GiB)": 43.05,
"step": 3235,
"train_speed(iter/s)": 0.664253
},
{
"epoch": 2.0998055735580037,
"grad_norm": 6.84375,
"learning_rate": 9.316669540455386e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 3240,
"train_speed(iter/s)": 0.664384
},
{
"epoch": 2.1030460142579392,
"grad_norm": 1.390625,
"learning_rate": 9.313963160679557e-05,
"loss": 0.0505859375,
"memory(GiB)": 43.05,
"step": 3245,
"train_speed(iter/s)": 0.664557
},
{
"epoch": 2.1062864549578744,
"grad_norm": 8.4375,
"learning_rate": 9.311251826781587e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 3250,
"train_speed(iter/s)": 0.664798
},
{
"epoch": 2.1095268956578095,
"grad_norm": 5.40625,
"learning_rate": 9.308535541875146e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 3255,
"train_speed(iter/s)": 0.664872
},
{
"epoch": 2.1127673363577446,
"grad_norm": 10.25,
"learning_rate": 9.3058143090796e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 3260,
"train_speed(iter/s)": 0.665106
},
{
"epoch": 2.1160077770576797,
"grad_norm": 2.390625,
"learning_rate": 9.303088131519986e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 3265,
"train_speed(iter/s)": 0.665408
},
{
"epoch": 2.119248217757615,
"grad_norm": 13.3125,
"learning_rate": 9.300357012327031e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 3270,
"train_speed(iter/s)": 0.665595
},
{
"epoch": 2.1224886584575504,
"grad_norm": 1.2734375,
"learning_rate": 9.297620954637126e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 3275,
"train_speed(iter/s)": 0.665503
},
{
"epoch": 2.1257290991574855,
"grad_norm": 0.474609375,
"learning_rate": 9.294879961592342e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 3280,
"train_speed(iter/s)": 0.665823
},
{
"epoch": 2.1289695398574207,
"grad_norm": 16.75,
"learning_rate": 9.292134036340414e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 3285,
"train_speed(iter/s)": 0.66599
},
{
"epoch": 2.1322099805573558,
"grad_norm": 7.53125,
"learning_rate": 9.28938318203474e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 3290,
"train_speed(iter/s)": 0.66602
},
{
"epoch": 2.135450421257291,
"grad_norm": 13.3125,
"learning_rate": 9.286627401834385e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 3295,
"train_speed(iter/s)": 0.666393
},
{
"epoch": 2.138690861957226,
"grad_norm": 12.4375,
"learning_rate": 9.283866698904059e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 3300,
"train_speed(iter/s)": 0.666836
},
{
"epoch": 2.141931302657161,
"grad_norm": 1.8828125,
"learning_rate": 9.281101076414133e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 3305,
"train_speed(iter/s)": 0.666932
},
{
"epoch": 2.1451717433570967,
"grad_norm": 10.1875,
"learning_rate": 9.278330537540631e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 3310,
"train_speed(iter/s)": 0.667343
},
{
"epoch": 2.148412184057032,
"grad_norm": 13.125,
"learning_rate": 9.275555085465215e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 3315,
"train_speed(iter/s)": 0.667436
},
{
"epoch": 2.151652624756967,
"grad_norm": 0.71875,
"learning_rate": 9.272774723375195e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 3320,
"train_speed(iter/s)": 0.66775
},
{
"epoch": 2.154893065456902,
"grad_norm": 11.6875,
"learning_rate": 9.269989454463514e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 3325,
"train_speed(iter/s)": 0.667811
},
{
"epoch": 2.158133506156837,
"grad_norm": 10.25,
"learning_rate": 9.267199281928758e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 3330,
"train_speed(iter/s)": 0.668054
},
{
"epoch": 2.1613739468567728,
"grad_norm": 9.375,
"learning_rate": 9.264404208975136e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 3335,
"train_speed(iter/s)": 0.668113
},
{
"epoch": 2.164614387556708,
"grad_norm": 2.296875,
"learning_rate": 9.26160423881249e-05,
"loss": 0.07109375,
"memory(GiB)": 43.05,
"step": 3340,
"train_speed(iter/s)": 0.668019
},
{
"epoch": 2.167854828256643,
"grad_norm": 7.28125,
"learning_rate": 9.258799374656286e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 3345,
"train_speed(iter/s)": 0.668181
},
{
"epoch": 2.171095268956578,
"grad_norm": 12.375,
"learning_rate": 9.255989619727605e-05,
"loss": 0.0552734375,
"memory(GiB)": 43.05,
"step": 3350,
"train_speed(iter/s)": 0.668287
},
{
"epoch": 2.1743357096565132,
"grad_norm": 0.58984375,
"learning_rate": 9.25317497725315e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 3355,
"train_speed(iter/s)": 0.668304
},
{
"epoch": 2.1775761503564484,
"grad_norm": 1.609375,
"learning_rate": 9.250355450465232e-05,
"loss": 0.020703125,
"memory(GiB)": 43.05,
"step": 3360,
"train_speed(iter/s)": 0.668607
},
{
"epoch": 2.1808165910563835,
"grad_norm": 6.28125,
"learning_rate": 9.247531042601777e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 3365,
"train_speed(iter/s)": 0.668622
},
{
"epoch": 2.184057031756319,
"grad_norm": 4.09375,
"learning_rate": 9.244701756906314e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 3370,
"train_speed(iter/s)": 0.66889
},
{
"epoch": 2.187297472456254,
"grad_norm": 2.921875,
"learning_rate": 9.241867596627969e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 3375,
"train_speed(iter/s)": 0.669121
},
{
"epoch": 2.1905379131561893,
"grad_norm": 10.1875,
"learning_rate": 9.239028565021472e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 3380,
"train_speed(iter/s)": 0.669339
},
{
"epoch": 2.1937783538561244,
"grad_norm": 12.0,
"learning_rate": 9.236184665347147e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 3385,
"train_speed(iter/s)": 0.6695
},
{
"epoch": 2.1970187945560595,
"grad_norm": 11.9375,
"learning_rate": 9.233335900870906e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 3390,
"train_speed(iter/s)": 0.669846
},
{
"epoch": 2.2002592352559946,
"grad_norm": 14.6875,
"learning_rate": 9.230482274864244e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 3395,
"train_speed(iter/s)": 0.669786
},
{
"epoch": 2.20349967595593,
"grad_norm": 15.1875,
"learning_rate": 9.227623790604248e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 3400,
"train_speed(iter/s)": 0.669768
},
{
"epoch": 2.2067401166558653,
"grad_norm": 20.375,
"learning_rate": 9.224760451373575e-05,
"loss": 0.044921875,
"memory(GiB)": 43.05,
"step": 3405,
"train_speed(iter/s)": 0.669685
},
{
"epoch": 2.2099805573558005,
"grad_norm": 1.5078125,
"learning_rate": 9.221892260460467e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 3410,
"train_speed(iter/s)": 0.670045
},
{
"epoch": 2.2132209980557356,
"grad_norm": 10.0625,
"learning_rate": 9.219019221158729e-05,
"loss": 0.047265625,
"memory(GiB)": 43.05,
"step": 3415,
"train_speed(iter/s)": 0.670012
},
{
"epoch": 2.2164614387556707,
"grad_norm": 12.5625,
"learning_rate": 9.216141336767738e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 3420,
"train_speed(iter/s)": 0.669962
},
{
"epoch": 2.219701879455606,
"grad_norm": 4.375,
"learning_rate": 9.213258610592435e-05,
"loss": 0.03974609375,
"memory(GiB)": 43.05,
"step": 3425,
"train_speed(iter/s)": 0.67006
},
{
"epoch": 2.222942320155541,
"grad_norm": 3.265625,
"learning_rate": 9.210371045943318e-05,
"loss": 0.04013671875,
"memory(GiB)": 43.05,
"step": 3430,
"train_speed(iter/s)": 0.670476
},
{
"epoch": 2.2261827608554765,
"grad_norm": 13.875,
"learning_rate": 9.207478646136447e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 3435,
"train_speed(iter/s)": 0.670632
},
{
"epoch": 2.2294232015554116,
"grad_norm": 5.90625,
"learning_rate": 9.204581414493432e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 3440,
"train_speed(iter/s)": 0.670764
},
{
"epoch": 2.2326636422553467,
"grad_norm": 8.25,
"learning_rate": 9.201679354341428e-05,
"loss": 0.055078125,
"memory(GiB)": 43.05,
"step": 3445,
"train_speed(iter/s)": 0.671001
},
{
"epoch": 2.235904082955282,
"grad_norm": 16.0,
"learning_rate": 9.198772469013142e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 3450,
"train_speed(iter/s)": 0.67115
},
{
"epoch": 2.239144523655217,
"grad_norm": 0.49609375,
"learning_rate": 9.195860761846817e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 3455,
"train_speed(iter/s)": 0.671323
},
{
"epoch": 2.242384964355152,
"grad_norm": 1.796875,
"learning_rate": 9.192944236186236e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 3460,
"train_speed(iter/s)": 0.671383
},
{
"epoch": 2.2456254050550877,
"grad_norm": 0.54296875,
"learning_rate": 9.190022895380714e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 3465,
"train_speed(iter/s)": 0.671342
},
{
"epoch": 2.248865845755023,
"grad_norm": 12.4375,
"learning_rate": 9.187096742785098e-05,
"loss": 0.0736328125,
"memory(GiB)": 43.05,
"step": 3470,
"train_speed(iter/s)": 0.671551
},
{
"epoch": 2.252106286454958,
"grad_norm": 18.0,
"learning_rate": 9.184165781759757e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 3475,
"train_speed(iter/s)": 0.671532
},
{
"epoch": 2.255346727154893,
"grad_norm": 2.578125,
"learning_rate": 9.181230015670583e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 3480,
"train_speed(iter/s)": 0.671873
},
{
"epoch": 2.258587167854828,
"grad_norm": 0.78515625,
"learning_rate": 9.178289447888992e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 3485,
"train_speed(iter/s)": 0.671924
},
{
"epoch": 2.2618276085547633,
"grad_norm": 10.6875,
"learning_rate": 9.175344081791906e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 3490,
"train_speed(iter/s)": 0.672205
},
{
"epoch": 2.2650680492546984,
"grad_norm": 1.5859375,
"learning_rate": 9.17239392076176e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 3495,
"train_speed(iter/s)": 0.672161
},
{
"epoch": 2.268308489954634,
"grad_norm": 17.875,
"learning_rate": 9.169438968186499e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 3500,
"train_speed(iter/s)": 0.672368
},
{
"epoch": 2.271548930654569,
"grad_norm": 10.6875,
"learning_rate": 9.166479227459567e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 3505,
"train_speed(iter/s)": 0.67252
},
{
"epoch": 2.274789371354504,
"grad_norm": 2.96875,
"learning_rate": 9.163514701979904e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 3510,
"train_speed(iter/s)": 0.672228
},
{
"epoch": 2.2780298120544393,
"grad_norm": 8.5,
"learning_rate": 9.160545395151955e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 3515,
"train_speed(iter/s)": 0.67226
},
{
"epoch": 2.2812702527543745,
"grad_norm": 3.96875,
"learning_rate": 9.157571310385644e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 3520,
"train_speed(iter/s)": 0.672291
},
{
"epoch": 2.28451069345431,
"grad_norm": 10.5625,
"learning_rate": 9.154592451096388e-05,
"loss": 0.0546875,
"memory(GiB)": 43.05,
"step": 3525,
"train_speed(iter/s)": 0.672127
},
{
"epoch": 2.287751134154245,
"grad_norm": 11.4375,
"learning_rate": 9.151608820705087e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 3530,
"train_speed(iter/s)": 0.672413
},
{
"epoch": 2.2909915748541803,
"grad_norm": 2.984375,
"learning_rate": 9.148620422638119e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 3535,
"train_speed(iter/s)": 0.672445
},
{
"epoch": 2.2942320155541154,
"grad_norm": 9.875,
"learning_rate": 9.145627260327338e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 3540,
"train_speed(iter/s)": 0.6724
},
{
"epoch": 2.2974724562540505,
"grad_norm": 0.51953125,
"learning_rate": 9.142629337210066e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 3545,
"train_speed(iter/s)": 0.672421
},
{
"epoch": 2.3007128969539856,
"grad_norm": 10.0,
"learning_rate": 9.139626656729099e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 3550,
"train_speed(iter/s)": 0.672505
},
{
"epoch": 2.3039533376539207,
"grad_norm": 6.78125,
"learning_rate": 9.136619222332687e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 3555,
"train_speed(iter/s)": 0.672758
},
{
"epoch": 2.3071937783538563,
"grad_norm": 12.5625,
"learning_rate": 9.13360703747455e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 3560,
"train_speed(iter/s)": 0.672888
},
{
"epoch": 2.3104342190537914,
"grad_norm": 11.1875,
"learning_rate": 9.130590105613854e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 3565,
"train_speed(iter/s)": 0.673289
},
{
"epoch": 2.3136746597537265,
"grad_norm": 0.6796875,
"learning_rate": 9.127568430215222e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 3570,
"train_speed(iter/s)": 0.67355
},
{
"epoch": 2.3169151004536617,
"grad_norm": 8.4375,
"learning_rate": 9.124542014748723e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 3575,
"train_speed(iter/s)": 0.673451
},
{
"epoch": 2.320155541153597,
"grad_norm": 1.53125,
"learning_rate": 9.121510862689868e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 3580,
"train_speed(iter/s)": 0.673597
},
{
"epoch": 2.323395981853532,
"grad_norm": 10.875,
"learning_rate": 9.118474977519611e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 3585,
"train_speed(iter/s)": 0.673586
},
{
"epoch": 2.3266364225534675,
"grad_norm": 2.671875,
"learning_rate": 9.115434362724337e-05,
"loss": 0.0501953125,
"memory(GiB)": 43.05,
"step": 3590,
"train_speed(iter/s)": 0.673664
},
{
"epoch": 2.3298768632534026,
"grad_norm": 16.5,
"learning_rate": 9.112389021795865e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 3595,
"train_speed(iter/s)": 0.673618
},
{
"epoch": 2.3331173039533377,
"grad_norm": 10.375,
"learning_rate": 9.109338958231441e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 3600,
"train_speed(iter/s)": 0.673742
},
{
"epoch": 2.336357744653273,
"grad_norm": 3.3125,
"learning_rate": 9.106284175533737e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 3605,
"train_speed(iter/s)": 0.673942
},
{
"epoch": 2.339598185353208,
"grad_norm": 11.75,
"learning_rate": 9.10322467721084e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 3610,
"train_speed(iter/s)": 0.674325
},
{
"epoch": 2.342838626053143,
"grad_norm": 2.5625,
"learning_rate": 9.100160466776252e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 3615,
"train_speed(iter/s)": 0.674276
},
{
"epoch": 2.346079066753078,
"grad_norm": 6.9375,
"learning_rate": 9.097091547748893e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 3620,
"train_speed(iter/s)": 0.674489
},
{
"epoch": 2.3493195074530138,
"grad_norm": 10.5625,
"learning_rate": 9.094017923653084e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 3625,
"train_speed(iter/s)": 0.67489
},
{
"epoch": 2.352559948152949,
"grad_norm": 3.359375,
"learning_rate": 9.090939598018551e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 3630,
"train_speed(iter/s)": 0.675088
},
{
"epoch": 2.355800388852884,
"grad_norm": 11.4375,
"learning_rate": 9.08785657438042e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 3635,
"train_speed(iter/s)": 0.675286
},
{
"epoch": 2.359040829552819,
"grad_norm": 12.6875,
"learning_rate": 9.084768856279212e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 3640,
"train_speed(iter/s)": 0.675561
},
{
"epoch": 2.3622812702527543,
"grad_norm": 10.125,
"learning_rate": 9.081676447260838e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 3645,
"train_speed(iter/s)": 0.675396
},
{
"epoch": 2.3655217109526894,
"grad_norm": 10.1875,
"learning_rate": 9.078579350876597e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 3650,
"train_speed(iter/s)": 0.675615
},
{
"epoch": 2.368762151652625,
"grad_norm": 1.2578125,
"learning_rate": 9.075477570683171e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 3655,
"train_speed(iter/s)": 0.675357
},
{
"epoch": 2.37200259235256,
"grad_norm": 4.125,
"learning_rate": 9.072371110242622e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 3660,
"train_speed(iter/s)": 0.675373
},
{
"epoch": 2.375243033052495,
"grad_norm": 9.6875,
"learning_rate": 9.069259973122382e-05,
"loss": 0.0751953125,
"memory(GiB)": 43.05,
"step": 3665,
"train_speed(iter/s)": 0.675584
},
{
"epoch": 2.3784834737524303,
"grad_norm": 3.484375,
"learning_rate": 9.066144162895258e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 3670,
"train_speed(iter/s)": 0.675409
},
{
"epoch": 2.3817239144523654,
"grad_norm": 13.25,
"learning_rate": 9.063023683139425e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 3675,
"train_speed(iter/s)": 0.675653
},
{
"epoch": 2.3849643551523005,
"grad_norm": 0.6796875,
"learning_rate": 9.059898537438415e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 3680,
"train_speed(iter/s)": 0.675869
},
{
"epoch": 2.3882047958522357,
"grad_norm": 5.21875,
"learning_rate": 9.056768729381122e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 3685,
"train_speed(iter/s)": 0.676039
},
{
"epoch": 2.3914452365521712,
"grad_norm": 15.3125,
"learning_rate": 9.053634262561794e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 3690,
"train_speed(iter/s)": 0.675961
},
{
"epoch": 2.3946856772521063,
"grad_norm": 4.90625,
"learning_rate": 9.050495140580029e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 3695,
"train_speed(iter/s)": 0.676292
},
{
"epoch": 2.3979261179520415,
"grad_norm": 15.375,
"learning_rate": 9.047351367040771e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 3700,
"train_speed(iter/s)": 0.676121
},
{
"epoch": 2.4011665586519766,
"grad_norm": 10.375,
"learning_rate": 9.044202945554302e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 3705,
"train_speed(iter/s)": 0.676301
},
{
"epoch": 2.4044069993519117,
"grad_norm": 14.625,
"learning_rate": 9.041049879736251e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 3710,
"train_speed(iter/s)": 0.676253
},
{
"epoch": 2.4076474400518473,
"grad_norm": 12.9375,
"learning_rate": 9.03789217320757e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 3715,
"train_speed(iter/s)": 0.676346
},
{
"epoch": 2.4108878807517824,
"grad_norm": 13.5625,
"learning_rate": 9.034729829594543e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 3720,
"train_speed(iter/s)": 0.676561
},
{
"epoch": 2.4141283214517175,
"grad_norm": 4.75,
"learning_rate": 9.031562852528788e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 3725,
"train_speed(iter/s)": 0.67666
},
{
"epoch": 2.4173687621516526,
"grad_norm": 0.546875,
"learning_rate": 9.028391245647232e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 3730,
"train_speed(iter/s)": 0.676936
},
{
"epoch": 2.4206092028515878,
"grad_norm": 0.65625,
"learning_rate": 9.025215012592123e-05,
"loss": 0.03212890625,
"memory(GiB)": 43.05,
"step": 3735,
"train_speed(iter/s)": 0.677255
},
{
"epoch": 2.423849643551523,
"grad_norm": 12.5,
"learning_rate": 9.022034157011028e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 3740,
"train_speed(iter/s)": 0.677439
},
{
"epoch": 2.427090084251458,
"grad_norm": 2.5,
"learning_rate": 9.018848682556812e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 3745,
"train_speed(iter/s)": 0.677526
},
{
"epoch": 2.4303305249513936,
"grad_norm": 11.8125,
"learning_rate": 9.015658592887653e-05,
"loss": 0.0548828125,
"memory(GiB)": 43.05,
"step": 3750,
"train_speed(iter/s)": 0.677739
},
{
"epoch": 2.4335709656513287,
"grad_norm": 4.8125,
"learning_rate": 9.012463891667023e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 3755,
"train_speed(iter/s)": 0.678015
},
{
"epoch": 2.436811406351264,
"grad_norm": 3.625,
"learning_rate": 9.009264582563691e-05,
"loss": 0.0564453125,
"memory(GiB)": 43.05,
"step": 3760,
"train_speed(iter/s)": 0.678127
},
{
"epoch": 2.440051847051199,
"grad_norm": 15.875,
"learning_rate": 9.006060669251723e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 3765,
"train_speed(iter/s)": 0.677994
},
{
"epoch": 2.443292287751134,
"grad_norm": 0.83203125,
"learning_rate": 9.002852155410466e-05,
"loss": 0.0708984375,
"memory(GiB)": 43.05,
"step": 3770,
"train_speed(iter/s)": 0.678195
},
{
"epoch": 2.446532728451069,
"grad_norm": 11.5625,
"learning_rate": 8.999639044724555e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 3775,
"train_speed(iter/s)": 0.678062
},
{
"epoch": 2.4497731691510047,
"grad_norm": 2.65625,
"learning_rate": 8.996421340883898e-05,
"loss": 0.05234375,
"memory(GiB)": 43.05,
"step": 3780,
"train_speed(iter/s)": 0.678184
},
{
"epoch": 2.45301360985094,
"grad_norm": 8.6875,
"learning_rate": 8.993199047583682e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 3785,
"train_speed(iter/s)": 0.678383
},
{
"epoch": 2.456254050550875,
"grad_norm": 12.1875,
"learning_rate": 8.989972168524367e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 3790,
"train_speed(iter/s)": 0.678511
},
{
"epoch": 2.45949449125081,
"grad_norm": 3.59375,
"learning_rate": 8.986740707411674e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 3795,
"train_speed(iter/s)": 0.678768
},
{
"epoch": 2.462734931950745,
"grad_norm": 2.578125,
"learning_rate": 8.983504667956588e-05,
"loss": 0.0583984375,
"memory(GiB)": 43.05,
"step": 3800,
"train_speed(iter/s)": 0.67896
},
{
"epoch": 2.4659753726506803,
"grad_norm": 10.75,
"learning_rate": 8.980264053875353e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 3805,
"train_speed(iter/s)": 0.679142
},
{
"epoch": 2.4692158133506155,
"grad_norm": 8.9375,
"learning_rate": 8.977018868889463e-05,
"loss": 0.023046875,
"memory(GiB)": 43.05,
"step": 3810,
"train_speed(iter/s)": 0.679118
},
{
"epoch": 2.472456254050551,
"grad_norm": 15.375,
"learning_rate": 8.973769116725666e-05,
"loss": 0.0537109375,
"memory(GiB)": 43.05,
"step": 3815,
"train_speed(iter/s)": 0.679129
},
{
"epoch": 2.475696694750486,
"grad_norm": 9.0,
"learning_rate": 8.97051480111595e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 3820,
"train_speed(iter/s)": 0.679249
},
{
"epoch": 2.4789371354504213,
"grad_norm": 10.5625,
"learning_rate": 8.967255925797549e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 3825,
"train_speed(iter/s)": 0.679519
},
{
"epoch": 2.4821775761503564,
"grad_norm": 2.0,
"learning_rate": 8.963992494512928e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 3830,
"train_speed(iter/s)": 0.679637
},
{
"epoch": 2.4854180168502915,
"grad_norm": 12.25,
"learning_rate": 8.960724511009787e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 3835,
"train_speed(iter/s)": 0.679944
},
{
"epoch": 2.488658457550227,
"grad_norm": 3.53125,
"learning_rate": 8.957451979041052e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 3840,
"train_speed(iter/s)": 0.679987
},
{
"epoch": 2.491898898250162,
"grad_norm": 1.9296875,
"learning_rate": 8.954174902364876e-05,
"loss": 0.0484375,
"memory(GiB)": 43.05,
"step": 3845,
"train_speed(iter/s)": 0.679953
},
{
"epoch": 2.4951393389500973,
"grad_norm": 0.90625,
"learning_rate": 8.950893284744629e-05,
"loss": 0.0509765625,
"memory(GiB)": 43.05,
"step": 3850,
"train_speed(iter/s)": 0.679988
},
{
"epoch": 2.4983797796500324,
"grad_norm": 11.8125,
"learning_rate": 8.947607129948892e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 3855,
"train_speed(iter/s)": 0.680303
},
{
"epoch": 2.5016202203499676,
"grad_norm": 15.125,
"learning_rate": 8.944316441751461e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 3860,
"train_speed(iter/s)": 0.680617
},
{
"epoch": 2.5048606610499027,
"grad_norm": 13.625,
"learning_rate": 8.94102122393134e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 3865,
"train_speed(iter/s)": 0.680536
},
{
"epoch": 2.508101101749838,
"grad_norm": 11.75,
"learning_rate": 8.937721480272729e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 3870,
"train_speed(iter/s)": 0.680779
},
{
"epoch": 2.511341542449773,
"grad_norm": 10.3125,
"learning_rate": 8.934417214565029e-05,
"loss": 0.06328125,
"memory(GiB)": 43.05,
"step": 3875,
"train_speed(iter/s)": 0.680895
},
{
"epoch": 2.5145819831497085,
"grad_norm": 15.25,
"learning_rate": 8.931108430602834e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 3880,
"train_speed(iter/s)": 0.680929
},
{
"epoch": 2.5178224238496436,
"grad_norm": 4.9375,
"learning_rate": 8.927795132185925e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 3885,
"train_speed(iter/s)": 0.680959
},
{
"epoch": 2.5210628645495787,
"grad_norm": 11.9375,
"learning_rate": 8.924477323119269e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 3890,
"train_speed(iter/s)": 0.681138
},
{
"epoch": 2.524303305249514,
"grad_norm": 13.4375,
"learning_rate": 8.921155007213012e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 3895,
"train_speed(iter/s)": 0.681258
},
{
"epoch": 2.527543745949449,
"grad_norm": 11.4375,
"learning_rate": 8.917828188282476e-05,
"loss": 0.05234375,
"memory(GiB)": 43.05,
"step": 3900,
"train_speed(iter/s)": 0.681412
},
{
"epoch": 2.5307841866493845,
"grad_norm": 11.875,
"learning_rate": 8.914496870148156e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 3905,
"train_speed(iter/s)": 0.681379
},
{
"epoch": 2.5340246273493197,
"grad_norm": 12.875,
"learning_rate": 8.911161056635711e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 3910,
"train_speed(iter/s)": 0.681674
},
{
"epoch": 2.537265068049255,
"grad_norm": 11.0,
"learning_rate": 8.907820751575961e-05,
"loss": 0.054296875,
"memory(GiB)": 43.05,
"step": 3915,
"train_speed(iter/s)": 0.681966
},
{
"epoch": 2.54050550874919,
"grad_norm": 7.0,
"learning_rate": 8.90447595880489e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 3920,
"train_speed(iter/s)": 0.682085
},
{
"epoch": 2.543745949449125,
"grad_norm": 12.9375,
"learning_rate": 8.901126682163632e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 3925,
"train_speed(iter/s)": 0.682319
},
{
"epoch": 2.54698639014906,
"grad_norm": 11.9375,
"learning_rate": 8.897772925498471e-05,
"loss": 0.050390625,
"memory(GiB)": 43.05,
"step": 3930,
"train_speed(iter/s)": 0.682379
},
{
"epoch": 2.5502268308489953,
"grad_norm": 1.1328125,
"learning_rate": 8.894414692660833e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 3935,
"train_speed(iter/s)": 0.682567
},
{
"epoch": 2.5534672715489304,
"grad_norm": 3.828125,
"learning_rate": 8.891051987507288e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 3940,
"train_speed(iter/s)": 0.68293
},
{
"epoch": 2.556707712248866,
"grad_norm": 1.09375,
"learning_rate": 8.887684813899542e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 3945,
"train_speed(iter/s)": 0.682891
},
{
"epoch": 2.559948152948801,
"grad_norm": 9.375,
"learning_rate": 8.884313175704428e-05,
"loss": 0.0466796875,
"memory(GiB)": 43.05,
"step": 3950,
"train_speed(iter/s)": 0.683066
},
{
"epoch": 2.563188593648736,
"grad_norm": 5.28125,
"learning_rate": 8.880937076793913e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 3955,
"train_speed(iter/s)": 0.683213
},
{
"epoch": 2.5664290343486713,
"grad_norm": 4.84375,
"learning_rate": 8.877556521045083e-05,
"loss": 0.0189453125,
"memory(GiB)": 43.05,
"step": 3960,
"train_speed(iter/s)": 0.68347
},
{
"epoch": 2.569669475048607,
"grad_norm": 4.9375,
"learning_rate": 8.87417151234014e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 3965,
"train_speed(iter/s)": 0.68316
},
{
"epoch": 2.572909915748542,
"grad_norm": 12.1875,
"learning_rate": 8.8707820545664e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 3970,
"train_speed(iter/s)": 0.683271
},
{
"epoch": 2.576150356448477,
"grad_norm": 14.9375,
"learning_rate": 8.867388151616296e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 3975,
"train_speed(iter/s)": 0.683173
},
{
"epoch": 2.5793907971484122,
"grad_norm": 13.875,
"learning_rate": 8.863989807387356e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 3980,
"train_speed(iter/s)": 0.683093
},
{
"epoch": 2.5826312378483474,
"grad_norm": 15.375,
"learning_rate": 8.860587025782214e-05,
"loss": 0.0546875,
"memory(GiB)": 43.05,
"step": 3985,
"train_speed(iter/s)": 0.683032
},
{
"epoch": 2.5858716785482825,
"grad_norm": 11.6875,
"learning_rate": 8.857179810708598e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 3990,
"train_speed(iter/s)": 0.683199
},
{
"epoch": 2.5891121192482176,
"grad_norm": 2.421875,
"learning_rate": 8.853768166079328e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 3995,
"train_speed(iter/s)": 0.682988
},
{
"epoch": 2.5923525599481527,
"grad_norm": 0.6171875,
"learning_rate": 8.850352095812309e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 4000,
"train_speed(iter/s)": 0.683171
},
{
"epoch": 2.5955930006480883,
"grad_norm": 12.125,
"learning_rate": 8.84693160383053e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 4005,
"train_speed(iter/s)": 0.613524
},
{
"epoch": 2.5988334413480234,
"grad_norm": 1.1171875,
"learning_rate": 8.84350669406206e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 4010,
"train_speed(iter/s)": 0.613712
},
{
"epoch": 2.6020738820479585,
"grad_norm": 12.8125,
"learning_rate": 8.840077370440039e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 4015,
"train_speed(iter/s)": 0.613548
},
{
"epoch": 2.6053143227478937,
"grad_norm": 12.0,
"learning_rate": 8.83664363690267e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 4020,
"train_speed(iter/s)": 0.61374
},
{
"epoch": 2.6085547634478288,
"grad_norm": 0.8984375,
"learning_rate": 8.833205497393234e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 4025,
"train_speed(iter/s)": 0.614105
},
{
"epoch": 2.6117952041477643,
"grad_norm": 16.75,
"learning_rate": 8.82976295586006e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 4030,
"train_speed(iter/s)": 0.61416
},
{
"epoch": 2.6150356448476995,
"grad_norm": 16.0,
"learning_rate": 8.826316016256536e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 4035,
"train_speed(iter/s)": 0.614079
},
{
"epoch": 2.6182760855476346,
"grad_norm": 12.1875,
"learning_rate": 8.822864682541103e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 4040,
"train_speed(iter/s)": 0.614213
},
{
"epoch": 2.6215165262475697,
"grad_norm": 1.734375,
"learning_rate": 8.819408958677244e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 4045,
"train_speed(iter/s)": 0.614431
},
{
"epoch": 2.624756966947505,
"grad_norm": 8.25,
"learning_rate": 8.815948848633487e-05,
"loss": 0.0603515625,
"memory(GiB)": 43.05,
"step": 4050,
"train_speed(iter/s)": 0.61471
},
{
"epoch": 2.62799740764744,
"grad_norm": 2.796875,
"learning_rate": 8.812484356383396e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 4055,
"train_speed(iter/s)": 0.614926
},
{
"epoch": 2.631237848347375,
"grad_norm": 2.640625,
"learning_rate": 8.809015485905565e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 4060,
"train_speed(iter/s)": 0.615193
},
{
"epoch": 2.63447828904731,
"grad_norm": 1.1875,
"learning_rate": 8.805542241183622e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 4065,
"train_speed(iter/s)": 0.615423
},
{
"epoch": 2.6377187297472457,
"grad_norm": 12.8125,
"learning_rate": 8.80206462620621e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 4070,
"train_speed(iter/s)": 0.61546
},
{
"epoch": 2.640959170447181,
"grad_norm": 3.265625,
"learning_rate": 8.798582644967e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 4075,
"train_speed(iter/s)": 0.615646
},
{
"epoch": 2.644199611147116,
"grad_norm": 10.625,
"learning_rate": 8.795096301464669e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 4080,
"train_speed(iter/s)": 0.615999
},
{
"epoch": 2.647440051847051,
"grad_norm": 14.9375,
"learning_rate": 8.79160559970291e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 4085,
"train_speed(iter/s)": 0.616085
},
{
"epoch": 2.6506804925469862,
"grad_norm": 0.66015625,
"learning_rate": 8.788110543690416e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 4090,
"train_speed(iter/s)": 0.616129
},
{
"epoch": 2.653920933246922,
"grad_norm": 15.1875,
"learning_rate": 8.784611137440881e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 4095,
"train_speed(iter/s)": 0.616351
},
{
"epoch": 2.657161373946857,
"grad_norm": 0.91015625,
"learning_rate": 8.781107384972999e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 4100,
"train_speed(iter/s)": 0.616673
},
{
"epoch": 2.660401814646792,
"grad_norm": 10.625,
"learning_rate": 8.777599290310454e-05,
"loss": 0.0484375,
"memory(GiB)": 43.05,
"step": 4105,
"train_speed(iter/s)": 0.616658
},
{
"epoch": 2.663642255346727,
"grad_norm": 14.9375,
"learning_rate": 8.77408685748191e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 4110,
"train_speed(iter/s)": 0.616972
},
{
"epoch": 2.6668826960466623,
"grad_norm": 0.96484375,
"learning_rate": 8.77057009052102e-05,
"loss": 0.0513671875,
"memory(GiB)": 43.05,
"step": 4115,
"train_speed(iter/s)": 0.616998
},
{
"epoch": 2.6701231367465974,
"grad_norm": 1.203125,
"learning_rate": 8.767048993466413e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 4120,
"train_speed(iter/s)": 0.617305
},
{
"epoch": 2.6733635774465325,
"grad_norm": 10.75,
"learning_rate": 8.763523570361691e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 4125,
"train_speed(iter/s)": 0.617432
},
{
"epoch": 2.6766040181464676,
"grad_norm": 16.25,
"learning_rate": 8.75999382525542e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 4130,
"train_speed(iter/s)": 0.61743
},
{
"epoch": 2.679844458846403,
"grad_norm": 13.3125,
"learning_rate": 8.756459762201133e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 4135,
"train_speed(iter/s)": 0.617385
},
{
"epoch": 2.6830848995463383,
"grad_norm": 14.1875,
"learning_rate": 8.752921385257322e-05,
"loss": 0.0578125,
"memory(GiB)": 43.05,
"step": 4140,
"train_speed(iter/s)": 0.617612
},
{
"epoch": 2.6863253402462735,
"grad_norm": 14.75,
"learning_rate": 8.749378698487429e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 4145,
"train_speed(iter/s)": 0.617707
},
{
"epoch": 2.6895657809462086,
"grad_norm": 0.984375,
"learning_rate": 8.745831705959852e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 4150,
"train_speed(iter/s)": 0.61787
},
{
"epoch": 2.692806221646144,
"grad_norm": 2.859375,
"learning_rate": 8.74228041174793e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 4155,
"train_speed(iter/s)": 0.618179
},
{
"epoch": 2.6960466623460793,
"grad_norm": 3.34375,
"learning_rate": 8.738724819929938e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 4160,
"train_speed(iter/s)": 0.618286
},
{
"epoch": 2.6992871030460144,
"grad_norm": 11.0625,
"learning_rate": 8.735164934589092e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 4165,
"train_speed(iter/s)": 0.618624
},
{
"epoch": 2.7025275437459495,
"grad_norm": 3.375,
"learning_rate": 8.731600759813538e-05,
"loss": 0.0177734375,
"memory(GiB)": 43.05,
"step": 4170,
"train_speed(iter/s)": 0.618786
},
{
"epoch": 2.7057679844458846,
"grad_norm": 4.28125,
"learning_rate": 8.728032299696348e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 4175,
"train_speed(iter/s)": 0.618852
},
{
"epoch": 2.7090084251458197,
"grad_norm": 17.375,
"learning_rate": 8.724459558335512e-05,
"loss": 0.0630859375,
"memory(GiB)": 43.05,
"step": 4180,
"train_speed(iter/s)": 0.618825
},
{
"epoch": 2.712248865845755,
"grad_norm": 8.9375,
"learning_rate": 8.72088253983394e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 4185,
"train_speed(iter/s)": 0.618844
},
{
"epoch": 2.71548930654569,
"grad_norm": 12.25,
"learning_rate": 8.71730124829945e-05,
"loss": 0.0630859375,
"memory(GiB)": 43.05,
"step": 4190,
"train_speed(iter/s)": 0.618973
},
{
"epoch": 2.7187297472456255,
"grad_norm": 15.5625,
"learning_rate": 8.713715687844772e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 4195,
"train_speed(iter/s)": 0.61914
},
{
"epoch": 2.7219701879455607,
"grad_norm": 1.5,
"learning_rate": 8.710125862587537e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 4200,
"train_speed(iter/s)": 0.619254
},
{
"epoch": 2.725210628645496,
"grad_norm": 9.0,
"learning_rate": 8.706531776650271e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 4205,
"train_speed(iter/s)": 0.619512
},
{
"epoch": 2.728451069345431,
"grad_norm": 4.90625,
"learning_rate": 8.702933434160395e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 4210,
"train_speed(iter/s)": 0.619649
},
{
"epoch": 2.731691510045366,
"grad_norm": 13.875,
"learning_rate": 8.699330839250217e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 4215,
"train_speed(iter/s)": 0.619626
},
{
"epoch": 2.7349319507453016,
"grad_norm": 8.1875,
"learning_rate": 8.69572399605693e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 4220,
"train_speed(iter/s)": 0.619676
},
{
"epoch": 2.7381723914452367,
"grad_norm": 12.1875,
"learning_rate": 8.692112908722607e-05,
"loss": 0.019921875,
"memory(GiB)": 43.05,
"step": 4225,
"train_speed(iter/s)": 0.61986
},
{
"epoch": 2.741412832145172,
"grad_norm": 3.15625,
"learning_rate": 8.68849758139419e-05,
"loss": 0.0443359375,
"memory(GiB)": 43.05,
"step": 4230,
"train_speed(iter/s)": 0.619941
},
{
"epoch": 2.744653272845107,
"grad_norm": 9.875,
"learning_rate": 8.684878018223497e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 4235,
"train_speed(iter/s)": 0.620209
},
{
"epoch": 2.747893713545042,
"grad_norm": 1.640625,
"learning_rate": 8.6812542233672e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 4240,
"train_speed(iter/s)": 0.620321
},
{
"epoch": 2.751134154244977,
"grad_norm": 2.140625,
"learning_rate": 8.677626200986844e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 4245,
"train_speed(iter/s)": 0.620371
},
{
"epoch": 2.7543745949449123,
"grad_norm": 1.2578125,
"learning_rate": 8.673993955248818e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 4250,
"train_speed(iter/s)": 0.620395
},
{
"epoch": 2.7576150356448474,
"grad_norm": 2.078125,
"learning_rate": 8.670357490324365e-05,
"loss": 0.0212890625,
"memory(GiB)": 43.05,
"step": 4255,
"train_speed(iter/s)": 0.620735
},
{
"epoch": 2.760855476344783,
"grad_norm": 3.859375,
"learning_rate": 8.666716810389577e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 4260,
"train_speed(iter/s)": 0.620936
},
{
"epoch": 2.764095917044718,
"grad_norm": 15.25,
"learning_rate": 8.663071919625378e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 4265,
"train_speed(iter/s)": 0.621061
},
{
"epoch": 2.7673363577446533,
"grad_norm": 9.8125,
"learning_rate": 8.659422822217536e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 4270,
"train_speed(iter/s)": 0.621354
},
{
"epoch": 2.7705767984445884,
"grad_norm": 5.15625,
"learning_rate": 8.655769522356646e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 4275,
"train_speed(iter/s)": 0.621649
},
{
"epoch": 2.7738172391445235,
"grad_norm": 1.015625,
"learning_rate": 8.652112024238129e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 4280,
"train_speed(iter/s)": 0.621851
},
{
"epoch": 2.777057679844459,
"grad_norm": 14.75,
"learning_rate": 8.648450332062226e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 4285,
"train_speed(iter/s)": 0.621832
},
{
"epoch": 2.780298120544394,
"grad_norm": 5.25,
"learning_rate": 8.644784450033999e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 4290,
"train_speed(iter/s)": 0.621946
},
{
"epoch": 2.7835385612443293,
"grad_norm": 9.25,
"learning_rate": 8.641114382363318e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 4295,
"train_speed(iter/s)": 0.62205
},
{
"epoch": 2.7867790019442644,
"grad_norm": 0.66796875,
"learning_rate": 8.637440133264858e-05,
"loss": 0.04287109375,
"memory(GiB)": 43.05,
"step": 4300,
"train_speed(iter/s)": 0.6219
},
{
"epoch": 2.7900194426441995,
"grad_norm": 2.03125,
"learning_rate": 8.633761706958102e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 4305,
"train_speed(iter/s)": 0.622058
},
{
"epoch": 2.7932598833441347,
"grad_norm": 3.25,
"learning_rate": 8.630079107667324e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 4310,
"train_speed(iter/s)": 0.622265
},
{
"epoch": 2.79650032404407,
"grad_norm": 9.9375,
"learning_rate": 8.626392339621595e-05,
"loss": 0.0154296875,
"memory(GiB)": 43.05,
"step": 4315,
"train_speed(iter/s)": 0.622562
},
{
"epoch": 2.7997407647440054,
"grad_norm": 1.4375,
"learning_rate": 8.622701407054769e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 4320,
"train_speed(iter/s)": 0.622627
},
{
"epoch": 2.8029812054439405,
"grad_norm": 14.5625,
"learning_rate": 8.619006314205484e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 4325,
"train_speed(iter/s)": 0.622871
},
{
"epoch": 2.8062216461438756,
"grad_norm": 0.423828125,
"learning_rate": 8.61530706531716e-05,
"loss": 0.0513671875,
"memory(GiB)": 43.05,
"step": 4330,
"train_speed(iter/s)": 0.62312
},
{
"epoch": 2.8094620868438107,
"grad_norm": 11.25,
"learning_rate": 8.611603664637983e-05,
"loss": 0.05146484375,
"memory(GiB)": 43.05,
"step": 4335,
"train_speed(iter/s)": 0.623218
},
{
"epoch": 2.812702527543746,
"grad_norm": 0.578125,
"learning_rate": 8.607896116420911e-05,
"loss": 0.0650390625,
"memory(GiB)": 43.05,
"step": 4340,
"train_speed(iter/s)": 0.623092
},
{
"epoch": 2.8159429682436814,
"grad_norm": 11.0,
"learning_rate": 8.60418442492366e-05,
"loss": 0.0603515625,
"memory(GiB)": 43.05,
"step": 4345,
"train_speed(iter/s)": 0.623057
},
{
"epoch": 2.8191834089436165,
"grad_norm": 2.65625,
"learning_rate": 8.600468594408715e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 4350,
"train_speed(iter/s)": 0.623127
},
{
"epoch": 2.8224238496435516,
"grad_norm": 11.375,
"learning_rate": 8.596748629143302e-05,
"loss": 0.05078125,
"memory(GiB)": 43.05,
"step": 4355,
"train_speed(iter/s)": 0.623246
},
{
"epoch": 2.8256642903434868,
"grad_norm": 2.859375,
"learning_rate": 8.593024533399403e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 4360,
"train_speed(iter/s)": 0.623443
},
{
"epoch": 2.828904731043422,
"grad_norm": 15.0625,
"learning_rate": 8.589296311453738e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 4365,
"train_speed(iter/s)": 0.623511
},
{
"epoch": 2.832145171743357,
"grad_norm": 2.453125,
"learning_rate": 8.585563967587773e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 4370,
"train_speed(iter/s)": 0.623641
},
{
"epoch": 2.835385612443292,
"grad_norm": 1.9453125,
"learning_rate": 8.581827506087699e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 4375,
"train_speed(iter/s)": 0.623852
},
{
"epoch": 2.8386260531432272,
"grad_norm": 15.9375,
"learning_rate": 8.578086931244443e-05,
"loss": 0.0607421875,
"memory(GiB)": 43.05,
"step": 4380,
"train_speed(iter/s)": 0.623774
},
{
"epoch": 2.841866493843163,
"grad_norm": 1.609375,
"learning_rate": 8.574342247353648e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 4385,
"train_speed(iter/s)": 0.62397
},
{
"epoch": 2.845106934543098,
"grad_norm": 6.875,
"learning_rate": 8.570593458715683e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 4390,
"train_speed(iter/s)": 0.624128
},
{
"epoch": 2.848347375243033,
"grad_norm": 10.9375,
"learning_rate": 8.566840569635629e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 4395,
"train_speed(iter/s)": 0.624316
},
{
"epoch": 2.851587815942968,
"grad_norm": 12.3125,
"learning_rate": 8.563083584423274e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 4400,
"train_speed(iter/s)": 0.62439
},
{
"epoch": 2.8548282566429033,
"grad_norm": 3.9375,
"learning_rate": 8.55932250739311e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4405,
"train_speed(iter/s)": 0.624502
},
{
"epoch": 2.858068697342839,
"grad_norm": 0.703125,
"learning_rate": 8.555557342864329e-05,
"loss": 0.0498046875,
"memory(GiB)": 43.05,
"step": 4410,
"train_speed(iter/s)": 0.6247
},
{
"epoch": 2.861309138042774,
"grad_norm": 5.78125,
"learning_rate": 8.55178809516082e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 4415,
"train_speed(iter/s)": 0.62471
},
{
"epoch": 2.864549578742709,
"grad_norm": 12.0,
"learning_rate": 8.548014768611154e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 4420,
"train_speed(iter/s)": 0.624841
},
{
"epoch": 2.8677900194426442,
"grad_norm": 14.5,
"learning_rate": 8.544237367548591e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 4425,
"train_speed(iter/s)": 0.625088
},
{
"epoch": 2.8710304601425793,
"grad_norm": 0.416015625,
"learning_rate": 8.540455896311073e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 4430,
"train_speed(iter/s)": 0.625286
},
{
"epoch": 2.8742709008425145,
"grad_norm": 7.84375,
"learning_rate": 8.536670359241208e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 4435,
"train_speed(iter/s)": 0.62535
},
{
"epoch": 2.8775113415424496,
"grad_norm": 15.3125,
"learning_rate": 8.532880760686281e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 4440,
"train_speed(iter/s)": 0.625458
},
{
"epoch": 2.8807517822423847,
"grad_norm": 2.78125,
"learning_rate": 8.529087104998235e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4445,
"train_speed(iter/s)": 0.625444
},
{
"epoch": 2.8839922229423203,
"grad_norm": 5.375,
"learning_rate": 8.525289396533678e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 4450,
"train_speed(iter/s)": 0.625487
},
{
"epoch": 2.8872326636422554,
"grad_norm": 10.75,
"learning_rate": 8.521487639653866e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 4455,
"train_speed(iter/s)": 0.625693
},
{
"epoch": 2.8904731043421905,
"grad_norm": 4.28125,
"learning_rate": 8.517681838724709e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 4460,
"train_speed(iter/s)": 0.625807
},
{
"epoch": 2.8937135450421256,
"grad_norm": 1.6328125,
"learning_rate": 8.513871998116763e-05,
"loss": 0.01796875,
"memory(GiB)": 43.05,
"step": 4465,
"train_speed(iter/s)": 0.625957
},
{
"epoch": 2.8969539857420608,
"grad_norm": 10.3125,
"learning_rate": 8.510058122205213e-05,
"loss": 0.044921875,
"memory(GiB)": 43.05,
"step": 4470,
"train_speed(iter/s)": 0.626276
},
{
"epoch": 2.9001944264419963,
"grad_norm": 9.4375,
"learning_rate": 8.506240215369888e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 4475,
"train_speed(iter/s)": 0.626521
},
{
"epoch": 2.9034348671419314,
"grad_norm": 6.0625,
"learning_rate": 8.502418281995245e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 4480,
"train_speed(iter/s)": 0.626578
},
{
"epoch": 2.9066753078418666,
"grad_norm": 1.359375,
"learning_rate": 8.498592326470361e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 4485,
"train_speed(iter/s)": 0.626811
},
{
"epoch": 2.9099157485418017,
"grad_norm": 12.3125,
"learning_rate": 8.494762353188931e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 4490,
"train_speed(iter/s)": 0.627084
},
{
"epoch": 2.913156189241737,
"grad_norm": 12.6875,
"learning_rate": 8.490928366549272e-05,
"loss": 0.0671875,
"memory(GiB)": 43.05,
"step": 4495,
"train_speed(iter/s)": 0.626934
},
{
"epoch": 2.916396629941672,
"grad_norm": 10.9375,
"learning_rate": 8.487090370954301e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 4500,
"train_speed(iter/s)": 0.627216
},
{
"epoch": 2.919637070641607,
"grad_norm": 9.625,
"learning_rate": 8.483248370811545e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 4505,
"train_speed(iter/s)": 0.627221
},
{
"epoch": 2.9228775113415426,
"grad_norm": 8.6875,
"learning_rate": 8.479402370533127e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 4510,
"train_speed(iter/s)": 0.627494
},
{
"epoch": 2.9261179520414777,
"grad_norm": 1.84375,
"learning_rate": 8.475552374535763e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 4515,
"train_speed(iter/s)": 0.627641
},
{
"epoch": 2.929358392741413,
"grad_norm": 12.375,
"learning_rate": 8.47169838724076e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 4520,
"train_speed(iter/s)": 0.627674
},
{
"epoch": 2.932598833441348,
"grad_norm": 2.359375,
"learning_rate": 8.467840413074007e-05,
"loss": 0.048046875,
"memory(GiB)": 43.05,
"step": 4525,
"train_speed(iter/s)": 0.627871
},
{
"epoch": 2.935839274141283,
"grad_norm": 13.1875,
"learning_rate": 8.463978456465971e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 4530,
"train_speed(iter/s)": 0.627937
},
{
"epoch": 2.9390797148412187,
"grad_norm": 0.8359375,
"learning_rate": 8.460112521851695e-05,
"loss": 0.0626953125,
"memory(GiB)": 43.05,
"step": 4535,
"train_speed(iter/s)": 0.628024
},
{
"epoch": 2.942320155541154,
"grad_norm": 10.1875,
"learning_rate": 8.456242613670788e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 4540,
"train_speed(iter/s)": 0.628172
},
{
"epoch": 2.945560596241089,
"grad_norm": 11.0,
"learning_rate": 8.452368736367422e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 4545,
"train_speed(iter/s)": 0.628269
},
{
"epoch": 2.948801036941024,
"grad_norm": 1.828125,
"learning_rate": 8.448490894390328e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 4550,
"train_speed(iter/s)": 0.628152
},
{
"epoch": 2.952041477640959,
"grad_norm": 13.5,
"learning_rate": 8.44460909219279e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 4555,
"train_speed(iter/s)": 0.628304
},
{
"epoch": 2.9552819183408943,
"grad_norm": 18.375,
"learning_rate": 8.440723334232641e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 4560,
"train_speed(iter/s)": 0.628485
},
{
"epoch": 2.9585223590408294,
"grad_norm": 10.5,
"learning_rate": 8.436833624972255e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 4565,
"train_speed(iter/s)": 0.628602
},
{
"epoch": 2.9617627997407645,
"grad_norm": 2.109375,
"learning_rate": 8.432939968878546e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 4570,
"train_speed(iter/s)": 0.628877
},
{
"epoch": 2.9650032404407,
"grad_norm": 1.3671875,
"learning_rate": 8.429042370422953e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 4575,
"train_speed(iter/s)": 0.628863
},
{
"epoch": 2.968243681140635,
"grad_norm": 10.3125,
"learning_rate": 8.425140834081455e-05,
"loss": 0.0587890625,
"memory(GiB)": 43.05,
"step": 4580,
"train_speed(iter/s)": 0.62893
},
{
"epoch": 2.9714841218405703,
"grad_norm": 13.0625,
"learning_rate": 8.421235364334541e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 4585,
"train_speed(iter/s)": 0.629116
},
{
"epoch": 2.9747245625405054,
"grad_norm": 16.125,
"learning_rate": 8.417325965667226e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 4590,
"train_speed(iter/s)": 0.629361
},
{
"epoch": 2.9779650032404406,
"grad_norm": 13.6875,
"learning_rate": 8.413412642569032e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 4595,
"train_speed(iter/s)": 0.629532
},
{
"epoch": 2.981205443940376,
"grad_norm": 3.21875,
"learning_rate": 8.409495399533989e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 4600,
"train_speed(iter/s)": 0.62976
},
{
"epoch": 2.9844458846403112,
"grad_norm": 1.4296875,
"learning_rate": 8.405574241060628e-05,
"loss": 0.054296875,
"memory(GiB)": 43.05,
"step": 4605,
"train_speed(iter/s)": 0.629953
},
{
"epoch": 2.9876863253402464,
"grad_norm": 15.5,
"learning_rate": 8.40164917165198e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 4610,
"train_speed(iter/s)": 0.629931
},
{
"epoch": 2.9909267660401815,
"grad_norm": 0.0,
"learning_rate": 8.397720195815562e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 4615,
"train_speed(iter/s)": 0.630124
},
{
"epoch": 2.9941672067401166,
"grad_norm": 0.7265625,
"learning_rate": 8.39378731806338e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 4620,
"train_speed(iter/s)": 0.630453
},
{
"epoch": 2.9974076474400517,
"grad_norm": 9.25,
"learning_rate": 8.389850542911921e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 4625,
"train_speed(iter/s)": 0.630426
},
{
"epoch": 3.000648088139987,
"grad_norm": 0.66015625,
"learning_rate": 8.38590987488215e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 4630,
"train_speed(iter/s)": 0.630339
},
{
"epoch": 3.0038885288399224,
"grad_norm": 12.75,
"learning_rate": 8.381965318499493e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 4635,
"train_speed(iter/s)": 0.630271
},
{
"epoch": 3.0071289695398575,
"grad_norm": 12.0,
"learning_rate": 8.378016878293855e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 4640,
"train_speed(iter/s)": 0.630251
},
{
"epoch": 3.0103694102397927,
"grad_norm": 3.734375,
"learning_rate": 8.374064558799593e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 4645,
"train_speed(iter/s)": 0.630261
},
{
"epoch": 3.0136098509397278,
"grad_norm": 17.375,
"learning_rate": 8.370108364555518e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 4650,
"train_speed(iter/s)": 0.630327
},
{
"epoch": 3.016850291639663,
"grad_norm": 0.92578125,
"learning_rate": 8.366148300104894e-05,
"loss": 0.0171875,
"memory(GiB)": 43.05,
"step": 4655,
"train_speed(iter/s)": 0.630595
},
{
"epoch": 3.020090732339598,
"grad_norm": 10.6875,
"learning_rate": 8.362184369995429e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 4660,
"train_speed(iter/s)": 0.630768
},
{
"epoch": 3.0233311730395336,
"grad_norm": 1.0234375,
"learning_rate": 8.358216578779271e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 4665,
"train_speed(iter/s)": 0.630757
},
{
"epoch": 3.0265716137394687,
"grad_norm": 4.46875,
"learning_rate": 8.354244931013e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 4670,
"train_speed(iter/s)": 0.630896
},
{
"epoch": 3.029812054439404,
"grad_norm": 5.0625,
"learning_rate": 8.350269431257624e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 4675,
"train_speed(iter/s)": 0.631085
},
{
"epoch": 3.033052495139339,
"grad_norm": 13.125,
"learning_rate": 8.346290084078579e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 4680,
"train_speed(iter/s)": 0.630968
},
{
"epoch": 3.036292935839274,
"grad_norm": 5.78125,
"learning_rate": 8.342306894045715e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 4685,
"train_speed(iter/s)": 0.631221
},
{
"epoch": 3.039533376539209,
"grad_norm": 15.125,
"learning_rate": 8.338319865733297e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 4690,
"train_speed(iter/s)": 0.631403
},
{
"epoch": 3.0427738172391443,
"grad_norm": 0.65234375,
"learning_rate": 8.334329003719998e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 4695,
"train_speed(iter/s)": 0.631499
},
{
"epoch": 3.04601425793908,
"grad_norm": 1.15625,
"learning_rate": 8.330334312588895e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 4700,
"train_speed(iter/s)": 0.631581
},
{
"epoch": 3.049254698639015,
"grad_norm": 2.96875,
"learning_rate": 8.326335796927458e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 4705,
"train_speed(iter/s)": 0.631734
},
{
"epoch": 3.05249513933895,
"grad_norm": 11.6875,
"learning_rate": 8.322333461327552e-05,
"loss": 0.021484375,
"memory(GiB)": 43.05,
"step": 4710,
"train_speed(iter/s)": 0.63169
},
{
"epoch": 3.0557355800388852,
"grad_norm": 10.625,
"learning_rate": 8.31832731038543e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 4715,
"train_speed(iter/s)": 0.631917
},
{
"epoch": 3.0589760207388204,
"grad_norm": 0.81640625,
"learning_rate": 8.314317348701723e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 4720,
"train_speed(iter/s)": 0.6321
},
{
"epoch": 3.0622164614387555,
"grad_norm": 7.5625,
"learning_rate": 8.310303580881442e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 4725,
"train_speed(iter/s)": 0.632284
},
{
"epoch": 3.065456902138691,
"grad_norm": 3.765625,
"learning_rate": 8.306286011533968e-05,
"loss": 0.0572265625,
"memory(GiB)": 43.05,
"step": 4730,
"train_speed(iter/s)": 0.632202
},
{
"epoch": 3.068697342838626,
"grad_norm": 4.09375,
"learning_rate": 8.302264645273042e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 4735,
"train_speed(iter/s)": 0.632424
},
{
"epoch": 3.0719377835385613,
"grad_norm": 0.86328125,
"learning_rate": 8.298239486716776e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 4740,
"train_speed(iter/s)": 0.632756
},
{
"epoch": 3.0751782242384964,
"grad_norm": 4.09375,
"learning_rate": 8.294210540487627e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 4745,
"train_speed(iter/s)": 0.632749
},
{
"epoch": 3.0784186649384315,
"grad_norm": 13.8125,
"learning_rate": 8.290177811212407e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 4750,
"train_speed(iter/s)": 0.632882
},
{
"epoch": 3.0816591056383666,
"grad_norm": 0.90234375,
"learning_rate": 8.286141303522273e-05,
"loss": 0.0212890625,
"memory(GiB)": 43.05,
"step": 4755,
"train_speed(iter/s)": 0.633143
},
{
"epoch": 3.084899546338302,
"grad_norm": 1.1640625,
"learning_rate": 8.282101022052717e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 4760,
"train_speed(iter/s)": 0.633077
},
{
"epoch": 3.0881399870382373,
"grad_norm": 4.90625,
"learning_rate": 8.278056971443567e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4765,
"train_speed(iter/s)": 0.63317
},
{
"epoch": 3.0913804277381725,
"grad_norm": 11.125,
"learning_rate": 8.274009156338982e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 4770,
"train_speed(iter/s)": 0.633145
},
{
"epoch": 3.0946208684381076,
"grad_norm": 11.4375,
"learning_rate": 8.269957581387442e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 4775,
"train_speed(iter/s)": 0.633361
},
{
"epoch": 3.0978613091380427,
"grad_norm": 0.55859375,
"learning_rate": 8.265902251241741e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 4780,
"train_speed(iter/s)": 0.633194
},
{
"epoch": 3.101101749837978,
"grad_norm": 1.1015625,
"learning_rate": 8.261843170558991e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 4785,
"train_speed(iter/s)": 0.633418
},
{
"epoch": 3.1043421905379134,
"grad_norm": 2.015625,
"learning_rate": 8.257780344000611e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 4790,
"train_speed(iter/s)": 0.63364
},
{
"epoch": 3.1075826312378485,
"grad_norm": 10.125,
"learning_rate": 8.253713776232317e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 4795,
"train_speed(iter/s)": 0.633929
},
{
"epoch": 3.1108230719377836,
"grad_norm": 10.0625,
"learning_rate": 8.249643471924124e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 4800,
"train_speed(iter/s)": 0.634026
},
{
"epoch": 3.1140635126377187,
"grad_norm": 12.3125,
"learning_rate": 8.245569435750342e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 4805,
"train_speed(iter/s)": 0.634167
},
{
"epoch": 3.117303953337654,
"grad_norm": 9.625,
"learning_rate": 8.241491672389558e-05,
"loss": 0.058203125,
"memory(GiB)": 43.05,
"step": 4810,
"train_speed(iter/s)": 0.634336
},
{
"epoch": 3.120544394037589,
"grad_norm": 2.734375,
"learning_rate": 8.237410186524648e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 4815,
"train_speed(iter/s)": 0.634383
},
{
"epoch": 3.123784834737524,
"grad_norm": 13.3125,
"learning_rate": 8.233324982842756e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 4820,
"train_speed(iter/s)": 0.634436
},
{
"epoch": 3.1270252754374597,
"grad_norm": 14.4375,
"learning_rate": 8.2292360660353e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 4825,
"train_speed(iter/s)": 0.634559
},
{
"epoch": 3.130265716137395,
"grad_norm": 4.15625,
"learning_rate": 8.22514344079796e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 4830,
"train_speed(iter/s)": 0.634789
},
{
"epoch": 3.13350615683733,
"grad_norm": 11.9375,
"learning_rate": 8.221047111830677e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 4835,
"train_speed(iter/s)": 0.634768
},
{
"epoch": 3.136746597537265,
"grad_norm": 12.8125,
"learning_rate": 8.216947083837643e-05,
"loss": 0.03642578125,
"memory(GiB)": 43.05,
"step": 4840,
"train_speed(iter/s)": 0.635024
},
{
"epoch": 3.1399870382372,
"grad_norm": 0.859375,
"learning_rate": 8.212843361527296e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 4845,
"train_speed(iter/s)": 0.635157
},
{
"epoch": 3.1432274789371353,
"grad_norm": 9.9375,
"learning_rate": 8.208735949612323e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4850,
"train_speed(iter/s)": 0.635384
},
{
"epoch": 3.146467919637071,
"grad_norm": 2.5625,
"learning_rate": 8.204624852809641e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 4855,
"train_speed(iter/s)": 0.635432
},
{
"epoch": 3.149708360337006,
"grad_norm": 0.5,
"learning_rate": 8.200510075840406e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 4860,
"train_speed(iter/s)": 0.635528
},
{
"epoch": 3.152948801036941,
"grad_norm": 10.5625,
"learning_rate": 8.196391623429992e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 4865,
"train_speed(iter/s)": 0.635744
},
{
"epoch": 3.156189241736876,
"grad_norm": 0.5546875,
"learning_rate": 8.192269500308001e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 4870,
"train_speed(iter/s)": 0.635911
},
{
"epoch": 3.1594296824368113,
"grad_norm": 1.1171875,
"learning_rate": 8.188143711208246e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 4875,
"train_speed(iter/s)": 0.635898
},
{
"epoch": 3.1626701231367464,
"grad_norm": 9.625,
"learning_rate": 8.18401426086875e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 4880,
"train_speed(iter/s)": 0.63608
},
{
"epoch": 3.1659105638366816,
"grad_norm": 12.875,
"learning_rate": 8.179881154031748e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 4885,
"train_speed(iter/s)": 0.636136
},
{
"epoch": 3.169151004536617,
"grad_norm": 0.4375,
"learning_rate": 8.175744395443662e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 4890,
"train_speed(iter/s)": 0.636321
},
{
"epoch": 3.1723914452365523,
"grad_norm": 14.1875,
"learning_rate": 8.171603989855115e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 4895,
"train_speed(iter/s)": 0.636546
},
{
"epoch": 3.1756318859364874,
"grad_norm": 6.28125,
"learning_rate": 8.167459942020919e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 4900,
"train_speed(iter/s)": 0.636452
},
{
"epoch": 3.1788723266364225,
"grad_norm": 2.953125,
"learning_rate": 8.163312256700067e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 4905,
"train_speed(iter/s)": 0.636629
},
{
"epoch": 3.1821127673363576,
"grad_norm": 4.15625,
"learning_rate": 8.159160938655726e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 4910,
"train_speed(iter/s)": 0.636829
},
{
"epoch": 3.1853532080362927,
"grad_norm": 10.8125,
"learning_rate": 8.155005992655238e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4915,
"train_speed(iter/s)": 0.637
},
{
"epoch": 3.1885936487362283,
"grad_norm": 1.8828125,
"learning_rate": 8.150847423470114e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 4920,
"train_speed(iter/s)": 0.637005
},
{
"epoch": 3.1918340894361634,
"grad_norm": 14.5,
"learning_rate": 8.14668523587602e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 4925,
"train_speed(iter/s)": 0.637222
},
{
"epoch": 3.1950745301360985,
"grad_norm": 12.75,
"learning_rate": 8.142519434652782e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 4930,
"train_speed(iter/s)": 0.637378
},
{
"epoch": 3.1983149708360337,
"grad_norm": 10.5625,
"learning_rate": 8.138350024584373e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 4935,
"train_speed(iter/s)": 0.63747
},
{
"epoch": 3.201555411535969,
"grad_norm": 9.3125,
"learning_rate": 8.134177010458914e-05,
"loss": 0.049609375,
"memory(GiB)": 43.05,
"step": 4940,
"train_speed(iter/s)": 0.637558
},
{
"epoch": 3.204795852235904,
"grad_norm": 14.25,
"learning_rate": 8.130000397068658e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 4945,
"train_speed(iter/s)": 0.637705
},
{
"epoch": 3.2080362929358395,
"grad_norm": 0.7421875,
"learning_rate": 8.12582018921e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 4950,
"train_speed(iter/s)": 0.63785
},
{
"epoch": 3.2112767336357746,
"grad_norm": 10.0625,
"learning_rate": 8.121636391683456e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 4955,
"train_speed(iter/s)": 0.638016
},
{
"epoch": 3.2145171743357097,
"grad_norm": 11.0,
"learning_rate": 8.117449009293668e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 4960,
"train_speed(iter/s)": 0.638147
},
{
"epoch": 3.217757615035645,
"grad_norm": 17.25,
"learning_rate": 8.113258046849392e-05,
"loss": 0.051953125,
"memory(GiB)": 43.05,
"step": 4965,
"train_speed(iter/s)": 0.638285
},
{
"epoch": 3.22099805573558,
"grad_norm": 11.1875,
"learning_rate": 8.109063509163501e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 4970,
"train_speed(iter/s)": 0.638338
},
{
"epoch": 3.224238496435515,
"grad_norm": 3.609375,
"learning_rate": 8.104865401052965e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 4975,
"train_speed(iter/s)": 0.638632
},
{
"epoch": 3.2274789371354506,
"grad_norm": 2.390625,
"learning_rate": 8.100663727338863e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 4980,
"train_speed(iter/s)": 0.63887
},
{
"epoch": 3.2307193778353858,
"grad_norm": 17.875,
"learning_rate": 8.096458492846362e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 4985,
"train_speed(iter/s)": 0.638997
},
{
"epoch": 3.233959818535321,
"grad_norm": 8.4375,
"learning_rate": 8.092249702404724e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 4990,
"train_speed(iter/s)": 0.639111
},
{
"epoch": 3.237200259235256,
"grad_norm": 10.8125,
"learning_rate": 8.088037360847287e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 4995,
"train_speed(iter/s)": 0.639087
},
{
"epoch": 3.240440699935191,
"grad_norm": 15.75,
"learning_rate": 8.083821473011477e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 5000,
"train_speed(iter/s)": 0.639051
},
{
"epoch": 3.2436811406351262,
"grad_norm": 13.5,
"learning_rate": 8.079602043738783e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 5005,
"train_speed(iter/s)": 0.639135
},
{
"epoch": 3.2469215813350614,
"grad_norm": 1.90625,
"learning_rate": 8.075379077874768e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 5010,
"train_speed(iter/s)": 0.639406
},
{
"epoch": 3.250162022034997,
"grad_norm": 2.21875,
"learning_rate": 8.07115258026905e-05,
"loss": 0.044140625,
"memory(GiB)": 43.05,
"step": 5015,
"train_speed(iter/s)": 0.639572
},
{
"epoch": 3.253402462734932,
"grad_norm": 2.734375,
"learning_rate": 8.066922555775311e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 5020,
"train_speed(iter/s)": 0.639713
},
{
"epoch": 3.256642903434867,
"grad_norm": 1.5703125,
"learning_rate": 8.062689009251277e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 5025,
"train_speed(iter/s)": 0.640003
},
{
"epoch": 3.2598833441348023,
"grad_norm": 14.125,
"learning_rate": 8.058451945558719e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 5030,
"train_speed(iter/s)": 0.640001
},
{
"epoch": 3.2631237848347374,
"grad_norm": 3.25,
"learning_rate": 8.054211369563447e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 5035,
"train_speed(iter/s)": 0.640051
},
{
"epoch": 3.2663642255346725,
"grad_norm": 13.3125,
"learning_rate": 8.049967286135309e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 5040,
"train_speed(iter/s)": 0.640257
},
{
"epoch": 3.269604666234608,
"grad_norm": 3.59375,
"learning_rate": 8.045719700148177e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 5045,
"train_speed(iter/s)": 0.640452
},
{
"epoch": 3.2728451069345432,
"grad_norm": 5.75,
"learning_rate": 8.041468616479945e-05,
"loss": 0.03388671875,
"memory(GiB)": 43.05,
"step": 5050,
"train_speed(iter/s)": 0.640688
},
{
"epoch": 3.2760855476344783,
"grad_norm": 0.578125,
"learning_rate": 8.037214040012528e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 5055,
"train_speed(iter/s)": 0.640637
},
{
"epoch": 3.2793259883344135,
"grad_norm": 13.5625,
"learning_rate": 8.032955975631847e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 5060,
"train_speed(iter/s)": 0.640924
},
{
"epoch": 3.2825664290343486,
"grad_norm": 12.0,
"learning_rate": 8.028694428227828e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 5065,
"train_speed(iter/s)": 0.641007
},
{
"epoch": 3.2858068697342837,
"grad_norm": 7.4375,
"learning_rate": 8.0244294026944e-05,
"loss": 0.01630859375,
"memory(GiB)": 43.05,
"step": 5070,
"train_speed(iter/s)": 0.641295
},
{
"epoch": 3.289047310434219,
"grad_norm": 2.84375,
"learning_rate": 8.02016090392949e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 5075,
"train_speed(iter/s)": 0.641411
},
{
"epoch": 3.2922877511341544,
"grad_norm": 1.921875,
"learning_rate": 8.015888936835003e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 5080,
"train_speed(iter/s)": 0.641628
},
{
"epoch": 3.2955281918340895,
"grad_norm": 0.78125,
"learning_rate": 8.011613506316838e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 5085,
"train_speed(iter/s)": 0.64172
},
{
"epoch": 3.2987686325340246,
"grad_norm": 1.640625,
"learning_rate": 8.007334617284864e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5090,
"train_speed(iter/s)": 0.64188
},
{
"epoch": 3.3020090732339598,
"grad_norm": 3.078125,
"learning_rate": 8.003052274652924e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 5095,
"train_speed(iter/s)": 0.642041
},
{
"epoch": 3.305249513933895,
"grad_norm": 2.359375,
"learning_rate": 7.998766483338831e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 5100,
"train_speed(iter/s)": 0.642165
},
{
"epoch": 3.3084899546338304,
"grad_norm": 13.875,
"learning_rate": 7.99447724826435e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 5105,
"train_speed(iter/s)": 0.641982
},
{
"epoch": 3.3117303953337656,
"grad_norm": 7.09375,
"learning_rate": 7.990184574355209e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 5110,
"train_speed(iter/s)": 0.642198
},
{
"epoch": 3.3149708360337007,
"grad_norm": 13.25,
"learning_rate": 7.98588846654108e-05,
"loss": 0.045703125,
"memory(GiB)": 43.05,
"step": 5115,
"train_speed(iter/s)": 0.641561
},
{
"epoch": 3.318211276733636,
"grad_norm": 3.59375,
"learning_rate": 7.981588929755581e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 5120,
"train_speed(iter/s)": 0.641573
},
{
"epoch": 3.321451717433571,
"grad_norm": 3.703125,
"learning_rate": 7.977285968936266e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 5125,
"train_speed(iter/s)": 0.641596
},
{
"epoch": 3.324692158133506,
"grad_norm": 16.375,
"learning_rate": 7.972979589024624e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 5130,
"train_speed(iter/s)": 0.641786
},
{
"epoch": 3.327932598833441,
"grad_norm": 5.40625,
"learning_rate": 7.968669794966067e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 5135,
"train_speed(iter/s)": 0.641917
},
{
"epoch": 3.3311730395333763,
"grad_norm": 0.828125,
"learning_rate": 7.96435659170993e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 5140,
"train_speed(iter/s)": 0.641959
},
{
"epoch": 3.334413480233312,
"grad_norm": 1.8359375,
"learning_rate": 7.960039984209462e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 5145,
"train_speed(iter/s)": 0.642049
},
{
"epoch": 3.337653920933247,
"grad_norm": 1.0,
"learning_rate": 7.955719977421823e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 5150,
"train_speed(iter/s)": 0.642241
},
{
"epoch": 3.340894361633182,
"grad_norm": 7.59375,
"learning_rate": 7.951396576308074e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 5155,
"train_speed(iter/s)": 0.642317
},
{
"epoch": 3.344134802333117,
"grad_norm": 0.734375,
"learning_rate": 7.947069785833176e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 5160,
"train_speed(iter/s)": 0.642442
},
{
"epoch": 3.3473752430330523,
"grad_norm": 0.439453125,
"learning_rate": 7.942739610965984e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 5165,
"train_speed(iter/s)": 0.642501
},
{
"epoch": 3.350615683732988,
"grad_norm": 0.6015625,
"learning_rate": 7.938406056679234e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 5170,
"train_speed(iter/s)": 0.642582
},
{
"epoch": 3.353856124432923,
"grad_norm": 8.9375,
"learning_rate": 7.93406912794955e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 5175,
"train_speed(iter/s)": 0.642524
},
{
"epoch": 3.357096565132858,
"grad_norm": 10.125,
"learning_rate": 7.929728829757426e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 5180,
"train_speed(iter/s)": 0.642761
},
{
"epoch": 3.3603370058327933,
"grad_norm": 8.6875,
"learning_rate": 7.925385167087225e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 5185,
"train_speed(iter/s)": 0.642755
},
{
"epoch": 3.3635774465327284,
"grad_norm": 14.625,
"learning_rate": 7.92103814492718e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 5190,
"train_speed(iter/s)": 0.642756
},
{
"epoch": 3.3668178872326635,
"grad_norm": 2.28125,
"learning_rate": 7.916687768269374e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 5195,
"train_speed(iter/s)": 0.642826
},
{
"epoch": 3.3700583279325986,
"grad_norm": 1.921875,
"learning_rate": 7.912334042109747e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 5200,
"train_speed(iter/s)": 0.642835
},
{
"epoch": 3.373298768632534,
"grad_norm": 2.203125,
"learning_rate": 7.907976971448091e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 5205,
"train_speed(iter/s)": 0.64264
},
{
"epoch": 3.3765392093324693,
"grad_norm": 9.5,
"learning_rate": 7.903616561288021e-05,
"loss": 0.017578125,
"memory(GiB)": 43.05,
"step": 5210,
"train_speed(iter/s)": 0.642804
},
{
"epoch": 3.3797796500324044,
"grad_norm": 17.0,
"learning_rate": 7.899252816637007e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 5215,
"train_speed(iter/s)": 0.642852
},
{
"epoch": 3.3830200907323396,
"grad_norm": 6.75,
"learning_rate": 7.894885742506337e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 5220,
"train_speed(iter/s)": 0.642941
},
{
"epoch": 3.3862605314322747,
"grad_norm": 10.5625,
"learning_rate": 7.890515343911127e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 5225,
"train_speed(iter/s)": 0.642969
},
{
"epoch": 3.38950097213221,
"grad_norm": 0.7890625,
"learning_rate": 7.886141625870307e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 5230,
"train_speed(iter/s)": 0.643161
},
{
"epoch": 3.3927414128321454,
"grad_norm": 10.25,
"learning_rate": 7.881764593406622e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 5235,
"train_speed(iter/s)": 0.643358
},
{
"epoch": 3.3959818535320805,
"grad_norm": 11.625,
"learning_rate": 7.87738425154662e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 5240,
"train_speed(iter/s)": 0.643484
},
{
"epoch": 3.3992222942320156,
"grad_norm": 12.75,
"learning_rate": 7.873000605320659e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 5245,
"train_speed(iter/s)": 0.643764
},
{
"epoch": 3.4024627349319507,
"grad_norm": 11.375,
"learning_rate": 7.868613659762878e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 5250,
"train_speed(iter/s)": 0.643963
},
{
"epoch": 3.405703175631886,
"grad_norm": 1.34375,
"learning_rate": 7.864223419911211e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5255,
"train_speed(iter/s)": 0.644049
},
{
"epoch": 3.408943616331821,
"grad_norm": 14.0,
"learning_rate": 7.859829890807382e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 5260,
"train_speed(iter/s)": 0.644215
},
{
"epoch": 3.412184057031756,
"grad_norm": 15.5,
"learning_rate": 7.855433077496882e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 5265,
"train_speed(iter/s)": 0.644236
},
{
"epoch": 3.4154244977316917,
"grad_norm": 14.1875,
"learning_rate": 7.851032985028976e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 5270,
"train_speed(iter/s)": 0.64435
},
{
"epoch": 3.4186649384316268,
"grad_norm": 0.5625,
"learning_rate": 7.846629618456702e-05,
"loss": 0.0599609375,
"memory(GiB)": 43.05,
"step": 5275,
"train_speed(iter/s)": 0.644495
},
{
"epoch": 3.421905379131562,
"grad_norm": 15.0625,
"learning_rate": 7.842222982836847e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 5280,
"train_speed(iter/s)": 0.644542
},
{
"epoch": 3.425145819831497,
"grad_norm": 0.57421875,
"learning_rate": 7.837813083229957e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 5285,
"train_speed(iter/s)": 0.644519
},
{
"epoch": 3.428386260531432,
"grad_norm": 3.453125,
"learning_rate": 7.833399924700331e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 5290,
"train_speed(iter/s)": 0.644511
},
{
"epoch": 3.4316267012313677,
"grad_norm": 17.375,
"learning_rate": 7.828983512316006e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 5295,
"train_speed(iter/s)": 0.644495
},
{
"epoch": 3.434867141931303,
"grad_norm": 0.55859375,
"learning_rate": 7.824563851148752e-05,
"loss": 0.0453125,
"memory(GiB)": 43.05,
"step": 5300,
"train_speed(iter/s)": 0.644602
},
{
"epoch": 3.438107582631238,
"grad_norm": 5.90625,
"learning_rate": 7.820140946274076e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 5305,
"train_speed(iter/s)": 0.644735
},
{
"epoch": 3.441348023331173,
"grad_norm": 4.375,
"learning_rate": 7.815714802771211e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 5310,
"train_speed(iter/s)": 0.644901
},
{
"epoch": 3.444588464031108,
"grad_norm": 2.171875,
"learning_rate": 7.811285425723101e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 5315,
"train_speed(iter/s)": 0.644867
},
{
"epoch": 3.4478289047310433,
"grad_norm": 0.57421875,
"learning_rate": 7.806852820216412e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 5320,
"train_speed(iter/s)": 0.645099
},
{
"epoch": 3.4510693454309784,
"grad_norm": 1.4453125,
"learning_rate": 7.802416991341512e-05,
"loss": 0.062109375,
"memory(GiB)": 43.05,
"step": 5325,
"train_speed(iter/s)": 0.645222
},
{
"epoch": 3.454309786130914,
"grad_norm": 13.125,
"learning_rate": 7.797977944192476e-05,
"loss": 0.05546875,
"memory(GiB)": 43.05,
"step": 5330,
"train_speed(iter/s)": 0.645274
},
{
"epoch": 3.457550226830849,
"grad_norm": 0.412109375,
"learning_rate": 7.79353568386707e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 5335,
"train_speed(iter/s)": 0.645431
},
{
"epoch": 3.4607906675307842,
"grad_norm": 0.484375,
"learning_rate": 7.78909021546675e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 5340,
"train_speed(iter/s)": 0.645563
},
{
"epoch": 3.4640311082307194,
"grad_norm": 14.0625,
"learning_rate": 7.784641544096658e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 5345,
"train_speed(iter/s)": 0.64551
},
{
"epoch": 3.4672715489306545,
"grad_norm": 1.421875,
"learning_rate": 7.780189674865616e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 5350,
"train_speed(iter/s)": 0.645597
},
{
"epoch": 3.4705119896305896,
"grad_norm": 1.1640625,
"learning_rate": 7.775734612886116e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5355,
"train_speed(iter/s)": 0.645617
},
{
"epoch": 3.473752430330525,
"grad_norm": 2.078125,
"learning_rate": 7.771276363274316e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 5360,
"train_speed(iter/s)": 0.645662
},
{
"epoch": 3.4769928710304603,
"grad_norm": 0.89453125,
"learning_rate": 7.766814931150035e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 5365,
"train_speed(iter/s)": 0.645772
},
{
"epoch": 3.4802333117303954,
"grad_norm": 0.59765625,
"learning_rate": 7.76235032163675e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 5370,
"train_speed(iter/s)": 0.645739
},
{
"epoch": 3.4834737524303305,
"grad_norm": 10.6875,
"learning_rate": 7.757882539861582e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 5375,
"train_speed(iter/s)": 0.645837
},
{
"epoch": 3.4867141931302656,
"grad_norm": 4.03125,
"learning_rate": 7.753411590955299e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 5380,
"train_speed(iter/s)": 0.646036
},
{
"epoch": 3.4899546338302008,
"grad_norm": 8.1875,
"learning_rate": 7.7489374800523e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 5385,
"train_speed(iter/s)": 0.646104
},
{
"epoch": 3.493195074530136,
"grad_norm": 4.03125,
"learning_rate": 7.744460212290625e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 5390,
"train_speed(iter/s)": 0.646305
},
{
"epoch": 3.4964355152300715,
"grad_norm": 12.5,
"learning_rate": 7.739979792811933e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 5395,
"train_speed(iter/s)": 0.646219
},
{
"epoch": 3.4996759559300066,
"grad_norm": 7.46875,
"learning_rate": 7.735496226761499e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5400,
"train_speed(iter/s)": 0.646368
},
{
"epoch": 3.5029163966299417,
"grad_norm": 1.0234375,
"learning_rate": 7.73100951928822e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 5405,
"train_speed(iter/s)": 0.646371
},
{
"epoch": 3.506156837329877,
"grad_norm": 1.8046875,
"learning_rate": 7.726519675544597e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 5410,
"train_speed(iter/s)": 0.646382
},
{
"epoch": 3.509397278029812,
"grad_norm": 5.53125,
"learning_rate": 7.722026700686727e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 5415,
"train_speed(iter/s)": 0.646589
},
{
"epoch": 3.5126377187297475,
"grad_norm": 13.0625,
"learning_rate": 7.717530599874311e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 5420,
"train_speed(iter/s)": 0.646705
},
{
"epoch": 3.5158781594296826,
"grad_norm": 1.65625,
"learning_rate": 7.71303137827064e-05,
"loss": 0.056640625,
"memory(GiB)": 43.05,
"step": 5425,
"train_speed(iter/s)": 0.64682
},
{
"epoch": 3.5191186001296177,
"grad_norm": 2.953125,
"learning_rate": 7.708529041042581e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 5430,
"train_speed(iter/s)": 0.647051
},
{
"epoch": 3.522359040829553,
"grad_norm": 1.9375,
"learning_rate": 7.704023593360583e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 5435,
"train_speed(iter/s)": 0.64715
},
{
"epoch": 3.525599481529488,
"grad_norm": 0.58984375,
"learning_rate": 7.69951504039867e-05,
"loss": 0.0578125,
"memory(GiB)": 43.05,
"step": 5440,
"train_speed(iter/s)": 0.647371
},
{
"epoch": 3.528839922229423,
"grad_norm": 1.3828125,
"learning_rate": 7.69500338733443e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 5445,
"train_speed(iter/s)": 0.647472
},
{
"epoch": 3.5320803629293582,
"grad_norm": 14.25,
"learning_rate": 7.690488639349008e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 5450,
"train_speed(iter/s)": 0.647435
},
{
"epoch": 3.5353208036292934,
"grad_norm": 10.0625,
"learning_rate": 7.685970801627108e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 5455,
"train_speed(iter/s)": 0.647517
},
{
"epoch": 3.538561244329229,
"grad_norm": 12.75,
"learning_rate": 7.681449879356979e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 5460,
"train_speed(iter/s)": 0.647619
},
{
"epoch": 3.541801685029164,
"grad_norm": 10.3125,
"learning_rate": 7.676925877730413e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 5465,
"train_speed(iter/s)": 0.647615
},
{
"epoch": 3.545042125729099,
"grad_norm": 12.25,
"learning_rate": 7.67239880194274e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 5470,
"train_speed(iter/s)": 0.647878
},
{
"epoch": 3.5482825664290343,
"grad_norm": 16.375,
"learning_rate": 7.66786865719282e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 5475,
"train_speed(iter/s)": 0.648057
},
{
"epoch": 3.5515230071289694,
"grad_norm": 14.3125,
"learning_rate": 7.663335448683035e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 5480,
"train_speed(iter/s)": 0.648051
},
{
"epoch": 3.554763447828905,
"grad_norm": 12.625,
"learning_rate": 7.658799181619284e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 5485,
"train_speed(iter/s)": 0.64819
},
{
"epoch": 3.55800388852884,
"grad_norm": 1.0703125,
"learning_rate": 7.654259861210987e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 5490,
"train_speed(iter/s)": 0.648173
},
{
"epoch": 3.561244329228775,
"grad_norm": 2.09375,
"learning_rate": 7.64971749267106e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 5495,
"train_speed(iter/s)": 0.648254
},
{
"epoch": 3.5644847699287103,
"grad_norm": 0.6875,
"learning_rate": 7.645172081215926e-05,
"loss": 0.0533203125,
"memory(GiB)": 43.05,
"step": 5500,
"train_speed(iter/s)": 0.648366
},
{
"epoch": 3.5677252106286454,
"grad_norm": 14.375,
"learning_rate": 7.640623632065502e-05,
"loss": 0.051171875,
"memory(GiB)": 43.05,
"step": 5505,
"train_speed(iter/s)": 0.648634
},
{
"epoch": 3.5709656513285806,
"grad_norm": 2.6875,
"learning_rate": 7.63607215044319e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 5510,
"train_speed(iter/s)": 0.648752
},
{
"epoch": 3.5742060920285157,
"grad_norm": 16.5,
"learning_rate": 7.631517641575875e-05,
"loss": 0.062109375,
"memory(GiB)": 43.05,
"step": 5515,
"train_speed(iter/s)": 0.648739
},
{
"epoch": 3.577446532728451,
"grad_norm": 11.75,
"learning_rate": 7.626960110693923e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 5520,
"train_speed(iter/s)": 0.648853
},
{
"epoch": 3.5806869734283864,
"grad_norm": 0.66796875,
"learning_rate": 7.622399563031168e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 5525,
"train_speed(iter/s)": 0.648817
},
{
"epoch": 3.5839274141283215,
"grad_norm": 9.4375,
"learning_rate": 7.617836003824905e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 5530,
"train_speed(iter/s)": 0.648813
},
{
"epoch": 3.5871678548282566,
"grad_norm": 15.625,
"learning_rate": 7.613269438315892e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 5535,
"train_speed(iter/s)": 0.648733
},
{
"epoch": 3.5904082955281917,
"grad_norm": 3.40625,
"learning_rate": 7.608699871748338e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 5540,
"train_speed(iter/s)": 0.648878
},
{
"epoch": 3.593648736228127,
"grad_norm": 12.1875,
"learning_rate": 7.604127309369897e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 5545,
"train_speed(iter/s)": 0.648849
},
{
"epoch": 3.5968891769280624,
"grad_norm": 0.8203125,
"learning_rate": 7.599551756431665e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 5550,
"train_speed(iter/s)": 0.649039
},
{
"epoch": 3.6001296176279975,
"grad_norm": 8.0,
"learning_rate": 7.594973218188172e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 5555,
"train_speed(iter/s)": 0.649097
},
{
"epoch": 3.6033700583279327,
"grad_norm": 11.1875,
"learning_rate": 7.590391699897375e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 5560,
"train_speed(iter/s)": 0.649188
},
{
"epoch": 3.606610499027868,
"grad_norm": 3.65625,
"learning_rate": 7.585807206820656e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 5565,
"train_speed(iter/s)": 0.6493
},
{
"epoch": 3.609850939727803,
"grad_norm": 1.796875,
"learning_rate": 7.581219744222812e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 5570,
"train_speed(iter/s)": 0.649432
},
{
"epoch": 3.613091380427738,
"grad_norm": 4.46875,
"learning_rate": 7.576629317372047e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 5575,
"train_speed(iter/s)": 0.649619
},
{
"epoch": 3.616331821127673,
"grad_norm": 1.21875,
"learning_rate": 7.572035931539975e-05,
"loss": 0.060546875,
"memory(GiB)": 43.05,
"step": 5580,
"train_speed(iter/s)": 0.649764
},
{
"epoch": 3.6195722618276087,
"grad_norm": 2.421875,
"learning_rate": 7.567439592001604e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 5585,
"train_speed(iter/s)": 0.649826
},
{
"epoch": 3.622812702527544,
"grad_norm": 10.6875,
"learning_rate": 7.562840304035334e-05,
"loss": 0.01513671875,
"memory(GiB)": 43.05,
"step": 5590,
"train_speed(iter/s)": 0.650042
},
{
"epoch": 3.626053143227479,
"grad_norm": 1.78125,
"learning_rate": 7.558238072922952e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5595,
"train_speed(iter/s)": 0.65019
},
{
"epoch": 3.629293583927414,
"grad_norm": 2.953125,
"learning_rate": 7.553632903949626e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 5600,
"train_speed(iter/s)": 0.650337
},
{
"epoch": 3.632534024627349,
"grad_norm": 15.625,
"learning_rate": 7.549024802403897e-05,
"loss": 0.0248046875,
"memory(GiB)": 43.05,
"step": 5605,
"train_speed(iter/s)": 0.650391
},
{
"epoch": 3.6357744653272848,
"grad_norm": 1.5234375,
"learning_rate": 7.544413773577673e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 5610,
"train_speed(iter/s)": 0.650397
},
{
"epoch": 3.63901490602722,
"grad_norm": 8.5625,
"learning_rate": 7.539799822766223e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 5615,
"train_speed(iter/s)": 0.650585
},
{
"epoch": 3.642255346727155,
"grad_norm": 15.9375,
"learning_rate": 7.535182955268173e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 5620,
"train_speed(iter/s)": 0.650687
},
{
"epoch": 3.64549578742709,
"grad_norm": 3.765625,
"learning_rate": 7.530563176385499e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 5625,
"train_speed(iter/s)": 0.650823
},
{
"epoch": 3.6487362281270252,
"grad_norm": 10.9375,
"learning_rate": 7.525940491423519e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 5630,
"train_speed(iter/s)": 0.651083
},
{
"epoch": 3.6519766688269604,
"grad_norm": 0.515625,
"learning_rate": 7.521314905690888e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 5635,
"train_speed(iter/s)": 0.651231
},
{
"epoch": 3.6552171095268955,
"grad_norm": 17.75,
"learning_rate": 7.516686424499595e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 5640,
"train_speed(iter/s)": 0.651137
},
{
"epoch": 3.6584575502268306,
"grad_norm": 4.09375,
"learning_rate": 7.51205505316495e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 5645,
"train_speed(iter/s)": 0.651233
},
{
"epoch": 3.661697990926766,
"grad_norm": 3.28125,
"learning_rate": 7.507420797005588e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 5650,
"train_speed(iter/s)": 0.651156
},
{
"epoch": 3.6649384316267013,
"grad_norm": 12.25,
"learning_rate": 7.502783661343449e-05,
"loss": 0.055078125,
"memory(GiB)": 43.05,
"step": 5655,
"train_speed(iter/s)": 0.651298
},
{
"epoch": 3.6681788723266364,
"grad_norm": 10.3125,
"learning_rate": 7.498143651503787e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 5660,
"train_speed(iter/s)": 0.651328
},
{
"epoch": 3.6714193130265715,
"grad_norm": 13.25,
"learning_rate": 7.493500772815149e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 5665,
"train_speed(iter/s)": 0.65134
},
{
"epoch": 3.6746597537265067,
"grad_norm": 1.765625,
"learning_rate": 7.488855030609387e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 5670,
"train_speed(iter/s)": 0.651437
},
{
"epoch": 3.6779001944264422,
"grad_norm": 11.0625,
"learning_rate": 7.484206430221634e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 5675,
"train_speed(iter/s)": 0.651513
},
{
"epoch": 3.6811406351263773,
"grad_norm": 1.875,
"learning_rate": 7.479554976990306e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 5680,
"train_speed(iter/s)": 0.651771
},
{
"epoch": 3.6843810758263125,
"grad_norm": 4.90625,
"learning_rate": 7.474900676257094e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 5685,
"train_speed(iter/s)": 0.651846
},
{
"epoch": 3.6876215165262476,
"grad_norm": 8.5,
"learning_rate": 7.470243533366966e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 5690,
"train_speed(iter/s)": 0.651883
},
{
"epoch": 3.6908619572261827,
"grad_norm": 4.6875,
"learning_rate": 7.465583553668144e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 5695,
"train_speed(iter/s)": 0.652021
},
{
"epoch": 3.694102397926118,
"grad_norm": 12.0,
"learning_rate": 7.460920742512118e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 5700,
"train_speed(iter/s)": 0.652161
},
{
"epoch": 3.697342838626053,
"grad_norm": 3.421875,
"learning_rate": 7.45625510525362e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 5705,
"train_speed(iter/s)": 0.652185
},
{
"epoch": 3.700583279325988,
"grad_norm": 12.125,
"learning_rate": 7.451586647250635e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 5710,
"train_speed(iter/s)": 0.652326
},
{
"epoch": 3.7038237200259236,
"grad_norm": 8.6875,
"learning_rate": 7.446915373864384e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 5715,
"train_speed(iter/s)": 0.652367
},
{
"epoch": 3.7070641607258588,
"grad_norm": 5.21875,
"learning_rate": 7.442241290459318e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 5720,
"train_speed(iter/s)": 0.65226
},
{
"epoch": 3.710304601425794,
"grad_norm": 0.63671875,
"learning_rate": 7.437564402403123e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 5725,
"train_speed(iter/s)": 0.652272
},
{
"epoch": 3.713545042125729,
"grad_norm": 5.875,
"learning_rate": 7.4328847150667e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 5730,
"train_speed(iter/s)": 0.652303
},
{
"epoch": 3.7167854828256646,
"grad_norm": 1.9765625,
"learning_rate": 7.428202233824164e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 5735,
"train_speed(iter/s)": 0.652285
},
{
"epoch": 3.7200259235255997,
"grad_norm": 6.0625,
"learning_rate": 7.423516964052844e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 5740,
"train_speed(iter/s)": 0.652526
},
{
"epoch": 3.723266364225535,
"grad_norm": 13.375,
"learning_rate": 7.418828911133263e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 5745,
"train_speed(iter/s)": 0.652675
},
{
"epoch": 3.72650680492547,
"grad_norm": 1.984375,
"learning_rate": 7.414138080449149e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 5750,
"train_speed(iter/s)": 0.652823
},
{
"epoch": 3.729747245625405,
"grad_norm": 10.125,
"learning_rate": 7.409444477387416e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 5755,
"train_speed(iter/s)": 0.652839
},
{
"epoch": 3.73298768632534,
"grad_norm": 4.15625,
"learning_rate": 7.404748107338157e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 5760,
"train_speed(iter/s)": 0.652819
},
{
"epoch": 3.7362281270252753,
"grad_norm": 13.375,
"learning_rate": 7.400048975694653e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 5765,
"train_speed(iter/s)": 0.653069
},
{
"epoch": 3.7394685677252104,
"grad_norm": 3.9375,
"learning_rate": 7.395347087853349e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 5770,
"train_speed(iter/s)": 0.653094
},
{
"epoch": 3.742709008425146,
"grad_norm": 2.84375,
"learning_rate": 7.390642449213852e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 5775,
"train_speed(iter/s)": 0.653279
},
{
"epoch": 3.745949449125081,
"grad_norm": 13.5625,
"learning_rate": 7.385935065178941e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 5780,
"train_speed(iter/s)": 0.653289
},
{
"epoch": 3.749189889825016,
"grad_norm": 9.875,
"learning_rate": 7.381224941154535e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 5785,
"train_speed(iter/s)": 0.653497
},
{
"epoch": 3.7524303305249513,
"grad_norm": 3.5625,
"learning_rate": 7.376512082549702e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 5790,
"train_speed(iter/s)": 0.653597
},
{
"epoch": 3.7556707712248865,
"grad_norm": 0.7421875,
"learning_rate": 7.371796494776659e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 5795,
"train_speed(iter/s)": 0.653667
},
{
"epoch": 3.758911211924822,
"grad_norm": 1.8828125,
"learning_rate": 7.367078183250746e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 5800,
"train_speed(iter/s)": 0.653779
},
{
"epoch": 3.762151652624757,
"grad_norm": 4.03125,
"learning_rate": 7.362357153390436e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 5805,
"train_speed(iter/s)": 0.653809
},
{
"epoch": 3.7653920933246923,
"grad_norm": 5.84375,
"learning_rate": 7.357633410617324e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 5810,
"train_speed(iter/s)": 0.65388
},
{
"epoch": 3.7686325340246274,
"grad_norm": 1.7734375,
"learning_rate": 7.352906960356122e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 5815,
"train_speed(iter/s)": 0.653861
},
{
"epoch": 3.7718729747245625,
"grad_norm": 0.5859375,
"learning_rate": 7.348177808034646e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 5820,
"train_speed(iter/s)": 0.654005
},
{
"epoch": 3.7751134154244976,
"grad_norm": 3.96875,
"learning_rate": 7.34344595908382e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 5825,
"train_speed(iter/s)": 0.654134
},
{
"epoch": 3.7783538561244328,
"grad_norm": 2.03125,
"learning_rate": 7.338711418937663e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 5830,
"train_speed(iter/s)": 0.654145
},
{
"epoch": 3.781594296824368,
"grad_norm": 11.3125,
"learning_rate": 7.333974193033281e-05,
"loss": 0.05390625,
"memory(GiB)": 43.05,
"step": 5835,
"train_speed(iter/s)": 0.654288
},
{
"epoch": 3.7848347375243034,
"grad_norm": 5.15625,
"learning_rate": 7.329234286810876e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 5840,
"train_speed(iter/s)": 0.654426
},
{
"epoch": 3.7880751782242386,
"grad_norm": 13.125,
"learning_rate": 7.324491705713712e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 5845,
"train_speed(iter/s)": 0.654567
},
{
"epoch": 3.7913156189241737,
"grad_norm": 10.6875,
"learning_rate": 7.319746455188135e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 5850,
"train_speed(iter/s)": 0.654819
},
{
"epoch": 3.794556059624109,
"grad_norm": 16.5,
"learning_rate": 7.314998540683556e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 5855,
"train_speed(iter/s)": 0.654923
},
{
"epoch": 3.797796500324044,
"grad_norm": 11.5,
"learning_rate": 7.310247967652442e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 5860,
"train_speed(iter/s)": 0.654971
},
{
"epoch": 3.8010369410239795,
"grad_norm": 11.0,
"learning_rate": 7.305494741550313e-05,
"loss": 0.0619140625,
"memory(GiB)": 43.05,
"step": 5865,
"train_speed(iter/s)": 0.654998
},
{
"epoch": 3.8042773817239146,
"grad_norm": 0.60546875,
"learning_rate": 7.30073886783574e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 5870,
"train_speed(iter/s)": 0.655134
},
{
"epoch": 3.8075178224238497,
"grad_norm": 12.1875,
"learning_rate": 7.29598035197033e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 5875,
"train_speed(iter/s)": 0.654989
},
{
"epoch": 3.810758263123785,
"grad_norm": 5.6875,
"learning_rate": 7.29121919941873e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 5880,
"train_speed(iter/s)": 0.655133
},
{
"epoch": 3.81399870382372,
"grad_norm": 0.78125,
"learning_rate": 7.286455415648607e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 5885,
"train_speed(iter/s)": 0.655036
},
{
"epoch": 3.817239144523655,
"grad_norm": 1.9765625,
"learning_rate": 7.281689006130653e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 5890,
"train_speed(iter/s)": 0.655153
},
{
"epoch": 3.82047958522359,
"grad_norm": 2.4375,
"learning_rate": 7.276919976338579e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 5895,
"train_speed(iter/s)": 0.655211
},
{
"epoch": 3.8237200259235253,
"grad_norm": 11.625,
"learning_rate": 7.2721483317491e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 5900,
"train_speed(iter/s)": 0.655411
},
{
"epoch": 3.826960466623461,
"grad_norm": 2.484375,
"learning_rate": 7.267374077841937e-05,
"loss": 0.0521484375,
"memory(GiB)": 43.05,
"step": 5905,
"train_speed(iter/s)": 0.65545
},
{
"epoch": 3.830200907323396,
"grad_norm": 15.25,
"learning_rate": 7.262597220099807e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 5910,
"train_speed(iter/s)": 0.655409
},
{
"epoch": 3.833441348023331,
"grad_norm": 12.875,
"learning_rate": 7.257817764008417e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 5915,
"train_speed(iter/s)": 0.655582
},
{
"epoch": 3.8366817887232663,
"grad_norm": 11.4375,
"learning_rate": 7.253035715056456e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 5920,
"train_speed(iter/s)": 0.655653
},
{
"epoch": 3.839922229423202,
"grad_norm": 1.6953125,
"learning_rate": 7.248251078735592e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 5925,
"train_speed(iter/s)": 0.655897
},
{
"epoch": 3.843162670123137,
"grad_norm": 15.125,
"learning_rate": 7.243463860540467e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 5930,
"train_speed(iter/s)": 0.655988
},
{
"epoch": 3.846403110823072,
"grad_norm": 17.0,
"learning_rate": 7.238674065968683e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 5935,
"train_speed(iter/s)": 0.656155
},
{
"epoch": 3.849643551523007,
"grad_norm": 13.625,
"learning_rate": 7.233881700520805e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 5940,
"train_speed(iter/s)": 0.656297
},
{
"epoch": 3.8528839922229423,
"grad_norm": 13.8125,
"learning_rate": 7.229086769700348e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 5945,
"train_speed(iter/s)": 0.656436
},
{
"epoch": 3.8561244329228774,
"grad_norm": 10.0,
"learning_rate": 7.224289279013773e-05,
"loss": 0.018359375,
"memory(GiB)": 43.05,
"step": 5950,
"train_speed(iter/s)": 0.656551
},
{
"epoch": 3.8593648736228126,
"grad_norm": 0.61328125,
"learning_rate": 7.219489233970485e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 5955,
"train_speed(iter/s)": 0.656645
},
{
"epoch": 3.8626053143227477,
"grad_norm": 3.53125,
"learning_rate": 7.214686640082815e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 5960,
"train_speed(iter/s)": 0.656492
},
{
"epoch": 3.8658457550226832,
"grad_norm": 10.5,
"learning_rate": 7.209881502866024e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 5965,
"train_speed(iter/s)": 0.65655
},
{
"epoch": 3.8690861957226184,
"grad_norm": 11.375,
"learning_rate": 7.205073827838298e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 5970,
"train_speed(iter/s)": 0.656575
},
{
"epoch": 3.8723266364225535,
"grad_norm": 12.4375,
"learning_rate": 7.200263620520732e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 5975,
"train_speed(iter/s)": 0.656699
},
{
"epoch": 3.8755670771224886,
"grad_norm": 14.875,
"learning_rate": 7.195450886437334e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 5980,
"train_speed(iter/s)": 0.65679
},
{
"epoch": 3.8788075178224237,
"grad_norm": 10.5625,
"learning_rate": 7.190635631115007e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 5985,
"train_speed(iter/s)": 0.656961
},
{
"epoch": 3.8820479585223593,
"grad_norm": 4.21875,
"learning_rate": 7.185817860083555e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 5990,
"train_speed(iter/s)": 0.657
},
{
"epoch": 3.8852883992222944,
"grad_norm": 4.90625,
"learning_rate": 7.18099757887567e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 5995,
"train_speed(iter/s)": 0.657169
},
{
"epoch": 3.8885288399222295,
"grad_norm": 10.375,
"learning_rate": 7.176174793026924e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 6000,
"train_speed(iter/s)": 0.65731
},
{
"epoch": 3.8917692806221647,
"grad_norm": 8.6875,
"learning_rate": 7.171349508075768e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 6005,
"train_speed(iter/s)": 0.61235
},
{
"epoch": 3.8950097213220998,
"grad_norm": 15.0,
"learning_rate": 7.166521729563523e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 6010,
"train_speed(iter/s)": 0.612313
},
{
"epoch": 3.898250162022035,
"grad_norm": 9.375,
"learning_rate": 7.161691463034374e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 6015,
"train_speed(iter/s)": 0.612442
},
{
"epoch": 3.90149060272197,
"grad_norm": 2.015625,
"learning_rate": 7.156858714035356e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 6020,
"train_speed(iter/s)": 0.612558
},
{
"epoch": 3.904731043421905,
"grad_norm": 2.421875,
"learning_rate": 7.152023488116368e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 6025,
"train_speed(iter/s)": 0.61267
},
{
"epoch": 3.9079714841218407,
"grad_norm": 4.0625,
"learning_rate": 7.147185790830144e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 6030,
"train_speed(iter/s)": 0.612772
},
{
"epoch": 3.911211924821776,
"grad_norm": 1.7109375,
"learning_rate": 7.142345627732255e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 6035,
"train_speed(iter/s)": 0.612813
},
{
"epoch": 3.914452365521711,
"grad_norm": 14.75,
"learning_rate": 7.137503004381111e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 6040,
"train_speed(iter/s)": 0.612797
},
{
"epoch": 3.917692806221646,
"grad_norm": 13.0625,
"learning_rate": 7.132657926337942e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 6045,
"train_speed(iter/s)": 0.612879
},
{
"epoch": 3.920933246921581,
"grad_norm": 11.375,
"learning_rate": 7.127810399166798e-05,
"loss": 0.04609375,
"memory(GiB)": 43.05,
"step": 6050,
"train_speed(iter/s)": 0.613019
},
{
"epoch": 3.9241736876215167,
"grad_norm": 3.046875,
"learning_rate": 7.122960428434544e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 6055,
"train_speed(iter/s)": 0.610811
},
{
"epoch": 3.927414128321452,
"grad_norm": 13.6875,
"learning_rate": 7.118108019710847e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 6060,
"train_speed(iter/s)": 0.610559
},
{
"epoch": 3.930654569021387,
"grad_norm": 8.3125,
"learning_rate": 7.113253178568176e-05,
"loss": 0.0484375,
"memory(GiB)": 43.05,
"step": 6065,
"train_speed(iter/s)": 0.6108
},
{
"epoch": 3.933895009721322,
"grad_norm": 13.875,
"learning_rate": 7.108395910581793e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 6070,
"train_speed(iter/s)": 0.610517
},
{
"epoch": 3.9371354504212572,
"grad_norm": 4.96875,
"learning_rate": 7.10353622132975e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 6075,
"train_speed(iter/s)": 0.610727
},
{
"epoch": 3.9403758911211924,
"grad_norm": 1.4609375,
"learning_rate": 7.098674116392873e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 6080,
"train_speed(iter/s)": 0.610834
},
{
"epoch": 3.9436163318211275,
"grad_norm": 2.078125,
"learning_rate": 7.093809601354769e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 6085,
"train_speed(iter/s)": 0.610838
},
{
"epoch": 3.9468567725210626,
"grad_norm": 0.63671875,
"learning_rate": 7.08894268180181e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 6090,
"train_speed(iter/s)": 0.610448
},
{
"epoch": 3.950097213220998,
"grad_norm": 10.5625,
"learning_rate": 7.084073363323124e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 6095,
"train_speed(iter/s)": 0.610269
},
{
"epoch": 3.9533376539209333,
"grad_norm": 11.0625,
"learning_rate": 7.079201651510602e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 6100,
"train_speed(iter/s)": 0.610368
},
{
"epoch": 3.9565780946208684,
"grad_norm": 12.0625,
"learning_rate": 7.074327551958883e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 6105,
"train_speed(iter/s)": 0.610338
},
{
"epoch": 3.9598185353208035,
"grad_norm": 6.9375,
"learning_rate": 7.069451070265342e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 6110,
"train_speed(iter/s)": 0.610249
},
{
"epoch": 3.963058976020739,
"grad_norm": 1.390625,
"learning_rate": 7.064572212030097e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 6115,
"train_speed(iter/s)": 0.610338
},
{
"epoch": 3.966299416720674,
"grad_norm": 12.625,
"learning_rate": 7.059690982855988e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 6120,
"train_speed(iter/s)": 0.610286
},
{
"epoch": 3.9695398574206093,
"grad_norm": 4.4375,
"learning_rate": 7.054807388348579e-05,
"loss": 0.019140625,
"memory(GiB)": 43.05,
"step": 6125,
"train_speed(iter/s)": 0.610403
},
{
"epoch": 3.9727802981205445,
"grad_norm": 5.65625,
"learning_rate": 7.049921434116158e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 6130,
"train_speed(iter/s)": 0.610555
},
{
"epoch": 3.9760207388204796,
"grad_norm": 2.734375,
"learning_rate": 7.045033125769713e-05,
"loss": 0.0212890625,
"memory(GiB)": 43.05,
"step": 6135,
"train_speed(iter/s)": 0.610652
},
{
"epoch": 3.9792611795204147,
"grad_norm": 2.28125,
"learning_rate": 7.04014246892294e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 6140,
"train_speed(iter/s)": 0.610823
},
{
"epoch": 3.98250162022035,
"grad_norm": 2.390625,
"learning_rate": 7.035249469192236e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 6145,
"train_speed(iter/s)": 0.610933
},
{
"epoch": 3.985742060920285,
"grad_norm": 11.375,
"learning_rate": 7.030354132196678e-05,
"loss": 0.021484375,
"memory(GiB)": 43.05,
"step": 6150,
"train_speed(iter/s)": 0.611014
},
{
"epoch": 3.9889825016202205,
"grad_norm": 0.76171875,
"learning_rate": 7.025456463558039e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 6155,
"train_speed(iter/s)": 0.610982
},
{
"epoch": 3.9922229423201556,
"grad_norm": 2.390625,
"learning_rate": 7.020556468900761e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 6160,
"train_speed(iter/s)": 0.610946
},
{
"epoch": 3.9954633830200907,
"grad_norm": 1.1015625,
"learning_rate": 7.01565415385196e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 6165,
"train_speed(iter/s)": 0.611032
},
{
"epoch": 3.998703823720026,
"grad_norm": 6.84375,
"learning_rate": 7.010749524041417e-05,
"loss": 0.0248046875,
"memory(GiB)": 43.05,
"step": 6170,
"train_speed(iter/s)": 0.611242
},
{
"epoch": 4.001944264419961,
"grad_norm": 6.84375,
"learning_rate": 7.005842585101575e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 6175,
"train_speed(iter/s)": 0.611346
},
{
"epoch": 4.0051847051198965,
"grad_norm": 13.5625,
"learning_rate": 7.00093334266752e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 6180,
"train_speed(iter/s)": 0.61137
},
{
"epoch": 4.008425145819832,
"grad_norm": 2.265625,
"learning_rate": 6.996021802376991e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 6185,
"train_speed(iter/s)": 0.611606
},
{
"epoch": 4.011665586519767,
"grad_norm": 0.98046875,
"learning_rate": 6.991107969870363e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 6190,
"train_speed(iter/s)": 0.611784
},
{
"epoch": 4.014906027219702,
"grad_norm": 14.3125,
"learning_rate": 6.986191850790641e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 6195,
"train_speed(iter/s)": 0.611802
},
{
"epoch": 4.018146467919637,
"grad_norm": 1.4375,
"learning_rate": 6.981273450783462e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 6200,
"train_speed(iter/s)": 0.611746
},
{
"epoch": 4.021386908619572,
"grad_norm": 0.99609375,
"learning_rate": 6.976352775497075e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 6205,
"train_speed(iter/s)": 0.611922
},
{
"epoch": 4.024627349319507,
"grad_norm": 14.4375,
"learning_rate": 6.971429830582347e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 6210,
"train_speed(iter/s)": 0.611948
},
{
"epoch": 4.027867790019442,
"grad_norm": 3.328125,
"learning_rate": 6.966504621692753e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 6215,
"train_speed(iter/s)": 0.612064
},
{
"epoch": 4.0311082307193775,
"grad_norm": 2.78125,
"learning_rate": 6.961577154484363e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 6220,
"train_speed(iter/s)": 0.61229
},
{
"epoch": 4.034348671419313,
"grad_norm": 10.75,
"learning_rate": 6.956647434615841e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 6225,
"train_speed(iter/s)": 0.612516
},
{
"epoch": 4.037589112119249,
"grad_norm": 3.078125,
"learning_rate": 6.951715467748442e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 6230,
"train_speed(iter/s)": 0.612694
},
{
"epoch": 4.040829552819184,
"grad_norm": 14.5,
"learning_rate": 6.946781259545996e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 6235,
"train_speed(iter/s)": 0.612841
},
{
"epoch": 4.044069993519119,
"grad_norm": 2.671875,
"learning_rate": 6.941844815674912e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 6240,
"train_speed(iter/s)": 0.612874
},
{
"epoch": 4.047310434219054,
"grad_norm": 3.640625,
"learning_rate": 6.936906141804164e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 6245,
"train_speed(iter/s)": 0.612924
},
{
"epoch": 4.050550874918989,
"grad_norm": 9.9375,
"learning_rate": 6.931965243605286e-05,
"loss": 0.01796875,
"memory(GiB)": 43.05,
"step": 6250,
"train_speed(iter/s)": 0.613039
},
{
"epoch": 4.053791315618924,
"grad_norm": 13.125,
"learning_rate": 6.927022126752368e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 6255,
"train_speed(iter/s)": 0.613154
},
{
"epoch": 4.057031756318859,
"grad_norm": 8.5625,
"learning_rate": 6.922076796922049e-05,
"loss": 0.0171875,
"memory(GiB)": 43.05,
"step": 6260,
"train_speed(iter/s)": 0.613357
},
{
"epoch": 4.0602721970187945,
"grad_norm": 9.4375,
"learning_rate": 6.917129259793506e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 6265,
"train_speed(iter/s)": 0.613563
},
{
"epoch": 4.06351263771873,
"grad_norm": 14.0,
"learning_rate": 6.912179521048452e-05,
"loss": 0.0455078125,
"memory(GiB)": 43.05,
"step": 6270,
"train_speed(iter/s)": 0.613557
},
{
"epoch": 4.066753078418665,
"grad_norm": 13.375,
"learning_rate": 6.90722758637113e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 6275,
"train_speed(iter/s)": 0.613633
},
{
"epoch": 4.0699935191186,
"grad_norm": 3.484375,
"learning_rate": 6.902273461448305e-05,
"loss": 0.0474609375,
"memory(GiB)": 43.05,
"step": 6280,
"train_speed(iter/s)": 0.613476
},
{
"epoch": 4.073233959818535,
"grad_norm": 8.625,
"learning_rate": 6.897317151969254e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 6285,
"train_speed(iter/s)": 0.61347
},
{
"epoch": 4.07647440051847,
"grad_norm": 1.171875,
"learning_rate": 6.892358663625766e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 6290,
"train_speed(iter/s)": 0.613416
},
{
"epoch": 4.079714841218406,
"grad_norm": 16.375,
"learning_rate": 6.887398002112129e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 6295,
"train_speed(iter/s)": 0.613491
},
{
"epoch": 4.082955281918341,
"grad_norm": 9.1875,
"learning_rate": 6.88243517312513e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 6300,
"train_speed(iter/s)": 0.613596
},
{
"epoch": 4.086195722618276,
"grad_norm": 10.0625,
"learning_rate": 6.877470182364042e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 6305,
"train_speed(iter/s)": 0.613591
},
{
"epoch": 4.0894361633182115,
"grad_norm": 9.25,
"learning_rate": 6.872503035530626e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 6310,
"train_speed(iter/s)": 0.613738
},
{
"epoch": 4.092676604018147,
"grad_norm": 1.9921875,
"learning_rate": 6.867533738329113e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 6315,
"train_speed(iter/s)": 0.613884
},
{
"epoch": 4.095917044718082,
"grad_norm": 14.75,
"learning_rate": 6.862562296466208e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 6320,
"train_speed(iter/s)": 0.613993
},
{
"epoch": 4.099157485418017,
"grad_norm": 9.875,
"learning_rate": 6.857588715651072e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 6325,
"train_speed(iter/s)": 0.614134
},
{
"epoch": 4.102397926117952,
"grad_norm": 0.51953125,
"learning_rate": 6.852613001595329e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 6330,
"train_speed(iter/s)": 0.614223
},
{
"epoch": 4.105638366817887,
"grad_norm": 2.359375,
"learning_rate": 6.847635160013051e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 6335,
"train_speed(iter/s)": 0.614396
},
{
"epoch": 4.108878807517822,
"grad_norm": 4.59375,
"learning_rate": 6.842655196620753e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 6340,
"train_speed(iter/s)": 0.614538
},
{
"epoch": 4.112119248217757,
"grad_norm": 11.3125,
"learning_rate": 6.837673117137388e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 6345,
"train_speed(iter/s)": 0.614612
},
{
"epoch": 4.115359688917692,
"grad_norm": 10.25,
"learning_rate": 6.832688927284336e-05,
"loss": 0.021484375,
"memory(GiB)": 43.05,
"step": 6350,
"train_speed(iter/s)": 0.6147
},
{
"epoch": 4.118600129617628,
"grad_norm": 1.0703125,
"learning_rate": 6.827702632785402e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 6355,
"train_speed(iter/s)": 0.614879
},
{
"epoch": 4.121840570317564,
"grad_norm": 7.96875,
"learning_rate": 6.822714239366811e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 6360,
"train_speed(iter/s)": 0.615015
},
{
"epoch": 4.125081011017499,
"grad_norm": 14.0,
"learning_rate": 6.817723752757195e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 6365,
"train_speed(iter/s)": 0.615194
},
{
"epoch": 4.128321451717434,
"grad_norm": 13.4375,
"learning_rate": 6.812731178687587e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 6370,
"train_speed(iter/s)": 0.61525
},
{
"epoch": 4.131561892417369,
"grad_norm": 10.9375,
"learning_rate": 6.807736522891424e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 6375,
"train_speed(iter/s)": 0.61541
},
{
"epoch": 4.134802333117304,
"grad_norm": 8.5,
"learning_rate": 6.802739791104529e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 6380,
"train_speed(iter/s)": 0.615546
},
{
"epoch": 4.138042773817239,
"grad_norm": 14.8125,
"learning_rate": 6.79774098906511e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 6385,
"train_speed(iter/s)": 0.615397
},
{
"epoch": 4.141283214517174,
"grad_norm": 0.88671875,
"learning_rate": 6.792740122513755e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 6390,
"train_speed(iter/s)": 0.615513
},
{
"epoch": 4.144523655217109,
"grad_norm": 0.56640625,
"learning_rate": 6.78773719719342e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 6395,
"train_speed(iter/s)": 0.615534
},
{
"epoch": 4.1477640959170445,
"grad_norm": 10.5,
"learning_rate": 6.782732218849424e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 6400,
"train_speed(iter/s)": 0.615526
},
{
"epoch": 4.15100453661698,
"grad_norm": 14.0625,
"learning_rate": 6.777725193229448e-05,
"loss": 0.02275390625,
"memory(GiB)": 43.05,
"step": 6405,
"train_speed(iter/s)": 0.615642
},
{
"epoch": 4.154244977316915,
"grad_norm": 13.0625,
"learning_rate": 6.772716126083521e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 6410,
"train_speed(iter/s)": 0.615752
},
{
"epoch": 4.15748541801685,
"grad_norm": 13.0625,
"learning_rate": 6.767705023164016e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 6415,
"train_speed(iter/s)": 0.615863
},
{
"epoch": 4.160725858716786,
"grad_norm": 0.625,
"learning_rate": 6.762691890225647e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 6420,
"train_speed(iter/s)": 0.616061
},
{
"epoch": 4.163966299416721,
"grad_norm": 1.7265625,
"learning_rate": 6.757676733025456e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 6425,
"train_speed(iter/s)": 0.616173
},
{
"epoch": 4.167206740116656,
"grad_norm": 6.375,
"learning_rate": 6.752659557322812e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 6430,
"train_speed(iter/s)": 0.616291
},
{
"epoch": 4.170447180816591,
"grad_norm": 4.1875,
"learning_rate": 6.747640368879401e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 6435,
"train_speed(iter/s)": 0.616452
},
{
"epoch": 4.173687621516526,
"grad_norm": 2.171875,
"learning_rate": 6.742619173459218e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 6440,
"train_speed(iter/s)": 0.61667
},
{
"epoch": 4.1769280622164615,
"grad_norm": 0.87890625,
"learning_rate": 6.737595976828568e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 6445,
"train_speed(iter/s)": 0.616679
},
{
"epoch": 4.180168502916397,
"grad_norm": 16.25,
"learning_rate": 6.732570784756051e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 6450,
"train_speed(iter/s)": 0.616704
},
{
"epoch": 4.183408943616332,
"grad_norm": 1.1171875,
"learning_rate": 6.727543603012559e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 6455,
"train_speed(iter/s)": 0.616801
},
{
"epoch": 4.186649384316267,
"grad_norm": 10.5,
"learning_rate": 6.722514437371267e-05,
"loss": 0.017578125,
"memory(GiB)": 43.05,
"step": 6460,
"train_speed(iter/s)": 0.616851
},
{
"epoch": 4.189889825016202,
"grad_norm": 3.765625,
"learning_rate": 6.717483293607633e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 6465,
"train_speed(iter/s)": 0.616955
},
{
"epoch": 4.193130265716137,
"grad_norm": 12.0,
"learning_rate": 6.71245017749938e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 6470,
"train_speed(iter/s)": 0.616829
},
{
"epoch": 4.196370706416072,
"grad_norm": 4.71875,
"learning_rate": 6.707415094826505e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 6475,
"train_speed(iter/s)": 0.61682
},
{
"epoch": 4.199611147116007,
"grad_norm": 14.5,
"learning_rate": 6.702378051371254e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 6480,
"train_speed(iter/s)": 0.616836
},
{
"epoch": 4.202851587815943,
"grad_norm": 2.828125,
"learning_rate": 6.697339052918131e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 6485,
"train_speed(iter/s)": 0.61684
},
{
"epoch": 4.2060920285158785,
"grad_norm": 2.3125,
"learning_rate": 6.692298105253883e-05,
"loss": 0.0447265625,
"memory(GiB)": 43.05,
"step": 6490,
"train_speed(iter/s)": 0.616881
},
{
"epoch": 4.209332469215814,
"grad_norm": 13.75,
"learning_rate": 6.687255214167496e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 6495,
"train_speed(iter/s)": 0.616836
},
{
"epoch": 4.212572909915749,
"grad_norm": 1.6328125,
"learning_rate": 6.682210385450185e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 6500,
"train_speed(iter/s)": 0.617034
},
{
"epoch": 4.215813350615684,
"grad_norm": 3.109375,
"learning_rate": 6.677163624895393e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 6505,
"train_speed(iter/s)": 0.6172
},
{
"epoch": 4.219053791315619,
"grad_norm": 16.0,
"learning_rate": 6.672114938298785e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 6510,
"train_speed(iter/s)": 0.617218
},
{
"epoch": 4.222294232015554,
"grad_norm": 1.6875,
"learning_rate": 6.667064331458228e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 6515,
"train_speed(iter/s)": 0.61736
},
{
"epoch": 4.225534672715489,
"grad_norm": 15.875,
"learning_rate": 6.662011810173806e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 6520,
"train_speed(iter/s)": 0.617442
},
{
"epoch": 4.228775113415424,
"grad_norm": 6.90625,
"learning_rate": 6.656957380247792e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 6525,
"train_speed(iter/s)": 0.617564
},
{
"epoch": 4.2320155541153595,
"grad_norm": 11.375,
"learning_rate": 6.651901047484654e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 6530,
"train_speed(iter/s)": 0.61773
},
{
"epoch": 4.235255994815295,
"grad_norm": 3.796875,
"learning_rate": 6.646842817691047e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 6535,
"train_speed(iter/s)": 0.617777
},
{
"epoch": 4.23849643551523,
"grad_norm": 3.515625,
"learning_rate": 6.641782696675805e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 6540,
"train_speed(iter/s)": 0.617863
},
{
"epoch": 4.241736876215166,
"grad_norm": 2.21875,
"learning_rate": 6.636720690249928e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 6545,
"train_speed(iter/s)": 0.617916
},
{
"epoch": 4.244977316915101,
"grad_norm": 10.0,
"learning_rate": 6.631656804226589e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 6550,
"train_speed(iter/s)": 0.617946
},
{
"epoch": 4.248217757615036,
"grad_norm": 1.046875,
"learning_rate": 6.626591044421113e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 6555,
"train_speed(iter/s)": 0.618107
},
{
"epoch": 4.251458198314971,
"grad_norm": 0.6328125,
"learning_rate": 6.621523416650983e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 6560,
"train_speed(iter/s)": 0.618259
},
{
"epoch": 4.254698639014906,
"grad_norm": 13.125,
"learning_rate": 6.616453926735821e-05,
"loss": 0.0169921875,
"memory(GiB)": 43.05,
"step": 6565,
"train_speed(iter/s)": 0.618288
},
{
"epoch": 4.257939079714841,
"grad_norm": 2.796875,
"learning_rate": 6.611382580497389e-05,
"loss": 0.018359375,
"memory(GiB)": 43.05,
"step": 6570,
"train_speed(iter/s)": 0.618287
},
{
"epoch": 4.261179520414776,
"grad_norm": 9.125,
"learning_rate": 6.606309383759586e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 6575,
"train_speed(iter/s)": 0.618423
},
{
"epoch": 4.2644199611147116,
"grad_norm": 0.48828125,
"learning_rate": 6.60123434234843e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 6580,
"train_speed(iter/s)": 0.618644
},
{
"epoch": 4.267660401814647,
"grad_norm": 1.0390625,
"learning_rate": 6.596157462092059e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 6585,
"train_speed(iter/s)": 0.618697
},
{
"epoch": 4.270900842514582,
"grad_norm": 3.984375,
"learning_rate": 6.591078748820725e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 6590,
"train_speed(iter/s)": 0.618772
},
{
"epoch": 4.274141283214517,
"grad_norm": 18.125,
"learning_rate": 6.585998208366781e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 6595,
"train_speed(iter/s)": 0.618836
},
{
"epoch": 4.277381723914452,
"grad_norm": 16.25,
"learning_rate": 6.580915846564683e-05,
"loss": 0.0462890625,
"memory(GiB)": 43.05,
"step": 6600,
"train_speed(iter/s)": 0.618855
},
{
"epoch": 4.280622164614387,
"grad_norm": 2.515625,
"learning_rate": 6.575831669250976e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 6605,
"train_speed(iter/s)": 0.618769
},
{
"epoch": 4.283862605314322,
"grad_norm": 11.3125,
"learning_rate": 6.570745682264288e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 6610,
"train_speed(iter/s)": 0.618837
},
{
"epoch": 4.287103046014258,
"grad_norm": 5.0,
"learning_rate": 6.565657891445326e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 6615,
"train_speed(iter/s)": 0.618891
},
{
"epoch": 4.290343486714193,
"grad_norm": 11.125,
"learning_rate": 6.560568302636877e-05,
"loss": 0.02109375,
"memory(GiB)": 43.05,
"step": 6620,
"train_speed(iter/s)": 0.618961
},
{
"epoch": 4.2935839274141285,
"grad_norm": 0.515625,
"learning_rate": 6.555476921683781e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 6625,
"train_speed(iter/s)": 0.619123
},
{
"epoch": 4.296824368114064,
"grad_norm": 14.0625,
"learning_rate": 6.55038375443294e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 6630,
"train_speed(iter/s)": 0.619174
},
{
"epoch": 4.300064808813999,
"grad_norm": 10.5,
"learning_rate": 6.545288806733309e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 6635,
"train_speed(iter/s)": 0.619255
},
{
"epoch": 4.303305249513934,
"grad_norm": 15.3125,
"learning_rate": 6.540192084435886e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 6640,
"train_speed(iter/s)": 0.619438
},
{
"epoch": 4.306545690213869,
"grad_norm": 8.9375,
"learning_rate": 6.535093593393708e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 6645,
"train_speed(iter/s)": 0.619513
},
{
"epoch": 4.309786130913804,
"grad_norm": 1.0546875,
"learning_rate": 6.529993339461846e-05,
"loss": 0.0435546875,
"memory(GiB)": 43.05,
"step": 6650,
"train_speed(iter/s)": 0.619554
},
{
"epoch": 4.313026571613739,
"grad_norm": 9.625,
"learning_rate": 6.52489132849739e-05,
"loss": 0.0208984375,
"memory(GiB)": 43.05,
"step": 6655,
"train_speed(iter/s)": 0.619772
},
{
"epoch": 4.316267012313674,
"grad_norm": 6.25,
"learning_rate": 6.519787566359448e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 6660,
"train_speed(iter/s)": 0.619866
},
{
"epoch": 4.3195074530136095,
"grad_norm": 8.125,
"learning_rate": 6.514682058909146e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 6665,
"train_speed(iter/s)": 0.620005
},
{
"epoch": 4.3227478937135455,
"grad_norm": 0.8046875,
"learning_rate": 6.509574812009606e-05,
"loss": 0.0634765625,
"memory(GiB)": 43.05,
"step": 6670,
"train_speed(iter/s)": 0.620046
},
{
"epoch": 4.325988334413481,
"grad_norm": 11.125,
"learning_rate": 6.504465831525949e-05,
"loss": 0.0134765625,
"memory(GiB)": 43.05,
"step": 6675,
"train_speed(iter/s)": 0.620261
},
{
"epoch": 4.329228775113416,
"grad_norm": 12.5,
"learning_rate": 6.499355123325296e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 6680,
"train_speed(iter/s)": 0.620359
},
{
"epoch": 4.332469215813351,
"grad_norm": 0.7578125,
"learning_rate": 6.49424269327674e-05,
"loss": 0.048828125,
"memory(GiB)": 43.05,
"step": 6685,
"train_speed(iter/s)": 0.620398
},
{
"epoch": 4.335709656513286,
"grad_norm": 3.53125,
"learning_rate": 6.489128547251357e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 6690,
"train_speed(iter/s)": 0.620478
},
{
"epoch": 4.338950097213221,
"grad_norm": 2.171875,
"learning_rate": 6.484012691122194e-05,
"loss": 0.016015625,
"memory(GiB)": 43.05,
"step": 6695,
"train_speed(iter/s)": 0.620475
},
{
"epoch": 4.342190537913156,
"grad_norm": 10.1875,
"learning_rate": 6.47889513076426e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 6700,
"train_speed(iter/s)": 0.620606
},
{
"epoch": 4.345430978613091,
"grad_norm": 0.94921875,
"learning_rate": 6.473775872054521e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 6705,
"train_speed(iter/s)": 0.620706
},
{
"epoch": 4.3486714193130265,
"grad_norm": 11.5625,
"learning_rate": 6.468654920871897e-05,
"loss": 0.04296875,
"memory(GiB)": 43.05,
"step": 6710,
"train_speed(iter/s)": 0.62086
},
{
"epoch": 4.351911860012962,
"grad_norm": 1.140625,
"learning_rate": 6.463532283097247e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 6715,
"train_speed(iter/s)": 0.620984
},
{
"epoch": 4.355152300712897,
"grad_norm": 6.75,
"learning_rate": 6.458407964613369e-05,
"loss": 0.0154296875,
"memory(GiB)": 43.05,
"step": 6720,
"train_speed(iter/s)": 0.620961
},
{
"epoch": 4.358392741412832,
"grad_norm": 0.9140625,
"learning_rate": 6.453281971304993e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 6725,
"train_speed(iter/s)": 0.621143
},
{
"epoch": 4.361633182112767,
"grad_norm": 3.90625,
"learning_rate": 6.448154309058767e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 6730,
"train_speed(iter/s)": 0.621279
},
{
"epoch": 4.364873622812702,
"grad_norm": 0.5703125,
"learning_rate": 6.443024983763262e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 6735,
"train_speed(iter/s)": 0.621353
},
{
"epoch": 4.368114063512638,
"grad_norm": 3.625,
"learning_rate": 6.437894001308953e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 6740,
"train_speed(iter/s)": 0.621461
},
{
"epoch": 4.371354504212573,
"grad_norm": 1.5078125,
"learning_rate": 6.432761367588223e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 6745,
"train_speed(iter/s)": 0.621395
},
{
"epoch": 4.374594944912508,
"grad_norm": 17.875,
"learning_rate": 6.427627088495349e-05,
"loss": 0.04765625,
"memory(GiB)": 43.05,
"step": 6750,
"train_speed(iter/s)": 0.621476
},
{
"epoch": 4.3778353856124435,
"grad_norm": 0.515625,
"learning_rate": 6.422491169926495e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 6755,
"train_speed(iter/s)": 0.621498
},
{
"epoch": 4.381075826312379,
"grad_norm": 14.8125,
"learning_rate": 6.417353617779715e-05,
"loss": 0.0451171875,
"memory(GiB)": 43.05,
"step": 6760,
"train_speed(iter/s)": 0.621524
},
{
"epoch": 4.384316267012314,
"grad_norm": 5.15625,
"learning_rate": 6.41221443795493e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 6765,
"train_speed(iter/s)": 0.621575
},
{
"epoch": 4.387556707712249,
"grad_norm": 4.75,
"learning_rate": 6.407073636353937e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 6770,
"train_speed(iter/s)": 0.621704
},
{
"epoch": 4.390797148412184,
"grad_norm": 14.6875,
"learning_rate": 6.401931218880393e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 6775,
"train_speed(iter/s)": 0.621833
},
{
"epoch": 4.394037589112119,
"grad_norm": 15.3125,
"learning_rate": 6.396787191439808e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 6780,
"train_speed(iter/s)": 0.621942
},
{
"epoch": 4.397278029812054,
"grad_norm": 17.0,
"learning_rate": 6.391641559939549e-05,
"loss": 0.046484375,
"memory(GiB)": 43.05,
"step": 6785,
"train_speed(iter/s)": 0.622018
},
{
"epoch": 4.400518470511989,
"grad_norm": 15.25,
"learning_rate": 6.386494330288815e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 6790,
"train_speed(iter/s)": 0.622035
},
{
"epoch": 4.403758911211924,
"grad_norm": 5.5,
"learning_rate": 6.381345508398647e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 6795,
"train_speed(iter/s)": 0.6221
},
{
"epoch": 4.40699935191186,
"grad_norm": 0.80859375,
"learning_rate": 6.376195100181911e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 6800,
"train_speed(iter/s)": 0.622253
},
{
"epoch": 4.4102397926117956,
"grad_norm": 8.875,
"learning_rate": 6.371043111553296e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 6805,
"train_speed(iter/s)": 0.622384
},
{
"epoch": 4.413480233311731,
"grad_norm": 7.9375,
"learning_rate": 6.365889548429309e-05,
"loss": 0.023046875,
"memory(GiB)": 43.05,
"step": 6810,
"train_speed(iter/s)": 0.622483
},
{
"epoch": 4.416720674011666,
"grad_norm": 11.4375,
"learning_rate": 6.360734416728261e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 6815,
"train_speed(iter/s)": 0.622554
},
{
"epoch": 4.419961114711601,
"grad_norm": 13.75,
"learning_rate": 6.355577722370264e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 6820,
"train_speed(iter/s)": 0.622643
},
{
"epoch": 4.423201555411536,
"grad_norm": 12.0625,
"learning_rate": 6.35041947127723e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 6825,
"train_speed(iter/s)": 0.622739
},
{
"epoch": 4.426441996111471,
"grad_norm": 18.125,
"learning_rate": 6.345259669372849e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 6830,
"train_speed(iter/s)": 0.62278
},
{
"epoch": 4.429682436811406,
"grad_norm": 6.53125,
"learning_rate": 6.340098322582603e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 6835,
"train_speed(iter/s)": 0.622944
},
{
"epoch": 4.432922877511341,
"grad_norm": 3.578125,
"learning_rate": 6.334935436833741e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 6840,
"train_speed(iter/s)": 0.623049
},
{
"epoch": 4.4361633182112765,
"grad_norm": 12.0,
"learning_rate": 6.329771018055281e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 6845,
"train_speed(iter/s)": 0.623179
},
{
"epoch": 4.439403758911212,
"grad_norm": 17.0,
"learning_rate": 6.324605072178002e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 6850,
"train_speed(iter/s)": 0.623055
},
{
"epoch": 4.442644199611147,
"grad_norm": 10.8125,
"learning_rate": 6.319437605134437e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 6855,
"train_speed(iter/s)": 0.62318
},
{
"epoch": 4.445884640311082,
"grad_norm": 1.6953125,
"learning_rate": 6.314268622858866e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 6860,
"train_speed(iter/s)": 0.623258
},
{
"epoch": 4.449125081011018,
"grad_norm": 13.8125,
"learning_rate": 6.309098131287308e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 6865,
"train_speed(iter/s)": 0.623343
},
{
"epoch": 4.452365521710953,
"grad_norm": 8.375,
"learning_rate": 6.303926136357517e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 6870,
"train_speed(iter/s)": 0.623556
},
{
"epoch": 4.455605962410888,
"grad_norm": 4.25,
"learning_rate": 6.298752644008967e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 6875,
"train_speed(iter/s)": 0.623629
},
{
"epoch": 4.458846403110823,
"grad_norm": 0.470703125,
"learning_rate": 6.293577660182863e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 6880,
"train_speed(iter/s)": 0.623641
},
{
"epoch": 4.462086843810758,
"grad_norm": 13.6875,
"learning_rate": 6.288401190822116e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 6885,
"train_speed(iter/s)": 0.623644
},
{
"epoch": 4.4653272845106935,
"grad_norm": 0.890625,
"learning_rate": 6.283223241871338e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 6890,
"train_speed(iter/s)": 0.623819
},
{
"epoch": 4.468567725210629,
"grad_norm": 5.625,
"learning_rate": 6.278043819276853e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 6895,
"train_speed(iter/s)": 0.623808
},
{
"epoch": 4.471808165910564,
"grad_norm": 10.25,
"learning_rate": 6.272862928986668e-05,
"loss": 0.01796875,
"memory(GiB)": 43.05,
"step": 6900,
"train_speed(iter/s)": 0.623992
},
{
"epoch": 4.475048606610499,
"grad_norm": 10.8125,
"learning_rate": 6.267680576950473e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 6905,
"train_speed(iter/s)": 0.62414
},
{
"epoch": 4.478289047310434,
"grad_norm": 13.1875,
"learning_rate": 6.262496769119646e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 6910,
"train_speed(iter/s)": 0.624303
},
{
"epoch": 4.481529488010369,
"grad_norm": 0.84765625,
"learning_rate": 6.257311511447232e-05,
"loss": 0.019921875,
"memory(GiB)": 43.05,
"step": 6915,
"train_speed(iter/s)": 0.62434
},
{
"epoch": 4.484769928710304,
"grad_norm": 15.6875,
"learning_rate": 6.252124809887938e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 6920,
"train_speed(iter/s)": 0.624398
},
{
"epoch": 4.48801036941024,
"grad_norm": 9.1875,
"learning_rate": 6.246936670398136e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 6925,
"train_speed(iter/s)": 0.624461
},
{
"epoch": 4.491250810110175,
"grad_norm": 0.6796875,
"learning_rate": 6.241747098935843e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 6930,
"train_speed(iter/s)": 0.624671
},
{
"epoch": 4.4944912508101105,
"grad_norm": 8.4375,
"learning_rate": 6.236556101460724e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 6935,
"train_speed(iter/s)": 0.624793
},
{
"epoch": 4.497731691510046,
"grad_norm": 16.25,
"learning_rate": 6.23136368393408e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 6940,
"train_speed(iter/s)": 0.62492
},
{
"epoch": 4.500972132209981,
"grad_norm": 2.015625,
"learning_rate": 6.226169852318842e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 6945,
"train_speed(iter/s)": 0.625014
},
{
"epoch": 4.504212572909916,
"grad_norm": 9.625,
"learning_rate": 6.22097461257957e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 6950,
"train_speed(iter/s)": 0.625087
},
{
"epoch": 4.507453013609851,
"grad_norm": 10.875,
"learning_rate": 6.215777970682435e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 6955,
"train_speed(iter/s)": 0.625181
},
{
"epoch": 4.510693454309786,
"grad_norm": 2.890625,
"learning_rate": 6.210579932595219e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 6960,
"train_speed(iter/s)": 0.625165
},
{
"epoch": 4.513933895009721,
"grad_norm": 16.25,
"learning_rate": 6.205380504287314e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 6965,
"train_speed(iter/s)": 0.625297
},
{
"epoch": 4.517174335709656,
"grad_norm": 5.375,
"learning_rate": 6.2001796917297e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 6970,
"train_speed(iter/s)": 0.625372
},
{
"epoch": 4.520414776409591,
"grad_norm": 1.375,
"learning_rate": 6.19497750089495e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 6975,
"train_speed(iter/s)": 0.625461
},
{
"epoch": 4.523655217109527,
"grad_norm": 8.5625,
"learning_rate": 6.18977393775722e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 6980,
"train_speed(iter/s)": 0.625638
},
{
"epoch": 4.526895657809462,
"grad_norm": 1.953125,
"learning_rate": 6.184569008292242e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 6985,
"train_speed(iter/s)": 0.625769
},
{
"epoch": 4.530136098509397,
"grad_norm": 5.3125,
"learning_rate": 6.179362718477319e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 6990,
"train_speed(iter/s)": 0.625847
},
{
"epoch": 4.533376539209333,
"grad_norm": 1.8125,
"learning_rate": 6.174155074291312e-05,
"loss": 0.0201171875,
"memory(GiB)": 43.05,
"step": 6995,
"train_speed(iter/s)": 0.626021
},
{
"epoch": 4.536616979909268,
"grad_norm": 2.03125,
"learning_rate": 6.168946081714642e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 7000,
"train_speed(iter/s)": 0.626232
},
{
"epoch": 4.539857420609203,
"grad_norm": 1.0234375,
"learning_rate": 6.163735746729272e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 7005,
"train_speed(iter/s)": 0.626324
},
{
"epoch": 4.543097861309138,
"grad_norm": 8.0625,
"learning_rate": 6.158524075318715e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 7010,
"train_speed(iter/s)": 0.626345
},
{
"epoch": 4.546338302009073,
"grad_norm": 14.8125,
"learning_rate": 6.153311073468011e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 7015,
"train_speed(iter/s)": 0.626445
},
{
"epoch": 4.549578742709008,
"grad_norm": 14.3125,
"learning_rate": 6.148096747163734e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 7020,
"train_speed(iter/s)": 0.626427
},
{
"epoch": 4.5528191834089435,
"grad_norm": 11.625,
"learning_rate": 6.142881102393973e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 7025,
"train_speed(iter/s)": 0.62628
},
{
"epoch": 4.556059624108879,
"grad_norm": 2.84375,
"learning_rate": 6.137664145148339e-05,
"loss": 0.02626953125,
"memory(GiB)": 43.05,
"step": 7030,
"train_speed(iter/s)": 0.62635
},
{
"epoch": 4.559300064808814,
"grad_norm": 6.875,
"learning_rate": 6.132445881417941e-05,
"loss": 0.0146484375,
"memory(GiB)": 43.05,
"step": 7035,
"train_speed(iter/s)": 0.626468
},
{
"epoch": 4.562540505508749,
"grad_norm": 14.4375,
"learning_rate": 6.127226317195396e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 7040,
"train_speed(iter/s)": 0.626465
},
{
"epoch": 4.565780946208684,
"grad_norm": 4.15625,
"learning_rate": 6.122005458474808e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 7045,
"train_speed(iter/s)": 0.626539
},
{
"epoch": 4.56902138690862,
"grad_norm": 7.125,
"learning_rate": 6.116783311251775e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 7050,
"train_speed(iter/s)": 0.626607
},
{
"epoch": 4.572261827608555,
"grad_norm": 5.8125,
"learning_rate": 6.111559881523371e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 7055,
"train_speed(iter/s)": 0.626759
},
{
"epoch": 4.57550226830849,
"grad_norm": 10.3125,
"learning_rate": 6.106335175288139e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 7060,
"train_speed(iter/s)": 0.626856
},
{
"epoch": 4.578742709008425,
"grad_norm": 13.5,
"learning_rate": 6.101109198546093e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 7065,
"train_speed(iter/s)": 0.627055
},
{
"epoch": 4.5819831497083605,
"grad_norm": 3.609375,
"learning_rate": 6.095881957298706e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 7070,
"train_speed(iter/s)": 0.627056
},
{
"epoch": 4.585223590408296,
"grad_norm": 7.6875,
"learning_rate": 6.0906534575488994e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 7075,
"train_speed(iter/s)": 0.627143
},
{
"epoch": 4.588464031108231,
"grad_norm": 13.25,
"learning_rate": 6.0854237053010424e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 7080,
"train_speed(iter/s)": 0.627199
},
{
"epoch": 4.591704471808166,
"grad_norm": 0.6015625,
"learning_rate": 6.080192706560944e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 7085,
"train_speed(iter/s)": 0.627417
},
{
"epoch": 4.594944912508101,
"grad_norm": 16.375,
"learning_rate": 6.074960467335842e-05,
"loss": 0.0494140625,
"memory(GiB)": 43.05,
"step": 7090,
"train_speed(iter/s)": 0.627538
},
{
"epoch": 4.598185353208036,
"grad_norm": 13.5,
"learning_rate": 6.0697269936343994e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 7095,
"train_speed(iter/s)": 0.627544
},
{
"epoch": 4.601425793907971,
"grad_norm": 2.109375,
"learning_rate": 6.064492291466698e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 7100,
"train_speed(iter/s)": 0.627636
},
{
"epoch": 4.604666234607906,
"grad_norm": 14.6875,
"learning_rate": 6.059256366844228e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 7105,
"train_speed(iter/s)": 0.627726
},
{
"epoch": 4.6079066753078415,
"grad_norm": 10.0625,
"learning_rate": 6.054019225779888e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 7110,
"train_speed(iter/s)": 0.627781
},
{
"epoch": 4.611147116007777,
"grad_norm": 13.4375,
"learning_rate": 6.048780874287967e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 7115,
"train_speed(iter/s)": 0.62782
},
{
"epoch": 4.614387556707713,
"grad_norm": 15.9375,
"learning_rate": 6.0435413183841484e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 7120,
"train_speed(iter/s)": 0.627882
},
{
"epoch": 4.617627997407648,
"grad_norm": 2.703125,
"learning_rate": 6.0383005640855006e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 7125,
"train_speed(iter/s)": 0.627954
},
{
"epoch": 4.620868438107583,
"grad_norm": 0.7890625,
"learning_rate": 6.0330586174104644e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 7130,
"train_speed(iter/s)": 0.628013
},
{
"epoch": 4.624108878807518,
"grad_norm": 11.4375,
"learning_rate": 6.027815484378848e-05,
"loss": 0.0224609375,
"memory(GiB)": 43.05,
"step": 7135,
"train_speed(iter/s)": 0.628186
},
{
"epoch": 4.627349319507453,
"grad_norm": 2.34375,
"learning_rate": 6.0225711710118296e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 7140,
"train_speed(iter/s)": 0.628313
},
{
"epoch": 4.630589760207388,
"grad_norm": 0.75,
"learning_rate": 6.0173256833319336e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 7145,
"train_speed(iter/s)": 0.628297
},
{
"epoch": 4.633830200907323,
"grad_norm": 4.90625,
"learning_rate": 6.012079027363041e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 7150,
"train_speed(iter/s)": 0.628393
},
{
"epoch": 4.6370706416072585,
"grad_norm": 12.0625,
"learning_rate": 6.006831209130372e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 7155,
"train_speed(iter/s)": 0.628479
},
{
"epoch": 4.640311082307194,
"grad_norm": 5.21875,
"learning_rate": 6.00158223466048e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 7160,
"train_speed(iter/s)": 0.628405
},
{
"epoch": 4.643551523007129,
"grad_norm": 7.5,
"learning_rate": 5.9963321099812445e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 7165,
"train_speed(iter/s)": 0.628501
},
{
"epoch": 4.646791963707064,
"grad_norm": 9.8125,
"learning_rate": 5.991080841121871e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 7170,
"train_speed(iter/s)": 0.628517
},
{
"epoch": 4.650032404407,
"grad_norm": 11.6875,
"learning_rate": 5.9858284341128756e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 7175,
"train_speed(iter/s)": 0.628643
},
{
"epoch": 4.653272845106935,
"grad_norm": 1.03125,
"learning_rate": 5.980574894986081e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 7180,
"train_speed(iter/s)": 0.628817
},
{
"epoch": 4.65651328580687,
"grad_norm": 1.4921875,
"learning_rate": 5.975320229774612e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 7185,
"train_speed(iter/s)": 0.628729
},
{
"epoch": 4.659753726506805,
"grad_norm": 2.765625,
"learning_rate": 5.9700644445128874e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 7190,
"train_speed(iter/s)": 0.628877
},
{
"epoch": 4.66299416720674,
"grad_norm": 3.03125,
"learning_rate": 5.964807545236607e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 7195,
"train_speed(iter/s)": 0.628933
},
{
"epoch": 4.666234607906675,
"grad_norm": 1.8046875,
"learning_rate": 5.959549537982757e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 7200,
"train_speed(iter/s)": 0.628988
},
{
"epoch": 4.669475048606611,
"grad_norm": 1.28125,
"learning_rate": 5.954290428789592e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 7205,
"train_speed(iter/s)": 0.629186
},
{
"epoch": 4.672715489306546,
"grad_norm": 4.03125,
"learning_rate": 5.94903022369663e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 7210,
"train_speed(iter/s)": 0.629256
},
{
"epoch": 4.675955930006481,
"grad_norm": 12.8125,
"learning_rate": 5.943768928744651e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 7215,
"train_speed(iter/s)": 0.629209
},
{
"epoch": 4.679196370706416,
"grad_norm": 11.625,
"learning_rate": 5.938506549975688e-05,
"loss": 0.044140625,
"memory(GiB)": 43.05,
"step": 7220,
"train_speed(iter/s)": 0.629147
},
{
"epoch": 4.682436811406351,
"grad_norm": 1.2421875,
"learning_rate": 5.933243093433015e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 7225,
"train_speed(iter/s)": 0.629296
},
{
"epoch": 4.685677252106286,
"grad_norm": 9.875,
"learning_rate": 5.9279785651611455e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 7230,
"train_speed(iter/s)": 0.629301
},
{
"epoch": 4.688917692806221,
"grad_norm": 17.5,
"learning_rate": 5.9227129712058207e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 7235,
"train_speed(iter/s)": 0.629337
},
{
"epoch": 4.692158133506156,
"grad_norm": 0.734375,
"learning_rate": 5.9174463176140115e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 7240,
"train_speed(iter/s)": 0.629481
},
{
"epoch": 4.695398574206092,
"grad_norm": 8.625,
"learning_rate": 5.912178610433902e-05,
"loss": 0.0431640625,
"memory(GiB)": 43.05,
"step": 7245,
"train_speed(iter/s)": 0.629593
},
{
"epoch": 4.6986390149060275,
"grad_norm": 2.75,
"learning_rate": 5.906909855714884e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 7250,
"train_speed(iter/s)": 0.629641
},
{
"epoch": 4.701879455605963,
"grad_norm": 12.6875,
"learning_rate": 5.901640059507557e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 7255,
"train_speed(iter/s)": 0.629732
},
{
"epoch": 4.705119896305898,
"grad_norm": 2.96875,
"learning_rate": 5.896369227863715e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 7260,
"train_speed(iter/s)": 0.629872
},
{
"epoch": 4.708360337005833,
"grad_norm": 3.640625,
"learning_rate": 5.891097366836339e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 7265,
"train_speed(iter/s)": 0.628331
},
{
"epoch": 4.711600777705768,
"grad_norm": 12.75,
"learning_rate": 5.885824482479596e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 7270,
"train_speed(iter/s)": 0.62829
},
{
"epoch": 4.714841218405703,
"grad_norm": 8.1875,
"learning_rate": 5.880550580848824e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 7275,
"train_speed(iter/s)": 0.628428
},
{
"epoch": 4.718081659105638,
"grad_norm": 3.15625,
"learning_rate": 5.875275668000529e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 7280,
"train_speed(iter/s)": 0.628185
},
{
"epoch": 4.721322099805573,
"grad_norm": 10.8125,
"learning_rate": 5.8699997499923855e-05,
"loss": 0.043359375,
"memory(GiB)": 43.05,
"step": 7285,
"train_speed(iter/s)": 0.628275
},
{
"epoch": 4.7245625405055085,
"grad_norm": 4.78125,
"learning_rate": 5.8647228328832135e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 7290,
"train_speed(iter/s)": 0.628342
},
{
"epoch": 4.727802981205444,
"grad_norm": 15.875,
"learning_rate": 5.859444922732985e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 7295,
"train_speed(iter/s)": 0.628399
},
{
"epoch": 4.731043421905379,
"grad_norm": 0.8515625,
"learning_rate": 5.854166025602812e-05,
"loss": 0.02392578125,
"memory(GiB)": 43.05,
"step": 7300,
"train_speed(iter/s)": 0.628513
},
{
"epoch": 4.734283862605315,
"grad_norm": 0.953125,
"learning_rate": 5.84888614755494e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 7305,
"train_speed(iter/s)": 0.628291
},
{
"epoch": 4.73752430330525,
"grad_norm": 10.4375,
"learning_rate": 5.8436052946527365e-05,
"loss": 0.01865234375,
"memory(GiB)": 43.05,
"step": 7310,
"train_speed(iter/s)": 0.628451
},
{
"epoch": 4.740764744005185,
"grad_norm": 0.921875,
"learning_rate": 5.838323472960696e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 7315,
"train_speed(iter/s)": 0.628483
},
{
"epoch": 4.74400518470512,
"grad_norm": 2.84375,
"learning_rate": 5.833040688544422e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 7320,
"train_speed(iter/s)": 0.628453
},
{
"epoch": 4.747245625405055,
"grad_norm": 7.09375,
"learning_rate": 5.827756947470622e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 7325,
"train_speed(iter/s)": 0.628404
},
{
"epoch": 4.75048606610499,
"grad_norm": 11.625,
"learning_rate": 5.822472255807106e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 7330,
"train_speed(iter/s)": 0.628528
},
{
"epoch": 4.7537265068049255,
"grad_norm": 0.484375,
"learning_rate": 5.817186619622771e-05,
"loss": 0.0478515625,
"memory(GiB)": 43.05,
"step": 7335,
"train_speed(iter/s)": 0.628664
},
{
"epoch": 4.756966947504861,
"grad_norm": 13.6875,
"learning_rate": 5.811900044987601e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 7340,
"train_speed(iter/s)": 0.628769
},
{
"epoch": 4.760207388204796,
"grad_norm": 0.58984375,
"learning_rate": 5.8066125379726576e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 7345,
"train_speed(iter/s)": 0.62888
},
{
"epoch": 4.763447828904731,
"grad_norm": 10.4375,
"learning_rate": 5.801324104650074e-05,
"loss": 0.040234375,
"memory(GiB)": 43.05,
"step": 7350,
"train_speed(iter/s)": 0.62902
},
{
"epoch": 4.766688269604666,
"grad_norm": 1.5078125,
"learning_rate": 5.796034751093047e-05,
"loss": 0.0384765625,
"memory(GiB)": 43.05,
"step": 7355,
"train_speed(iter/s)": 0.629189
},
{
"epoch": 4.769928710304601,
"grad_norm": 10.0625,
"learning_rate": 5.7907444833758295e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 7360,
"train_speed(iter/s)": 0.629383
},
{
"epoch": 4.773169151004536,
"grad_norm": 9.9375,
"learning_rate": 5.7854533075737224e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 7365,
"train_speed(iter/s)": 0.629448
},
{
"epoch": 4.776409591704471,
"grad_norm": 2.890625,
"learning_rate": 5.7801612297630734e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 7370,
"train_speed(iter/s)": 0.629513
},
{
"epoch": 4.779650032404407,
"grad_norm": 2.046875,
"learning_rate": 5.774868256021264e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 7375,
"train_speed(iter/s)": 0.629642
},
{
"epoch": 4.7828904731043425,
"grad_norm": 8.5625,
"learning_rate": 5.769574392426702e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 7380,
"train_speed(iter/s)": 0.629671
},
{
"epoch": 4.786130913804278,
"grad_norm": 4.71875,
"learning_rate": 5.764279645058822e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 7385,
"train_speed(iter/s)": 0.629703
},
{
"epoch": 4.789371354504213,
"grad_norm": 4.25,
"learning_rate": 5.75898401999807e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 7390,
"train_speed(iter/s)": 0.629715
},
{
"epoch": 4.792611795204148,
"grad_norm": 2.375,
"learning_rate": 5.7536875233259036e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 7395,
"train_speed(iter/s)": 0.629774
},
{
"epoch": 4.795852235904083,
"grad_norm": 12.4375,
"learning_rate": 5.748390161124776e-05,
"loss": 0.0509765625,
"memory(GiB)": 43.05,
"step": 7400,
"train_speed(iter/s)": 0.62985
},
{
"epoch": 4.799092676604018,
"grad_norm": 6.40625,
"learning_rate": 5.7430919394781394e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 7405,
"train_speed(iter/s)": 0.629995
},
{
"epoch": 4.802333117303953,
"grad_norm": 5.0,
"learning_rate": 5.737792864470428e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 7410,
"train_speed(iter/s)": 0.630111
},
{
"epoch": 4.805573558003888,
"grad_norm": 11.5,
"learning_rate": 5.732492942187061e-05,
"loss": 0.0173828125,
"memory(GiB)": 43.05,
"step": 7415,
"train_speed(iter/s)": 0.630205
},
{
"epoch": 4.808813998703823,
"grad_norm": 3.40625,
"learning_rate": 5.7271921787144276e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 7420,
"train_speed(iter/s)": 0.630346
},
{
"epoch": 4.8120544394037585,
"grad_norm": 1.6953125,
"learning_rate": 5.7218905801398846e-05,
"loss": 0.0306640625,
"memory(GiB)": 43.05,
"step": 7425,
"train_speed(iter/s)": 0.630399
},
{
"epoch": 4.8152948801036946,
"grad_norm": 9.875,
"learning_rate": 5.716588152551747e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 7430,
"train_speed(iter/s)": 0.63043
},
{
"epoch": 4.81853532080363,
"grad_norm": 13.8125,
"learning_rate": 5.711284902039282e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 7435,
"train_speed(iter/s)": 0.630488
},
{
"epoch": 4.821775761503565,
"grad_norm": 8.75,
"learning_rate": 5.7059808346927e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 7440,
"train_speed(iter/s)": 0.630683
},
{
"epoch": 4.8250162022035,
"grad_norm": 15.25,
"learning_rate": 5.7006759566031535e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 7445,
"train_speed(iter/s)": 0.630663
},
{
"epoch": 4.828256642903435,
"grad_norm": 0.490234375,
"learning_rate": 5.695370273862721e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 7450,
"train_speed(iter/s)": 0.63078
},
{
"epoch": 4.83149708360337,
"grad_norm": 4.40625,
"learning_rate": 5.6900637925644106e-05,
"loss": 0.019921875,
"memory(GiB)": 43.05,
"step": 7455,
"train_speed(iter/s)": 0.630894
},
{
"epoch": 4.834737524303305,
"grad_norm": 12.0,
"learning_rate": 5.6847565188021445e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 7460,
"train_speed(iter/s)": 0.631061
},
{
"epoch": 4.83797796500324,
"grad_norm": 3.0625,
"learning_rate": 5.6794484586707545e-05,
"loss": 0.04921875,
"memory(GiB)": 43.05,
"step": 7465,
"train_speed(iter/s)": 0.631133
},
{
"epoch": 4.8412184057031755,
"grad_norm": 16.25,
"learning_rate": 5.6741396182659735e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 7470,
"train_speed(iter/s)": 0.63119
},
{
"epoch": 4.844458846403111,
"grad_norm": 3.9375,
"learning_rate": 5.6688300036844365e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 7475,
"train_speed(iter/s)": 0.631266
},
{
"epoch": 4.847699287103046,
"grad_norm": 0.5546875,
"learning_rate": 5.663519621023663e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 7480,
"train_speed(iter/s)": 0.631362
},
{
"epoch": 4.850939727802981,
"grad_norm": 1.9609375,
"learning_rate": 5.658208476382053e-05,
"loss": 0.021484375,
"memory(GiB)": 43.05,
"step": 7485,
"train_speed(iter/s)": 0.63139
},
{
"epoch": 4.854180168502916,
"grad_norm": 3.1875,
"learning_rate": 5.65289657585889e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 7490,
"train_speed(iter/s)": 0.631378
},
{
"epoch": 4.857420609202851,
"grad_norm": 1.9453125,
"learning_rate": 5.647583925554314e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 7495,
"train_speed(iter/s)": 0.631432
},
{
"epoch": 4.860661049902787,
"grad_norm": 4.21875,
"learning_rate": 5.642270531569336e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 7500,
"train_speed(iter/s)": 0.631482
},
{
"epoch": 4.863901490602722,
"grad_norm": 1.09375,
"learning_rate": 5.636956400005815e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 7505,
"train_speed(iter/s)": 0.631579
},
{
"epoch": 4.867141931302657,
"grad_norm": 11.5625,
"learning_rate": 5.6316415369664575e-05,
"loss": 0.0328125,
"memory(GiB)": 43.05,
"step": 7510,
"train_speed(iter/s)": 0.631606
},
{
"epoch": 4.8703823720025925,
"grad_norm": 13.5,
"learning_rate": 5.6263259485548134e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 7515,
"train_speed(iter/s)": 0.631747
},
{
"epoch": 4.873622812702528,
"grad_norm": 3.375,
"learning_rate": 5.621009640875262e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 7520,
"train_speed(iter/s)": 0.631913
},
{
"epoch": 4.876863253402463,
"grad_norm": 5.1875,
"learning_rate": 5.615692620033012e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 7525,
"train_speed(iter/s)": 0.631943
},
{
"epoch": 4.880103694102398,
"grad_norm": 12.8125,
"learning_rate": 5.610374892134088e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 7530,
"train_speed(iter/s)": 0.631878
},
{
"epoch": 4.883344134802333,
"grad_norm": 5.125,
"learning_rate": 5.60505646328533e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 7535,
"train_speed(iter/s)": 0.631941
},
{
"epoch": 4.886584575502268,
"grad_norm": 10.25,
"learning_rate": 5.599737339594376e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 7540,
"train_speed(iter/s)": 0.632037
},
{
"epoch": 4.889825016202203,
"grad_norm": 17.75,
"learning_rate": 5.594417527169673e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 7545,
"train_speed(iter/s)": 0.632007
},
{
"epoch": 4.893065456902138,
"grad_norm": 10.3125,
"learning_rate": 5.589097032120447e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 7550,
"train_speed(iter/s)": 0.632156
},
{
"epoch": 4.896305897602074,
"grad_norm": 11.625,
"learning_rate": 5.583775860556717e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 7555,
"train_speed(iter/s)": 0.632135
},
{
"epoch": 4.8995463383020095,
"grad_norm": 15.5,
"learning_rate": 5.578454018589274e-05,
"loss": 0.015625,
"memory(GiB)": 43.05,
"step": 7560,
"train_speed(iter/s)": 0.632263
},
{
"epoch": 4.902786779001945,
"grad_norm": 18.0,
"learning_rate": 5.5731315123296834e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 7565,
"train_speed(iter/s)": 0.632362
},
{
"epoch": 4.90602721970188,
"grad_norm": 3.671875,
"learning_rate": 5.5678083478902655e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 7570,
"train_speed(iter/s)": 0.632384
},
{
"epoch": 4.909267660401815,
"grad_norm": 8.9375,
"learning_rate": 5.562484531384107e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 7575,
"train_speed(iter/s)": 0.632468
},
{
"epoch": 4.91250810110175,
"grad_norm": 11.5,
"learning_rate": 5.5571600689250335e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 7580,
"train_speed(iter/s)": 0.632557
},
{
"epoch": 4.915748541801685,
"grad_norm": 2.828125,
"learning_rate": 5.551834966627617e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 7585,
"train_speed(iter/s)": 0.632647
},
{
"epoch": 4.91898898250162,
"grad_norm": 1.1484375,
"learning_rate": 5.5465092306071666e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 7590,
"train_speed(iter/s)": 0.632737
},
{
"epoch": 4.922229423201555,
"grad_norm": 1.6953125,
"learning_rate": 5.541182866979715e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 7595,
"train_speed(iter/s)": 0.632828
},
{
"epoch": 4.92546986390149,
"grad_norm": 2.578125,
"learning_rate": 5.5358558818620176e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 7600,
"train_speed(iter/s)": 0.633014
},
{
"epoch": 4.928710304601426,
"grad_norm": 3.75,
"learning_rate": 5.530528281371544e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 7605,
"train_speed(iter/s)": 0.633075
},
{
"epoch": 4.931950745301361,
"grad_norm": 1.8828125,
"learning_rate": 5.5252000716264686e-05,
"loss": 0.0177734375,
"memory(GiB)": 43.05,
"step": 7610,
"train_speed(iter/s)": 0.63301
},
{
"epoch": 4.935191186001296,
"grad_norm": 11.375,
"learning_rate": 5.5198712587456655e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 7615,
"train_speed(iter/s)": 0.633044
},
{
"epoch": 4.938431626701231,
"grad_norm": 14.0,
"learning_rate": 5.514541848848704e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 7620,
"train_speed(iter/s)": 0.633114
},
{
"epoch": 4.941672067401167,
"grad_norm": 6.71875,
"learning_rate": 5.5092118480558386e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 7625,
"train_speed(iter/s)": 0.633226
},
{
"epoch": 4.944912508101102,
"grad_norm": 12.0,
"learning_rate": 5.503881262487999e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 7630,
"train_speed(iter/s)": 0.633247
},
{
"epoch": 4.948152948801037,
"grad_norm": 2.28125,
"learning_rate": 5.4985500982667903e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 7635,
"train_speed(iter/s)": 0.633361
},
{
"epoch": 4.951393389500972,
"grad_norm": 14.25,
"learning_rate": 5.4932183615144785e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 7640,
"train_speed(iter/s)": 0.633469
},
{
"epoch": 4.954633830200907,
"grad_norm": 9.625,
"learning_rate": 5.4878860583539915e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 7645,
"train_speed(iter/s)": 0.633526
},
{
"epoch": 4.9578742709008425,
"grad_norm": 10.1875,
"learning_rate": 5.482553194908905e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 7650,
"train_speed(iter/s)": 0.633593
},
{
"epoch": 4.961114711600778,
"grad_norm": 3.84375,
"learning_rate": 5.477219777303435e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 7655,
"train_speed(iter/s)": 0.633688
},
{
"epoch": 4.964355152300713,
"grad_norm": 4.84375,
"learning_rate": 5.4718858116624416e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 7660,
"train_speed(iter/s)": 0.633655
},
{
"epoch": 4.967595593000648,
"grad_norm": 1.9296875,
"learning_rate": 5.466551304111408e-05,
"loss": 0.0173828125,
"memory(GiB)": 43.05,
"step": 7665,
"train_speed(iter/s)": 0.633793
},
{
"epoch": 4.970836033700583,
"grad_norm": 0.458984375,
"learning_rate": 5.461216260776442e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 7670,
"train_speed(iter/s)": 0.633856
},
{
"epoch": 4.974076474400518,
"grad_norm": 9.5625,
"learning_rate": 5.455880687784266e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 7675,
"train_speed(iter/s)": 0.633919
},
{
"epoch": 4.977316915100454,
"grad_norm": 10.9375,
"learning_rate": 5.450544591262212e-05,
"loss": 0.037890625,
"memory(GiB)": 43.05,
"step": 7680,
"train_speed(iter/s)": 0.633972
},
{
"epoch": 4.980557355800389,
"grad_norm": 0.74609375,
"learning_rate": 5.44520797733821e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 7685,
"train_speed(iter/s)": 0.633993
},
{
"epoch": 4.983797796500324,
"grad_norm": 10.6875,
"learning_rate": 5.4398708521407895e-05,
"loss": 0.019140625,
"memory(GiB)": 43.05,
"step": 7690,
"train_speed(iter/s)": 0.634048
},
{
"epoch": 4.9870382372002595,
"grad_norm": 3.5,
"learning_rate": 5.434533221799062e-05,
"loss": 0.04453125,
"memory(GiB)": 43.05,
"step": 7695,
"train_speed(iter/s)": 0.634109
},
{
"epoch": 4.990278677900195,
"grad_norm": 3.3125,
"learning_rate": 5.429195092442721e-05,
"loss": 0.019140625,
"memory(GiB)": 43.05,
"step": 7700,
"train_speed(iter/s)": 0.634221
},
{
"epoch": 4.99351911860013,
"grad_norm": 16.0,
"learning_rate": 5.423856470202036e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 7705,
"train_speed(iter/s)": 0.634178
},
{
"epoch": 4.996759559300065,
"grad_norm": 14.4375,
"learning_rate": 5.4185173612078365e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 7710,
"train_speed(iter/s)": 0.634191
},
{
"epoch": 5.0,
"grad_norm": 14.0625,
"learning_rate": 5.413177771591515e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 7715,
"train_speed(iter/s)": 0.634304
},
{
"epoch": 5.003240440699935,
"grad_norm": 5.71875,
"learning_rate": 5.407837707485015e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 7720,
"train_speed(iter/s)": 0.634305
},
{
"epoch": 5.00648088139987,
"grad_norm": 10.3125,
"learning_rate": 5.402497175020828e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 7725,
"train_speed(iter/s)": 0.634454
},
{
"epoch": 5.009721322099805,
"grad_norm": 12.125,
"learning_rate": 5.397156180331976e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 7730,
"train_speed(iter/s)": 0.634515
},
{
"epoch": 5.0129617627997405,
"grad_norm": 1.6875,
"learning_rate": 5.39181472955202e-05,
"loss": 0.0529296875,
"memory(GiB)": 43.05,
"step": 7735,
"train_speed(iter/s)": 0.63457
},
{
"epoch": 5.016202203499676,
"grad_norm": 11.4375,
"learning_rate": 5.386472828815039e-05,
"loss": 0.0578125,
"memory(GiB)": 43.05,
"step": 7740,
"train_speed(iter/s)": 0.634627
},
{
"epoch": 5.019442644199611,
"grad_norm": 10.5,
"learning_rate": 5.38113048425563e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 7745,
"train_speed(iter/s)": 0.634651
},
{
"epoch": 5.022683084899547,
"grad_norm": 0.64453125,
"learning_rate": 5.375787702008903e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 7750,
"train_speed(iter/s)": 0.63465
},
{
"epoch": 5.025923525599482,
"grad_norm": 13.8125,
"learning_rate": 5.370444488210465e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 7755,
"train_speed(iter/s)": 0.633812
},
{
"epoch": 5.029163966299417,
"grad_norm": 10.4375,
"learning_rate": 5.365100848996425e-05,
"loss": 0.0173828125,
"memory(GiB)": 43.05,
"step": 7760,
"train_speed(iter/s)": 0.633824
},
{
"epoch": 5.032404406999352,
"grad_norm": 1.1953125,
"learning_rate": 5.359756790503375e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 7765,
"train_speed(iter/s)": 0.633735
},
{
"epoch": 5.035644847699287,
"grad_norm": 4.84375,
"learning_rate": 5.354412318868391e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 7770,
"train_speed(iter/s)": 0.633869
},
{
"epoch": 5.038885288399222,
"grad_norm": 3.65625,
"learning_rate": 5.349067440229024e-05,
"loss": 0.0171875,
"memory(GiB)": 43.05,
"step": 7775,
"train_speed(iter/s)": 0.633922
},
{
"epoch": 5.0421257290991575,
"grad_norm": 14.5,
"learning_rate": 5.343722160723292e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 7780,
"train_speed(iter/s)": 0.634081
},
{
"epoch": 5.045366169799093,
"grad_norm": 10.0,
"learning_rate": 5.33837648648967e-05,
"loss": 0.01640625,
"memory(GiB)": 43.05,
"step": 7785,
"train_speed(iter/s)": 0.634081
},
{
"epoch": 5.048606610499028,
"grad_norm": 1.3984375,
"learning_rate": 5.333030423667092e-05,
"loss": 0.0158203125,
"memory(GiB)": 43.05,
"step": 7790,
"train_speed(iter/s)": 0.634146
},
{
"epoch": 5.051847051198963,
"grad_norm": 16.25,
"learning_rate": 5.327683978394935e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 7795,
"train_speed(iter/s)": 0.63421
},
{
"epoch": 5.055087491898898,
"grad_norm": 3.328125,
"learning_rate": 5.322337156813014e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 7800,
"train_speed(iter/s)": 0.634236
},
{
"epoch": 5.058327932598833,
"grad_norm": 12.0,
"learning_rate": 5.31698996506158e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 7805,
"train_speed(iter/s)": 0.634233
},
{
"epoch": 5.061568373298769,
"grad_norm": 13.625,
"learning_rate": 5.3116424092813063e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 7810,
"train_speed(iter/s)": 0.634187
},
{
"epoch": 5.064808813998704,
"grad_norm": 1.9765625,
"learning_rate": 5.306294495613284e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 7815,
"train_speed(iter/s)": 0.634316
},
{
"epoch": 5.068049254698639,
"grad_norm": 2.015625,
"learning_rate": 5.3009462301990174e-05,
"loss": 0.0412109375,
"memory(GiB)": 43.05,
"step": 7820,
"train_speed(iter/s)": 0.634324
},
{
"epoch": 5.071289695398574,
"grad_norm": 14.375,
"learning_rate": 5.295597619180411e-05,
"loss": 0.0138671875,
"memory(GiB)": 43.05,
"step": 7825,
"train_speed(iter/s)": 0.63442
},
{
"epoch": 5.07453013609851,
"grad_norm": 2.453125,
"learning_rate": 5.290248668699771e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 7830,
"train_speed(iter/s)": 0.634437
},
{
"epoch": 5.077770576798445,
"grad_norm": 10.9375,
"learning_rate": 5.284899384899791e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 7835,
"train_speed(iter/s)": 0.634457
},
{
"epoch": 5.08101101749838,
"grad_norm": 12.9375,
"learning_rate": 5.279549773923547e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 7840,
"train_speed(iter/s)": 0.63448
},
{
"epoch": 5.084251458198315,
"grad_norm": 10.5,
"learning_rate": 5.274199841914489e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 7845,
"train_speed(iter/s)": 0.634588
},
{
"epoch": 5.08749189889825,
"grad_norm": 0.41015625,
"learning_rate": 5.268849595016441e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 7850,
"train_speed(iter/s)": 0.634681
},
{
"epoch": 5.090732339598185,
"grad_norm": 12.875,
"learning_rate": 5.263499039373583e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 7855,
"train_speed(iter/s)": 0.63476
},
{
"epoch": 5.09397278029812,
"grad_norm": 12.25,
"learning_rate": 5.2581481811304534e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 7860,
"train_speed(iter/s)": 0.63484
},
{
"epoch": 5.097213220998055,
"grad_norm": 4.34375,
"learning_rate": 5.252797026431937e-05,
"loss": 0.0162109375,
"memory(GiB)": 43.05,
"step": 7865,
"train_speed(iter/s)": 0.634867
},
{
"epoch": 5.1004536616979905,
"grad_norm": 6.84375,
"learning_rate": 5.247445581423257e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 7870,
"train_speed(iter/s)": 0.634923
},
{
"epoch": 5.1036941023979265,
"grad_norm": 2.59375,
"learning_rate": 5.242093852249973e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 7875,
"train_speed(iter/s)": 0.635026
},
{
"epoch": 5.106934543097862,
"grad_norm": 2.046875,
"learning_rate": 5.236741845057971e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 7880,
"train_speed(iter/s)": 0.635112
},
{
"epoch": 5.110174983797797,
"grad_norm": 15.6875,
"learning_rate": 5.2313895659934516e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 7885,
"train_speed(iter/s)": 0.635123
},
{
"epoch": 5.113415424497732,
"grad_norm": 0.60546875,
"learning_rate": 5.226037021202932e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 7890,
"train_speed(iter/s)": 0.635182
},
{
"epoch": 5.116655865197667,
"grad_norm": 7.0,
"learning_rate": 5.220684216833236e-05,
"loss": 0.0224609375,
"memory(GiB)": 43.05,
"step": 7895,
"train_speed(iter/s)": 0.635271
},
{
"epoch": 5.119896305897602,
"grad_norm": 2.109375,
"learning_rate": 5.215331159031479e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 7900,
"train_speed(iter/s)": 0.635228
},
{
"epoch": 5.123136746597537,
"grad_norm": 8.625,
"learning_rate": 5.209977853945076e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 7905,
"train_speed(iter/s)": 0.635387
},
{
"epoch": 5.126377187297472,
"grad_norm": 12.75,
"learning_rate": 5.204624307721719e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 7910,
"train_speed(iter/s)": 0.635298
},
{
"epoch": 5.1296176279974075,
"grad_norm": 11.0625,
"learning_rate": 5.1992705265093775e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 7915,
"train_speed(iter/s)": 0.635456
},
{
"epoch": 5.132858068697343,
"grad_norm": 0.6875,
"learning_rate": 5.1939165164562974e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 7920,
"train_speed(iter/s)": 0.635471
},
{
"epoch": 5.136098509397278,
"grad_norm": 12.625,
"learning_rate": 5.188562283710977e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 7925,
"train_speed(iter/s)": 0.635584
},
{
"epoch": 5.139338950097213,
"grad_norm": 9.9375,
"learning_rate": 5.1832078344221804e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 7930,
"train_speed(iter/s)": 0.635608
},
{
"epoch": 5.142579390797148,
"grad_norm": 1.3125,
"learning_rate": 5.177853174738915e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 7935,
"train_speed(iter/s)": 0.635558
},
{
"epoch": 5.145819831497084,
"grad_norm": 0.6328125,
"learning_rate": 5.1724983108104305e-05,
"loss": 0.0517578125,
"memory(GiB)": 43.05,
"step": 7940,
"train_speed(iter/s)": 0.63554
},
{
"epoch": 5.149060272197019,
"grad_norm": 0.734375,
"learning_rate": 5.1671432487862106e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 7945,
"train_speed(iter/s)": 0.635652
},
{
"epoch": 5.152300712896954,
"grad_norm": 0.953125,
"learning_rate": 5.1617879948159684e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 7950,
"train_speed(iter/s)": 0.635712
},
{
"epoch": 5.155541153596889,
"grad_norm": 3.40625,
"learning_rate": 5.156432555049636e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 7955,
"train_speed(iter/s)": 0.635774
},
{
"epoch": 5.1587815942968245,
"grad_norm": 13.5625,
"learning_rate": 5.151076935637359e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 7960,
"train_speed(iter/s)": 0.635854
},
{
"epoch": 5.16202203499676,
"grad_norm": 6.59375,
"learning_rate": 5.1457211427294914e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 7965,
"train_speed(iter/s)": 0.635986
},
{
"epoch": 5.165262475696695,
"grad_norm": 11.5625,
"learning_rate": 5.140365182476583e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 7970,
"train_speed(iter/s)": 0.635978
},
{
"epoch": 5.16850291639663,
"grad_norm": 1.3671875,
"learning_rate": 5.1350090610293765e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 7975,
"train_speed(iter/s)": 0.636059
},
{
"epoch": 5.171743357096565,
"grad_norm": 11.625,
"learning_rate": 5.129652784538803e-05,
"loss": 0.020703125,
"memory(GiB)": 43.05,
"step": 7980,
"train_speed(iter/s)": 0.636064
},
{
"epoch": 5.1749837977965,
"grad_norm": 7.6875,
"learning_rate": 5.124296359155968e-05,
"loss": 0.0140625,
"memory(GiB)": 43.05,
"step": 7985,
"train_speed(iter/s)": 0.636167
},
{
"epoch": 5.178224238496435,
"grad_norm": 14.0625,
"learning_rate": 5.118939791032148e-05,
"loss": 0.0130859375,
"memory(GiB)": 43.05,
"step": 7990,
"train_speed(iter/s)": 0.636255
},
{
"epoch": 5.18146467919637,
"grad_norm": 4.90625,
"learning_rate": 5.113583086318786e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 7995,
"train_speed(iter/s)": 0.636389
},
{
"epoch": 5.1847051198963054,
"grad_norm": 13.0,
"learning_rate": 5.108226251167483e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8000,
"train_speed(iter/s)": 0.6365
},
{
"epoch": 5.1879455605962415,
"grad_norm": 11.0,
"learning_rate": 5.1028692917299825e-05,
"loss": 0.054296875,
"memory(GiB)": 43.05,
"step": 8005,
"train_speed(iter/s)": 0.604173
},
{
"epoch": 5.191186001296177,
"grad_norm": 2.125,
"learning_rate": 5.097512214158179e-05,
"loss": 0.05,
"memory(GiB)": 43.05,
"step": 8010,
"train_speed(iter/s)": 0.604167
},
{
"epoch": 5.194426441996112,
"grad_norm": 2.71875,
"learning_rate": 5.0921550246040974e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 8015,
"train_speed(iter/s)": 0.604346
},
{
"epoch": 5.197666882696047,
"grad_norm": 4.8125,
"learning_rate": 5.0867977292198935e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 8020,
"train_speed(iter/s)": 0.604425
},
{
"epoch": 5.200907323395982,
"grad_norm": 13.5625,
"learning_rate": 5.0814403341578444e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8025,
"train_speed(iter/s)": 0.604499
},
{
"epoch": 5.204147764095917,
"grad_norm": 14.8125,
"learning_rate": 5.076082845570342e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 8030,
"train_speed(iter/s)": 0.604584
},
{
"epoch": 5.207388204795852,
"grad_norm": 15.25,
"learning_rate": 5.070725269609884e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 8035,
"train_speed(iter/s)": 0.604694
},
{
"epoch": 5.210628645495787,
"grad_norm": 11.875,
"learning_rate": 5.065367612429071e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 8040,
"train_speed(iter/s)": 0.604773
},
{
"epoch": 5.213869086195722,
"grad_norm": 1.140625,
"learning_rate": 5.060009880180592e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8045,
"train_speed(iter/s)": 0.604909
},
{
"epoch": 5.2171095268956575,
"grad_norm": 13.5,
"learning_rate": 5.054652079017229e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 8050,
"train_speed(iter/s)": 0.604926
},
{
"epoch": 5.220349967595593,
"grad_norm": 0.59765625,
"learning_rate": 5.049294215091839e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 8055,
"train_speed(iter/s)": 0.605073
},
{
"epoch": 5.223590408295528,
"grad_norm": 4.96875,
"learning_rate": 5.04393629455735e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 8060,
"train_speed(iter/s)": 0.605121
},
{
"epoch": 5.226830848995464,
"grad_norm": 10.75,
"learning_rate": 5.038578323566757e-05,
"loss": 0.0408203125,
"memory(GiB)": 43.05,
"step": 8065,
"train_speed(iter/s)": 0.605194
},
{
"epoch": 5.230071289695399,
"grad_norm": 2.265625,
"learning_rate": 5.0332203082731165e-05,
"loss": 0.0162109375,
"memory(GiB)": 43.05,
"step": 8070,
"train_speed(iter/s)": 0.605244
},
{
"epoch": 5.233311730395334,
"grad_norm": 0.6171875,
"learning_rate": 5.027862254829527e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 8075,
"train_speed(iter/s)": 0.605317
},
{
"epoch": 5.236552171095269,
"grad_norm": 1.3359375,
"learning_rate": 5.0225041693891396e-05,
"loss": 0.0201171875,
"memory(GiB)": 43.05,
"step": 8080,
"train_speed(iter/s)": 0.604723
},
{
"epoch": 5.239792611795204,
"grad_norm": 1.421875,
"learning_rate": 5.0171460581051364e-05,
"loss": 0.018359375,
"memory(GiB)": 43.05,
"step": 8085,
"train_speed(iter/s)": 0.60472
},
{
"epoch": 5.243033052495139,
"grad_norm": 3.890625,
"learning_rate": 5.011787927130732e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 8090,
"train_speed(iter/s)": 0.604836
},
{
"epoch": 5.2462734931950745,
"grad_norm": 12.0,
"learning_rate": 5.006429782619162e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 8095,
"train_speed(iter/s)": 0.604937
},
{
"epoch": 5.24951393389501,
"grad_norm": 1.03125,
"learning_rate": 5.0010716307236796e-05,
"loss": 0.042578125,
"memory(GiB)": 43.05,
"step": 8100,
"train_speed(iter/s)": 0.604993
},
{
"epoch": 5.252754374594945,
"grad_norm": 8.25,
"learning_rate": 4.995713477597546e-05,
"loss": 0.0296875,
"memory(GiB)": 43.05,
"step": 8105,
"train_speed(iter/s)": 0.605031
},
{
"epoch": 5.25599481529488,
"grad_norm": 1.46875,
"learning_rate": 4.990355329394019e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 8110,
"train_speed(iter/s)": 0.60515
},
{
"epoch": 5.259235255994815,
"grad_norm": 7.1875,
"learning_rate": 4.984997192266359e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 8115,
"train_speed(iter/s)": 0.605281
},
{
"epoch": 5.26247569669475,
"grad_norm": 7.4375,
"learning_rate": 4.9796390723678085e-05,
"loss": 0.0224609375,
"memory(GiB)": 43.05,
"step": 8120,
"train_speed(iter/s)": 0.605418
},
{
"epoch": 5.265716137394685,
"grad_norm": 1.8984375,
"learning_rate": 4.97428097585159e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 8125,
"train_speed(iter/s)": 0.605487
},
{
"epoch": 5.268956578094621,
"grad_norm": 1.015625,
"learning_rate": 4.968922908870901e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 8130,
"train_speed(iter/s)": 0.605636
},
{
"epoch": 5.272197018794556,
"grad_norm": 1.4921875,
"learning_rate": 4.9635648775789075e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 8135,
"train_speed(iter/s)": 0.605722
},
{
"epoch": 5.2754374594944915,
"grad_norm": 17.0,
"learning_rate": 4.958206888128726e-05,
"loss": 0.0482421875,
"memory(GiB)": 43.05,
"step": 8140,
"train_speed(iter/s)": 0.605755
},
{
"epoch": 5.278677900194427,
"grad_norm": 10.125,
"learning_rate": 4.9528489466734326e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 8145,
"train_speed(iter/s)": 0.605841
},
{
"epoch": 5.281918340894362,
"grad_norm": 9.25,
"learning_rate": 4.947491059366049e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 8150,
"train_speed(iter/s)": 0.606016
},
{
"epoch": 5.285158781594297,
"grad_norm": 2.703125,
"learning_rate": 4.942133232359527e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 8155,
"train_speed(iter/s)": 0.60617
},
{
"epoch": 5.288399222294232,
"grad_norm": 0.81640625,
"learning_rate": 4.9367754718067566e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 8160,
"train_speed(iter/s)": 0.606285
},
{
"epoch": 5.291639662994167,
"grad_norm": 3.765625,
"learning_rate": 4.93141778386055e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 8165,
"train_speed(iter/s)": 0.60636
},
{
"epoch": 5.294880103694102,
"grad_norm": 10.1875,
"learning_rate": 4.9260601746736315e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 8170,
"train_speed(iter/s)": 0.606432
},
{
"epoch": 5.298120544394037,
"grad_norm": 1.453125,
"learning_rate": 4.92070265039864e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 8175,
"train_speed(iter/s)": 0.606522
},
{
"epoch": 5.3013609850939725,
"grad_norm": 1.5546875,
"learning_rate": 4.9153452171881184e-05,
"loss": 0.0458984375,
"memory(GiB)": 43.05,
"step": 8180,
"train_speed(iter/s)": 0.606545
},
{
"epoch": 5.304601425793908,
"grad_norm": 1.03125,
"learning_rate": 4.909987881194497e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8185,
"train_speed(iter/s)": 0.606695
},
{
"epoch": 5.307841866493844,
"grad_norm": 0.65625,
"learning_rate": 4.9046306485701e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 8190,
"train_speed(iter/s)": 0.606869
},
{
"epoch": 5.311082307193779,
"grad_norm": 10.875,
"learning_rate": 4.899273525467135e-05,
"loss": 0.0248046875,
"memory(GiB)": 43.05,
"step": 8195,
"train_speed(iter/s)": 0.606882
},
{
"epoch": 5.314322747893714,
"grad_norm": 4.65625,
"learning_rate": 4.893916518037678e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 8200,
"train_speed(iter/s)": 0.606937
},
{
"epoch": 5.317563188593649,
"grad_norm": 8.9375,
"learning_rate": 4.888559632433677e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8205,
"train_speed(iter/s)": 0.607074
},
{
"epoch": 5.320803629293584,
"grad_norm": 16.0,
"learning_rate": 4.88320287480694e-05,
"loss": 0.02109375,
"memory(GiB)": 43.05,
"step": 8210,
"train_speed(iter/s)": 0.607103
},
{
"epoch": 5.324044069993519,
"grad_norm": 3.890625,
"learning_rate": 4.8778462513091214e-05,
"loss": 0.0421875,
"memory(GiB)": 43.05,
"step": 8215,
"train_speed(iter/s)": 0.607146
},
{
"epoch": 5.327284510693454,
"grad_norm": 3.984375,
"learning_rate": 4.872489768091729e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 8220,
"train_speed(iter/s)": 0.607161
},
{
"epoch": 5.330524951393389,
"grad_norm": 2.1875,
"learning_rate": 4.867133431306108e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 8225,
"train_speed(iter/s)": 0.607205
},
{
"epoch": 5.333765392093325,
"grad_norm": 12.25,
"learning_rate": 4.8617772471034335e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8230,
"train_speed(iter/s)": 0.607379
},
{
"epoch": 5.33700583279326,
"grad_norm": 1.640625,
"learning_rate": 4.856421221634705e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 8235,
"train_speed(iter/s)": 0.607384
},
{
"epoch": 5.340246273493195,
"grad_norm": 1.1953125,
"learning_rate": 4.851065361050743e-05,
"loss": 0.0146484375,
"memory(GiB)": 43.05,
"step": 8240,
"train_speed(iter/s)": 0.607513
},
{
"epoch": 5.34348671419313,
"grad_norm": 1.4296875,
"learning_rate": 4.845709671502178e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 8245,
"train_speed(iter/s)": 0.607557
},
{
"epoch": 5.346727154893065,
"grad_norm": 11.0625,
"learning_rate": 4.840354159139438e-05,
"loss": 0.0556640625,
"memory(GiB)": 43.05,
"step": 8250,
"train_speed(iter/s)": 0.607691
},
{
"epoch": 5.349967595593001,
"grad_norm": 2.375,
"learning_rate": 4.8349988301127555e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 8255,
"train_speed(iter/s)": 0.607817
},
{
"epoch": 5.353208036292936,
"grad_norm": 1.3828125,
"learning_rate": 4.82964369057215e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 8260,
"train_speed(iter/s)": 0.607898
},
{
"epoch": 5.356448476992871,
"grad_norm": 3.578125,
"learning_rate": 4.8242887466674194e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 8265,
"train_speed(iter/s)": 0.608052
},
{
"epoch": 5.359688917692806,
"grad_norm": 6.9375,
"learning_rate": 4.818934004548142e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8270,
"train_speed(iter/s)": 0.608099
},
{
"epoch": 5.3629293583927415,
"grad_norm": 1.7578125,
"learning_rate": 4.8135794703636643e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 8275,
"train_speed(iter/s)": 0.608232
},
{
"epoch": 5.366169799092677,
"grad_norm": 2.21875,
"learning_rate": 4.808225150263088e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 8280,
"train_speed(iter/s)": 0.60837
},
{
"epoch": 5.369410239792612,
"grad_norm": 8.0,
"learning_rate": 4.802871050395276e-05,
"loss": 0.015234375,
"memory(GiB)": 43.05,
"step": 8285,
"train_speed(iter/s)": 0.608458
},
{
"epoch": 5.372650680492547,
"grad_norm": 15.25,
"learning_rate": 4.797517176908836e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 8290,
"train_speed(iter/s)": 0.60859
},
{
"epoch": 5.375891121192482,
"grad_norm": 0.66015625,
"learning_rate": 4.792163535952113e-05,
"loss": 0.016796875,
"memory(GiB)": 43.05,
"step": 8295,
"train_speed(iter/s)": 0.608648
},
{
"epoch": 5.379131561892417,
"grad_norm": 6.3125,
"learning_rate": 4.786810133673188e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 8300,
"train_speed(iter/s)": 0.608736
},
{
"epoch": 5.382372002592352,
"grad_norm": 0.84375,
"learning_rate": 4.781456976219869e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 8305,
"train_speed(iter/s)": 0.608824
},
{
"epoch": 5.385612443292287,
"grad_norm": 2.484375,
"learning_rate": 4.776104069739677e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 8310,
"train_speed(iter/s)": 0.608935
},
{
"epoch": 5.388852883992223,
"grad_norm": 15.125,
"learning_rate": 4.770751420379852e-05,
"loss": 0.0490234375,
"memory(GiB)": 43.05,
"step": 8315,
"train_speed(iter/s)": 0.608995
},
{
"epoch": 5.3920933246921585,
"grad_norm": 2.15625,
"learning_rate": 4.7653990342873354e-05,
"loss": 0.0373046875,
"memory(GiB)": 43.05,
"step": 8320,
"train_speed(iter/s)": 0.609089
},
{
"epoch": 5.395333765392094,
"grad_norm": 0.578125,
"learning_rate": 4.7600469176087634e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8325,
"train_speed(iter/s)": 0.609144
},
{
"epoch": 5.398574206092029,
"grad_norm": 12.375,
"learning_rate": 4.754695076490467e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 8330,
"train_speed(iter/s)": 0.609278
},
{
"epoch": 5.401814646791964,
"grad_norm": 8.5,
"learning_rate": 4.7493435170784615e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 8335,
"train_speed(iter/s)": 0.609252
},
{
"epoch": 5.405055087491899,
"grad_norm": 5.125,
"learning_rate": 4.7439922455184325e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 8340,
"train_speed(iter/s)": 0.60928
},
{
"epoch": 5.408295528191834,
"grad_norm": 3.859375,
"learning_rate": 4.738641267955742e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 8345,
"train_speed(iter/s)": 0.609361
},
{
"epoch": 5.411535968891769,
"grad_norm": 12.1875,
"learning_rate": 4.7332905905354136e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 8350,
"train_speed(iter/s)": 0.609534
},
{
"epoch": 5.414776409591704,
"grad_norm": 2.0625,
"learning_rate": 4.727940219402119e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 8355,
"train_speed(iter/s)": 0.609561
},
{
"epoch": 5.4180168502916395,
"grad_norm": 12.875,
"learning_rate": 4.722590160700186e-05,
"loss": 0.050390625,
"memory(GiB)": 43.05,
"step": 8360,
"train_speed(iter/s)": 0.609583
},
{
"epoch": 5.421257290991575,
"grad_norm": 3.375,
"learning_rate": 4.717240420573581e-05,
"loss": 0.0208984375,
"memory(GiB)": 43.05,
"step": 8365,
"train_speed(iter/s)": 0.609734
},
{
"epoch": 5.42449773169151,
"grad_norm": 11.375,
"learning_rate": 4.711891005165904e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 8370,
"train_speed(iter/s)": 0.609765
},
{
"epoch": 5.427738172391445,
"grad_norm": 11.5625,
"learning_rate": 4.706541920620383e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 8375,
"train_speed(iter/s)": 0.609934
},
{
"epoch": 5.43097861309138,
"grad_norm": 13.25,
"learning_rate": 4.701193173079867e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 8380,
"train_speed(iter/s)": 0.610021
},
{
"epoch": 5.434219053791316,
"grad_norm": 13.375,
"learning_rate": 4.695844768686812e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 8385,
"train_speed(iter/s)": 0.610153
},
{
"epoch": 5.437459494491251,
"grad_norm": 4.71875,
"learning_rate": 4.690496713583289e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8390,
"train_speed(iter/s)": 0.610216
},
{
"epoch": 5.440699935191186,
"grad_norm": 16.375,
"learning_rate": 4.685149013910961e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 8395,
"train_speed(iter/s)": 0.610328
},
{
"epoch": 5.443940375891121,
"grad_norm": 14.625,
"learning_rate": 4.6798016758110865e-05,
"loss": 0.0201171875,
"memory(GiB)": 43.05,
"step": 8400,
"train_speed(iter/s)": 0.610501
},
{
"epoch": 5.4471808165910565,
"grad_norm": 10.0,
"learning_rate": 4.674454705424506e-05,
"loss": 0.05,
"memory(GiB)": 43.05,
"step": 8405,
"train_speed(iter/s)": 0.610602
},
{
"epoch": 5.450421257290992,
"grad_norm": 13.1875,
"learning_rate": 4.6691081088916436e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 8410,
"train_speed(iter/s)": 0.610583
},
{
"epoch": 5.453661697990927,
"grad_norm": 12.125,
"learning_rate": 4.663761892352483e-05,
"loss": 0.0208984375,
"memory(GiB)": 43.05,
"step": 8415,
"train_speed(iter/s)": 0.610661
},
{
"epoch": 5.456902138690862,
"grad_norm": 10.3125,
"learning_rate": 4.6584160619465814e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 8420,
"train_speed(iter/s)": 0.610726
},
{
"epoch": 5.460142579390797,
"grad_norm": 15.0625,
"learning_rate": 4.653070623813051e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 8425,
"train_speed(iter/s)": 0.610783
},
{
"epoch": 5.463383020090732,
"grad_norm": 3.796875,
"learning_rate": 4.6477255840905484e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 8430,
"train_speed(iter/s)": 0.610706
},
{
"epoch": 5.466623460790667,
"grad_norm": 11.5625,
"learning_rate": 4.642380948917279e-05,
"loss": 0.0154296875,
"memory(GiB)": 43.05,
"step": 8435,
"train_speed(iter/s)": 0.610804
},
{
"epoch": 5.469863901490602,
"grad_norm": 6.71875,
"learning_rate": 4.637036724430981e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 8440,
"train_speed(iter/s)": 0.610933
},
{
"epoch": 5.473104342190538,
"grad_norm": 6.5,
"learning_rate": 4.6316929167689176e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 8445,
"train_speed(iter/s)": 0.611038
},
{
"epoch": 5.476344782890473,
"grad_norm": 1.4375,
"learning_rate": 4.626349532067879e-05,
"loss": 0.0197265625,
"memory(GiB)": 43.05,
"step": 8450,
"train_speed(iter/s)": 0.611129
},
{
"epoch": 5.479585223590409,
"grad_norm": 3.0625,
"learning_rate": 4.621006576464168e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 8455,
"train_speed(iter/s)": 0.61121
},
{
"epoch": 5.482825664290344,
"grad_norm": 8.6875,
"learning_rate": 4.61566405609359e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 8460,
"train_speed(iter/s)": 0.611307
},
{
"epoch": 5.486066104990279,
"grad_norm": 1.4921875,
"learning_rate": 4.610321977091458e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8465,
"train_speed(iter/s)": 0.61141
},
{
"epoch": 5.489306545690214,
"grad_norm": 11.25,
"learning_rate": 4.6049803455925725e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 8470,
"train_speed(iter/s)": 0.61143
},
{
"epoch": 5.492546986390149,
"grad_norm": 2.328125,
"learning_rate": 4.5996391677312225e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 8475,
"train_speed(iter/s)": 0.611595
},
{
"epoch": 5.495787427090084,
"grad_norm": 12.9375,
"learning_rate": 4.594298449641175e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 8480,
"train_speed(iter/s)": 0.611601
},
{
"epoch": 5.499027867790019,
"grad_norm": 8.75,
"learning_rate": 4.588958197455673e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8485,
"train_speed(iter/s)": 0.61175
},
{
"epoch": 5.502268308489954,
"grad_norm": 13.5,
"learning_rate": 4.583618417307416e-05,
"loss": 0.0390625,
"memory(GiB)": 43.05,
"step": 8490,
"train_speed(iter/s)": 0.611858
},
{
"epoch": 5.5055087491898895,
"grad_norm": 17.625,
"learning_rate": 4.578279115328569e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 8495,
"train_speed(iter/s)": 0.611954
},
{
"epoch": 5.508749189889825,
"grad_norm": 13.4375,
"learning_rate": 4.572940297650747e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 8500,
"train_speed(iter/s)": 0.612014
},
{
"epoch": 5.51198963058976,
"grad_norm": 1.2109375,
"learning_rate": 4.567601970405004e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 8505,
"train_speed(iter/s)": 0.612165
},
{
"epoch": 5.515230071289696,
"grad_norm": 3.984375,
"learning_rate": 4.5622641397218355e-05,
"loss": 0.020703125,
"memory(GiB)": 43.05,
"step": 8510,
"train_speed(iter/s)": 0.612217
},
{
"epoch": 5.518470511989631,
"grad_norm": 10.3125,
"learning_rate": 4.556926811731165e-05,
"loss": 0.023046875,
"memory(GiB)": 43.05,
"step": 8515,
"train_speed(iter/s)": 0.612292
},
{
"epoch": 5.521710952689566,
"grad_norm": 5.09375,
"learning_rate": 4.5515899925623415e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 8520,
"train_speed(iter/s)": 0.612312
},
{
"epoch": 5.524951393389501,
"grad_norm": 3.1875,
"learning_rate": 4.546253688344122e-05,
"loss": 0.0189453125,
"memory(GiB)": 43.05,
"step": 8525,
"train_speed(iter/s)": 0.612402
},
{
"epoch": 5.528191834089436,
"grad_norm": 11.625,
"learning_rate": 4.540917905204681e-05,
"loss": 0.031640625,
"memory(GiB)": 43.05,
"step": 8530,
"train_speed(iter/s)": 0.61249
},
{
"epoch": 5.531432274789371,
"grad_norm": 0.7265625,
"learning_rate": 4.53558264927159e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 8535,
"train_speed(iter/s)": 0.612517
},
{
"epoch": 5.5346727154893065,
"grad_norm": 3.796875,
"learning_rate": 4.530247926671816e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 8540,
"train_speed(iter/s)": 0.612561
},
{
"epoch": 5.537913156189242,
"grad_norm": 16.125,
"learning_rate": 4.524913743531712e-05,
"loss": 0.0498046875,
"memory(GiB)": 43.05,
"step": 8545,
"train_speed(iter/s)": 0.612591
},
{
"epoch": 5.541153596889177,
"grad_norm": 0.494140625,
"learning_rate": 4.519580105977017e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 8550,
"train_speed(iter/s)": 0.612707
},
{
"epoch": 5.544394037589112,
"grad_norm": 12.5,
"learning_rate": 4.514247020132835e-05,
"loss": 0.0404296875,
"memory(GiB)": 43.05,
"step": 8555,
"train_speed(iter/s)": 0.612856
},
{
"epoch": 5.547634478289047,
"grad_norm": 3.21875,
"learning_rate": 4.508914492123642e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 8560,
"train_speed(iter/s)": 0.612854
},
{
"epoch": 5.550874918988983,
"grad_norm": 13.0,
"learning_rate": 4.503582528073272e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 8565,
"train_speed(iter/s)": 0.613024
},
{
"epoch": 5.554115359688918,
"grad_norm": 1.3828125,
"learning_rate": 4.4982511341049124e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 8570,
"train_speed(iter/s)": 0.613083
},
{
"epoch": 5.557355800388853,
"grad_norm": 4.15625,
"learning_rate": 4.492920316341095e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8575,
"train_speed(iter/s)": 0.613224
},
{
"epoch": 5.560596241088788,
"grad_norm": 8.3125,
"learning_rate": 4.487590080903692e-05,
"loss": 0.0248046875,
"memory(GiB)": 43.05,
"step": 8580,
"train_speed(iter/s)": 0.613218
},
{
"epoch": 5.5638366817887235,
"grad_norm": 14.5625,
"learning_rate": 4.482260433913899e-05,
"loss": 0.02109375,
"memory(GiB)": 43.05,
"step": 8585,
"train_speed(iter/s)": 0.613258
},
{
"epoch": 5.567077122488659,
"grad_norm": 3.5625,
"learning_rate": 4.476931381492247e-05,
"loss": 0.0537109375,
"memory(GiB)": 43.05,
"step": 8590,
"train_speed(iter/s)": 0.613228
},
{
"epoch": 5.570317563188594,
"grad_norm": 17.5,
"learning_rate": 4.471602929758577e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 8595,
"train_speed(iter/s)": 0.61316
},
{
"epoch": 5.573558003888529,
"grad_norm": 2.265625,
"learning_rate": 4.466275084832041e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 8600,
"train_speed(iter/s)": 0.613196
},
{
"epoch": 5.576798444588464,
"grad_norm": 0.431640625,
"learning_rate": 4.460947852831096e-05,
"loss": 0.0416015625,
"memory(GiB)": 43.05,
"step": 8605,
"train_speed(iter/s)": 0.613363
},
{
"epoch": 5.580038885288399,
"grad_norm": 16.0,
"learning_rate": 4.455621239873498e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 8610,
"train_speed(iter/s)": 0.61345
},
{
"epoch": 5.583279325988334,
"grad_norm": 14.75,
"learning_rate": 4.450295252076282e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 8615,
"train_speed(iter/s)": 0.6135
},
{
"epoch": 5.586519766688269,
"grad_norm": 11.0625,
"learning_rate": 4.444969895555774e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 8620,
"train_speed(iter/s)": 0.613576
},
{
"epoch": 5.5897602073882044,
"grad_norm": 1.078125,
"learning_rate": 4.4396451764275755e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 8625,
"train_speed(iter/s)": 0.613598
},
{
"epoch": 5.59300064808814,
"grad_norm": 2.546875,
"learning_rate": 4.4343211008065484e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 8630,
"train_speed(iter/s)": 0.613615
},
{
"epoch": 5.596241088788075,
"grad_norm": 2.890625,
"learning_rate": 4.428997674806822e-05,
"loss": 0.0564453125,
"memory(GiB)": 43.05,
"step": 8635,
"train_speed(iter/s)": 0.613688
},
{
"epoch": 5.599481529488011,
"grad_norm": 3.1875,
"learning_rate": 4.423674904541779e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 8640,
"train_speed(iter/s)": 0.613749
},
{
"epoch": 5.602721970187946,
"grad_norm": 2.46875,
"learning_rate": 4.4183527961240455e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 8645,
"train_speed(iter/s)": 0.613872
},
{
"epoch": 5.605962410887881,
"grad_norm": 8.9375,
"learning_rate": 4.413031355665492e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 8650,
"train_speed(iter/s)": 0.613911
},
{
"epoch": 5.609202851587816,
"grad_norm": 9.125,
"learning_rate": 4.407710589277221e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8655,
"train_speed(iter/s)": 0.613996
},
{
"epoch": 5.612443292287751,
"grad_norm": 14.25,
"learning_rate": 4.402390503069556e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 8660,
"train_speed(iter/s)": 0.613954
},
{
"epoch": 5.615683732987686,
"grad_norm": 10.625,
"learning_rate": 4.3970711031520446e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 8665,
"train_speed(iter/s)": 0.61412
},
{
"epoch": 5.618924173687621,
"grad_norm": 0.81640625,
"learning_rate": 4.391752395633446e-05,
"loss": 0.024609375,
"memory(GiB)": 43.05,
"step": 8670,
"train_speed(iter/s)": 0.614246
},
{
"epoch": 5.6221646143875565,
"grad_norm": 1.4765625,
"learning_rate": 4.386434386621722e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 8675,
"train_speed(iter/s)": 0.614211
},
{
"epoch": 5.625405055087492,
"grad_norm": 1.3671875,
"learning_rate": 4.381117082224033e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 8680,
"train_speed(iter/s)": 0.614312
},
{
"epoch": 5.628645495787427,
"grad_norm": 1.125,
"learning_rate": 4.375800488546733e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 8685,
"train_speed(iter/s)": 0.614351
},
{
"epoch": 5.631885936487362,
"grad_norm": 15.9375,
"learning_rate": 4.370484611695354e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 8690,
"train_speed(iter/s)": 0.614385
},
{
"epoch": 5.635126377187298,
"grad_norm": 1.6796875,
"learning_rate": 4.365169457774609e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8695,
"train_speed(iter/s)": 0.614455
},
{
"epoch": 5.638366817887233,
"grad_norm": 12.375,
"learning_rate": 4.3598550328883814e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 8700,
"train_speed(iter/s)": 0.614552
},
{
"epoch": 5.641607258587168,
"grad_norm": 1.375,
"learning_rate": 4.354541343139714e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 8705,
"train_speed(iter/s)": 0.614735
},
{
"epoch": 5.644847699287103,
"grad_norm": 16.125,
"learning_rate": 4.349228394630808e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 8710,
"train_speed(iter/s)": 0.614743
},
{
"epoch": 5.648088139987038,
"grad_norm": 11.375,
"learning_rate": 4.3439161934630156e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 8715,
"train_speed(iter/s)": 0.614913
},
{
"epoch": 5.6513285806869735,
"grad_norm": 17.125,
"learning_rate": 4.338604745736822e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 8720,
"train_speed(iter/s)": 0.615012
},
{
"epoch": 5.654569021386909,
"grad_norm": 13.25,
"learning_rate": 4.3332940575518565e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 8725,
"train_speed(iter/s)": 0.615076
},
{
"epoch": 5.657809462086844,
"grad_norm": 12.125,
"learning_rate": 4.327984135006873e-05,
"loss": 0.0197265625,
"memory(GiB)": 43.05,
"step": 8730,
"train_speed(iter/s)": 0.61514
},
{
"epoch": 5.661049902786779,
"grad_norm": 1.59375,
"learning_rate": 4.3226749841997436e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 8735,
"train_speed(iter/s)": 0.615215
},
{
"epoch": 5.664290343486714,
"grad_norm": 3.625,
"learning_rate": 4.317366611227458e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8740,
"train_speed(iter/s)": 0.615171
},
{
"epoch": 5.667530784186649,
"grad_norm": 7.78125,
"learning_rate": 4.3120590221861125e-05,
"loss": 0.03359375,
"memory(GiB)": 43.05,
"step": 8745,
"train_speed(iter/s)": 0.615263
},
{
"epoch": 5.670771224886584,
"grad_norm": 11.625,
"learning_rate": 4.3067522231708974e-05,
"loss": 0.027734375,
"memory(GiB)": 43.05,
"step": 8750,
"train_speed(iter/s)": 0.615425
},
{
"epoch": 5.674011665586519,
"grad_norm": 5.65625,
"learning_rate": 4.301446220276102e-05,
"loss": 0.0095703125,
"memory(GiB)": 43.05,
"step": 8755,
"train_speed(iter/s)": 0.615447
},
{
"epoch": 5.6772521062864545,
"grad_norm": 4.875,
"learning_rate": 4.2961410195951e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 8760,
"train_speed(iter/s)": 0.615449
},
{
"epoch": 5.6804925469863905,
"grad_norm": 3.15625,
"learning_rate": 4.2908366272203414e-05,
"loss": 0.039453125,
"memory(GiB)": 43.05,
"step": 8765,
"train_speed(iter/s)": 0.61553
},
{
"epoch": 5.683732987686326,
"grad_norm": 1.9765625,
"learning_rate": 4.285533049243351e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 8770,
"train_speed(iter/s)": 0.615572
},
{
"epoch": 5.686973428386261,
"grad_norm": 2.1875,
"learning_rate": 4.280230291754718e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 8775,
"train_speed(iter/s)": 0.61563
},
{
"epoch": 5.690213869086196,
"grad_norm": 11.6875,
"learning_rate": 4.274928360844086e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 8780,
"train_speed(iter/s)": 0.615709
},
{
"epoch": 5.693454309786131,
"grad_norm": 3.078125,
"learning_rate": 4.269627262600151e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 8785,
"train_speed(iter/s)": 0.615769
},
{
"epoch": 5.696694750486066,
"grad_norm": 11.4375,
"learning_rate": 4.264327003110657e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 8790,
"train_speed(iter/s)": 0.615899
},
{
"epoch": 5.699935191186001,
"grad_norm": 5.53125,
"learning_rate": 4.2590275884623805e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 8795,
"train_speed(iter/s)": 0.61597
},
{
"epoch": 5.703175631885936,
"grad_norm": 0.640625,
"learning_rate": 4.253729024741125e-05,
"loss": 0.0177734375,
"memory(GiB)": 43.05,
"step": 8800,
"train_speed(iter/s)": 0.616019
},
{
"epoch": 5.7064160725858715,
"grad_norm": 0.6484375,
"learning_rate": 4.248431318031724e-05,
"loss": 0.04375,
"memory(GiB)": 43.05,
"step": 8805,
"train_speed(iter/s)": 0.616098
},
{
"epoch": 5.709656513285807,
"grad_norm": 1.09375,
"learning_rate": 4.2431344744180225e-05,
"loss": 0.0197265625,
"memory(GiB)": 43.05,
"step": 8810,
"train_speed(iter/s)": 0.616157
},
{
"epoch": 5.712896953985742,
"grad_norm": 13.0,
"learning_rate": 4.2378384999828736e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 8815,
"train_speed(iter/s)": 0.616015
},
{
"epoch": 5.716137394685678,
"grad_norm": 15.625,
"learning_rate": 4.2325434008081344e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 8820,
"train_speed(iter/s)": 0.616157
},
{
"epoch": 5.719377835385613,
"grad_norm": 1.5546875,
"learning_rate": 4.2272491829746585e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 8825,
"train_speed(iter/s)": 0.616221
},
{
"epoch": 5.722618276085548,
"grad_norm": 13.125,
"learning_rate": 4.22195585256228e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8830,
"train_speed(iter/s)": 0.616236
},
{
"epoch": 5.725858716785483,
"grad_norm": 13.75,
"learning_rate": 4.216663415649823e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 8835,
"train_speed(iter/s)": 0.616401
},
{
"epoch": 5.729099157485418,
"grad_norm": 4.15625,
"learning_rate": 4.21137187831508e-05,
"loss": 0.0486328125,
"memory(GiB)": 43.05,
"step": 8840,
"train_speed(iter/s)": 0.61642
},
{
"epoch": 5.732339598185353,
"grad_norm": 1.5390625,
"learning_rate": 4.206081246634811e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 8845,
"train_speed(iter/s)": 0.616472
},
{
"epoch": 5.7355800388852884,
"grad_norm": 4.71875,
"learning_rate": 4.200791526684738e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 8850,
"train_speed(iter/s)": 0.616534
},
{
"epoch": 5.738820479585224,
"grad_norm": 0.515625,
"learning_rate": 4.195502724539536e-05,
"loss": 0.01953125,
"memory(GiB)": 43.05,
"step": 8855,
"train_speed(iter/s)": 0.616599
},
{
"epoch": 5.742060920285159,
"grad_norm": 11.0625,
"learning_rate": 4.190214846272821e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 8860,
"train_speed(iter/s)": 0.616742
},
{
"epoch": 5.745301360985094,
"grad_norm": 7.6875,
"learning_rate": 4.184927897957154e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 8865,
"train_speed(iter/s)": 0.616703
},
{
"epoch": 5.748541801685029,
"grad_norm": 16.625,
"learning_rate": 4.179641885664026e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 8870,
"train_speed(iter/s)": 0.616676
},
{
"epoch": 5.751782242384964,
"grad_norm": 6.40625,
"learning_rate": 4.1743568154638526e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 8875,
"train_speed(iter/s)": 0.616644
},
{
"epoch": 5.755022683084899,
"grad_norm": 6.96875,
"learning_rate": 4.169072693425967e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 8880,
"train_speed(iter/s)": 0.616723
},
{
"epoch": 5.758263123784834,
"grad_norm": 10.0625,
"learning_rate": 4.1637895256186175e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 8885,
"train_speed(iter/s)": 0.61676
},
{
"epoch": 5.76150356448477,
"grad_norm": 2.0625,
"learning_rate": 4.158507318108949e-05,
"loss": 0.03125,
"memory(GiB)": 43.05,
"step": 8890,
"train_speed(iter/s)": 0.616858
},
{
"epoch": 5.764744005184705,
"grad_norm": 4.53125,
"learning_rate": 4.153226076963011e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 8895,
"train_speed(iter/s)": 0.616854
},
{
"epoch": 5.7679844458846405,
"grad_norm": 2.046875,
"learning_rate": 4.1479458082457383e-05,
"loss": 0.05078125,
"memory(GiB)": 43.05,
"step": 8900,
"train_speed(iter/s)": 0.616931
},
{
"epoch": 5.771224886584576,
"grad_norm": 0.54296875,
"learning_rate": 4.142666518020952e-05,
"loss": 0.0154296875,
"memory(GiB)": 43.05,
"step": 8905,
"train_speed(iter/s)": 0.61703
},
{
"epoch": 5.774465327284511,
"grad_norm": 11.5,
"learning_rate": 4.137388212351348e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 8910,
"train_speed(iter/s)": 0.617053
},
{
"epoch": 5.777705767984446,
"grad_norm": 1.9765625,
"learning_rate": 4.1321108972984946e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 8915,
"train_speed(iter/s)": 0.617197
},
{
"epoch": 5.780946208684381,
"grad_norm": 2.515625,
"learning_rate": 4.1268345789228155e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 8920,
"train_speed(iter/s)": 0.617315
},
{
"epoch": 5.784186649384316,
"grad_norm": 3.578125,
"learning_rate": 4.121559263283596e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 8925,
"train_speed(iter/s)": 0.617482
},
{
"epoch": 5.787427090084251,
"grad_norm": 2.578125,
"learning_rate": 4.1162849564389693e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 8930,
"train_speed(iter/s)": 0.617643
},
{
"epoch": 5.790667530784186,
"grad_norm": 8.0625,
"learning_rate": 4.111011664445907e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 8935,
"train_speed(iter/s)": 0.61776
},
{
"epoch": 5.7939079714841215,
"grad_norm": 1.7421875,
"learning_rate": 4.105739393360218e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 8940,
"train_speed(iter/s)": 0.617825
},
{
"epoch": 5.7971484121840575,
"grad_norm": 7.21875,
"learning_rate": 4.10046814923654e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 8945,
"train_speed(iter/s)": 0.617923
},
{
"epoch": 5.800388852883993,
"grad_norm": 0.859375,
"learning_rate": 4.095197938128325e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 8950,
"train_speed(iter/s)": 0.618049
},
{
"epoch": 5.803629293583928,
"grad_norm": 13.625,
"learning_rate": 4.0899287660878444e-05,
"loss": 0.0333984375,
"memory(GiB)": 43.05,
"step": 8955,
"train_speed(iter/s)": 0.61808
},
{
"epoch": 5.806869734283863,
"grad_norm": 14.875,
"learning_rate": 4.084660639166178e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 8960,
"train_speed(iter/s)": 0.61814
},
{
"epoch": 5.810110174983798,
"grad_norm": 14.1875,
"learning_rate": 4.079393563413197e-05,
"loss": 0.0212890625,
"memory(GiB)": 43.05,
"step": 8965,
"train_speed(iter/s)": 0.618267
},
{
"epoch": 5.813350615683733,
"grad_norm": 10.8125,
"learning_rate": 4.074127544877574e-05,
"loss": 0.0208984375,
"memory(GiB)": 43.05,
"step": 8970,
"train_speed(iter/s)": 0.618328
},
{
"epoch": 5.816591056383668,
"grad_norm": 0.71875,
"learning_rate": 4.068862589606765e-05,
"loss": 0.0263671875,
"memory(GiB)": 43.05,
"step": 8975,
"train_speed(iter/s)": 0.618373
},
{
"epoch": 5.819831497083603,
"grad_norm": 9.1875,
"learning_rate": 4.063598703647002e-05,
"loss": 0.023828125,
"memory(GiB)": 43.05,
"step": 8980,
"train_speed(iter/s)": 0.618401
},
{
"epoch": 5.8230719377835385,
"grad_norm": 3.09375,
"learning_rate": 4.0583358930432916e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 8985,
"train_speed(iter/s)": 0.618533
},
{
"epoch": 5.826312378483474,
"grad_norm": 10.1875,
"learning_rate": 4.0530741638394076e-05,
"loss": 0.03046875,
"memory(GiB)": 43.05,
"step": 8990,
"train_speed(iter/s)": 0.618508
},
{
"epoch": 5.829552819183409,
"grad_norm": 0.57421875,
"learning_rate": 4.0478135220778755e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 8995,
"train_speed(iter/s)": 0.618612
},
{
"epoch": 5.832793259883344,
"grad_norm": 4.4375,
"learning_rate": 4.042553973799977e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 9000,
"train_speed(iter/s)": 0.618586
},
{
"epoch": 5.836033700583279,
"grad_norm": 10.3125,
"learning_rate": 4.03729552504574e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 9005,
"train_speed(iter/s)": 0.618742
},
{
"epoch": 5.839274141283214,
"grad_norm": 0.56640625,
"learning_rate": 4.032038181853922e-05,
"loss": 0.04111328125,
"memory(GiB)": 43.05,
"step": 9010,
"train_speed(iter/s)": 0.618801
},
{
"epoch": 5.842514581983149,
"grad_norm": 8.1875,
"learning_rate": 4.026781950262018e-05,
"loss": 0.035546875,
"memory(GiB)": 43.05,
"step": 9015,
"train_speed(iter/s)": 0.618804
},
{
"epoch": 5.845755022683085,
"grad_norm": 18.375,
"learning_rate": 4.0215268363062465e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 9020,
"train_speed(iter/s)": 0.618882
},
{
"epoch": 5.84899546338302,
"grad_norm": 0.69921875,
"learning_rate": 4.0162728460215346e-05,
"loss": 0.0232421875,
"memory(GiB)": 43.05,
"step": 9025,
"train_speed(iter/s)": 0.618868
},
{
"epoch": 5.8522359040829555,
"grad_norm": 10.0,
"learning_rate": 4.0110199854415264e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 9030,
"train_speed(iter/s)": 0.619029
},
{
"epoch": 5.855476344782891,
"grad_norm": 2.828125,
"learning_rate": 4.005768260598569e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 9035,
"train_speed(iter/s)": 0.619114
},
{
"epoch": 5.858716785482826,
"grad_norm": 12.375,
"learning_rate": 4.0005176775237e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 9040,
"train_speed(iter/s)": 0.619198
},
{
"epoch": 5.861957226182761,
"grad_norm": 14.25,
"learning_rate": 3.99526824224665e-05,
"loss": 0.0240234375,
"memory(GiB)": 43.05,
"step": 9045,
"train_speed(iter/s)": 0.619316
},
{
"epoch": 5.865197666882696,
"grad_norm": 1.5390625,
"learning_rate": 3.990019960795831e-05,
"loss": 0.0337890625,
"memory(GiB)": 43.05,
"step": 9050,
"train_speed(iter/s)": 0.619371
},
{
"epoch": 5.868438107582631,
"grad_norm": 8.4375,
"learning_rate": 3.984772839198327e-05,
"loss": 0.0177734375,
"memory(GiB)": 43.05,
"step": 9055,
"train_speed(iter/s)": 0.619431
},
{
"epoch": 5.871678548282566,
"grad_norm": 2.453125,
"learning_rate": 3.979526883479892e-05,
"loss": 0.0162109375,
"memory(GiB)": 43.05,
"step": 9060,
"train_speed(iter/s)": 0.619449
},
{
"epoch": 5.874918988982501,
"grad_norm": 10.1875,
"learning_rate": 3.9742820996649435e-05,
"loss": 0.0314453125,
"memory(GiB)": 43.05,
"step": 9065,
"train_speed(iter/s)": 0.619509
},
{
"epoch": 5.878159429682436,
"grad_norm": 13.8125,
"learning_rate": 3.9690384937765495e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 9070,
"train_speed(iter/s)": 0.619525
},
{
"epoch": 5.881399870382372,
"grad_norm": 8.125,
"learning_rate": 3.9637960718364265e-05,
"loss": 0.02734375,
"memory(GiB)": 43.05,
"step": 9075,
"train_speed(iter/s)": 0.619648
},
{
"epoch": 5.884640311082308,
"grad_norm": 1.921875,
"learning_rate": 3.958554839864932e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 9080,
"train_speed(iter/s)": 0.619621
},
{
"epoch": 5.887880751782243,
"grad_norm": 1.9765625,
"learning_rate": 3.953314803881058e-05,
"loss": 0.026953125,
"memory(GiB)": 43.05,
"step": 9085,
"train_speed(iter/s)": 0.619703
},
{
"epoch": 5.891121192482178,
"grad_norm": 2.359375,
"learning_rate": 3.94807596990242e-05,
"loss": 0.0427734375,
"memory(GiB)": 43.05,
"step": 9090,
"train_speed(iter/s)": 0.619781
},
{
"epoch": 5.894361633182113,
"grad_norm": 0.6015625,
"learning_rate": 3.942838343945253e-05,
"loss": 0.0537109375,
"memory(GiB)": 43.05,
"step": 9095,
"train_speed(iter/s)": 0.619794
},
{
"epoch": 5.897602073882048,
"grad_norm": 10.9375,
"learning_rate": 3.93760193202441e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 9100,
"train_speed(iter/s)": 0.619792
},
{
"epoch": 5.900842514581983,
"grad_norm": 11.3125,
"learning_rate": 3.932366740153343e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 9105,
"train_speed(iter/s)": 0.619931
},
{
"epoch": 5.904082955281918,
"grad_norm": 6.3125,
"learning_rate": 3.927132774344107e-05,
"loss": 0.012890625,
"memory(GiB)": 43.05,
"step": 9110,
"train_speed(iter/s)": 0.620064
},
{
"epoch": 5.907323395981853,
"grad_norm": 16.625,
"learning_rate": 3.9219000406073516e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 9115,
"train_speed(iter/s)": 0.620009
},
{
"epoch": 5.9105638366817885,
"grad_norm": 11.375,
"learning_rate": 3.916668544952302e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 9120,
"train_speed(iter/s)": 0.619989
},
{
"epoch": 5.913804277381724,
"grad_norm": 4.59375,
"learning_rate": 3.911438293386771e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 9125,
"train_speed(iter/s)": 0.620071
},
{
"epoch": 5.917044718081659,
"grad_norm": 2.1875,
"learning_rate": 3.9062092919171414e-05,
"loss": 0.046875,
"memory(GiB)": 43.05,
"step": 9130,
"train_speed(iter/s)": 0.620124
},
{
"epoch": 5.920285158781594,
"grad_norm": 2.640625,
"learning_rate": 3.9009815465483536e-05,
"loss": 0.0322265625,
"memory(GiB)": 43.05,
"step": 9135,
"train_speed(iter/s)": 0.620183
},
{
"epoch": 5.923525599481529,
"grad_norm": 1.7890625,
"learning_rate": 3.895755063283912e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 9140,
"train_speed(iter/s)": 0.620345
},
{
"epoch": 5.926766040181465,
"grad_norm": 4.0625,
"learning_rate": 3.8905298481258726e-05,
"loss": 0.0365234375,
"memory(GiB)": 43.05,
"step": 9145,
"train_speed(iter/s)": 0.620408
},
{
"epoch": 5.9300064808814,
"grad_norm": 15.5625,
"learning_rate": 3.8853059070748275e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 9150,
"train_speed(iter/s)": 0.620542
},
{
"epoch": 5.933246921581335,
"grad_norm": 0.66015625,
"learning_rate": 3.880083246129914e-05,
"loss": 0.019140625,
"memory(GiB)": 43.05,
"step": 9155,
"train_speed(iter/s)": 0.620592
},
{
"epoch": 5.93648736228127,
"grad_norm": 2.203125,
"learning_rate": 3.8748618712887966e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 9160,
"train_speed(iter/s)": 0.620562
},
{
"epoch": 5.9397278029812055,
"grad_norm": 2.4375,
"learning_rate": 3.869641788547658e-05,
"loss": 0.0169921875,
"memory(GiB)": 43.05,
"step": 9165,
"train_speed(iter/s)": 0.620596
},
{
"epoch": 5.942968243681141,
"grad_norm": 9.3125,
"learning_rate": 3.864423003901203e-05,
"loss": 0.0423828125,
"memory(GiB)": 43.05,
"step": 9170,
"train_speed(iter/s)": 0.620719
},
{
"epoch": 5.946208684381076,
"grad_norm": 3.328125,
"learning_rate": 3.8592055233426454e-05,
"loss": 0.038671875,
"memory(GiB)": 43.05,
"step": 9175,
"train_speed(iter/s)": 0.620788
},
{
"epoch": 5.949449125081011,
"grad_norm": 15.875,
"learning_rate": 3.853989352863698e-05,
"loss": 0.0234375,
"memory(GiB)": 43.05,
"step": 9180,
"train_speed(iter/s)": 0.620866
},
{
"epoch": 5.952689565780946,
"grad_norm": 2.875,
"learning_rate": 3.8487744984545705e-05,
"loss": 0.01484375,
"memory(GiB)": 43.05,
"step": 9185,
"train_speed(iter/s)": 0.620868
},
{
"epoch": 5.955930006480881,
"grad_norm": 1.4921875,
"learning_rate": 3.843560966103965e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 9190,
"train_speed(iter/s)": 0.620895
},
{
"epoch": 5.959170447180816,
"grad_norm": 6.53125,
"learning_rate": 3.838348761799058e-05,
"loss": 0.0224609375,
"memory(GiB)": 43.05,
"step": 9195,
"train_speed(iter/s)": 0.620994
},
{
"epoch": 5.962410887880752,
"grad_norm": 1.1796875,
"learning_rate": 3.833137891525506e-05,
"loss": 0.0439453125,
"memory(GiB)": 43.05,
"step": 9200,
"train_speed(iter/s)": 0.621117
},
{
"epoch": 5.965651328580687,
"grad_norm": 1.8671875,
"learning_rate": 3.827928361267433e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 9205,
"train_speed(iter/s)": 0.621276
},
{
"epoch": 5.9688917692806225,
"grad_norm": 5.0,
"learning_rate": 3.8227201770074225e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 9210,
"train_speed(iter/s)": 0.621345
},
{
"epoch": 5.972132209980558,
"grad_norm": 11.625,
"learning_rate": 3.8175133447265146e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 9215,
"train_speed(iter/s)": 0.621345
},
{
"epoch": 5.975372650680493,
"grad_norm": 6.71875,
"learning_rate": 3.812307870404197e-05,
"loss": 0.0380859375,
"memory(GiB)": 43.05,
"step": 9220,
"train_speed(iter/s)": 0.621503
},
{
"epoch": 5.978613091380428,
"grad_norm": 12.6875,
"learning_rate": 3.807103760018392e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 9225,
"train_speed(iter/s)": 0.621577
},
{
"epoch": 5.981853532080363,
"grad_norm": 7.21875,
"learning_rate": 3.801901019545463e-05,
"loss": 0.0498046875,
"memory(GiB)": 43.05,
"step": 9230,
"train_speed(iter/s)": 0.621586
},
{
"epoch": 5.985093972780298,
"grad_norm": 8.75,
"learning_rate": 3.796699654960197e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 9235,
"train_speed(iter/s)": 0.621643
},
{
"epoch": 5.988334413480233,
"grad_norm": 10.6875,
"learning_rate": 3.791499672235799e-05,
"loss": 0.0095703125,
"memory(GiB)": 43.05,
"step": 9240,
"train_speed(iter/s)": 0.621758
},
{
"epoch": 5.991574854180168,
"grad_norm": 13.1875,
"learning_rate": 3.786301077343892e-05,
"loss": 0.0376953125,
"memory(GiB)": 43.05,
"step": 9245,
"train_speed(iter/s)": 0.62188
},
{
"epoch": 5.9948152948801035,
"grad_norm": 2.484375,
"learning_rate": 3.781103876254503e-05,
"loss": 0.030859375,
"memory(GiB)": 43.05,
"step": 9250,
"train_speed(iter/s)": 0.621971
},
{
"epoch": 5.998055735580039,
"grad_norm": 3.515625,
"learning_rate": 3.775908074936053e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 9255,
"train_speed(iter/s)": 0.62198
},
{
"epoch": 6.001296176279974,
"grad_norm": 8.0625,
"learning_rate": 3.770713679355364e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 9260,
"train_speed(iter/s)": 0.622038
},
{
"epoch": 6.00453661697991,
"grad_norm": 12.0625,
"learning_rate": 3.765520695477642e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 9265,
"train_speed(iter/s)": 0.622113
},
{
"epoch": 6.007777057679845,
"grad_norm": 0.66796875,
"learning_rate": 3.760329129266464e-05,
"loss": 0.0255859375,
"memory(GiB)": 43.05,
"step": 9270,
"train_speed(iter/s)": 0.622179
},
{
"epoch": 6.01101749837978,
"grad_norm": 9.0625,
"learning_rate": 3.755138986683788e-05,
"loss": 0.0140625,
"memory(GiB)": 43.05,
"step": 9275,
"train_speed(iter/s)": 0.622171
},
{
"epoch": 6.014257939079715,
"grad_norm": 6.625,
"learning_rate": 3.749950273689935e-05,
"loss": 0.028125,
"memory(GiB)": 43.05,
"step": 9280,
"train_speed(iter/s)": 0.622064
},
{
"epoch": 6.01749837977965,
"grad_norm": 12.9375,
"learning_rate": 3.7447629962435816e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 9285,
"train_speed(iter/s)": 0.622156
},
{
"epoch": 6.020738820479585,
"grad_norm": 3.265625,
"learning_rate": 3.739577160301756e-05,
"loss": 0.0173828125,
"memory(GiB)": 43.05,
"step": 9290,
"train_speed(iter/s)": 0.622216
},
{
"epoch": 6.02397926117952,
"grad_norm": 9.0625,
"learning_rate": 3.734392771819837e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 9295,
"train_speed(iter/s)": 0.622264
},
{
"epoch": 6.0272197018794555,
"grad_norm": 13.75,
"learning_rate": 3.729209836751531e-05,
"loss": 0.041796875,
"memory(GiB)": 43.05,
"step": 9300,
"train_speed(iter/s)": 0.622218
},
{
"epoch": 6.030460142579391,
"grad_norm": 14.1875,
"learning_rate": 3.7240283610488836e-05,
"loss": 0.0361328125,
"memory(GiB)": 43.05,
"step": 9305,
"train_speed(iter/s)": 0.62234
},
{
"epoch": 6.033700583279326,
"grad_norm": 1.21875,
"learning_rate": 3.718848350662262e-05,
"loss": 0.0189453125,
"memory(GiB)": 43.05,
"step": 9310,
"train_speed(iter/s)": 0.622437
},
{
"epoch": 6.036941023979261,
"grad_norm": 16.875,
"learning_rate": 3.713669811540349e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 9315,
"train_speed(iter/s)": 0.622473
},
{
"epoch": 6.040181464679196,
"grad_norm": 0.73046875,
"learning_rate": 3.70849274963014e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 9320,
"train_speed(iter/s)": 0.622534
},
{
"epoch": 6.043421905379131,
"grad_norm": 5.15625,
"learning_rate": 3.7033171708769324e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 9325,
"train_speed(iter/s)": 0.622625
},
{
"epoch": 6.046662346079067,
"grad_norm": 8.25,
"learning_rate": 3.698143081224323e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 9330,
"train_speed(iter/s)": 0.622703
},
{
"epoch": 6.049902786779002,
"grad_norm": 5.6875,
"learning_rate": 3.692970486614195e-05,
"loss": 0.0279296875,
"memory(GiB)": 43.05,
"step": 9335,
"train_speed(iter/s)": 0.622734
},
{
"epoch": 6.053143227478937,
"grad_norm": 12.25,
"learning_rate": 3.687799392986714e-05,
"loss": 0.0294921875,
"memory(GiB)": 43.05,
"step": 9340,
"train_speed(iter/s)": 0.622792
},
{
"epoch": 6.0563836681788725,
"grad_norm": 4.28125,
"learning_rate": 3.6826298062803296e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 9345,
"train_speed(iter/s)": 0.622846
},
{
"epoch": 6.059624108878808,
"grad_norm": 9.8125,
"learning_rate": 3.677461732431751e-05,
"loss": 0.040625,
"memory(GiB)": 43.05,
"step": 9350,
"train_speed(iter/s)": 0.62294
},
{
"epoch": 6.062864549578743,
"grad_norm": 9.875,
"learning_rate": 3.672295177375955e-05,
"loss": 0.044921875,
"memory(GiB)": 43.05,
"step": 9355,
"train_speed(iter/s)": 0.622956
},
{
"epoch": 6.066104990278678,
"grad_norm": 0.7890625,
"learning_rate": 3.6671301470461776e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 9360,
"train_speed(iter/s)": 0.623076
},
{
"epoch": 6.069345430978613,
"grad_norm": 4.59375,
"learning_rate": 3.661966647373895e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 9365,
"train_speed(iter/s)": 0.623143
},
{
"epoch": 6.072585871678548,
"grad_norm": 7.0,
"learning_rate": 3.6568046842888326e-05,
"loss": 0.017578125,
"memory(GiB)": 43.05,
"step": 9370,
"train_speed(iter/s)": 0.623216
},
{
"epoch": 6.075826312378483,
"grad_norm": 12.5,
"learning_rate": 3.6516442637189496e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 9375,
"train_speed(iter/s)": 0.623271
},
{
"epoch": 6.079066753078418,
"grad_norm": 2.21875,
"learning_rate": 3.646485391590433e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 9380,
"train_speed(iter/s)": 0.623314
},
{
"epoch": 6.0823071937783535,
"grad_norm": 2.328125,
"learning_rate": 3.64132807382769e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 9385,
"train_speed(iter/s)": 0.623427
},
{
"epoch": 6.085547634478289,
"grad_norm": 0.53125,
"learning_rate": 3.6361723163533504e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 9390,
"train_speed(iter/s)": 0.623582
},
{
"epoch": 6.088788075178225,
"grad_norm": 7.0625,
"learning_rate": 3.631018125088239e-05,
"loss": 0.020703125,
"memory(GiB)": 43.05,
"step": 9395,
"train_speed(iter/s)": 0.623702
},
{
"epoch": 6.09202851587816,
"grad_norm": 15.0,
"learning_rate": 3.625865505951394e-05,
"loss": 0.0244140625,
"memory(GiB)": 43.05,
"step": 9400,
"train_speed(iter/s)": 0.62378
},
{
"epoch": 6.095268956578095,
"grad_norm": 11.375,
"learning_rate": 3.620714464860043e-05,
"loss": 0.03671875,
"memory(GiB)": 43.05,
"step": 9405,
"train_speed(iter/s)": 0.623872
},
{
"epoch": 6.09850939727803,
"grad_norm": 2.09375,
"learning_rate": 3.615565007729601e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 9410,
"train_speed(iter/s)": 0.623985
},
{
"epoch": 6.101749837977965,
"grad_norm": 1.1953125,
"learning_rate": 3.6104171404736655e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 9415,
"train_speed(iter/s)": 0.624073
},
{
"epoch": 6.1049902786779,
"grad_norm": 11.5625,
"learning_rate": 3.6052708690040096e-05,
"loss": 0.0310546875,
"memory(GiB)": 43.05,
"step": 9420,
"train_speed(iter/s)": 0.624186
},
{
"epoch": 6.108230719377835,
"grad_norm": 9.6875,
"learning_rate": 3.600126199230568e-05,
"loss": 0.03203125,
"memory(GiB)": 43.05,
"step": 9425,
"train_speed(iter/s)": 0.624106
},
{
"epoch": 6.1114711600777705,
"grad_norm": 14.125,
"learning_rate": 3.5949831370614425e-05,
"loss": 0.0388671875,
"memory(GiB)": 43.05,
"step": 9430,
"train_speed(iter/s)": 0.624121
},
{
"epoch": 6.114711600777706,
"grad_norm": 9.25,
"learning_rate": 3.589841688402887e-05,
"loss": 0.034375,
"memory(GiB)": 43.05,
"step": 9435,
"train_speed(iter/s)": 0.62415
},
{
"epoch": 6.117952041477641,
"grad_norm": 0.55078125,
"learning_rate": 3.5847018591593e-05,
"loss": 0.0251953125,
"memory(GiB)": 43.05,
"step": 9440,
"train_speed(iter/s)": 0.62424
},
{
"epoch": 6.121192482177576,
"grad_norm": 8.8125,
"learning_rate": 3.57956365523322e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 9445,
"train_speed(iter/s)": 0.624293
},
{
"epoch": 6.124432922877511,
"grad_norm": 15.4375,
"learning_rate": 3.574427082525326e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 9450,
"train_speed(iter/s)": 0.62428
},
{
"epoch": 6.127673363577447,
"grad_norm": 1.4765625,
"learning_rate": 3.569292146934413e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 9455,
"train_speed(iter/s)": 0.624332
},
{
"epoch": 6.130913804277382,
"grad_norm": 0.66015625,
"learning_rate": 3.564158854357406e-05,
"loss": 0.0216796875,
"memory(GiB)": 43.05,
"step": 9460,
"train_speed(iter/s)": 0.624278
},
{
"epoch": 6.134154244977317,
"grad_norm": 0.60546875,
"learning_rate": 3.559027210689338e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 9465,
"train_speed(iter/s)": 0.624252
},
{
"epoch": 6.137394685677252,
"grad_norm": 4.59375,
"learning_rate": 3.553897221823347e-05,
"loss": 0.0166015625,
"memory(GiB)": 43.05,
"step": 9470,
"train_speed(iter/s)": 0.624385
},
{
"epoch": 6.1406351263771874,
"grad_norm": 13.75,
"learning_rate": 3.5487688936506735e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 9475,
"train_speed(iter/s)": 0.624414
},
{
"epoch": 6.143875567077123,
"grad_norm": 12.5,
"learning_rate": 3.543642232060652e-05,
"loss": 0.03515625,
"memory(GiB)": 43.05,
"step": 9480,
"train_speed(iter/s)": 0.624493
},
{
"epoch": 6.147116007777058,
"grad_norm": 1.7421875,
"learning_rate": 3.538517242940699e-05,
"loss": 0.0341796875,
"memory(GiB)": 43.05,
"step": 9485,
"train_speed(iter/s)": 0.624511
},
{
"epoch": 6.150356448476993,
"grad_norm": 1.078125,
"learning_rate": 3.5333939321763135e-05,
"loss": 0.047265625,
"memory(GiB)": 43.05,
"step": 9490,
"train_speed(iter/s)": 0.624664
},
{
"epoch": 6.153596889176928,
"grad_norm": 9.5625,
"learning_rate": 3.528272305651069e-05,
"loss": 0.051953125,
"memory(GiB)": 43.05,
"step": 9495,
"train_speed(iter/s)": 0.624756
},
{
"epoch": 6.156837329876863,
"grad_norm": 1.1015625,
"learning_rate": 3.523152369246596e-05,
"loss": 0.0138671875,
"memory(GiB)": 43.05,
"step": 9500,
"train_speed(iter/s)": 0.624885
},
{
"epoch": 6.160077770576798,
"grad_norm": 0.640625,
"learning_rate": 3.5180341288425945e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 9505,
"train_speed(iter/s)": 0.624944
},
{
"epoch": 6.163318211276733,
"grad_norm": 7.1875,
"learning_rate": 3.512917590316812e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 9510,
"train_speed(iter/s)": 0.625037
},
{
"epoch": 6.166558651976668,
"grad_norm": 0.703125,
"learning_rate": 3.5078027595450405e-05,
"loss": 0.02578125,
"memory(GiB)": 43.05,
"step": 9515,
"train_speed(iter/s)": 0.62499
},
{
"epoch": 6.169799092676604,
"grad_norm": 12.3125,
"learning_rate": 3.502689642401114e-05,
"loss": 0.021875,
"memory(GiB)": 43.05,
"step": 9520,
"train_speed(iter/s)": 0.625114
},
{
"epoch": 6.1730395333765395,
"grad_norm": 8.3125,
"learning_rate": 3.497578244756897e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 9525,
"train_speed(iter/s)": 0.625097
},
{
"epoch": 6.176279974076475,
"grad_norm": 0.5546875,
"learning_rate": 3.492468572482278e-05,
"loss": 0.04248046875,
"memory(GiB)": 43.05,
"step": 9530,
"train_speed(iter/s)": 0.625122
},
{
"epoch": 6.17952041477641,
"grad_norm": 0.625,
"learning_rate": 3.487360631445165e-05,
"loss": 0.01953125,
"memory(GiB)": 43.05,
"step": 9535,
"train_speed(iter/s)": 0.625178
},
{
"epoch": 6.182760855476345,
"grad_norm": 13.0,
"learning_rate": 3.4822544275114805e-05,
"loss": 0.0453125,
"memory(GiB)": 43.05,
"step": 9540,
"train_speed(iter/s)": 0.625186
},
{
"epoch": 6.18600129617628,
"grad_norm": 12.5,
"learning_rate": 3.477149966545147e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 9545,
"train_speed(iter/s)": 0.625236
},
{
"epoch": 6.189241736876215,
"grad_norm": 12.75,
"learning_rate": 3.4720472544080905e-05,
"loss": 0.019140625,
"memory(GiB)": 43.05,
"step": 9550,
"train_speed(iter/s)": 0.625287
},
{
"epoch": 6.19248217757615,
"grad_norm": 1.765625,
"learning_rate": 3.4669462969602274e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 9555,
"train_speed(iter/s)": 0.625367
},
{
"epoch": 6.195722618276085,
"grad_norm": 8.375,
"learning_rate": 3.461847100059454e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 9560,
"train_speed(iter/s)": 0.625374
},
{
"epoch": 6.1989630589760205,
"grad_norm": 12.8125,
"learning_rate": 3.456749669561651e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 9565,
"train_speed(iter/s)": 0.625358
},
{
"epoch": 6.202203499675956,
"grad_norm": 5.03125,
"learning_rate": 3.4516540113206695e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 9570,
"train_speed(iter/s)": 0.625508
},
{
"epoch": 6.205443940375891,
"grad_norm": 11.75,
"learning_rate": 3.446560131188323e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 9575,
"train_speed(iter/s)": 0.625637
},
{
"epoch": 6.208684381075827,
"grad_norm": 1.09375,
"learning_rate": 3.4414680350143843e-05,
"loss": 0.0419921875,
"memory(GiB)": 43.05,
"step": 9580,
"train_speed(iter/s)": 0.625628
},
{
"epoch": 6.211924821775762,
"grad_norm": 1.203125,
"learning_rate": 3.4363777286465806e-05,
"loss": 0.033203125,
"memory(GiB)": 43.05,
"step": 9585,
"train_speed(iter/s)": 0.625683
},
{
"epoch": 6.215165262475697,
"grad_norm": 12.0,
"learning_rate": 3.431289217930575e-05,
"loss": 0.0359375,
"memory(GiB)": 43.05,
"step": 9590,
"train_speed(iter/s)": 0.625761
},
{
"epoch": 6.218405703175632,
"grad_norm": 2.515625,
"learning_rate": 3.426202508709976e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 9595,
"train_speed(iter/s)": 0.625794
},
{
"epoch": 6.221646143875567,
"grad_norm": 11.5625,
"learning_rate": 3.421117606826324e-05,
"loss": 0.036328125,
"memory(GiB)": 43.05,
"step": 9600,
"train_speed(iter/s)": 0.62594
},
{
"epoch": 6.224886584575502,
"grad_norm": 2.65625,
"learning_rate": 3.4160345181190805e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 9605,
"train_speed(iter/s)": 0.62599
},
{
"epoch": 6.2281270252754375,
"grad_norm": 14.625,
"learning_rate": 3.4109532484256234e-05,
"loss": 0.0287109375,
"memory(GiB)": 43.05,
"step": 9610,
"train_speed(iter/s)": 0.626052
},
{
"epoch": 6.231367465975373,
"grad_norm": 1.3203125,
"learning_rate": 3.405873803581244e-05,
"loss": 0.0369140625,
"memory(GiB)": 43.05,
"step": 9615,
"train_speed(iter/s)": 0.626201
},
{
"epoch": 6.234607906675308,
"grad_norm": 12.375,
"learning_rate": 3.400796189419141e-05,
"loss": 0.0201171875,
"memory(GiB)": 43.05,
"step": 9620,
"train_speed(iter/s)": 0.62628
},
{
"epoch": 6.237848347375243,
"grad_norm": 1.9140625,
"learning_rate": 3.3957204117704035e-05,
"loss": 0.0248046875,
"memory(GiB)": 43.05,
"step": 9625,
"train_speed(iter/s)": 0.626313
},
{
"epoch": 6.241088788075178,
"grad_norm": 0.921875,
"learning_rate": 3.390646476464017e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 9630,
"train_speed(iter/s)": 0.626415
},
{
"epoch": 6.244329228775113,
"grad_norm": 3.71875,
"learning_rate": 3.385574389326852e-05,
"loss": 0.029296875,
"memory(GiB)": 43.05,
"step": 9635,
"train_speed(iter/s)": 0.626423
},
{
"epoch": 6.247569669475048,
"grad_norm": 0.71875,
"learning_rate": 3.3805041561836505e-05,
"loss": 0.02890625,
"memory(GiB)": 43.05,
"step": 9640,
"train_speed(iter/s)": 0.626412
},
{
"epoch": 6.250810110174983,
"grad_norm": 5.03125,
"learning_rate": 3.375435782857032e-05,
"loss": 0.0197265625,
"memory(GiB)": 43.05,
"step": 9645,
"train_speed(iter/s)": 0.626515
},
{
"epoch": 6.254050550874919,
"grad_norm": 13.0625,
"learning_rate": 3.370369275167476e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 9650,
"train_speed(iter/s)": 0.626461
},
{
"epoch": 6.2572909915748545,
"grad_norm": 11.625,
"learning_rate": 3.365304638933322e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 9655,
"train_speed(iter/s)": 0.626524
},
{
"epoch": 6.26053143227479,
"grad_norm": 10.625,
"learning_rate": 3.360241879970759e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 9660,
"train_speed(iter/s)": 0.626677
},
{
"epoch": 6.263771872974725,
"grad_norm": 6.84375,
"learning_rate": 3.355181004093823e-05,
"loss": 0.032421875,
"memory(GiB)": 43.05,
"step": 9665,
"train_speed(iter/s)": 0.626686
},
{
"epoch": 6.26701231367466,
"grad_norm": 7.875,
"learning_rate": 3.3501220171143785e-05,
"loss": 0.0283203125,
"memory(GiB)": 43.05,
"step": 9670,
"train_speed(iter/s)": 0.626702
},
{
"epoch": 6.270252754374595,
"grad_norm": 9.6875,
"learning_rate": 3.345064924842133e-05,
"loss": 0.0169921875,
"memory(GiB)": 43.05,
"step": 9675,
"train_speed(iter/s)": 0.62685
},
{
"epoch": 6.27349319507453,
"grad_norm": 5.28125,
"learning_rate": 3.340009733084611e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 9680,
"train_speed(iter/s)": 0.626918
},
{
"epoch": 6.276733635774465,
"grad_norm": 15.5,
"learning_rate": 3.334956447647155e-05,
"loss": 0.0228515625,
"memory(GiB)": 43.05,
"step": 9685,
"train_speed(iter/s)": 0.626933
},
{
"epoch": 6.2799740764744,
"grad_norm": 14.8125,
"learning_rate": 3.32990507433292e-05,
"loss": 0.0298828125,
"memory(GiB)": 43.05,
"step": 9690,
"train_speed(iter/s)": 0.627045
},
{
"epoch": 6.283214517174335,
"grad_norm": 4.15625,
"learning_rate": 3.324855618942865e-05,
"loss": 0.023046875,
"memory(GiB)": 43.05,
"step": 9695,
"train_speed(iter/s)": 0.627127
},
{
"epoch": 6.2864549578742706,
"grad_norm": 1.3046875,
"learning_rate": 3.319808087275743e-05,
"loss": 0.0318359375,
"memory(GiB)": 43.05,
"step": 9700,
"train_speed(iter/s)": 0.627239
},
{
"epoch": 6.289695398574206,
"grad_norm": 0.7109375,
"learning_rate": 3.314762485128102e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 9705,
"train_speed(iter/s)": 0.627349
},
{
"epoch": 6.292935839274142,
"grad_norm": 10.5,
"learning_rate": 3.309718818294275e-05,
"loss": 0.028515625,
"memory(GiB)": 43.05,
"step": 9710,
"train_speed(iter/s)": 0.62746
},
{
"epoch": 6.296176279974077,
"grad_norm": 1.9375,
"learning_rate": 3.304677092566367e-05,
"loss": 0.0375,
"memory(GiB)": 43.05,
"step": 9715,
"train_speed(iter/s)": 0.627494
},
{
"epoch": 6.299416720674012,
"grad_norm": 3.578125,
"learning_rate": 3.299637313734258e-05,
"loss": 0.037109375,
"memory(GiB)": 43.05,
"step": 9720,
"train_speed(iter/s)": 0.627573
},
{
"epoch": 6.302657161373947,
"grad_norm": 10.3125,
"learning_rate": 3.294599487585594e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 9725,
"train_speed(iter/s)": 0.627566
},
{
"epoch": 6.305897602073882,
"grad_norm": 15.5625,
"learning_rate": 3.289563619905771e-05,
"loss": 0.0291015625,
"memory(GiB)": 43.05,
"step": 9730,
"train_speed(iter/s)": 0.627585
},
{
"epoch": 6.309138042773817,
"grad_norm": 14.6875,
"learning_rate": 3.2845297164779446e-05,
"loss": 0.0181640625,
"memory(GiB)": 43.05,
"step": 9735,
"train_speed(iter/s)": 0.627593
},
{
"epoch": 6.312378483473752,
"grad_norm": 11.5625,
"learning_rate": 3.2794977830830085e-05,
"loss": 0.0267578125,
"memory(GiB)": 43.05,
"step": 9740,
"train_speed(iter/s)": 0.627519
},
{
"epoch": 6.3156189241736875,
"grad_norm": 2.28125,
"learning_rate": 3.2744678254995974e-05,
"loss": 0.0220703125,
"memory(GiB)": 43.05,
"step": 9745,
"train_speed(iter/s)": 0.627617
},
{
"epoch": 6.318859364873623,
"grad_norm": 1.109375,
"learning_rate": 3.269439849504075e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 9750,
"train_speed(iter/s)": 0.627748
},
{
"epoch": 6.322099805573558,
"grad_norm": 9.875,
"learning_rate": 3.264413860870535e-05,
"loss": 0.010546875,
"memory(GiB)": 43.05,
"step": 9755,
"train_speed(iter/s)": 0.627816
},
{
"epoch": 6.325340246273493,
"grad_norm": 12.3125,
"learning_rate": 3.2593898653707775e-05,
"loss": 0.0203125,
"memory(GiB)": 43.05,
"step": 9760,
"train_speed(iter/s)": 0.62786
},
{
"epoch": 6.328580686973428,
"grad_norm": 4.59375,
"learning_rate": 3.254367868774322e-05,
"loss": 0.0087890625,
"memory(GiB)": 43.05,
"step": 9765,
"train_speed(iter/s)": 0.627947
},
{
"epoch": 6.331821127673363,
"grad_norm": 2.828125,
"learning_rate": 3.249347876848395e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 9770,
"train_speed(iter/s)": 0.627957
},
{
"epoch": 6.335061568373299,
"grad_norm": 2.078125,
"learning_rate": 3.244329895357912e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 9775,
"train_speed(iter/s)": 0.628084
},
{
"epoch": 6.338302009073234,
"grad_norm": 12.4375,
"learning_rate": 3.239313930065484e-05,
"loss": 0.03828125,
"memory(GiB)": 43.05,
"step": 9780,
"train_speed(iter/s)": 0.628116
},
{
"epoch": 6.341542449773169,
"grad_norm": 10.1875,
"learning_rate": 3.234299986731412e-05,
"loss": 0.0392578125,
"memory(GiB)": 43.05,
"step": 9785,
"train_speed(iter/s)": 0.628147
},
{
"epoch": 6.3447828904731045,
"grad_norm": 12.3125,
"learning_rate": 3.2292880711136644e-05,
"loss": 0.0236328125,
"memory(GiB)": 43.05,
"step": 9790,
"train_speed(iter/s)": 0.628274
},
{
"epoch": 6.34802333117304,
"grad_norm": 8.625,
"learning_rate": 3.22427818896789e-05,
"loss": 0.0396484375,
"memory(GiB)": 43.05,
"step": 9795,
"train_speed(iter/s)": 0.62824
},
{
"epoch": 6.351263771872975,
"grad_norm": 3.59375,
"learning_rate": 3.2192703460473994e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 9800,
"train_speed(iter/s)": 0.628324
},
{
"epoch": 6.35450421257291,
"grad_norm": 1.359375,
"learning_rate": 3.214264548103158e-05,
"loss": 0.0265625,
"memory(GiB)": 43.05,
"step": 9805,
"train_speed(iter/s)": 0.628438
},
{
"epoch": 6.357744653272845,
"grad_norm": 15.5,
"learning_rate": 3.2092608008837874e-05,
"loss": 0.041015625,
"memory(GiB)": 43.05,
"step": 9810,
"train_speed(iter/s)": 0.628488
},
{
"epoch": 6.36098509397278,
"grad_norm": 12.25,
"learning_rate": 3.204259110135553e-05,
"loss": 0.0349609375,
"memory(GiB)": 43.05,
"step": 9815,
"train_speed(iter/s)": 0.628557
},
{
"epoch": 6.364225534672715,
"grad_norm": 3.28125,
"learning_rate": 3.1992594816023565e-05,
"loss": 0.025390625,
"memory(GiB)": 43.05,
"step": 9820,
"train_speed(iter/s)": 0.628668
},
{
"epoch": 6.36746597537265,
"grad_norm": 10.4375,
"learning_rate": 3.194261921025734e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 9825,
"train_speed(iter/s)": 0.628799
},
{
"epoch": 6.3707064160725855,
"grad_norm": 11.6875,
"learning_rate": 3.189266434144847e-05,
"loss": 0.015625,
"memory(GiB)": 43.05,
"step": 9830,
"train_speed(iter/s)": 0.628841
},
{
"epoch": 6.3739468567725215,
"grad_norm": 7.84375,
"learning_rate": 3.18427302669647e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 9835,
"train_speed(iter/s)": 0.62893
},
{
"epoch": 6.377187297472457,
"grad_norm": 3.65625,
"learning_rate": 3.179281704414998e-05,
"loss": 0.01875,
"memory(GiB)": 43.05,
"step": 9840,
"train_speed(iter/s)": 0.629004
},
{
"epoch": 6.380427738172392,
"grad_norm": 4.78125,
"learning_rate": 3.174292473032426e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 9845,
"train_speed(iter/s)": 0.629108
},
{
"epoch": 6.383668178872327,
"grad_norm": 6.25,
"learning_rate": 3.1693053382783474e-05,
"loss": 0.0271484375,
"memory(GiB)": 43.05,
"step": 9850,
"train_speed(iter/s)": 0.629189
},
{
"epoch": 6.386908619572262,
"grad_norm": 9.0,
"learning_rate": 3.16432030587995e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 9855,
"train_speed(iter/s)": 0.629275
},
{
"epoch": 6.390149060272197,
"grad_norm": 2.703125,
"learning_rate": 3.1593373815620094e-05,
"loss": 0.0330078125,
"memory(GiB)": 43.05,
"step": 9860,
"train_speed(iter/s)": 0.629324
},
{
"epoch": 6.393389500972132,
"grad_norm": 11.9375,
"learning_rate": 3.1543565710468744e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 9865,
"train_speed(iter/s)": 0.629472
},
{
"epoch": 6.396629941672067,
"grad_norm": 1.828125,
"learning_rate": 3.1493778800544696e-05,
"loss": 0.0205078125,
"memory(GiB)": 43.05,
"step": 9870,
"train_speed(iter/s)": 0.629602
},
{
"epoch": 6.3998703823720025,
"grad_norm": 1.46875,
"learning_rate": 3.144401314302287e-05,
"loss": 0.052734375,
"memory(GiB)": 43.05,
"step": 9875,
"train_speed(iter/s)": 0.62967
},
{
"epoch": 6.403110823071938,
"grad_norm": 1.71875,
"learning_rate": 3.139426879505377e-05,
"loss": 0.019921875,
"memory(GiB)": 43.05,
"step": 9880,
"train_speed(iter/s)": 0.629779
},
{
"epoch": 6.406351263771873,
"grad_norm": 11.3125,
"learning_rate": 3.13445458137634e-05,
"loss": 0.0345703125,
"memory(GiB)": 43.05,
"step": 9885,
"train_speed(iter/s)": 0.629868
},
{
"epoch": 6.409591704471808,
"grad_norm": 1.515625,
"learning_rate": 3.129484425625326e-05,
"loss": 0.0259765625,
"memory(GiB)": 43.05,
"step": 9890,
"train_speed(iter/s)": 0.629956
},
{
"epoch": 6.412832145171743,
"grad_norm": 2.1875,
"learning_rate": 3.1245164179600264e-05,
"loss": 0.03984375,
"memory(GiB)": 43.05,
"step": 9895,
"train_speed(iter/s)": 0.630071
},
{
"epoch": 6.416072585871679,
"grad_norm": 5.90625,
"learning_rate": 3.119550564085658e-05,
"loss": 0.020703125,
"memory(GiB)": 43.05,
"step": 9900,
"train_speed(iter/s)": 0.630122
},
{
"epoch": 6.419313026571614,
"grad_norm": 8.0625,
"learning_rate": 3.114586869704972e-05,
"loss": 0.0275390625,
"memory(GiB)": 43.05,
"step": 9905,
"train_speed(iter/s)": 0.630186
},
{
"epoch": 6.422553467271549,
"grad_norm": 3.3125,
"learning_rate": 3.109625340518237e-05,
"loss": 0.0400390625,
"memory(GiB)": 43.05,
"step": 9910,
"train_speed(iter/s)": 0.630236
},
{
"epoch": 6.425793907971484,
"grad_norm": 4.21875,
"learning_rate": 3.104665982223234e-05,
"loss": 0.0357421875,
"memory(GiB)": 43.05,
"step": 9915,
"train_speed(iter/s)": 0.630251
},
{
"epoch": 6.429034348671419,
"grad_norm": 8.4375,
"learning_rate": 3.0997088005152524e-05,
"loss": 0.0353515625,
"memory(GiB)": 43.05,
"step": 9920,
"train_speed(iter/s)": 0.63032
},
{
"epoch": 6.4322747893713546,
"grad_norm": 12.5625,
"learning_rate": 3.094753801087083e-05,
"loss": 0.025,
"memory(GiB)": 43.05,
"step": 9925,
"train_speed(iter/s)": 0.630287
},
{
"epoch": 6.43551523007129,
"grad_norm": 2.4375,
"learning_rate": 3.0898009896290074e-05,
"loss": 0.04140625,
"memory(GiB)": 43.05,
"step": 9930,
"train_speed(iter/s)": 0.63037
},
{
"epoch": 6.438755670771225,
"grad_norm": 2.828125,
"learning_rate": 3.084850371828796e-05,
"loss": 0.02421875,
"memory(GiB)": 43.05,
"step": 9935,
"train_speed(iter/s)": 0.630399
},
{
"epoch": 6.44199611147116,
"grad_norm": 13.75,
"learning_rate": 3.0799019533717025e-05,
"loss": 0.0150390625,
"memory(GiB)": 43.05,
"step": 9940,
"train_speed(iter/s)": 0.630535
},
{
"epoch": 6.445236552171095,
"grad_norm": 10.5625,
"learning_rate": 3.074955739940449e-05,
"loss": 0.030078125,
"memory(GiB)": 43.05,
"step": 9945,
"train_speed(iter/s)": 0.630644
},
{
"epoch": 6.44847699287103,
"grad_norm": 12.0625,
"learning_rate": 3.0700117372152315e-05,
"loss": 0.0470703125,
"memory(GiB)": 43.05,
"step": 9950,
"train_speed(iter/s)": 0.630719
},
{
"epoch": 6.451717433570965,
"grad_norm": 6.53125,
"learning_rate": 3.0650699508737046e-05,
"loss": 0.0326171875,
"memory(GiB)": 43.05,
"step": 9955,
"train_speed(iter/s)": 0.630822
},
{
"epoch": 6.454957874270901,
"grad_norm": 6.34375,
"learning_rate": 3.060130386590977e-05,
"loss": 0.0185546875,
"memory(GiB)": 43.05,
"step": 9960,
"train_speed(iter/s)": 0.630799
},
{
"epoch": 6.458198314970836,
"grad_norm": 16.125,
"learning_rate": 3.055193050039607e-05,
"loss": 0.022265625,
"memory(GiB)": 43.05,
"step": 9965,
"train_speed(iter/s)": 0.630819
},
{
"epoch": 6.4614387556707715,
"grad_norm": 1.1953125,
"learning_rate": 3.0502579468895943e-05,
"loss": 0.026171875,
"memory(GiB)": 43.05,
"step": 9970,
"train_speed(iter/s)": 0.630866
},
{
"epoch": 6.464679196370707,
"grad_norm": 1.234375,
"learning_rate": 3.0453250828083718e-05,
"loss": 0.034765625,
"memory(GiB)": 43.05,
"step": 9975,
"train_speed(iter/s)": 0.63096
},
{
"epoch": 6.467919637070642,
"grad_norm": 4.40625,
"learning_rate": 3.0403944634608034e-05,
"loss": 0.0302734375,
"memory(GiB)": 43.05,
"step": 9980,
"train_speed(iter/s)": 0.631083
},
{
"epoch": 6.471160077770577,
"grad_norm": 6.65625,
"learning_rate": 3.0354660945091763e-05,
"loss": 0.02265625,
"memory(GiB)": 43.05,
"step": 9985,
"train_speed(iter/s)": 0.631167
},
{
"epoch": 6.474400518470512,
"grad_norm": 1.3125,
"learning_rate": 3.0305399816131884e-05,
"loss": 0.0193359375,
"memory(GiB)": 43.05,
"step": 9990,
"train_speed(iter/s)": 0.631197
},
{
"epoch": 6.477640959170447,
"grad_norm": 0.9375,
"learning_rate": 3.0256161304299514e-05,
"loss": 0.0197265625,
"memory(GiB)": 43.05,
"step": 9995,
"train_speed(iter/s)": 0.63122
},
{
"epoch": 6.480881399870382,
"grad_norm": 2.015625,
"learning_rate": 3.0206945466139812e-05,
"loss": 0.033984375,
"memory(GiB)": 43.05,
"step": 10000,
"train_speed(iter/s)": 0.631328
}
],
"logging_steps": 5,
"max_steps": 15430,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": true,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.478228814502298e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}