EDT-Former-encoder / trainer_state.json
zihaojing's picture
Upload folder using huggingface_hub
854f892 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 15.0,
"eval_steps": 1,
"global_step": 48810,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03073140749846343,
"grad_norm": 5.49942684173584,
"learning_rate": 0.0001,
"loss": 11.0245,
"step": 100,
"train_loss_gtc": 4.04046875,
"train_loss_gtm": 0.6575,
"train_loss_lm": 6.331875
},
{
"epoch": 0.06146281499692686,
"grad_norm": 6.315459251403809,
"learning_rate": 9.999896007507038e-05,
"loss": 6.9106,
"step": 200,
"train_loss_gtc": 2.85453125,
"train_loss_gtm": 0.6456640625,
"train_loss_lm": 3.41375
},
{
"epoch": 0.09219422249539029,
"grad_norm": 4.463039398193359,
"learning_rate": 9.999584034353926e-05,
"loss": 5.6825,
"step": 300,
"train_loss_gtc": 2.053828125,
"train_loss_gtm": 0.625390625,
"train_loss_lm": 3.02734375
},
{
"epoch": 0.12292562999385372,
"grad_norm": 8.906160354614258,
"learning_rate": 9.999064093517811e-05,
"loss": 4.8225,
"step": 400,
"train_loss_gtc": 1.5528125,
"train_loss_gtm": 0.550078125,
"train_loss_lm": 2.7178125
},
{
"epoch": 0.15365703749231716,
"grad_norm": 5.962785720825195,
"learning_rate": 9.99833620662667e-05,
"loss": 4.3188,
"step": 500,
"train_loss_gtc": 1.3146875,
"train_loss_gtm": 0.4680859375,
"train_loss_lm": 2.55734375
},
{
"epoch": 0.18438844499078058,
"grad_norm": 15.813605308532715,
"learning_rate": 9.997400403958414e-05,
"loss": 3.9968,
"step": 600,
"train_loss_gtc": 1.151953125,
"train_loss_gtm": 0.404775390625,
"train_loss_lm": 2.43796875
},
{
"epoch": 0.215119852489244,
"grad_norm": 9.086814880371094,
"learning_rate": 9.99625672443962e-05,
"loss": 3.8064,
"step": 700,
"train_loss_gtc": 1.0492578125,
"train_loss_gtm": 0.37134765625,
"train_loss_lm": 2.38078125
},
{
"epoch": 0.24585125998770743,
"grad_norm": 10.451958656311035,
"learning_rate": 9.994905215643926e-05,
"loss": 3.6012,
"step": 800,
"train_loss_gtc": 0.9308203125,
"train_loss_gtm": 0.342978515625,
"train_loss_lm": 2.32328125
},
{
"epoch": 0.2765826674861709,
"grad_norm": 2.9532716274261475,
"learning_rate": 9.993345933790036e-05,
"loss": 3.4027,
"step": 900,
"train_loss_gtc": 0.80484375,
"train_loss_gtm": 0.313017578125,
"train_loss_lm": 2.28640625
},
{
"epoch": 0.3073140749846343,
"grad_norm": 5.513271331787109,
"learning_rate": 9.991578943739396e-05,
"loss": 3.2882,
"step": 1000,
"train_loss_gtc": 0.748984375,
"train_loss_gtm": 0.29857421875,
"train_loss_lm": 2.23609375
},
{
"epoch": 0.33804548248309774,
"grad_norm": 9.901054382324219,
"learning_rate": 9.989604318993484e-05,
"loss": 3.1962,
"step": 1100,
"train_loss_gtc": 0.694609375,
"train_loss_gtm": 0.293974609375,
"train_loss_lm": 2.22203125
},
{
"epoch": 0.36877688998156116,
"grad_norm": 4.640697956085205,
"learning_rate": 9.987422141690761e-05,
"loss": 3.0563,
"step": 1200,
"train_loss_gtc": 0.62521484375,
"train_loss_gtm": 0.266708984375,
"train_loss_lm": 2.163359375
},
{
"epoch": 0.3995082974800246,
"grad_norm": 6.046248435974121,
"learning_rate": 9.98503250260325e-05,
"loss": 2.9871,
"step": 1300,
"train_loss_gtc": 0.60119140625,
"train_loss_gtm": 0.250830078125,
"train_loss_lm": 2.1440625
},
{
"epoch": 0.430239704978488,
"grad_norm": 6.06462287902832,
"learning_rate": 9.982435501132761e-05,
"loss": 2.918,
"step": 1400,
"train_loss_gtc": 0.54515625,
"train_loss_gtm": 0.228916015625,
"train_loss_lm": 2.139453125
},
{
"epoch": 0.46097111247695144,
"grad_norm": 4.272490978240967,
"learning_rate": 9.979631245306756e-05,
"loss": 2.8624,
"step": 1500,
"train_loss_gtc": 0.5327734375,
"train_loss_gtm": 0.236171875,
"train_loss_lm": 2.10140625
},
{
"epoch": 0.49170251997541486,
"grad_norm": 2.8239645957946777,
"learning_rate": 9.976619851773859e-05,
"loss": 2.7952,
"step": 1600,
"train_loss_gtc": 0.49181640625,
"train_loss_gtm": 0.216806640625,
"train_loss_lm": 2.087890625
},
{
"epoch": 0.5224339274738783,
"grad_norm": 4.718524932861328,
"learning_rate": 9.973401445798997e-05,
"loss": 2.744,
"step": 1700,
"train_loss_gtc": 0.47638671875,
"train_loss_gtm": 0.2067578125,
"train_loss_lm": 2.06015625
},
{
"epoch": 0.5531653349723418,
"grad_norm": 7.191675662994385,
"learning_rate": 9.969976161258194e-05,
"loss": 2.6875,
"step": 1800,
"train_loss_gtc": 0.4446484375,
"train_loss_gtm": 0.1912353515625,
"train_loss_lm": 2.05390625
},
{
"epoch": 0.5838967424708051,
"grad_norm": 4.1135640144348145,
"learning_rate": 9.966344140633001e-05,
"loss": 2.6366,
"step": 1900,
"train_loss_gtc": 0.42078125,
"train_loss_gtm": 0.187529296875,
"train_loss_lm": 2.03140625
},
{
"epoch": 0.6146281499692686,
"grad_norm": 4.423370361328125,
"learning_rate": 9.962505535004571e-05,
"loss": 2.6245,
"step": 2000,
"train_loss_gtc": 0.40998046875,
"train_loss_gtm": 0.1915087890625,
"train_loss_lm": 2.026328125
},
{
"epoch": 0.645359557467732,
"grad_norm": 3.297764778137207,
"learning_rate": 9.958460504047372e-05,
"loss": 2.5585,
"step": 2100,
"train_loss_gtc": 0.38455078125,
"train_loss_gtm": 0.176318359375,
"train_loss_lm": 1.99703125
},
{
"epoch": 0.6760909649661955,
"grad_norm": 8.061768531799316,
"learning_rate": 9.954209216022543e-05,
"loss": 2.5188,
"step": 2200,
"train_loss_gtc": 0.36263671875,
"train_loss_gtm": 0.1635595703125,
"train_loss_lm": 1.987578125
},
{
"epoch": 0.7068223724646588,
"grad_norm": 4.367369174957275,
"learning_rate": 9.949751847770904e-05,
"loss": 2.5078,
"step": 2300,
"train_loss_gtc": 0.3640234375,
"train_loss_gtm": 0.174541015625,
"train_loss_lm": 1.973359375
},
{
"epoch": 0.7375537799631223,
"grad_norm": 1.8874632120132446,
"learning_rate": 9.945088584705584e-05,
"loss": 2.4485,
"step": 2400,
"train_loss_gtc": 0.33607421875,
"train_loss_gtm": 0.14620361328125,
"train_loss_lm": 1.95671875
},
{
"epoch": 0.7682851874615857,
"grad_norm": 2.437286376953125,
"learning_rate": 9.940219620804327e-05,
"loss": 2.4232,
"step": 2500,
"train_loss_gtc": 0.3200390625,
"train_loss_gtm": 0.149814453125,
"train_loss_lm": 1.95375
},
{
"epoch": 0.7990165949600492,
"grad_norm": 2.414090871810913,
"learning_rate": 9.935145158601411e-05,
"loss": 2.4102,
"step": 2600,
"train_loss_gtc": 0.317177734375,
"train_loss_gtm": 0.153583984375,
"train_loss_lm": 1.937109375
},
{
"epoch": 0.8297480024585125,
"grad_norm": 3.1979939937591553,
"learning_rate": 9.929865409179224e-05,
"loss": 2.3885,
"step": 2700,
"train_loss_gtc": 0.30353515625,
"train_loss_gtm": 0.135172119140625,
"train_loss_lm": 1.940078125
},
{
"epoch": 0.860479409956976,
"grad_norm": 5.63021993637085,
"learning_rate": 9.92438059215949e-05,
"loss": 2.3431,
"step": 2800,
"train_loss_gtc": 0.286796875,
"train_loss_gtm": 0.1349951171875,
"train_loss_lm": 1.92109375
},
{
"epoch": 0.8912108174554395,
"grad_norm": 2.3982977867126465,
"learning_rate": 9.918690935694126e-05,
"loss": 2.3297,
"step": 2900,
"train_loss_gtc": 0.28318359375,
"train_loss_gtm": 0.134306640625,
"train_loss_lm": 1.91375
},
{
"epoch": 0.9219422249539029,
"grad_norm": 3.208214282989502,
"learning_rate": 9.912796676455757e-05,
"loss": 2.3016,
"step": 3000,
"train_loss_gtc": 0.272578125,
"train_loss_gtm": 0.124122314453125,
"train_loss_lm": 1.913046875
},
{
"epoch": 0.9526736324523664,
"grad_norm": 2.7042226791381836,
"learning_rate": 9.906698059627866e-05,
"loss": 2.2748,
"step": 3100,
"train_loss_gtc": 0.25408203125,
"train_loss_gtm": 0.121632080078125,
"train_loss_lm": 1.896484375
},
{
"epoch": 0.9834050399508297,
"grad_norm": 3.007841110229492,
"learning_rate": 9.900395338894601e-05,
"loss": 2.2726,
"step": 3200,
"train_loss_gtc": 0.254970703125,
"train_loss_gtm": 0.1286865234375,
"train_loss_lm": 1.881171875
},
{
"epoch": 1.0,
"eval_loss": 2.549999952316284,
"eval_runtime": 4.223,
"eval_samples_per_second": 235.615,
"eval_steps_per_second": 2.605,
"step": 3254,
"train_loss_gtc": 0.26175491898148145,
"train_loss_gtm": 0.12819191261574073,
"train_loss_lm": 1.8826678240740742,
"val_loss_gtc": 0.39130859375,
"val_loss_gtm": 0.2362060546875,
"val_loss_lm": 1.94765625
},
{
"epoch": 1.014136447449293,
"grad_norm": 3.212726354598999,
"learning_rate": 9.89388877643022e-05,
"loss": 2.2582,
"step": 3300,
"train_loss_gtc": 0.24630604619565216,
"train_loss_gtm": 0.11464259935461957,
"train_loss_lm": 1.8685461956521738
},
{
"epoch": 1.0448678549477566,
"grad_norm": 2.5640087127685547,
"learning_rate": 9.887178642888182e-05,
"loss": 2.2174,
"step": 3400,
"train_loss_gtc": 0.23337890625,
"train_loss_gtm": 0.11256591796875,
"train_loss_lm": 1.86703125
},
{
"epoch": 1.07559926244622,
"grad_norm": 1.9021731615066528,
"learning_rate": 9.880265217389893e-05,
"loss": 2.2195,
"step": 3500,
"train_loss_gtc": 0.234921875,
"train_loss_gtm": 0.125850830078125,
"train_loss_lm": 1.86671875
},
{
"epoch": 1.1063306699446835,
"grad_norm": 2.093270778656006,
"learning_rate": 9.873148787513093e-05,
"loss": 2.2154,
"step": 3600,
"train_loss_gtc": 0.241884765625,
"train_loss_gtm": 0.12220458984375,
"train_loss_lm": 1.8584375
},
{
"epoch": 1.1370620774431468,
"grad_norm": 4.833855152130127,
"learning_rate": 9.865829649279898e-05,
"loss": 2.1983,
"step": 3700,
"train_loss_gtc": 0.2290625,
"train_loss_gtm": 0.108033447265625,
"train_loss_lm": 1.843671875
},
{
"epoch": 1.1677934849416103,
"grad_norm": 4.2011518478393555,
"learning_rate": 9.858308107144479e-05,
"loss": 2.1765,
"step": 3800,
"train_loss_gtc": 0.223466796875,
"train_loss_gtm": 0.109500732421875,
"train_loss_lm": 1.837578125
},
{
"epoch": 1.1985248924400738,
"grad_norm": 6.385040760040283,
"learning_rate": 9.850584473980405e-05,
"loss": 2.1558,
"step": 3900,
"train_loss_gtc": 0.21083984375,
"train_loss_gtm": 0.102752685546875,
"train_loss_lm": 1.843203125
},
{
"epoch": 1.2292562999385372,
"grad_norm": 2.9829607009887695,
"learning_rate": 9.84265907106762e-05,
"loss": 2.1519,
"step": 4000,
"train_loss_gtc": 0.208603515625,
"train_loss_gtm": 0.11494384765625,
"train_loss_lm": 1.833828125
},
{
"epoch": 1.2599877074370007,
"grad_norm": 1.9834630489349365,
"learning_rate": 9.834532228079088e-05,
"loss": 2.1325,
"step": 4100,
"train_loss_gtc": 0.201494140625,
"train_loss_gtm": 0.102637939453125,
"train_loss_lm": 1.829453125
},
{
"epoch": 1.290719114935464,
"grad_norm": 2.685356378555298,
"learning_rate": 9.826204283067073e-05,
"loss": 2.1218,
"step": 4200,
"train_loss_gtc": 0.19677734375,
"train_loss_gtm": 0.100631103515625,
"train_loss_lm": 1.82484375
},
{
"epoch": 1.3214505224339275,
"grad_norm": 1.5945948362350464,
"learning_rate": 9.817675582449082e-05,
"loss": 2.1261,
"step": 4300,
"train_loss_gtc": 0.19837890625,
"train_loss_gtm": 0.11297119140625,
"train_loss_lm": 1.828828125
},
{
"epoch": 1.352181929932391,
"grad_norm": 2.5570931434631348,
"learning_rate": 9.80894648099345e-05,
"loss": 2.1039,
"step": 4400,
"train_loss_gtc": 0.18681640625,
"train_loss_gtm": 0.09926513671875,
"train_loss_lm": 1.81125
},
{
"epoch": 1.3829133374308542,
"grad_norm": 1.3465452194213867,
"learning_rate": 9.800017341804584e-05,
"loss": 2.0879,
"step": 4500,
"train_loss_gtc": 0.1895263671875,
"train_loss_gtm": 0.101627197265625,
"train_loss_lm": 1.799453125
},
{
"epoch": 1.4136447449293177,
"grad_norm": 1.4922404289245605,
"learning_rate": 9.790888536307865e-05,
"loss": 2.0743,
"step": 4600,
"train_loss_gtc": 0.1802734375,
"train_loss_gtm": 0.0869610595703125,
"train_loss_lm": 1.797890625
},
{
"epoch": 1.4443761524277812,
"grad_norm": 3.5552456378936768,
"learning_rate": 9.781560444234187e-05,
"loss": 2.077,
"step": 4700,
"train_loss_gtc": 0.1821533203125,
"train_loss_gtm": 0.10408935546875,
"train_loss_lm": 1.79203125
},
{
"epoch": 1.4751075599262446,
"grad_norm": 4.11555290222168,
"learning_rate": 9.77203345360417e-05,
"loss": 2.0674,
"step": 4800,
"train_loss_gtc": 0.178369140625,
"train_loss_gtm": 0.09654052734375,
"train_loss_lm": 1.797109375
},
{
"epoch": 1.5058389674247081,
"grad_norm": 1.795981764793396,
"learning_rate": 9.762307960712018e-05,
"loss": 2.0636,
"step": 4900,
"train_loss_gtc": 0.179599609375,
"train_loss_gtm": 0.10112548828125,
"train_loss_lm": 1.788359375
},
{
"epoch": 1.5365703749231714,
"grad_norm": 1.9280930757522583,
"learning_rate": 9.75238437010903e-05,
"loss": 2.0431,
"step": 5000,
"train_loss_gtc": 0.170859375,
"train_loss_gtm": 0.0848968505859375,
"train_loss_lm": 1.785546875
},
{
"epoch": 1.5673017824216349,
"grad_norm": 1.1412756443023682,
"learning_rate": 9.742263094586775e-05,
"loss": 2.0316,
"step": 5100,
"train_loss_gtc": 0.171064453125,
"train_loss_gtm": 0.0872491455078125,
"train_loss_lm": 1.7815625
},
{
"epoch": 1.5980331899200984,
"grad_norm": 1.9907689094543457,
"learning_rate": 9.731944555159926e-05,
"loss": 2.0229,
"step": 5200,
"train_loss_gtc": 0.1637890625,
"train_loss_gtm": 0.080010986328125,
"train_loss_lm": 1.778359375
},
{
"epoch": 1.6287645974185616,
"grad_norm": 1.9761877059936523,
"learning_rate": 9.721429181048736e-05,
"loss": 2.0141,
"step": 5300,
"train_loss_gtc": 0.163154296875,
"train_loss_gtm": 0.08893798828125,
"train_loss_lm": 1.76640625
},
{
"epoch": 1.6594960049170253,
"grad_norm": 2.6358389854431152,
"learning_rate": 9.710717409661191e-05,
"loss": 2.0137,
"step": 5400,
"train_loss_gtc": 0.1599267578125,
"train_loss_gtm": 0.08393280029296875,
"train_loss_lm": 1.7646875
},
{
"epoch": 1.6902274124154886,
"grad_norm": 1.707661509513855,
"learning_rate": 9.699809686574819e-05,
"loss": 2.0079,
"step": 5500,
"train_loss_gtc": 0.157626953125,
"train_loss_gtm": 0.0876666259765625,
"train_loss_lm": 1.757421875
},
{
"epoch": 1.720958819913952,
"grad_norm": 2.0161757469177246,
"learning_rate": 9.688706465518145e-05,
"loss": 2.0002,
"step": 5600,
"train_loss_gtc": 0.1531884765625,
"train_loss_gtm": 0.0824560546875,
"train_loss_lm": 1.757421875
},
{
"epoch": 1.7516902274124155,
"grad_norm": 1.4757572412490845,
"learning_rate": 9.677408208351822e-05,
"loss": 1.9837,
"step": 5700,
"train_loss_gtc": 0.152705078125,
"train_loss_gtm": 0.080008544921875,
"train_loss_lm": 1.756640625
},
{
"epoch": 1.7824216349108788,
"grad_norm": 1.820297360420227,
"learning_rate": 9.665915385049424e-05,
"loss": 1.9852,
"step": 5800,
"train_loss_gtc": 0.1525732421875,
"train_loss_gtm": 0.0837579345703125,
"train_loss_lm": 1.7565625
},
{
"epoch": 1.8131530424093425,
"grad_norm": 2.550199031829834,
"learning_rate": 9.65422847367789e-05,
"loss": 1.9628,
"step": 5900,
"train_loss_gtc": 0.1431494140625,
"train_loss_gtm": 0.070325927734375,
"train_loss_lm": 1.748359375
},
{
"epoch": 1.8438844499078058,
"grad_norm": 3.284813642501831,
"learning_rate": 9.642347960377638e-05,
"loss": 1.9785,
"step": 6000,
"train_loss_gtc": 0.149755859375,
"train_loss_gtm": 0.085838623046875,
"train_loss_lm": 1.74515625
},
{
"epoch": 1.8746158574062692,
"grad_norm": 2.071235418319702,
"learning_rate": 9.630274339342344e-05,
"loss": 1.9699,
"step": 6100,
"train_loss_gtc": 0.1496533203125,
"train_loss_gtm": 0.08025115966796875,
"train_loss_lm": 1.75
},
{
"epoch": 1.9053472649047327,
"grad_norm": 1.5470776557922363,
"learning_rate": 9.618008112798393e-05,
"loss": 1.9727,
"step": 6200,
"train_loss_gtc": 0.1493896484375,
"train_loss_gtm": 0.080283203125,
"train_loss_lm": 1.739921875
},
{
"epoch": 1.936078672403196,
"grad_norm": 6.53104305267334,
"learning_rate": 9.605549790983973e-05,
"loss": 1.9612,
"step": 6300,
"train_loss_gtc": 0.1470556640625,
"train_loss_gtm": 0.0838421630859375,
"train_loss_lm": 1.736328125
},
{
"epoch": 1.9668100799016595,
"grad_norm": 1.8970906734466553,
"learning_rate": 9.592899892127863e-05,
"loss": 1.9457,
"step": 6400,
"train_loss_gtc": 0.1391943359375,
"train_loss_gtm": 0.0737689208984375,
"train_loss_lm": 1.730859375
},
{
"epoch": 1.997541487400123,
"grad_norm": 1.675215721130371,
"learning_rate": 9.580058942427867e-05,
"loss": 1.9364,
"step": 6500,
"train_loss_gtc": 0.1374560546875,
"train_loss_gtm": 0.0687762451171875,
"train_loss_lm": 1.72921875
},
{
"epoch": 2.0,
"eval_loss": 2.19921875,
"eval_runtime": 3.8844,
"eval_samples_per_second": 256.154,
"eval_steps_per_second": 2.832,
"step": 6508,
"train_loss_gtc": 0.137939453125,
"train_loss_gtm": 0.0904998779296875,
"train_loss_lm": 1.70703125,
"val_loss_gtc": 0.2357421875,
"val_loss_gtm": 0.210223388671875,
"val_loss_lm": 1.7640625
},
{
"epoch": 2.028272894898586,
"grad_norm": 1.4447669982910156,
"learning_rate": 9.567027476028937e-05,
"loss": 1.9201,
"step": 6600,
"train_loss_gtc": 0.13188901154891305,
"train_loss_gtm": 0.06937939187754756,
"train_loss_lm": 1.7201086956521738
},
{
"epoch": 2.05900430239705,
"grad_norm": 2.3873608112335205,
"learning_rate": 9.553806035000945e-05,
"loss": 1.9203,
"step": 6700,
"train_loss_gtc": 0.12853515625,
"train_loss_gtm": 0.0747589111328125,
"train_loss_lm": 1.7165625
},
{
"epoch": 2.089735709895513,
"grad_norm": 3.420974016189575,
"learning_rate": 9.540395169316132e-05,
"loss": 1.9248,
"step": 6800,
"train_loss_gtc": 0.1289501953125,
"train_loss_gtm": 0.0793853759765625,
"train_loss_lm": 1.7153125
},
{
"epoch": 2.120467117393977,
"grad_norm": 2.3101561069488525,
"learning_rate": 9.526795436826242e-05,
"loss": 1.9149,
"step": 6900,
"train_loss_gtc": 0.128876953125,
"train_loss_gtm": 0.071492919921875,
"train_loss_lm": 1.713671875
},
{
"epoch": 2.15119852489244,
"grad_norm": 1.0823053121566772,
"learning_rate": 9.513007403239311e-05,
"loss": 1.8968,
"step": 7000,
"train_loss_gtc": 0.1226025390625,
"train_loss_gtm": 0.0626873779296875,
"train_loss_lm": 1.707890625
},
{
"epoch": 2.1819299323909034,
"grad_norm": 2.2665770053863525,
"learning_rate": 9.49903164209613e-05,
"loss": 1.9036,
"step": 7100,
"train_loss_gtc": 0.128154296875,
"train_loss_gtm": 0.06544097900390625,
"train_loss_lm": 1.702578125
},
{
"epoch": 2.212661339889367,
"grad_norm": 0.9536680579185486,
"learning_rate": 9.484868734746399e-05,
"loss": 1.8943,
"step": 7200,
"train_loss_gtc": 0.119287109375,
"train_loss_gtm": 0.0679217529296875,
"train_loss_lm": 1.700703125
},
{
"epoch": 2.2433927473878303,
"grad_norm": 1.799402117729187,
"learning_rate": 9.470519270324532e-05,
"loss": 1.8917,
"step": 7300,
"train_loss_gtc": 0.121845703125,
"train_loss_gtm": 0.05860137939453125,
"train_loss_lm": 1.7115625
},
{
"epoch": 2.2741241548862936,
"grad_norm": 1.3167532682418823,
"learning_rate": 9.455983845725164e-05,
"loss": 1.8896,
"step": 7400,
"train_loss_gtc": 0.12458984375,
"train_loss_gtm": 0.06630035400390626,
"train_loss_lm": 1.698984375
},
{
"epoch": 2.3048555623847573,
"grad_norm": 3.1567189693450928,
"learning_rate": 9.441263065578308e-05,
"loss": 1.8849,
"step": 7500,
"train_loss_gtc": 0.120859375,
"train_loss_gtm": 0.063575439453125,
"train_loss_lm": 1.69515625
},
{
"epoch": 2.3355869698832206,
"grad_norm": 2.949071168899536,
"learning_rate": 9.426357542224215e-05,
"loss": 1.8767,
"step": 7600,
"train_loss_gtc": 0.1182275390625,
"train_loss_gtm": 0.067989501953125,
"train_loss_lm": 1.691875
},
{
"epoch": 2.3663183773816843,
"grad_norm": 2.110520362854004,
"learning_rate": 9.411267895687898e-05,
"loss": 1.8791,
"step": 7700,
"train_loss_gtc": 0.11953125,
"train_loss_gtm": 0.068671875,
"train_loss_lm": 1.693046875
},
{
"epoch": 2.3970497848801475,
"grad_norm": 1.1845890283584595,
"learning_rate": 9.395994753653343e-05,
"loss": 1.8692,
"step": 7800,
"train_loss_gtc": 0.1122509765625,
"train_loss_gtm": 0.06687744140625,
"train_loss_lm": 1.690078125
},
{
"epoch": 2.427781192378611,
"grad_norm": 1.572401762008667,
"learning_rate": 9.380538751437396e-05,
"loss": 1.869,
"step": 7900,
"train_loss_gtc": 0.1185498046875,
"train_loss_gtm": 0.05891082763671875,
"train_loss_lm": 1.69109375
},
{
"epoch": 2.4585125998770745,
"grad_norm": 1.395868182182312,
"learning_rate": 9.364900531963336e-05,
"loss": 1.8866,
"step": 8000,
"train_loss_gtc": 0.125126953125,
"train_loss_gtm": 0.0729669189453125,
"train_loss_lm": 1.689609375
},
{
"epoch": 2.4892440073755377,
"grad_norm": 1.1641755104064941,
"learning_rate": 9.349080745734135e-05,
"loss": 1.867,
"step": 8100,
"train_loss_gtc": 0.1189306640625,
"train_loss_gtm": 0.06694183349609376,
"train_loss_lm": 1.68921875
},
{
"epoch": 2.5199754148740015,
"grad_norm": 2.092716932296753,
"learning_rate": 9.333080050805396e-05,
"loss": 1.8538,
"step": 8200,
"train_loss_gtc": 0.114306640625,
"train_loss_gtm": 0.0646319580078125,
"train_loss_lm": 1.68078125
},
{
"epoch": 2.5507068223724647,
"grad_norm": 1.8535902500152588,
"learning_rate": 9.316899112757982e-05,
"loss": 1.8524,
"step": 8300,
"train_loss_gtc": 0.1098681640625,
"train_loss_gtm": 0.06549957275390625,
"train_loss_lm": 1.6834375
},
{
"epoch": 2.581438229870928,
"grad_norm": 1.1401584148406982,
"learning_rate": 9.300538604670325e-05,
"loss": 1.8498,
"step": 8400,
"train_loss_gtc": 0.109970703125,
"train_loss_gtm": 0.0634844970703125,
"train_loss_lm": 1.677734375
},
{
"epoch": 2.6121696373693917,
"grad_norm": 1.7290570735931396,
"learning_rate": 9.283999207090439e-05,
"loss": 1.8523,
"step": 8500,
"train_loss_gtc": 0.1066796875,
"train_loss_gtm": 0.06089630126953125,
"train_loss_lm": 1.683203125
},
{
"epoch": 2.642901044867855,
"grad_norm": 0.7238840460777283,
"learning_rate": 9.267281608007592e-05,
"loss": 1.8537,
"step": 8600,
"train_loss_gtc": 0.1138037109375,
"train_loss_gtm": 0.066612548828125,
"train_loss_lm": 1.6834375
},
{
"epoch": 2.673632452366318,
"grad_norm": 2.260568380355835,
"learning_rate": 9.250386502823712e-05,
"loss": 1.8303,
"step": 8700,
"train_loss_gtc": 0.102099609375,
"train_loss_gtm": 0.0620458984375,
"train_loss_lm": 1.6715625
},
{
"epoch": 2.704363859864782,
"grad_norm": 1.1292821168899536,
"learning_rate": 9.233314594324437e-05,
"loss": 1.8346,
"step": 8800,
"train_loss_gtc": 0.106123046875,
"train_loss_gtm": 0.06123687744140625,
"train_loss_lm": 1.66734375
},
{
"epoch": 2.735095267363245,
"grad_norm": 1.3726723194122314,
"learning_rate": 9.216066592649899e-05,
"loss": 1.835,
"step": 8900,
"train_loss_gtc": 0.1031640625,
"train_loss_gtm": 0.055390625,
"train_loss_lm": 1.670390625
},
{
"epoch": 2.7658266748617084,
"grad_norm": 0.7613235712051392,
"learning_rate": 9.198643215265175e-05,
"loss": 1.8289,
"step": 9000,
"train_loss_gtc": 0.1032861328125,
"train_loss_gtm": 0.05791168212890625,
"train_loss_lm": 1.670234375
},
{
"epoch": 2.796558082360172,
"grad_norm": 1.4104223251342773,
"learning_rate": 9.181045186930446e-05,
"loss": 1.8226,
"step": 9100,
"train_loss_gtc": 0.10169921875,
"train_loss_gtm": 0.05517242431640625,
"train_loss_lm": 1.665
},
{
"epoch": 2.8272894898586354,
"grad_norm": 2.643277406692505,
"learning_rate": 9.163273239670845e-05,
"loss": 1.8278,
"step": 9200,
"train_loss_gtc": 0.1060205078125,
"train_loss_gtm": 0.0587158203125,
"train_loss_lm": 1.66890625
},
{
"epoch": 2.858020897357099,
"grad_norm": 0.7709031105041504,
"learning_rate": 9.145328112746013e-05,
"loss": 1.8159,
"step": 9300,
"train_loss_gtc": 0.10208984375,
"train_loss_gtm": 0.05981475830078125,
"train_loss_lm": 1.654921875
},
{
"epoch": 2.8887523048555623,
"grad_norm": 1.2432808876037598,
"learning_rate": 9.127210552619346e-05,
"loss": 1.8186,
"step": 9400,
"train_loss_gtc": 0.104443359375,
"train_loss_gtm": 0.05574615478515625,
"train_loss_lm": 1.65515625
},
{
"epoch": 2.919483712354026,
"grad_norm": 0.7463958859443665,
"learning_rate": 9.108921312926937e-05,
"loss": 1.8259,
"step": 9500,
"train_loss_gtc": 0.1081005859375,
"train_loss_gtm": 0.06411102294921875,
"train_loss_lm": 1.660234375
},
{
"epoch": 2.9502151198524893,
"grad_norm": 1.5550811290740967,
"learning_rate": 9.090461154446243e-05,
"loss": 1.8085,
"step": 9600,
"train_loss_gtc": 0.0987890625,
"train_loss_gtm": 0.0575555419921875,
"train_loss_lm": 1.650625
},
{
"epoch": 2.9809465273509526,
"grad_norm": 1.3831332921981812,
"learning_rate": 9.071830845064421e-05,
"loss": 1.8021,
"step": 9700,
"train_loss_gtc": 0.093544921875,
"train_loss_gtm": 0.04638153076171875,
"train_loss_lm": 1.653125
},
{
"epoch": 3.0,
"eval_loss": 2.033203125,
"eval_runtime": 3.9269,
"eval_samples_per_second": 253.383,
"eval_steps_per_second": 2.801,
"step": 9762,
"train_loss_gtc": 0.09482500630040322,
"train_loss_gtm": 0.03868521413495464,
"train_loss_lm": 1.6529737903225807,
"val_loss_gtc": 0.195166015625,
"val_loss_gtm": 0.1783203125,
"val_loss_lm": 1.67734375
},
{
"epoch": 3.0116779348494163,
"grad_norm": 0.6389613747596741,
"learning_rate": 9.0530311597464e-05,
"loss": 1.7867,
"step": 9800,
"train_loss_gtc": 0.0934094880756579,
"train_loss_gtm": 0.05592105263157895,
"train_loss_lm": 1.6383634868421053
},
{
"epoch": 3.0424093423478795,
"grad_norm": 4.752490520477295,
"learning_rate": 9.034062880502636e-05,
"loss": 1.8109,
"step": 9900,
"train_loss_gtc": 0.106484375,
"train_loss_gtm": 0.065299072265625,
"train_loss_lm": 1.645078125
},
{
"epoch": 3.073140749846343,
"grad_norm": 0.8840903043746948,
"learning_rate": 9.014926796356588e-05,
"loss": 1.7886,
"step": 10000,
"train_loss_gtc": 0.0953857421875,
"train_loss_gtm": 0.05199127197265625,
"train_loss_lm": 1.638125
},
{
"epoch": 3.1038721573448065,
"grad_norm": 1.790175199508667,
"learning_rate": 8.995623703311894e-05,
"loss": 1.7777,
"step": 10100,
"train_loss_gtc": 0.09615478515625,
"train_loss_gtm": 0.04876113891601563,
"train_loss_lm": 1.631875
},
{
"epoch": 3.1346035648432697,
"grad_norm": 1.167677879333496,
"learning_rate": 8.976154404319261e-05,
"loss": 1.7916,
"step": 10200,
"train_loss_gtc": 0.0942919921875,
"train_loss_gtm": 0.0605712890625,
"train_loss_lm": 1.639140625
},
{
"epoch": 3.1653349723417334,
"grad_norm": 1.2573415040969849,
"learning_rate": 8.956519709243065e-05,
"loss": 1.7905,
"step": 10300,
"train_loss_gtc": 0.097001953125,
"train_loss_gtm": 0.06192230224609375,
"train_loss_lm": 1.638203125
},
{
"epoch": 3.1960663798401967,
"grad_norm": 0.9823325276374817,
"learning_rate": 8.93672043482766e-05,
"loss": 1.7674,
"step": 10400,
"train_loss_gtc": 0.09020751953125,
"train_loss_gtm": 0.0457440185546875,
"train_loss_lm": 1.631875
},
{
"epoch": 3.22679778733866,
"grad_norm": 0.8545394539833069,
"learning_rate": 8.91675740466341e-05,
"loss": 1.7727,
"step": 10500,
"train_loss_gtc": 0.09113525390625,
"train_loss_gtm": 0.04704864501953125,
"train_loss_lm": 1.63609375
},
{
"epoch": 3.2575291948371237,
"grad_norm": 0.8178197741508484,
"learning_rate": 8.896631449152425e-05,
"loss": 1.7856,
"step": 10600,
"train_loss_gtc": 0.09496826171875,
"train_loss_gtm": 0.058369140625,
"train_loss_lm": 1.636484375
},
{
"epoch": 3.288260602335587,
"grad_norm": 1.6585794687271118,
"learning_rate": 8.876343405474018e-05,
"loss": 1.7747,
"step": 10700,
"train_loss_gtc": 0.0904931640625,
"train_loss_gtm": 0.0434954833984375,
"train_loss_lm": 1.6378125
},
{
"epoch": 3.3189920098340506,
"grad_norm": 1.3470587730407715,
"learning_rate": 8.855894117549885e-05,
"loss": 1.7657,
"step": 10800,
"train_loss_gtc": 0.08614501953125,
"train_loss_gtm": 0.04624908447265625,
"train_loss_lm": 1.6303125
},
{
"epoch": 3.349723417332514,
"grad_norm": 0.6378850936889648,
"learning_rate": 8.835284436009e-05,
"loss": 1.7683,
"step": 10900,
"train_loss_gtc": 0.0873779296875,
"train_loss_gtm": 0.04869110107421875,
"train_loss_lm": 1.6296875
},
{
"epoch": 3.380454824830977,
"grad_norm": 1.9016733169555664,
"learning_rate": 8.814515218152226e-05,
"loss": 1.7686,
"step": 11000,
"train_loss_gtc": 0.08774169921875,
"train_loss_gtm": 0.04988037109375,
"train_loss_lm": 1.6275
},
{
"epoch": 3.411186232329441,
"grad_norm": 6.191075325012207,
"learning_rate": 8.793587327916661e-05,
"loss": 1.7661,
"step": 11100,
"train_loss_gtc": 0.08765625,
"train_loss_gtm": 0.04734375,
"train_loss_lm": 1.626875
},
{
"epoch": 3.441917639827904,
"grad_norm": 1.3823864459991455,
"learning_rate": 8.772501635839694e-05,
"loss": 1.7634,
"step": 11200,
"train_loss_gtc": 0.0852392578125,
"train_loss_gtm": 0.047673492431640624,
"train_loss_lm": 1.630546875
},
{
"epoch": 3.4726490473263674,
"grad_norm": 0.9048540592193604,
"learning_rate": 8.751259019022801e-05,
"loss": 1.763,
"step": 11300,
"train_loss_gtc": 0.0891015625,
"train_loss_gtm": 0.0477813720703125,
"train_loss_lm": 1.626171875
},
{
"epoch": 3.503380454824831,
"grad_norm": 1.5553096532821655,
"learning_rate": 8.729860361095056e-05,
"loss": 1.7607,
"step": 11400,
"train_loss_gtc": 0.086826171875,
"train_loss_gtm": 0.05027008056640625,
"train_loss_lm": 1.627890625
},
{
"epoch": 3.5341118623232943,
"grad_norm": 1.639862060546875,
"learning_rate": 8.708306552176368e-05,
"loss": 1.7502,
"step": 11500,
"train_loss_gtc": 0.0829248046875,
"train_loss_gtm": 0.046551055908203125,
"train_loss_lm": 1.6209375
},
{
"epoch": 3.5648432698217576,
"grad_norm": 0.8203203678131104,
"learning_rate": 8.68659848884047e-05,
"loss": 1.7439,
"step": 11600,
"train_loss_gtc": 0.0837255859375,
"train_loss_gtm": 0.04283203125,
"train_loss_lm": 1.6215625
},
{
"epoch": 3.5955746773202213,
"grad_norm": 0.7728437781333923,
"learning_rate": 8.664737074077606e-05,
"loss": 1.7501,
"step": 11700,
"train_loss_gtc": 0.08592041015625,
"train_loss_gtm": 0.0465325927734375,
"train_loss_lm": 1.62140625
},
{
"epoch": 3.6263060848186845,
"grad_norm": 2.764263391494751,
"learning_rate": 8.642723217256991e-05,
"loss": 1.748,
"step": 11800,
"train_loss_gtc": 0.08718017578125,
"train_loss_gtm": 0.05325942993164062,
"train_loss_lm": 1.614921875
},
{
"epoch": 3.6570374923171483,
"grad_norm": 1.381459355354309,
"learning_rate": 8.620557834088962e-05,
"loss": 1.7502,
"step": 11900,
"train_loss_gtc": 0.08507568359375,
"train_loss_gtm": 0.04700302124023437,
"train_loss_lm": 1.6196875
},
{
"epoch": 3.6877688998156115,
"grad_norm": 3.3425018787384033,
"learning_rate": 8.598241846586899e-05,
"loss": 1.7493,
"step": 12000,
"train_loss_gtc": 0.0854345703125,
"train_loss_gtm": 0.04688232421875,
"train_loss_lm": 1.61484375
},
{
"epoch": 3.718500307314075,
"grad_norm": 1.8849352598190308,
"learning_rate": 8.575776183028873e-05,
"loss": 1.7444,
"step": 12100,
"train_loss_gtc": 0.08189208984375,
"train_loss_gtm": 0.05115646362304688,
"train_loss_lm": 1.619765625
},
{
"epoch": 3.7492317148125385,
"grad_norm": 0.8946220278739929,
"learning_rate": 8.553161777919028e-05,
"loss": 1.7366,
"step": 12200,
"train_loss_gtc": 0.08007568359375,
"train_loss_gtm": 0.04531814575195312,
"train_loss_lm": 1.614140625
},
{
"epoch": 3.7799631223110017,
"grad_norm": 0.682151734828949,
"learning_rate": 8.530399571948708e-05,
"loss": 1.7215,
"step": 12300,
"train_loss_gtc": 0.0752392578125,
"train_loss_gtm": 0.034449920654296876,
"train_loss_lm": 1.606171875
},
{
"epoch": 3.8106945298094654,
"grad_norm": 0.6967170238494873,
"learning_rate": 8.507490511957333e-05,
"loss": 1.7367,
"step": 12400,
"train_loss_gtc": 0.0777197265625,
"train_loss_gtm": 0.045133056640625,
"train_loss_lm": 1.610703125
},
{
"epoch": 3.8414259373079287,
"grad_norm": 1.3819748163223267,
"learning_rate": 8.484435550893006e-05,
"loss": 1.7275,
"step": 12500,
"train_loss_gtc": 0.07723388671875,
"train_loss_gtm": 0.037061309814453124,
"train_loss_lm": 1.611875
},
{
"epoch": 3.872157344806392,
"grad_norm": 1.2844618558883667,
"learning_rate": 8.461235647772877e-05,
"loss": 1.7219,
"step": 12600,
"train_loss_gtc": 0.07914794921875,
"train_loss_gtm": 0.039013671875,
"train_loss_lm": 1.610703125
},
{
"epoch": 3.9028887523048557,
"grad_norm": 1.2111929655075073,
"learning_rate": 8.437891767643251e-05,
"loss": 1.7278,
"step": 12700,
"train_loss_gtc": 0.078544921875,
"train_loss_gtm": 0.038914642333984374,
"train_loss_lm": 1.610078125
},
{
"epoch": 3.933620159803319,
"grad_norm": 0.8348441123962402,
"learning_rate": 8.414404881539443e-05,
"loss": 1.7255,
"step": 12800,
"train_loss_gtc": 0.0780419921875,
"train_loss_gtm": 0.044105224609375,
"train_loss_lm": 1.602890625
},
{
"epoch": 3.964351567301782,
"grad_norm": 1.4092820882797241,
"learning_rate": 8.39077596644539e-05,
"loss": 1.7195,
"step": 12900,
"train_loss_gtc": 0.07587646484375,
"train_loss_gtm": 0.0384991455078125,
"train_loss_lm": 1.60671875
},
{
"epoch": 3.995082974800246,
"grad_norm": 3.6042683124542236,
"learning_rate": 8.367006005253006e-05,
"loss": 1.7295,
"step": 13000,
"train_loss_gtc": 0.08053466796875,
"train_loss_gtm": 0.04183273315429688,
"train_loss_lm": 1.60703125
},
{
"epoch": 4.0,
"eval_loss": 1.9796874523162842,
"eval_runtime": 3.8758,
"eval_samples_per_second": 256.721,
"eval_steps_per_second": 2.838,
"step": 13016,
"train_loss_gtc": 0.079498291015625,
"train_loss_gtm": 0.044208526611328125,
"train_loss_lm": 1.59814453125,
"val_loss_gtc": 0.159912109375,
"val_loss_gtm": 0.183050537109375,
"val_loss_lm": 1.61953125
},
{
"epoch": 4.02581438229871,
"grad_norm": 1.8121000528335571,
"learning_rate": 8.343095986721301e-05,
"loss": 1.7206,
"step": 13100,
"train_loss_gtc": 0.07986014229910714,
"train_loss_gtm": 0.055745079403831846,
"train_loss_lm": 1.5932849702380953
},
{
"epoch": 4.056545789797172,
"grad_norm": 1.3698956966400146,
"learning_rate": 8.319046905435246e-05,
"loss": 1.7096,
"step": 13200,
"train_loss_gtc": 0.0751611328125,
"train_loss_gtm": 0.042149658203125,
"train_loss_lm": 1.59296875
},
{
"epoch": 4.087277197295636,
"grad_norm": 1.6034717559814453,
"learning_rate": 8.294859761764408e-05,
"loss": 1.7046,
"step": 13300,
"train_loss_gtc": 0.072431640625,
"train_loss_gtm": 0.046780548095703124,
"train_loss_lm": 1.58609375
},
{
"epoch": 4.1180086047941,
"grad_norm": 1.3316949605941772,
"learning_rate": 8.270535561821336e-05,
"loss": 1.7095,
"step": 13400,
"train_loss_gtc": 0.077919921875,
"train_loss_gtm": 0.04380218505859375,
"train_loss_lm": 1.591953125
},
{
"epoch": 4.148740012292563,
"grad_norm": 0.6827447414398193,
"learning_rate": 8.246075317419706e-05,
"loss": 1.7173,
"step": 13500,
"train_loss_gtc": 0.07958740234375,
"train_loss_gtm": 0.0477728271484375,
"train_loss_lm": 1.598125
},
{
"epoch": 4.179471419791026,
"grad_norm": 1.5629603862762451,
"learning_rate": 8.221480046032233e-05,
"loss": 1.6964,
"step": 13600,
"train_loss_gtc": 0.07267578125,
"train_loss_gtm": 0.03870758056640625,
"train_loss_lm": 1.582734375
},
{
"epoch": 4.21020282728949,
"grad_norm": 1.0047539472579956,
"learning_rate": 8.196750770748355e-05,
"loss": 1.7064,
"step": 13700,
"train_loss_gtc": 0.0744775390625,
"train_loss_gtm": 0.0351849365234375,
"train_loss_lm": 1.5890625
},
{
"epoch": 4.240934234787954,
"grad_norm": 0.7914025187492371,
"learning_rate": 8.171888520231666e-05,
"loss": 1.7175,
"step": 13800,
"train_loss_gtc": 0.0821630859375,
"train_loss_gtm": 0.05238189697265625,
"train_loss_lm": 1.58953125
},
{
"epoch": 4.2716656422864165,
"grad_norm": 1.05272376537323,
"learning_rate": 8.146894328677128e-05,
"loss": 1.6928,
"step": 13900,
"train_loss_gtc": 0.0702734375,
"train_loss_gtm": 0.033878173828125,
"train_loss_lm": 1.58375
},
{
"epoch": 4.30239704978488,
"grad_norm": 1.6808894872665405,
"learning_rate": 8.12176923576806e-05,
"loss": 1.6968,
"step": 14000,
"train_loss_gtc": 0.07257080078125,
"train_loss_gtm": 0.034125747680664065,
"train_loss_lm": 1.585
},
{
"epoch": 4.333128457283344,
"grad_norm": 0.815800130367279,
"learning_rate": 8.096514286632879e-05,
"loss": 1.6977,
"step": 14100,
"train_loss_gtc": 0.070693359375,
"train_loss_gtm": 0.030710296630859377,
"train_loss_lm": 1.585390625
},
{
"epoch": 4.363859864781807,
"grad_norm": 0.7311274409294128,
"learning_rate": 8.071130531801635e-05,
"loss": 1.7137,
"step": 14200,
"train_loss_gtc": 0.079658203125,
"train_loss_gtm": 0.0499749755859375,
"train_loss_lm": 1.59171875
},
{
"epoch": 4.3945912722802705,
"grad_norm": 0.8525009155273438,
"learning_rate": 8.045619027162303e-05,
"loss": 1.6995,
"step": 14300,
"train_loss_gtc": 0.07261474609375,
"train_loss_gtm": 0.03937774658203125,
"train_loss_lm": 1.588359375
},
{
"epoch": 4.425322679778734,
"grad_norm": 1.277293086051941,
"learning_rate": 8.019980833916874e-05,
"loss": 1.6991,
"step": 14400,
"train_loss_gtc": 0.07212158203125,
"train_loss_gtm": 0.042982177734375,
"train_loss_lm": 1.585625
},
{
"epoch": 4.456054087277197,
"grad_norm": 0.7727832794189453,
"learning_rate": 7.994217018537195e-05,
"loss": 1.6925,
"step": 14500,
"train_loss_gtc": 0.07016845703125,
"train_loss_gtm": 0.041646270751953124,
"train_loss_lm": 1.580859375
},
{
"epoch": 4.486785494775661,
"grad_norm": 1.5050898790359497,
"learning_rate": 7.968328652720627e-05,
"loss": 1.6898,
"step": 14600,
"train_loss_gtc": 0.07381591796875,
"train_loss_gtm": 0.035602569580078125,
"train_loss_lm": 1.583203125
},
{
"epoch": 4.517516902274124,
"grad_norm": 0.8743451833724976,
"learning_rate": 7.942316813345447e-05,
"loss": 1.6976,
"step": 14700,
"train_loss_gtc": 0.07141357421875,
"train_loss_gtm": 0.03574203491210937,
"train_loss_lm": 1.58015625
},
{
"epoch": 4.548248309772587,
"grad_norm": 4.071852684020996,
"learning_rate": 7.916182582426064e-05,
"loss": 1.6793,
"step": 14800,
"train_loss_gtc": 0.065556640625,
"train_loss_gtm": 0.03599624633789063,
"train_loss_lm": 1.578359375
},
{
"epoch": 4.578979717271051,
"grad_norm": 1.2412759065628052,
"learning_rate": 7.88992704706801e-05,
"loss": 1.6891,
"step": 14900,
"train_loss_gtc": 0.07296875,
"train_loss_gtm": 0.04167098999023437,
"train_loss_lm": 1.58109375
},
{
"epoch": 4.609711124769515,
"grad_norm": 1.0076960325241089,
"learning_rate": 7.863551299422714e-05,
"loss": 1.6928,
"step": 15000,
"train_loss_gtc": 0.07355712890625,
"train_loss_gtm": 0.040593414306640624,
"train_loss_lm": 1.5796875
},
{
"epoch": 4.640442532267977,
"grad_norm": 1.8155709505081177,
"learning_rate": 7.837056436642077e-05,
"loss": 1.6972,
"step": 15100,
"train_loss_gtc": 0.07208251953125,
"train_loss_gtm": 0.03765533447265625,
"train_loss_lm": 1.5828125
},
{
"epoch": 4.671173939766441,
"grad_norm": 4.2761101722717285,
"learning_rate": 7.810443560832832e-05,
"loss": 1.6779,
"step": 15200,
"train_loss_gtc": 0.0666650390625,
"train_loss_gtm": 0.03232818603515625,
"train_loss_lm": 1.5771875
},
{
"epoch": 4.701905347264905,
"grad_norm": 1.0301436185836792,
"learning_rate": 7.783713779010697e-05,
"loss": 1.6814,
"step": 15300,
"train_loss_gtc": 0.0691845703125,
"train_loss_gtm": 0.03757865905761719,
"train_loss_lm": 1.57953125
},
{
"epoch": 4.7326367547633685,
"grad_norm": 3.180100679397583,
"learning_rate": 7.756868203054334e-05,
"loss": 1.6773,
"step": 15400,
"train_loss_gtc": 0.06718994140625,
"train_loss_gtm": 0.030146408081054687,
"train_loss_lm": 1.57796875
},
{
"epoch": 4.763368162261831,
"grad_norm": 0.845735490322113,
"learning_rate": 7.729907949659089e-05,
"loss": 1.6662,
"step": 15500,
"train_loss_gtc": 0.06385986328125,
"train_loss_gtm": 0.027723541259765627,
"train_loss_lm": 1.573125
},
{
"epoch": 4.794099569760295,
"grad_norm": 0.8206067681312561,
"learning_rate": 7.702834140290547e-05,
"loss": 1.6742,
"step": 15600,
"train_loss_gtc": 0.067158203125,
"train_loss_gtm": 0.035130157470703124,
"train_loss_lm": 1.571953125
},
{
"epoch": 4.824830977258759,
"grad_norm": 0.7254693508148193,
"learning_rate": 7.675647901137879e-05,
"loss": 1.6833,
"step": 15700,
"train_loss_gtc": 0.06796142578125,
"train_loss_gtm": 0.03723342895507813,
"train_loss_lm": 1.573984375
},
{
"epoch": 4.855562384757222,
"grad_norm": 1.2930517196655273,
"learning_rate": 7.648350363066998e-05,
"loss": 1.6783,
"step": 15800,
"train_loss_gtc": 0.0690478515625,
"train_loss_gtm": 0.03417861938476562,
"train_loss_lm": 1.574296875
},
{
"epoch": 4.886293792255685,
"grad_norm": 0.46316060423851013,
"learning_rate": 7.620942661573523e-05,
"loss": 1.6772,
"step": 15900,
"train_loss_gtc": 0.0691015625,
"train_loss_gtm": 0.03562210083007813,
"train_loss_lm": 1.568046875
},
{
"epoch": 4.917025199754149,
"grad_norm": 1.149546504020691,
"learning_rate": 7.59342593673553e-05,
"loss": 1.668,
"step": 16000,
"train_loss_gtc": 0.0667919921875,
"train_loss_gtm": 0.035623931884765626,
"train_loss_lm": 1.56515625
},
{
"epoch": 4.947756607252612,
"grad_norm": 0.4385952949523926,
"learning_rate": 7.56580133316615e-05,
"loss": 1.6674,
"step": 16100,
"train_loss_gtc": 0.06619873046875,
"train_loss_gtm": 0.034098358154296876,
"train_loss_lm": 1.56875
},
{
"epoch": 4.9784880147510755,
"grad_norm": 0.670734167098999,
"learning_rate": 7.538069999965934e-05,
"loss": 1.6746,
"step": 16200,
"train_loss_gtc": 0.067392578125,
"train_loss_gtm": 0.04040283203125,
"train_loss_lm": 1.568984375
},
{
"epoch": 5.0,
"eval_loss": 1.859765648841858,
"eval_runtime": 3.9207,
"eval_samples_per_second": 253.782,
"eval_steps_per_second": 2.806,
"step": 16270,
"train_loss_gtc": 0.06170131138392857,
"train_loss_gtm": 0.02779693603515625,
"train_loss_lm": 1.574330357142857,
"val_loss_gtc": 0.124658203125,
"val_loss_gtm": 0.1553466796875,
"val_loss_lm": 1.5875
},
{
"epoch": 5.009219422249539,
"grad_norm": 0.8819429278373718,
"learning_rate": 7.510233090675076e-05,
"loss": 1.6639,
"step": 16300,
"train_loss_gtc": 0.06167805989583333,
"train_loss_gtm": 0.03746388753255208,
"train_loss_lm": 1.55
},
{
"epoch": 5.039950829748003,
"grad_norm": 1.934515118598938,
"learning_rate": 7.482291763225411e-05,
"loss": 1.6471,
"step": 16400,
"train_loss_gtc": 0.0614111328125,
"train_loss_gtm": 0.0246038818359375,
"train_loss_lm": 1.560078125
},
{
"epoch": 5.070682237246466,
"grad_norm": 0.4644189476966858,
"learning_rate": 7.454247179892258e-05,
"loss": 1.6539,
"step": 16500,
"train_loss_gtc": 0.06216796875,
"train_loss_gtm": 0.029619293212890627,
"train_loss_lm": 1.55984375
},
{
"epoch": 5.101413644744929,
"grad_norm": 0.6986903548240662,
"learning_rate": 7.426100507246073e-05,
"loss": 1.654,
"step": 16600,
"train_loss_gtc": 0.06435791015625,
"train_loss_gtm": 0.029658050537109376,
"train_loss_lm": 1.554609375
},
{
"epoch": 5.132145052243393,
"grad_norm": 1.399057149887085,
"learning_rate": 7.397852916103918e-05,
"loss": 1.6514,
"step": 16700,
"train_loss_gtc": 0.06365234375,
"train_loss_gtm": 0.032920303344726565,
"train_loss_lm": 1.556171875
},
{
"epoch": 5.162876459741856,
"grad_norm": 1.2212918996810913,
"learning_rate": 7.369505581480761e-05,
"loss": 1.6591,
"step": 16800,
"train_loss_gtc": 0.06535888671875,
"train_loss_gtm": 0.03793792724609375,
"train_loss_lm": 1.55328125
},
{
"epoch": 5.19360786724032,
"grad_norm": 1.227950930595398,
"learning_rate": 7.341059682540601e-05,
"loss": 1.6542,
"step": 16900,
"train_loss_gtc": 0.06419921875,
"train_loss_gtm": 0.03715614318847656,
"train_loss_lm": 1.558828125
},
{
"epoch": 5.224339274738783,
"grad_norm": 0.7415390610694885,
"learning_rate": 7.312516402547418e-05,
"loss": 1.6535,
"step": 17000,
"train_loss_gtc": 0.06427001953125,
"train_loss_gtm": 0.038449249267578124,
"train_loss_lm": 1.5575
},
{
"epoch": 5.255070682237246,
"grad_norm": 0.4711204767227173,
"learning_rate": 7.283876928815944e-05,
"loss": 1.6536,
"step": 17100,
"train_loss_gtc": 0.062666015625,
"train_loss_gtm": 0.032582550048828124,
"train_loss_lm": 1.558359375
},
{
"epoch": 5.28580208973571,
"grad_norm": 0.8353385925292969,
"learning_rate": 7.255142452662295e-05,
"loss": 1.6433,
"step": 17200,
"train_loss_gtc": 0.0605859375,
"train_loss_gtm": 0.029074554443359376,
"train_loss_lm": 1.557421875
},
{
"epoch": 5.316533497234174,
"grad_norm": 0.8035210371017456,
"learning_rate": 7.226314169354391e-05,
"loss": 1.6511,
"step": 17300,
"train_loss_gtc": 0.0600830078125,
"train_loss_gtm": 0.029854888916015624,
"train_loss_lm": 1.558125
},
{
"epoch": 5.347264904732636,
"grad_norm": 1.1498240232467651,
"learning_rate": 7.197393278062251e-05,
"loss": 1.6475,
"step": 17400,
"train_loss_gtc": 0.0640478515625,
"train_loss_gtm": 0.039508514404296875,
"train_loss_lm": 1.549453125
},
{
"epoch": 5.3779963122311,
"grad_norm": 0.510197639465332,
"learning_rate": 7.168380981808108e-05,
"loss": 1.6438,
"step": 17500,
"train_loss_gtc": 0.06174072265625,
"train_loss_gtm": 0.0270050048828125,
"train_loss_lm": 1.551328125
},
{
"epoch": 5.408727719729564,
"grad_norm": 1.0153284072875977,
"learning_rate": 7.139278487416369e-05,
"loss": 1.6418,
"step": 17600,
"train_loss_gtc": 0.05983154296875,
"train_loss_gtm": 0.031028366088867186,
"train_loss_lm": 1.553203125
},
{
"epoch": 5.439459127228027,
"grad_norm": 3.0183732509613037,
"learning_rate": 7.110087005463413e-05,
"loss": 1.6466,
"step": 17700,
"train_loss_gtc": 0.0623046875,
"train_loss_gtm": 0.03655166625976562,
"train_loss_lm": 1.55625
},
{
"epoch": 5.47019053472649,
"grad_norm": 0.8955859541893005,
"learning_rate": 7.080807750227229e-05,
"loss": 1.6351,
"step": 17800,
"train_loss_gtc": 0.05905029296875,
"train_loss_gtm": 0.029703750610351562,
"train_loss_lm": 1.544765625
},
{
"epoch": 5.500921942224954,
"grad_norm": 0.4188254773616791,
"learning_rate": 7.051441939636915e-05,
"loss": 1.6359,
"step": 17900,
"train_loss_gtc": 0.05901123046875,
"train_loss_gtm": 0.02728248596191406,
"train_loss_lm": 1.54921875
},
{
"epoch": 5.531653349723418,
"grad_norm": 0.733801543712616,
"learning_rate": 7.021990795222015e-05,
"loss": 1.6387,
"step": 18000,
"train_loss_gtc": 0.0610791015625,
"train_loss_gtm": 0.033560562133789065,
"train_loss_lm": 1.550390625
},
{
"epoch": 5.5623847572218805,
"grad_norm": 1.336493968963623,
"learning_rate": 6.992455542061697e-05,
"loss": 1.6385,
"step": 18100,
"train_loss_gtc": 0.0579931640625,
"train_loss_gtm": 0.0305810546875,
"train_loss_lm": 1.5415625
},
{
"epoch": 5.593116164720344,
"grad_norm": 0.9108180999755859,
"learning_rate": 6.962837408733806e-05,
"loss": 1.6326,
"step": 18200,
"train_loss_gtc": 0.0611328125,
"train_loss_gtm": 0.027354583740234376,
"train_loss_lm": 1.54671875
},
{
"epoch": 5.623847572218808,
"grad_norm": 0.6176537871360779,
"learning_rate": 6.933137627263747e-05,
"loss": 1.6387,
"step": 18300,
"train_loss_gtc": 0.05935791015625,
"train_loss_gtm": 0.027886962890625,
"train_loss_lm": 1.550859375
},
{
"epoch": 5.654578979717271,
"grad_norm": 0.6086634993553162,
"learning_rate": 6.903357433073251e-05,
"loss": 1.6463,
"step": 18400,
"train_loss_gtc": 0.06116943359375,
"train_loss_gtm": 0.029478378295898437,
"train_loss_lm": 1.5525
},
{
"epoch": 5.6853103872157345,
"grad_norm": 0.6187928318977356,
"learning_rate": 6.873498064928969e-05,
"loss": 1.6362,
"step": 18500,
"train_loss_gtc": 0.059130859375,
"train_loss_gtm": 0.0319903564453125,
"train_loss_lm": 1.546171875
},
{
"epoch": 5.716041794714198,
"grad_norm": 0.7199295163154602,
"learning_rate": 6.843560764890953e-05,
"loss": 1.6304,
"step": 18600,
"train_loss_gtc": 0.0573388671875,
"train_loss_gtm": 0.026798248291015625,
"train_loss_lm": 1.54875
},
{
"epoch": 5.746773202212661,
"grad_norm": 0.48185673356056213,
"learning_rate": 6.81354677826099e-05,
"loss": 1.6356,
"step": 18700,
"train_loss_gtc": 0.05507568359375,
"train_loss_gtm": 0.023614425659179688,
"train_loss_lm": 1.552265625
},
{
"epoch": 5.777504609711125,
"grad_norm": 0.44653263688087463,
"learning_rate": 6.783457353530797e-05,
"loss": 1.629,
"step": 18800,
"train_loss_gtc": 0.057138671875,
"train_loss_gtm": 0.02418853759765625,
"train_loss_lm": 1.546953125
},
{
"epoch": 5.808236017209588,
"grad_norm": 0.6734046936035156,
"learning_rate": 6.75329374233009e-05,
"loss": 1.6311,
"step": 18900,
"train_loss_gtc": 0.05802734375,
"train_loss_gtm": 0.025522842407226562,
"train_loss_lm": 1.54484375
},
{
"epoch": 5.838967424708052,
"grad_norm": 1.3264803886413574,
"learning_rate": 6.723057199374518e-05,
"loss": 1.6371,
"step": 19000,
"train_loss_gtc": 0.057373046875,
"train_loss_gtm": 0.030865325927734374,
"train_loss_lm": 1.548671875
},
{
"epoch": 5.869698832206515,
"grad_norm": 1.0278254747390747,
"learning_rate": 6.692748982413474e-05,
"loss": 1.6338,
"step": 19100,
"train_loss_gtc": 0.05820068359375,
"train_loss_gtm": 0.030936508178710936,
"train_loss_lm": 1.550859375
},
{
"epoch": 5.900430239704979,
"grad_norm": 0.6339811086654663,
"learning_rate": 6.662370352177774e-05,
"loss": 1.6301,
"step": 19200,
"train_loss_gtc": 0.0586328125,
"train_loss_gtm": 0.02888214111328125,
"train_loss_lm": 1.5375
},
{
"epoch": 5.931161647203442,
"grad_norm": 0.892484724521637,
"learning_rate": 6.631922572327213e-05,
"loss": 1.6294,
"step": 19300,
"train_loss_gtc": 0.05766357421875,
"train_loss_gtm": 0.03270828247070313,
"train_loss_lm": 1.542890625
},
{
"epoch": 5.961893054701905,
"grad_norm": 0.8719256520271301,
"learning_rate": 6.601406909398007e-05,
"loss": 1.6334,
"step": 19400,
"train_loss_gtc": 0.05709716796875,
"train_loss_gtm": 0.034244384765625,
"train_loss_lm": 1.544453125
},
{
"epoch": 5.992624462200369,
"grad_norm": 0.5898253917694092,
"learning_rate": 6.570824632750099e-05,
"loss": 1.6308,
"step": 19500,
"train_loss_gtc": 0.05718017578125,
"train_loss_gtm": 0.028017425537109376,
"train_loss_lm": 1.5475
},
{
"epoch": 6.0,
"eval_loss": 1.8289062976837158,
"eval_runtime": 3.955,
"eval_samples_per_second": 251.581,
"eval_steps_per_second": 2.781,
"step": 19524,
"train_loss_gtc": 0.059417724609375,
"train_loss_gtm": 0.021376291910807293,
"train_loss_lm": 1.5485026041666667,
"val_loss_gtc": 0.1081298828125,
"val_loss_gtm": 0.14790267944335939,
"val_loss_lm": 1.559375
},
{
"epoch": 6.0233558696988325,
"grad_norm": 0.5566962361335754,
"learning_rate": 6.540177014514361e-05,
"loss": 1.6229,
"step": 19600,
"train_loss_gtc": 0.05809583162006579,
"train_loss_gtm": 0.027290545011821547,
"train_loss_lm": 1.5270353618421053
},
{
"epoch": 6.054087277197295,
"grad_norm": 0.7923322319984436,
"learning_rate": 6.509465329539689e-05,
"loss": 1.6096,
"step": 19700,
"train_loss_gtc": 0.0559228515625,
"train_loss_gtm": 0.02381988525390625,
"train_loss_lm": 1.527734375
},
{
"epoch": 6.084818684695759,
"grad_norm": 0.633963942527771,
"learning_rate": 6.478690855339953e-05,
"loss": 1.6261,
"step": 19800,
"train_loss_gtc": 0.0565234375,
"train_loss_gtm": 0.03267807006835938,
"train_loss_lm": 1.532734375
},
{
"epoch": 6.115550092194223,
"grad_norm": 0.9739740490913391,
"learning_rate": 6.44785487204087e-05,
"loss": 1.6157,
"step": 19900,
"train_loss_gtc": 0.05468017578125,
"train_loss_gtm": 0.029143524169921876,
"train_loss_lm": 1.53
},
{
"epoch": 6.146281499692686,
"grad_norm": 1.191219449043274,
"learning_rate": 6.416958662326749e-05,
"loss": 1.6127,
"step": 20000,
"train_loss_gtc": 0.05240966796875,
"train_loss_gtm": 0.02735198974609375,
"train_loss_lm": 1.531171875
},
{
"epoch": 6.177012907191149,
"grad_norm": 0.9735581278800964,
"learning_rate": 6.38600351138714e-05,
"loss": 1.6113,
"step": 20100,
"train_loss_gtc": 0.0530419921875,
"train_loss_gtm": 0.027030487060546875,
"train_loss_lm": 1.5346875
},
{
"epoch": 6.207744314689613,
"grad_norm": 1.2206913232803345,
"learning_rate": 6.35499070686337e-05,
"loss": 1.6212,
"step": 20200,
"train_loss_gtc": 0.055166015625,
"train_loss_gtm": 0.026912918090820314,
"train_loss_lm": 1.535625
},
{
"epoch": 6.238475722188076,
"grad_norm": 0.8422713279724121,
"learning_rate": 6.323921538794981e-05,
"loss": 1.6118,
"step": 20300,
"train_loss_gtc": 0.05383056640625,
"train_loss_gtm": 0.029865264892578125,
"train_loss_lm": 1.52765625
},
{
"epoch": 6.2692071296865395,
"grad_norm": 1.286847472190857,
"learning_rate": 6.292797299566072e-05,
"loss": 1.6112,
"step": 20400,
"train_loss_gtc": 0.055625,
"train_loss_gtm": 0.0314874267578125,
"train_loss_lm": 1.525234375
},
{
"epoch": 6.299938537185003,
"grad_norm": 0.5895647406578064,
"learning_rate": 6.261619283851527e-05,
"loss": 1.6021,
"step": 20500,
"train_loss_gtc": 0.050849609375,
"train_loss_gtm": 0.027188568115234374,
"train_loss_lm": 1.52734375
},
{
"epoch": 6.330669944683467,
"grad_norm": 0.6928810477256775,
"learning_rate": 6.230388788563187e-05,
"loss": 1.6008,
"step": 20600,
"train_loss_gtc": 0.05047119140625,
"train_loss_gtm": 0.02188018798828125,
"train_loss_lm": 1.530703125
},
{
"epoch": 6.36140135218193,
"grad_norm": 1.0124385356903076,
"learning_rate": 6.199107112795872e-05,
"loss": 1.6071,
"step": 20700,
"train_loss_gtc": 0.05262939453125,
"train_loss_gtm": 0.028003463745117186,
"train_loss_lm": 1.52765625
},
{
"epoch": 6.392132759680393,
"grad_norm": 1.7495094537734985,
"learning_rate": 6.167775557773363e-05,
"loss": 1.6069,
"step": 20800,
"train_loss_gtc": 0.0532470703125,
"train_loss_gtm": 0.027218780517578124,
"train_loss_lm": 1.525546875
},
{
"epoch": 6.422864167178857,
"grad_norm": 0.7303450703620911,
"learning_rate": 6.136395426794261e-05,
"loss": 1.5961,
"step": 20900,
"train_loss_gtc": 0.04982177734375,
"train_loss_gtm": 0.019435043334960937,
"train_loss_lm": 1.521875
},
{
"epoch": 6.45359557467732,
"grad_norm": 0.797379732131958,
"learning_rate": 6.104968025177791e-05,
"loss": 1.607,
"step": 21000,
"train_loss_gtc": 0.0555908203125,
"train_loss_gtm": 0.024838104248046874,
"train_loss_lm": 1.529375
},
{
"epoch": 6.484326982175784,
"grad_norm": 0.5462325811386108,
"learning_rate": 6.073494660209491e-05,
"loss": 1.6088,
"step": 21100,
"train_loss_gtc": 0.0543115234375,
"train_loss_gtm": 0.03119972229003906,
"train_loss_lm": 1.52734375
},
{
"epoch": 6.515058389674247,
"grad_norm": 0.46476686000823975,
"learning_rate": 6.0419766410868294e-05,
"loss": 1.6075,
"step": 21200,
"train_loss_gtc": 0.05191650390625,
"train_loss_gtm": 0.027312164306640626,
"train_loss_lm": 1.5278125
},
{
"epoch": 6.54578979717271,
"grad_norm": 0.6704521179199219,
"learning_rate": 6.010415278864762e-05,
"loss": 1.6081,
"step": 21300,
"train_loss_gtc": 0.05267822265625,
"train_loss_gtm": 0.025740814208984376,
"train_loss_lm": 1.522734375
},
{
"epoch": 6.576521204671174,
"grad_norm": 0.513566792011261,
"learning_rate": 5.978811886401183e-05,
"loss": 1.6077,
"step": 21400,
"train_loss_gtc": 0.05446533203125,
"train_loss_gtm": 0.03446975708007813,
"train_loss_lm": 1.523359375
},
{
"epoch": 6.6072526121696376,
"grad_norm": 1.2353570461273193,
"learning_rate": 5.947167778302323e-05,
"loss": 1.5954,
"step": 21500,
"train_loss_gtc": 0.04914306640625,
"train_loss_gtm": 0.019849777221679688,
"train_loss_lm": 1.52546875
},
{
"epoch": 6.637984019668101,
"grad_norm": 2.1153972148895264,
"learning_rate": 5.9154842708680544e-05,
"loss": 1.6048,
"step": 21600,
"train_loss_gtc": 0.052568359375,
"train_loss_gtm": 0.028261795043945312,
"train_loss_lm": 1.52453125
},
{
"epoch": 6.668715427166564,
"grad_norm": 1.2410842180252075,
"learning_rate": 5.8837626820371486e-05,
"loss": 1.6103,
"step": 21700,
"train_loss_gtc": 0.0537890625,
"train_loss_gtm": 0.027198944091796875,
"train_loss_lm": 1.52640625
},
{
"epoch": 6.699446834665028,
"grad_norm": 0.39238986372947693,
"learning_rate": 5.852004331332443e-05,
"loss": 1.6068,
"step": 21800,
"train_loss_gtc": 0.05417724609375,
"train_loss_gtm": 0.025730323791503907,
"train_loss_lm": 1.5265625
},
{
"epoch": 6.7301782421634915,
"grad_norm": 0.8881044983863831,
"learning_rate": 5.820210539805968e-05,
"loss": 1.5946,
"step": 21900,
"train_loss_gtc": 0.0499072265625,
"train_loss_gtm": 0.019407730102539062,
"train_loss_lm": 1.521875
},
{
"epoch": 6.760909649661954,
"grad_norm": 0.5124359130859375,
"learning_rate": 5.788382629983977e-05,
"loss": 1.612,
"step": 22000,
"train_loss_gtc": 0.0574853515625,
"train_loss_gtm": 0.031860885620117185,
"train_loss_lm": 1.523984375
},
{
"epoch": 6.791641057160418,
"grad_norm": 0.6098849773406982,
"learning_rate": 5.7565219258119455e-05,
"loss": 1.5961,
"step": 22100,
"train_loss_gtc": 0.05323974609375,
"train_loss_gtm": 0.02882041931152344,
"train_loss_lm": 1.521953125
},
{
"epoch": 6.822372464658882,
"grad_norm": 1.027600884437561,
"learning_rate": 5.724629752599495e-05,
"loss": 1.5928,
"step": 22200,
"train_loss_gtc": 0.0508203125,
"train_loss_gtm": 0.02281818389892578,
"train_loss_lm": 1.52421875
},
{
"epoch": 6.8531038721573445,
"grad_norm": 1.004398226737976,
"learning_rate": 5.692707436965267e-05,
"loss": 1.5929,
"step": 22300,
"train_loss_gtc": 0.04905517578125,
"train_loss_gtm": 0.025001983642578125,
"train_loss_lm": 1.521328125
},
{
"epoch": 6.883835279655808,
"grad_norm": 0.8874416351318359,
"learning_rate": 5.660756306781733e-05,
"loss": 1.5983,
"step": 22400,
"train_loss_gtc": 0.04990234375,
"train_loss_gtm": 0.025154190063476564,
"train_loss_lm": 1.52375
},
{
"epoch": 6.914566687154272,
"grad_norm": 0.5866090059280396,
"learning_rate": 5.628777691119965e-05,
"loss": 1.5958,
"step": 22500,
"train_loss_gtc": 0.0502880859375,
"train_loss_gtm": 0.024204254150390625,
"train_loss_lm": 1.521328125
},
{
"epoch": 6.945298094652735,
"grad_norm": 0.48130372166633606,
"learning_rate": 5.59677292019435e-05,
"loss": 1.594,
"step": 22600,
"train_loss_gtc": 0.05079833984375,
"train_loss_gtm": 0.02796661376953125,
"train_loss_lm": 1.51875
},
{
"epoch": 6.976029502151198,
"grad_norm": 0.6554698944091797,
"learning_rate": 5.564743325307254e-05,
"loss": 1.5964,
"step": 22700,
"train_loss_gtc": 0.0513427734375,
"train_loss_gtm": 0.025988922119140626,
"train_loss_lm": 1.521171875
},
{
"epoch": 7.0,
"eval_loss": 1.8093750476837158,
"eval_runtime": 3.9611,
"eval_samples_per_second": 251.194,
"eval_steps_per_second": 2.777,
"step": 22778,
"train_loss_gtc": 0.051851712740384616,
"train_loss_gtm": 0.024179898775540866,
"train_loss_lm": 1.5157251602564104,
"val_loss_gtc": 0.11328125,
"val_loss_gtm": 0.15882568359375,
"val_loss_lm": 1.53984375
},
{
"epoch": 7.006760909649662,
"grad_norm": 1.2214024066925049,
"learning_rate": 5.5326902387936454e-05,
"loss": 1.5932,
"step": 22800,
"train_loss_gtc": 0.04629794034090909,
"train_loss_gtm": 0.011040774258700285,
"train_loss_lm": 1.5095880681818181
},
{
"epoch": 7.037492317148125,
"grad_norm": 0.78125,
"learning_rate": 5.500614993965673e-05,
"loss": 1.5774,
"step": 22900,
"train_loss_gtc": 0.048642578125,
"train_loss_gtm": 0.028121871948242186,
"train_loss_lm": 1.504296875
},
{
"epoch": 7.068223724646589,
"grad_norm": 0.5814157724380493,
"learning_rate": 5.468518925057203e-05,
"loss": 1.5826,
"step": 23000,
"train_loss_gtc": 0.049710693359375,
"train_loss_gtm": 0.02605010986328125,
"train_loss_lm": 1.508125
},
{
"epoch": 7.098955132145052,
"grad_norm": 0.7798097133636475,
"learning_rate": 5.4364033671683304e-05,
"loss": 1.5849,
"step": 23100,
"train_loss_gtc": 0.049805908203125,
"train_loss_gtm": 0.024519424438476562,
"train_loss_lm": 1.512890625
},
{
"epoch": 7.129686539643516,
"grad_norm": 0.8778783679008484,
"learning_rate": 5.404269656209819e-05,
"loss": 1.5775,
"step": 23200,
"train_loss_gtc": 0.04724853515625,
"train_loss_gtm": 0.021280136108398438,
"train_loss_lm": 1.509140625
},
{
"epoch": 7.160417947141979,
"grad_norm": 0.8768311142921448,
"learning_rate": 5.3721191288475595e-05,
"loss": 1.5768,
"step": 23300,
"train_loss_gtc": 0.0488720703125,
"train_loss_gtm": 0.020770683288574218,
"train_loss_lm": 1.50484375
},
{
"epoch": 7.191149354640443,
"grad_norm": 1.3236780166625977,
"learning_rate": 5.3399531224469424e-05,
"loss": 1.5761,
"step": 23400,
"train_loss_gtc": 0.047967529296875,
"train_loss_gtm": 0.016504249572753905,
"train_loss_lm": 1.507578125
},
{
"epoch": 7.221880762138906,
"grad_norm": 0.4845696985721588,
"learning_rate": 5.307772975017249e-05,
"loss": 1.58,
"step": 23500,
"train_loss_gtc": 0.04843017578125,
"train_loss_gtm": 0.021038818359375,
"train_loss_lm": 1.51203125
},
{
"epoch": 7.252612169637369,
"grad_norm": 0.6816074848175049,
"learning_rate": 5.2755800251559794e-05,
"loss": 1.5807,
"step": 23600,
"train_loss_gtc": 0.0488525390625,
"train_loss_gtm": 0.025988388061523437,
"train_loss_lm": 1.50859375
},
{
"epoch": 7.283343577135833,
"grad_norm": 0.8071028590202332,
"learning_rate": 5.24337561199318e-05,
"loss": 1.5757,
"step": 23700,
"train_loss_gtc": 0.0470068359375,
"train_loss_gtm": 0.02268218994140625,
"train_loss_lm": 1.510703125
},
{
"epoch": 7.3140749846342965,
"grad_norm": 1.132927656173706,
"learning_rate": 5.211161075135733e-05,
"loss": 1.5746,
"step": 23800,
"train_loss_gtc": 0.04585205078125,
"train_loss_gtm": 0.020586471557617187,
"train_loss_lm": 1.508203125
},
{
"epoch": 7.344806392132759,
"grad_norm": 0.6981713771820068,
"learning_rate": 5.178937754611637e-05,
"loss": 1.5759,
"step": 23900,
"train_loss_gtc": 0.045491943359375,
"train_loss_gtm": 0.0174371337890625,
"train_loss_lm": 1.508671875
},
{
"epoch": 7.375537799631223,
"grad_norm": 0.689810574054718,
"learning_rate": 5.1467069908142684e-05,
"loss": 1.5719,
"step": 24000,
"train_loss_gtc": 0.046361083984375,
"train_loss_gtm": 0.02007720947265625,
"train_loss_lm": 1.50734375
},
{
"epoch": 7.406269207129687,
"grad_norm": 0.5761317610740662,
"learning_rate": 5.1144701244466144e-05,
"loss": 1.5774,
"step": 24100,
"train_loss_gtc": 0.047037353515625,
"train_loss_gtm": 0.025342483520507813,
"train_loss_lm": 1.505390625
},
{
"epoch": 7.43700061462815,
"grad_norm": 0.9547802805900574,
"learning_rate": 5.082228496465517e-05,
"loss": 1.5723,
"step": 24200,
"train_loss_gtc": 0.046898193359375,
"train_loss_gtm": 0.019998626708984377,
"train_loss_lm": 1.5040625
},
{
"epoch": 7.467732022126613,
"grad_norm": 1.58182954788208,
"learning_rate": 5.049983448025881e-05,
"loss": 1.5752,
"step": 24300,
"train_loss_gtc": 0.047181396484375,
"train_loss_gtm": 0.019326019287109374,
"train_loss_lm": 1.5034375
},
{
"epoch": 7.498463429625077,
"grad_norm": 1.1392496824264526,
"learning_rate": 5.0177363204249016e-05,
"loss": 1.567,
"step": 24400,
"train_loss_gtc": 0.0444873046875,
"train_loss_gtm": 0.02104278564453125,
"train_loss_lm": 1.503828125
},
{
"epoch": 7.529194837123541,
"grad_norm": 0.9969751238822937,
"learning_rate": 4.985488455046249e-05,
"loss": 1.5918,
"step": 24500,
"train_loss_gtc": 0.05201904296875,
"train_loss_gtm": 0.026438446044921876,
"train_loss_lm": 1.50671875
},
{
"epoch": 7.5599262446220035,
"grad_norm": 0.6485080122947693,
"learning_rate": 4.953241193304291e-05,
"loss": 1.5678,
"step": 24600,
"train_loss_gtc": 0.04556884765625,
"train_loss_gtm": 0.01871406555175781,
"train_loss_lm": 1.50484375
},
{
"epoch": 7.590657652120467,
"grad_norm": 0.5488921403884888,
"learning_rate": 4.920995876588286e-05,
"loss": 1.5709,
"step": 24700,
"train_loss_gtc": 0.045516357421875,
"train_loss_gtm": 0.017727508544921874,
"train_loss_lm": 1.507890625
},
{
"epoch": 7.621389059618931,
"grad_norm": 1.2782403230667114,
"learning_rate": 4.888753846206578e-05,
"loss": 1.5708,
"step": 24800,
"train_loss_gtc": 0.045699462890625,
"train_loss_gtm": 0.019001045227050782,
"train_loss_lm": 1.5021875
},
{
"epoch": 7.652120467117394,
"grad_norm": 1.2111992835998535,
"learning_rate": 4.856516443330818e-05,
"loss": 1.5671,
"step": 24900,
"train_loss_gtc": 0.04524169921875,
"train_loss_gtm": 0.015474700927734375,
"train_loss_lm": 1.50671875
},
{
"epoch": 7.682851874615857,
"grad_norm": 0.9381042122840881,
"learning_rate": 4.824285008940159e-05,
"loss": 1.5682,
"step": 25000,
"train_loss_gtc": 0.04477783203125,
"train_loss_gtm": 0.016591415405273438,
"train_loss_lm": 1.50328125
},
{
"epoch": 7.713583282114321,
"grad_norm": 0.41880643367767334,
"learning_rate": 4.79206088376549e-05,
"loss": 1.5699,
"step": 25100,
"train_loss_gtc": 0.04564697265625,
"train_loss_gtm": 0.022302398681640623,
"train_loss_lm": 1.50265625
},
{
"epoch": 7.744314689612784,
"grad_norm": 0.41994112730026245,
"learning_rate": 4.7598454082336525e-05,
"loss": 1.5593,
"step": 25200,
"train_loss_gtc": 0.0431494140625,
"train_loss_gtm": 0.01353099822998047,
"train_loss_lm": 1.501328125
},
{
"epoch": 7.775046097111248,
"grad_norm": 0.41959813237190247,
"learning_rate": 4.727639922411693e-05,
"loss": 1.5675,
"step": 25300,
"train_loss_gtc": 0.045030517578125,
"train_loss_gtm": 0.018340682983398436,
"train_loss_lm": 1.498359375
},
{
"epoch": 7.805777504609711,
"grad_norm": 1.3286911249160767,
"learning_rate": 4.695445765951113e-05,
"loss": 1.5671,
"step": 25400,
"train_loss_gtc": 0.044442138671875,
"train_loss_gtm": 0.017482261657714843,
"train_loss_lm": 1.50640625
},
{
"epoch": 7.836508912108174,
"grad_norm": 0.5046520233154297,
"learning_rate": 4.6632642780321506e-05,
"loss": 1.5625,
"step": 25500,
"train_loss_gtc": 0.04425048828125,
"train_loss_gtm": 0.01410266876220703,
"train_loss_lm": 1.501953125
},
{
"epoch": 7.867240319606638,
"grad_norm": 0.7728056907653809,
"learning_rate": 4.631096797308068e-05,
"loss": 1.5739,
"step": 25600,
"train_loss_gtc": 0.048016357421875,
"train_loss_gtm": 0.026591949462890625,
"train_loss_lm": 1.502890625
},
{
"epoch": 7.8979717271051015,
"grad_norm": 0.549649178981781,
"learning_rate": 4.598944661849467e-05,
"loss": 1.5654,
"step": 25700,
"train_loss_gtc": 0.045203857421875,
"train_loss_gtm": 0.019275131225585936,
"train_loss_lm": 1.500703125
},
{
"epoch": 7.928703134603564,
"grad_norm": 0.4454677999019623,
"learning_rate": 4.566809209088641e-05,
"loss": 1.5661,
"step": 25800,
"train_loss_gtc": 0.044942626953125,
"train_loss_gtm": 0.01573017120361328,
"train_loss_lm": 1.50234375
},
{
"epoch": 7.959434542102028,
"grad_norm": 0.5023268461227417,
"learning_rate": 4.534691775763923e-05,
"loss": 1.5643,
"step": 25900,
"train_loss_gtc": 0.045194091796875,
"train_loss_gtm": 0.020731773376464844,
"train_loss_lm": 1.498359375
},
{
"epoch": 7.990165949600492,
"grad_norm": 0.4675215780735016,
"learning_rate": 4.5025936978640993e-05,
"loss": 1.5646,
"step": 26000,
"train_loss_gtc": 0.04420166015625,
"train_loss_gtm": 0.0233331298828125,
"train_loss_lm": 1.50140625
},
{
"epoch": 8.0,
"eval_loss": 1.7390625476837158,
"eval_runtime": 3.9419,
"eval_samples_per_second": 252.418,
"eval_steps_per_second": 2.791,
"step": 26032,
"train_loss_gtc": 0.043849945068359375,
"train_loss_gtm": 0.01909458637237549,
"train_loss_lm": 1.50341796875,
"val_loss_gtc": 0.085546875,
"val_loss_gtm": 0.1235870361328125,
"val_loss_lm": 1.51875
},
{
"epoch": 8.020897357098955,
"grad_norm": 0.9887075424194336,
"learning_rate": 4.470516310572825e-05,
"loss": 1.5523,
"step": 26100,
"train_loss_gtc": 0.04299388212316176,
"train_loss_gtm": 0.015683286330279184,
"train_loss_lm": 1.4872472426470589
},
{
"epoch": 8.05162876459742,
"grad_norm": 0.7514944076538086,
"learning_rate": 4.43846094821309e-05,
"loss": 1.5613,
"step": 26200,
"train_loss_gtc": 0.04583251953125,
"train_loss_gtm": 0.026337127685546875,
"train_loss_lm": 1.490859375
},
{
"epoch": 8.082360172095882,
"grad_norm": 1.092617154121399,
"learning_rate": 4.406428944191709e-05,
"loss": 1.5533,
"step": 26300,
"train_loss_gtc": 0.04384765625,
"train_loss_gtm": 0.016444091796875,
"train_loss_lm": 1.488046875
},
{
"epoch": 8.113091579594345,
"grad_norm": 1.1750010251998901,
"learning_rate": 4.374421630943868e-05,
"loss": 1.5543,
"step": 26400,
"train_loss_gtc": 0.043507080078125,
"train_loss_gtm": 0.018485107421875,
"train_loss_lm": 1.493203125
},
{
"epoch": 8.14382298709281,
"grad_norm": 0.5995994806289673,
"learning_rate": 4.3424403398776835e-05,
"loss": 1.5558,
"step": 26500,
"train_loss_gtc": 0.045775146484375,
"train_loss_gtm": 0.0213360595703125,
"train_loss_lm": 1.486953125
},
{
"epoch": 8.174554394591272,
"grad_norm": 0.40138596296310425,
"learning_rate": 4.310486401318829e-05,
"loss": 1.5414,
"step": 26600,
"train_loss_gtc": 0.04089599609375,
"train_loss_gtm": 0.011089859008789062,
"train_loss_lm": 1.488828125
},
{
"epoch": 8.205285802089735,
"grad_norm": 0.4291875958442688,
"learning_rate": 4.278561144455199e-05,
"loss": 1.5511,
"step": 26700,
"train_loss_gtc": 0.0429052734375,
"train_loss_gtm": 0.014610671997070312,
"train_loss_lm": 1.4884375
},
{
"epoch": 8.2360172095882,
"grad_norm": 0.5274336934089661,
"learning_rate": 4.246665897281612e-05,
"loss": 1.5493,
"step": 26800,
"train_loss_gtc": 0.04279296875,
"train_loss_gtm": 0.015193328857421876,
"train_loss_lm": 1.49359375
},
{
"epoch": 8.266748617086662,
"grad_norm": 0.7654374837875366,
"learning_rate": 4.214801986544575e-05,
"loss": 1.5566,
"step": 26900,
"train_loss_gtc": 0.042926025390625,
"train_loss_gtm": 0.018515548706054687,
"train_loss_lm": 1.49296875
},
{
"epoch": 8.297480024585125,
"grad_norm": 0.9065292477607727,
"learning_rate": 4.182970737687093e-05,
"loss": 1.5538,
"step": 27000,
"train_loss_gtc": 0.04357177734375,
"train_loss_gtm": 0.016671829223632813,
"train_loss_lm": 1.491875
},
{
"epoch": 8.32821143208359,
"grad_norm": 1.0985864400863647,
"learning_rate": 4.151173474793534e-05,
"loss": 1.5566,
"step": 27100,
"train_loss_gtc": 0.045074462890625,
"train_loss_gtm": 0.02417022705078125,
"train_loss_lm": 1.488515625
},
{
"epoch": 8.358942839582053,
"grad_norm": 0.43155065178871155,
"learning_rate": 4.1194115205345574e-05,
"loss": 1.5593,
"step": 27200,
"train_loss_gtc": 0.04392822265625,
"train_loss_gtm": 0.024323196411132814,
"train_loss_lm": 1.490078125
},
{
"epoch": 8.389674247080515,
"grad_norm": 0.6603362560272217,
"learning_rate": 4.0876861961120806e-05,
"loss": 1.5456,
"step": 27300,
"train_loss_gtc": 0.043385009765625,
"train_loss_gtm": 0.011190872192382812,
"train_loss_lm": 1.486015625
},
{
"epoch": 8.42040565457898,
"grad_norm": 0.5204278826713562,
"learning_rate": 4.055998821204337e-05,
"loss": 1.5511,
"step": 27400,
"train_loss_gtc": 0.04381103515625,
"train_loss_gtm": 0.017749443054199218,
"train_loss_lm": 1.491953125
},
{
"epoch": 8.451137062077443,
"grad_norm": 0.7329652309417725,
"learning_rate": 4.024350713910969e-05,
"loss": 1.5452,
"step": 27500,
"train_loss_gtc": 0.041251220703125,
"train_loss_gtm": 0.012794952392578125,
"train_loss_lm": 1.48953125
},
{
"epoch": 8.481868469575907,
"grad_norm": 1.1227164268493652,
"learning_rate": 3.9927431906982095e-05,
"loss": 1.5508,
"step": 27600,
"train_loss_gtc": 0.04261962890625,
"train_loss_gtm": 0.01765655517578125,
"train_loss_lm": 1.48875
},
{
"epoch": 8.51259987707437,
"grad_norm": 0.6496936678886414,
"learning_rate": 3.9611775663441094e-05,
"loss": 1.5491,
"step": 27700,
"train_loss_gtc": 0.04417724609375,
"train_loss_gtm": 0.023344078063964845,
"train_loss_lm": 1.48734375
},
{
"epoch": 8.543331284572833,
"grad_norm": 0.4676097333431244,
"learning_rate": 3.92965515388386e-05,
"loss": 1.5494,
"step": 27800,
"train_loss_gtc": 0.0420458984375,
"train_loss_gtm": 0.020555419921875,
"train_loss_lm": 1.48703125
},
{
"epoch": 8.574062692071298,
"grad_norm": 1.0823791027069092,
"learning_rate": 3.8981772645551595e-05,
"loss": 1.5512,
"step": 27900,
"train_loss_gtc": 0.042501220703125,
"train_loss_gtm": 0.022169036865234373,
"train_loss_lm": 1.4890625
},
{
"epoch": 8.60479409956976,
"grad_norm": 0.40729042887687683,
"learning_rate": 3.866745207743683e-05,
"loss": 1.543,
"step": 28000,
"train_loss_gtc": 0.03969482421875,
"train_loss_gtm": 0.009343986511230468,
"train_loss_lm": 1.487421875
},
{
"epoch": 8.635525507068223,
"grad_norm": 1.4600690603256226,
"learning_rate": 3.835360290928612e-05,
"loss": 1.549,
"step": 28100,
"train_loss_gtc": 0.04197265625,
"train_loss_gtm": 0.016862869262695312,
"train_loss_lm": 1.484921875
},
{
"epoch": 8.666256914566688,
"grad_norm": 0.43790164589881897,
"learning_rate": 3.8040238196282395e-05,
"loss": 1.5401,
"step": 28200,
"train_loss_gtc": 0.03960205078125,
"train_loss_gtm": 0.01627326965332031,
"train_loss_lm": 1.482890625
},
{
"epoch": 8.69698832206515,
"grad_norm": 0.4079265892505646,
"learning_rate": 3.772737097345676e-05,
"loss": 1.5519,
"step": 28300,
"train_loss_gtc": 0.04443603515625,
"train_loss_gtm": 0.01917346954345703,
"train_loss_lm": 1.486328125
},
{
"epoch": 8.727719729563614,
"grad_norm": 2.1502716541290283,
"learning_rate": 3.741501425514618e-05,
"loss": 1.5453,
"step": 28400,
"train_loss_gtc": 0.04140380859375,
"train_loss_gtm": 0.016539077758789062,
"train_loss_lm": 1.489453125
},
{
"epoch": 8.758451137062078,
"grad_norm": 2.0536539554595947,
"learning_rate": 3.710318103445223e-05,
"loss": 1.5478,
"step": 28500,
"train_loss_gtc": 0.04205078125,
"train_loss_gtm": 0.019853744506835937,
"train_loss_lm": 1.48765625
},
{
"epoch": 8.789182544560541,
"grad_norm": 0.8067043423652649,
"learning_rate": 3.6791884282700464e-05,
"loss": 1.5401,
"step": 28600,
"train_loss_gtc": 0.042589111328125,
"train_loss_gtm": 0.01223979949951172,
"train_loss_lm": 1.487265625
},
{
"epoch": 8.819913952059004,
"grad_norm": 1.0549793243408203,
"learning_rate": 3.6481136948901016e-05,
"loss": 1.5449,
"step": 28700,
"train_loss_gtc": 0.039984130859375,
"train_loss_gtm": 0.013403701782226562,
"train_loss_lm": 1.4865625
},
{
"epoch": 8.850645359557468,
"grad_norm": 0.3913937211036682,
"learning_rate": 3.617095195920983e-05,
"loss": 1.5392,
"step": 28800,
"train_loss_gtc": 0.038916015625,
"train_loss_gtm": 0.014700355529785157,
"train_loss_lm": 1.48515625
},
{
"epoch": 8.881376767055931,
"grad_norm": 0.6485953330993652,
"learning_rate": 3.5861342216391083e-05,
"loss": 1.5398,
"step": 28900,
"train_loss_gtc": 0.0403515625,
"train_loss_gtm": 0.008778877258300781,
"train_loss_lm": 1.48671875
},
{
"epoch": 8.912108174554394,
"grad_norm": 0.42979031801223755,
"learning_rate": 3.555232059928037e-05,
"loss": 1.5443,
"step": 29000,
"train_loss_gtc": 0.040491943359375,
"train_loss_gtm": 0.020406494140625,
"train_loss_lm": 1.487421875
},
{
"epoch": 8.942839582052859,
"grad_norm": 0.4814371168613434,
"learning_rate": 3.524389996224899e-05,
"loss": 1.5404,
"step": 29100,
"train_loss_gtc": 0.038388671875,
"train_loss_gtm": 0.01521839141845703,
"train_loss_lm": 1.486171875
},
{
"epoch": 8.973570989551321,
"grad_norm": 1.2739533185958862,
"learning_rate": 3.4936093134669375e-05,
"loss": 1.5411,
"step": 29200,
"train_loss_gtc": 0.04125,
"train_loss_gtm": 0.01296173095703125,
"train_loss_lm": 1.482421875
},
{
"epoch": 9.0,
"eval_loss": 1.7078125476837158,
"eval_runtime": 3.925,
"eval_samples_per_second": 253.505,
"eval_steps_per_second": 2.803,
"step": 29286,
"train_loss_gtc": 0.03917747320130814,
"train_loss_gtm": 0.016063379686932232,
"train_loss_lm": 1.4876453488372092,
"val_loss_gtc": 0.0817138671875,
"val_loss_gtm": 0.13273239135742188,
"val_loss_lm": 1.5046875
},
{
"epoch": 9.004302397049784,
"grad_norm": 0.49189862608909607,
"learning_rate": 3.4628912920381206e-05,
"loss": 1.5363,
"step": 29300,
"train_loss_gtc": 0.03465053013392857,
"train_loss_gtm": 0.006429399762834821,
"train_loss_lm": 1.4709821428571428
},
{
"epoch": 9.035033804548249,
"grad_norm": 0.4804949164390564,
"learning_rate": 3.432237209715904e-05,
"loss": 1.5311,
"step": 29400,
"train_loss_gtc": 0.040050048828125,
"train_loss_gtm": 0.015148849487304687,
"train_loss_lm": 1.471484375
},
{
"epoch": 9.065765212046712,
"grad_norm": 2.241997241973877,
"learning_rate": 3.40164834161806e-05,
"loss": 1.5311,
"step": 29500,
"train_loss_gtc": 0.039757080078125,
"train_loss_gtm": 0.017725067138671877,
"train_loss_lm": 1.476484375
},
{
"epoch": 9.096496619545174,
"grad_norm": 0.48794323205947876,
"learning_rate": 3.371125960149651e-05,
"loss": 1.5284,
"step": 29600,
"train_loss_gtc": 0.040618896484375,
"train_loss_gtm": 0.014891014099121094,
"train_loss_lm": 1.47359375
},
{
"epoch": 9.127228027043639,
"grad_norm": 0.9154407978057861,
"learning_rate": 3.340671334950091e-05,
"loss": 1.5308,
"step": 29700,
"train_loss_gtc": 0.04009521484375,
"train_loss_gtm": 0.01613304138183594,
"train_loss_lm": 1.476953125
},
{
"epoch": 9.157959434542102,
"grad_norm": 0.3826013505458832,
"learning_rate": 3.31028573284034e-05,
"loss": 1.5269,
"step": 29800,
"train_loss_gtc": 0.03857177734375,
"train_loss_gtm": 0.012269973754882812,
"train_loss_lm": 1.47546875
},
{
"epoch": 9.188690842040565,
"grad_norm": 0.4480116665363312,
"learning_rate": 3.279970417770206e-05,
"loss": 1.5314,
"step": 29900,
"train_loss_gtc": 0.0409716796875,
"train_loss_gtm": 0.01894462585449219,
"train_loss_lm": 1.477578125
},
{
"epoch": 9.21942224953903,
"grad_norm": 0.5610605478286743,
"learning_rate": 3.24972665076576e-05,
"loss": 1.5302,
"step": 30000,
"train_loss_gtc": 0.037857666015625,
"train_loss_gtm": 0.015255851745605469,
"train_loss_lm": 1.47703125
},
{
"epoch": 9.250153657037492,
"grad_norm": 0.4201144576072693,
"learning_rate": 3.219555689876896e-05,
"loss": 1.5277,
"step": 30100,
"train_loss_gtc": 0.03964599609375,
"train_loss_gtm": 0.012595443725585938,
"train_loss_lm": 1.47546875
},
{
"epoch": 9.280885064535955,
"grad_norm": 1.114909291267395,
"learning_rate": 3.1894587901249875e-05,
"loss": 1.5265,
"step": 30200,
"train_loss_gtc": 0.036279296875,
"train_loss_gtm": 0.009384765625,
"train_loss_lm": 1.475390625
},
{
"epoch": 9.31161647203442,
"grad_norm": 0.41764217615127563,
"learning_rate": 3.159437203450691e-05,
"loss": 1.5256,
"step": 30300,
"train_loss_gtc": 0.037474365234375,
"train_loss_gtm": 0.010728912353515625,
"train_loss_lm": 1.4775
},
{
"epoch": 9.342347879532882,
"grad_norm": 1.1266087293624878,
"learning_rate": 3.1294921786618595e-05,
"loss": 1.522,
"step": 30400,
"train_loss_gtc": 0.036798095703125,
"train_loss_gtm": 0.007729339599609375,
"train_loss_lm": 1.4746875
},
{
"epoch": 9.373079287031347,
"grad_norm": 0.4223707616329193,
"learning_rate": 3.099624961381606e-05,
"loss": 1.5262,
"step": 30500,
"train_loss_gtc": 0.039088134765625,
"train_loss_gtm": 0.013626289367675782,
"train_loss_lm": 1.471875
},
{
"epoch": 9.40381069452981,
"grad_norm": 0.4733109176158905,
"learning_rate": 3.069836793996486e-05,
"loss": 1.5268,
"step": 30600,
"train_loss_gtc": 0.038968505859375,
"train_loss_gtm": 0.015171966552734374,
"train_loss_lm": 1.4746875
},
{
"epoch": 9.434542102028272,
"grad_norm": 0.8515746593475342,
"learning_rate": 3.0401289156048117e-05,
"loss": 1.524,
"step": 30700,
"train_loss_gtc": 0.038099365234375,
"train_loss_gtm": 0.011698036193847657,
"train_loss_lm": 1.475234375
},
{
"epoch": 9.465273509526737,
"grad_norm": 0.3740207850933075,
"learning_rate": 3.0105025619651193e-05,
"loss": 1.5272,
"step": 30800,
"train_loss_gtc": 0.038729248046875,
"train_loss_gtm": 0.012548446655273438,
"train_loss_lm": 1.474765625
},
{
"epoch": 9.4960049170252,
"grad_norm": 0.42126893997192383,
"learning_rate": 2.9809589654447555e-05,
"loss": 1.5232,
"step": 30900,
"train_loss_gtc": 0.0394775390625,
"train_loss_gtm": 0.012857398986816405,
"train_loss_lm": 1.47125
},
{
"epoch": 9.526736324523663,
"grad_norm": 0.4131476581096649,
"learning_rate": 2.951499354968623e-05,
"loss": 1.5289,
"step": 31000,
"train_loss_gtc": 0.03717041015625,
"train_loss_gtm": 0.010989189147949219,
"train_loss_lm": 1.476328125
},
{
"epoch": 9.557467732022127,
"grad_norm": 1.3864574432373047,
"learning_rate": 2.922124955968054e-05,
"loss": 1.5302,
"step": 31100,
"train_loss_gtc": 0.040264892578125,
"train_loss_gtm": 0.014952011108398437,
"train_loss_lm": 1.4771875
},
{
"epoch": 9.58819913952059,
"grad_norm": 0.6983849406242371,
"learning_rate": 2.892836990329844e-05,
"loss": 1.5228,
"step": 31200,
"train_loss_gtc": 0.037857666015625,
"train_loss_gtm": 0.014338626861572265,
"train_loss_lm": 1.475390625
},
{
"epoch": 9.618930547019053,
"grad_norm": 0.9399222731590271,
"learning_rate": 2.8636366763454153e-05,
"loss": 1.5205,
"step": 31300,
"train_loss_gtc": 0.03775146484375,
"train_loss_gtm": 0.011002845764160156,
"train_loss_lm": 1.4725
},
{
"epoch": 9.649661954517518,
"grad_norm": 0.7803316712379456,
"learning_rate": 2.8345252286601448e-05,
"loss": 1.5214,
"step": 31400,
"train_loss_gtc": 0.03853271484375,
"train_loss_gtm": 0.014136924743652343,
"train_loss_lm": 1.473671875
},
{
"epoch": 9.68039336201598,
"grad_norm": 1.0166672468185425,
"learning_rate": 2.805503858222842e-05,
"loss": 1.525,
"step": 31500,
"train_loss_gtc": 0.03795654296875,
"train_loss_gtm": 0.013683624267578125,
"train_loss_lm": 1.472109375
},
{
"epoch": 9.711124769514443,
"grad_norm": 1.386081576347351,
"learning_rate": 2.7765737722353725e-05,
"loss": 1.5211,
"step": 31600,
"train_loss_gtc": 0.037562255859375,
"train_loss_gtm": 0.0162908935546875,
"train_loss_lm": 1.471484375
},
{
"epoch": 9.741856177012908,
"grad_norm": 1.487998366355896,
"learning_rate": 2.747736174102441e-05,
"loss": 1.5211,
"step": 31700,
"train_loss_gtc": 0.037666015625,
"train_loss_gtm": 0.009608421325683594,
"train_loss_lm": 1.4721875
},
{
"epoch": 9.77258758451137,
"grad_norm": 0.4993577301502228,
"learning_rate": 2.7189922633815346e-05,
"loss": 1.5286,
"step": 31800,
"train_loss_gtc": 0.04015380859375,
"train_loss_gtm": 0.015623245239257812,
"train_loss_lm": 1.476171875
},
{
"epoch": 9.803318992009833,
"grad_norm": 2.035013437271118,
"learning_rate": 2.690343235733026e-05,
"loss": 1.5297,
"step": 31900,
"train_loss_gtc": 0.03919921875,
"train_loss_gtm": 0.01642772674560547,
"train_loss_lm": 1.4703125
},
{
"epoch": 9.834050399508298,
"grad_norm": 0.44986504316329956,
"learning_rate": 2.66179028287044e-05,
"loss": 1.5191,
"step": 32000,
"train_loss_gtc": 0.0372119140625,
"train_loss_gtm": 0.01147369384765625,
"train_loss_lm": 1.474140625
},
{
"epoch": 9.86478180700676,
"grad_norm": 0.44800782203674316,
"learning_rate": 2.633334592510876e-05,
"loss": 1.5229,
"step": 32100,
"train_loss_gtc": 0.037117919921875,
"train_loss_gtm": 0.020374336242675782,
"train_loss_lm": 1.472421875
},
{
"epoch": 9.895513214505224,
"grad_norm": 0.4471757113933563,
"learning_rate": 2.6049773483256046e-05,
"loss": 1.5197,
"step": 32200,
"train_loss_gtc": 0.03855224609375,
"train_loss_gtm": 0.012574348449707031,
"train_loss_lm": 1.4709375
},
{
"epoch": 9.926244622003688,
"grad_norm": 1.0153461694717407,
"learning_rate": 2.5767197298908296e-05,
"loss": 1.522,
"step": 32300,
"train_loss_gtc": 0.0387353515625,
"train_loss_gtm": 0.013848609924316406,
"train_loss_lm": 1.470703125
},
{
"epoch": 9.956976029502151,
"grad_norm": 0.35531821846961975,
"learning_rate": 2.5485629126386323e-05,
"loss": 1.5207,
"step": 32400,
"train_loss_gtc": 0.0349658203125,
"train_loss_gtm": 0.00917278289794922,
"train_loss_lm": 1.471484375
},
{
"epoch": 9.987707437000614,
"grad_norm": 0.4289498031139374,
"learning_rate": 2.5205080678080573e-05,
"loss": 1.5159,
"step": 32500,
"train_loss_gtc": 0.03526123046875,
"train_loss_gtm": 0.006147556304931641,
"train_loss_lm": 1.469765625
},
{
"epoch": 10.0,
"eval_loss": 1.663671851158142,
"eval_runtime": 3.934,
"eval_samples_per_second": 252.92,
"eval_steps_per_second": 2.796,
"step": 32540,
"train_loss_gtc": 0.0366180419921875,
"train_loss_gtm": 0.016598081588745116,
"train_loss_lm": 1.4673828125,
"val_loss_gtc": 0.075244140625,
"val_loss_gtm": 0.091632080078125,
"val_loss_lm": 1.49296875
},
{
"epoch": 10.018438844499078,
"grad_norm": 0.6605350971221924,
"learning_rate": 2.4925563623964055e-05,
"loss": 1.5146,
"step": 32600,
"train_loss_gtc": 0.037335205078125,
"train_loss_gtm": 0.014607747395833334,
"train_loss_lm": 1.459765625
},
{
"epoch": 10.049170251997541,
"grad_norm": 0.5100732445716858,
"learning_rate": 2.4647089591106885e-05,
"loss": 1.5074,
"step": 32700,
"train_loss_gtc": 0.035672607421875,
"train_loss_gtm": 0.010171089172363281,
"train_loss_lm": 1.461875
},
{
"epoch": 10.079901659496006,
"grad_norm": 1.4332607984542847,
"learning_rate": 2.4369670163192603e-05,
"loss": 1.5097,
"step": 32800,
"train_loss_gtc": 0.038460693359375,
"train_loss_gtm": 0.012800846099853515,
"train_loss_lm": 1.461484375
},
{
"epoch": 10.110633066994469,
"grad_norm": 0.3127327263355255,
"learning_rate": 2.409331688003642e-05,
"loss": 1.5074,
"step": 32900,
"train_loss_gtc": 0.037158203125,
"train_loss_gtm": 0.007948532104492187,
"train_loss_lm": 1.46296875
},
{
"epoch": 10.141364474492931,
"grad_norm": 0.49944329261779785,
"learning_rate": 2.3818041237105047e-05,
"loss": 1.5138,
"step": 33000,
"train_loss_gtc": 0.036298828125,
"train_loss_gtm": 0.01034515380859375,
"train_loss_lm": 1.46390625
},
{
"epoch": 10.172095881991396,
"grad_norm": 0.38113901019096375,
"learning_rate": 2.3543854685038612e-05,
"loss": 1.5096,
"step": 33100,
"train_loss_gtc": 0.035716552734375,
"train_loss_gtm": 0.010895004272460937,
"train_loss_lm": 1.46328125
},
{
"epoch": 10.202827289489859,
"grad_norm": 0.8738096952438354,
"learning_rate": 2.3270768629174366e-05,
"loss": 1.5107,
"step": 33200,
"train_loss_gtc": 0.03684326171875,
"train_loss_gtm": 0.01412738800048828,
"train_loss_lm": 1.4628125
},
{
"epoch": 10.233558696988322,
"grad_norm": 0.7059551477432251,
"learning_rate": 2.2998794429072228e-05,
"loss": 1.511,
"step": 33300,
"train_loss_gtc": 0.035848388671875,
"train_loss_gtm": 0.010251865386962891,
"train_loss_lm": 1.46515625
},
{
"epoch": 10.264290104486786,
"grad_norm": 0.49285122752189636,
"learning_rate": 2.2727943398042223e-05,
"loss": 1.5166,
"step": 33400,
"train_loss_gtc": 0.03899169921875,
"train_loss_gtm": 0.01776031494140625,
"train_loss_lm": 1.46421875
},
{
"epoch": 10.295021511985249,
"grad_norm": 0.3343373239040375,
"learning_rate": 2.245822680267391e-05,
"loss": 1.5063,
"step": 33500,
"train_loss_gtc": 0.034970703125,
"train_loss_gtm": 0.00969287872314453,
"train_loss_lm": 1.46109375
},
{
"epoch": 10.325752919483712,
"grad_norm": 0.6031121611595154,
"learning_rate": 2.2189655862367736e-05,
"loss": 1.5091,
"step": 33600,
"train_loss_gtc": 0.036680908203125,
"train_loss_gtm": 0.013059463500976563,
"train_loss_lm": 1.46125
},
{
"epoch": 10.356484326982176,
"grad_norm": 0.35346755385398865,
"learning_rate": 2.1922241748868395e-05,
"loss": 1.5055,
"step": 33700,
"train_loss_gtc": 0.033951416015625,
"train_loss_gtm": 0.005552330017089844,
"train_loss_lm": 1.4603125
},
{
"epoch": 10.38721573448064,
"grad_norm": 1.6642231941223145,
"learning_rate": 2.1655995585799977e-05,
"loss": 1.51,
"step": 33800,
"train_loss_gtc": 0.036239013671875,
"train_loss_gtm": 0.012279739379882812,
"train_loss_lm": 1.460546875
},
{
"epoch": 10.417947141979102,
"grad_norm": 1.5294814109802246,
"learning_rate": 2.1390928448203397e-05,
"loss": 1.5046,
"step": 33900,
"train_loss_gtc": 0.03334716796875,
"train_loss_gtm": 0.00482635498046875,
"train_loss_lm": 1.46125
},
{
"epoch": 10.448678549477567,
"grad_norm": 0.9640972018241882,
"learning_rate": 2.1127051362075596e-05,
"loss": 1.5085,
"step": 34000,
"train_loss_gtc": 0.03734619140625,
"train_loss_gtm": 0.012679977416992188,
"train_loss_lm": 1.4615625
},
{
"epoch": 10.47940995697603,
"grad_norm": 2.8935489654541016,
"learning_rate": 2.086437530391101e-05,
"loss": 1.5037,
"step": 34100,
"train_loss_gtc": 0.034757080078125,
"train_loss_gtm": 0.006779251098632813,
"train_loss_lm": 1.46296875
},
{
"epoch": 10.510141364474492,
"grad_norm": 0.4859734773635864,
"learning_rate": 2.0602911200244907e-05,
"loss": 1.5141,
"step": 34200,
"train_loss_gtc": 0.037239990234375,
"train_loss_gtm": 0.014754142761230469,
"train_loss_lm": 1.462109375
},
{
"epoch": 10.540872771972957,
"grad_norm": 0.4255363643169403,
"learning_rate": 2.034266992719886e-05,
"loss": 1.5048,
"step": 34300,
"train_loss_gtc": 0.0356005859375,
"train_loss_gtm": 0.009561195373535156,
"train_loss_lm": 1.459375
},
{
"epoch": 10.57160417947142,
"grad_norm": 0.41498520970344543,
"learning_rate": 2.008366231002836e-05,
"loss": 1.5094,
"step": 34400,
"train_loss_gtc": 0.0361181640625,
"train_loss_gtm": 0.013895111083984375,
"train_loss_lm": 1.458828125
},
{
"epoch": 10.602335586969883,
"grad_norm": 0.4691818058490753,
"learning_rate": 1.9825899122672516e-05,
"loss": 1.5088,
"step": 34500,
"train_loss_gtc": 0.036781005859375,
"train_loss_gtm": 0.016254196166992186,
"train_loss_lm": 1.4590625
},
{
"epoch": 10.633066994468347,
"grad_norm": 0.3247811496257782,
"learning_rate": 1.9569391087305944e-05,
"loss": 1.5095,
"step": 34600,
"train_loss_gtc": 0.036104736328125,
"train_loss_gtm": 0.011984748840332031,
"train_loss_lm": 1.458671875
},
{
"epoch": 10.66379840196681,
"grad_norm": 0.48939141631126404,
"learning_rate": 1.931414887389265e-05,
"loss": 1.5032,
"step": 34700,
"train_loss_gtc": 0.035,
"train_loss_gtm": 0.00996623992919922,
"train_loss_lm": 1.45953125
},
{
"epoch": 10.694529809465273,
"grad_norm": 0.5067106485366821,
"learning_rate": 1.906018309974225e-05,
"loss": 1.5118,
"step": 34800,
"train_loss_gtc": 0.036153564453125,
"train_loss_gtm": 0.017694778442382812,
"train_loss_lm": 1.46171875
},
{
"epoch": 10.725261216963737,
"grad_norm": 0.4321945607662201,
"learning_rate": 1.8807504329068377e-05,
"loss": 1.5052,
"step": 34900,
"train_loss_gtc": 0.0354345703125,
"train_loss_gtm": 0.012692756652832031,
"train_loss_lm": 1.461015625
},
{
"epoch": 10.7559926244622,
"grad_norm": 0.39166566729545593,
"learning_rate": 1.8556123072549097e-05,
"loss": 1.5078,
"step": 35000,
"train_loss_gtc": 0.037042236328125,
"train_loss_gtm": 0.011860542297363282,
"train_loss_lm": 1.4615625
},
{
"epoch": 10.786724031960663,
"grad_norm": 0.4959773123264313,
"learning_rate": 1.8306049786889872e-05,
"loss": 1.5037,
"step": 35100,
"train_loss_gtc": 0.036055908203125,
"train_loss_gtm": 0.007551231384277344,
"train_loss_lm": 1.46109375
},
{
"epoch": 10.817455439459128,
"grad_norm": 0.5386573076248169,
"learning_rate": 1.8057294874388443e-05,
"loss": 1.5052,
"step": 35200,
"train_loss_gtc": 0.034755859375,
"train_loss_gtm": 0.011582107543945312,
"train_loss_lm": 1.461328125
},
{
"epoch": 10.84818684695759,
"grad_norm": 0.38217893242836,
"learning_rate": 1.78098686825022e-05,
"loss": 1.502,
"step": 35300,
"train_loss_gtc": 0.034422607421875,
"train_loss_gtm": 0.0069885444641113285,
"train_loss_lm": 1.4628125
},
{
"epoch": 10.878918254456053,
"grad_norm": 0.3977510929107666,
"learning_rate": 1.7563781503417743e-05,
"loss": 1.5027,
"step": 35400,
"train_loss_gtc": 0.034517822265625,
"train_loss_gtm": 0.012902565002441406,
"train_loss_lm": 1.461015625
},
{
"epoch": 10.909649661954518,
"grad_norm": 1.0005662441253662,
"learning_rate": 1.7319043573622796e-05,
"loss": 1.5068,
"step": 35500,
"train_loss_gtc": 0.034649658203125,
"train_loss_gtm": 0.00762664794921875,
"train_loss_lm": 1.46140625
},
{
"epoch": 10.94038106945298,
"grad_norm": 0.8638070225715637,
"learning_rate": 1.707566507348032e-05,
"loss": 1.5069,
"step": 35600,
"train_loss_gtc": 0.03516845703125,
"train_loss_gtm": 0.013119163513183594,
"train_loss_lm": 1.46453125
},
{
"epoch": 10.971112476951445,
"grad_norm": 0.7276130318641663,
"learning_rate": 1.6833656126805075e-05,
"loss": 1.5038,
"step": 35700,
"train_loss_gtc": 0.034442138671875,
"train_loss_gtm": 0.008318862915039062,
"train_loss_lm": 1.4596875
},
{
"epoch": 11.0,
"eval_loss": 1.6515624523162842,
"eval_runtime": 3.9297,
"eval_samples_per_second": 253.199,
"eval_steps_per_second": 2.799,
"step": 35794,
"train_loss_gtc": 0.03423682679521277,
"train_loss_gtm": 0.011414101783265459,
"train_loss_lm": 1.4602726063829787,
"val_loss_gtc": 0.0715576171875,
"val_loss_gtm": 0.08521461486816406,
"val_loss_lm": 1.484375
},
{
"epoch": 11.001843884449908,
"grad_norm": 0.355080246925354,
"learning_rate": 1.6593026800442584e-05,
"loss": 1.5059,
"step": 35800,
"train_loss_gtc": 0.038492838541666664,
"train_loss_gtm": 0.0019823710123697915,
"train_loss_lm": 1.4466145833333333
},
{
"epoch": 11.03257529194837,
"grad_norm": 0.3804630935192108,
"learning_rate": 1.6353787103850214e-05,
"loss": 1.4999,
"step": 35900,
"train_loss_gtc": 0.034288330078125,
"train_loss_gtm": 0.013097267150878906,
"train_loss_lm": 1.454375
},
{
"epoch": 11.063306699446835,
"grad_norm": 0.43270865082740784,
"learning_rate": 1.611594698868099e-05,
"loss": 1.4984,
"step": 36000,
"train_loss_gtc": 0.034847412109375,
"train_loss_gtm": 0.010229988098144531,
"train_loss_lm": 1.45125
},
{
"epoch": 11.094038106945298,
"grad_norm": 0.35577720403671265,
"learning_rate": 1.587951634836949e-05,
"loss": 1.4972,
"step": 36100,
"train_loss_gtc": 0.03463623046875,
"train_loss_gtm": 0.006039161682128907,
"train_loss_lm": 1.45390625
},
{
"epoch": 11.124769514443761,
"grad_norm": 0.3876980245113373,
"learning_rate": 1.5644505017720396e-05,
"loss": 1.4942,
"step": 36200,
"train_loss_gtc": 0.032666015625,
"train_loss_gtm": 0.00611663818359375,
"train_loss_lm": 1.451171875
},
{
"epoch": 11.155500921942226,
"grad_norm": 0.675238847732544,
"learning_rate": 1.5410922772499352e-05,
"loss": 1.503,
"step": 36300,
"train_loss_gtc": 0.035501708984375,
"train_loss_gtm": 0.013241043090820312,
"train_loss_lm": 1.45578125
},
{
"epoch": 11.186232329440688,
"grad_norm": 0.4091513752937317,
"learning_rate": 1.5178779329026393e-05,
"loss": 1.5001,
"step": 36400,
"train_loss_gtc": 0.03492431640625,
"train_loss_gtm": 0.013411216735839844,
"train_loss_lm": 1.4534375
},
{
"epoch": 11.216963736939151,
"grad_norm": 0.4007122814655304,
"learning_rate": 1.494808434377164e-05,
"loss": 1.4959,
"step": 36500,
"train_loss_gtc": 0.0340380859375,
"train_loss_gtm": 0.010790367126464844,
"train_loss_lm": 1.45296875
},
{
"epoch": 11.247695144437616,
"grad_norm": 0.3332425057888031,
"learning_rate": 1.4718847412953784e-05,
"loss": 1.4964,
"step": 36600,
"train_loss_gtc": 0.035784912109375,
"train_loss_gtm": 0.013795166015625,
"train_loss_lm": 1.4509375
},
{
"epoch": 11.278426551936079,
"grad_norm": 0.42536449432373047,
"learning_rate": 1.4491078072140779e-05,
"loss": 1.4959,
"step": 36700,
"train_loss_gtc": 0.035238037109375,
"train_loss_gtm": 0.008274612426757812,
"train_loss_lm": 1.453203125
},
{
"epoch": 11.309157959434541,
"grad_norm": 0.4789024889469147,
"learning_rate": 1.4264785795853231e-05,
"loss": 1.4947,
"step": 36800,
"train_loss_gtc": 0.0340283203125,
"train_loss_gtm": 0.007297935485839843,
"train_loss_lm": 1.4525
},
{
"epoch": 11.339889366933006,
"grad_norm": 0.436238557100296,
"learning_rate": 1.4039979997170349e-05,
"loss": 1.4954,
"step": 36900,
"train_loss_gtc": 0.035128173828125,
"train_loss_gtm": 0.010289707183837891,
"train_loss_lm": 1.45390625
},
{
"epoch": 11.370620774431469,
"grad_norm": 0.37121227383613586,
"learning_rate": 1.3816670027338297e-05,
"loss": 1.4961,
"step": 37000,
"train_loss_gtc": 0.0336767578125,
"train_loss_gtm": 0.011312313079833984,
"train_loss_lm": 1.451328125
},
{
"epoch": 11.401352181929932,
"grad_norm": 0.3737700581550598,
"learning_rate": 1.3594865175381267e-05,
"loss": 1.4941,
"step": 37100,
"train_loss_gtc": 0.034173583984375,
"train_loss_gtm": 0.011153717041015625,
"train_loss_lm": 1.453828125
},
{
"epoch": 11.432083589428396,
"grad_norm": 0.40509167313575745,
"learning_rate": 1.3374574667715033e-05,
"loss": 1.4974,
"step": 37200,
"train_loss_gtc": 0.034654541015625,
"train_loss_gtm": 0.013001708984375,
"train_loss_lm": 1.452421875
},
{
"epoch": 11.46281499692686,
"grad_norm": 0.38259902596473694,
"learning_rate": 1.3155807667763265e-05,
"loss": 1.4975,
"step": 37300,
"train_loss_gtc": 0.03426025390625,
"train_loss_gtm": 0.011098213195800781,
"train_loss_lm": 1.45296875
},
{
"epoch": 11.493546404425322,
"grad_norm": 2.280012369155884,
"learning_rate": 1.2938573275576204e-05,
"loss": 1.4933,
"step": 37400,
"train_loss_gtc": 0.034439697265625,
"train_loss_gtm": 0.009605464935302734,
"train_loss_lm": 1.451640625
},
{
"epoch": 11.524277811923787,
"grad_norm": 0.8614688515663147,
"learning_rate": 1.2722880527452285e-05,
"loss": 1.4916,
"step": 37500,
"train_loss_gtc": 0.032637939453125,
"train_loss_gtm": 0.0070468330383300784,
"train_loss_lm": 1.454375
},
{
"epoch": 11.55500921942225,
"grad_norm": 0.40161266922950745,
"learning_rate": 1.250873839556213e-05,
"loss": 1.4943,
"step": 37600,
"train_loss_gtc": 0.033919677734375,
"train_loss_gtm": 0.005923271179199219,
"train_loss_lm": 1.45078125
},
{
"epoch": 11.585740626920712,
"grad_norm": 0.4867040514945984,
"learning_rate": 1.2296155787575386e-05,
"loss": 1.4963,
"step": 37700,
"train_loss_gtc": 0.03362060546875,
"train_loss_gtm": 0.01107696533203125,
"train_loss_lm": 1.453515625
},
{
"epoch": 11.616472034419177,
"grad_norm": 0.40651935338974,
"learning_rate": 1.208514154629022e-05,
"loss": 1.4943,
"step": 37800,
"train_loss_gtc": 0.034439697265625,
"train_loss_gtm": 0.00758575439453125,
"train_loss_lm": 1.454609375
},
{
"epoch": 11.64720344191764,
"grad_norm": 0.43702617287635803,
"learning_rate": 1.1875704449265423e-05,
"loss": 1.4957,
"step": 37900,
"train_loss_gtc": 0.034952392578125,
"train_loss_gtm": 0.010952072143554687,
"train_loss_lm": 1.454921875
},
{
"epoch": 11.677934849416104,
"grad_norm": 0.3727381229400635,
"learning_rate": 1.1667853208455325e-05,
"loss": 1.4978,
"step": 38000,
"train_loss_gtc": 0.03486572265625,
"train_loss_gtm": 0.015162067413330078,
"train_loss_lm": 1.450859375
},
{
"epoch": 11.708666256914567,
"grad_norm": 0.3844757080078125,
"learning_rate": 1.1461596469847402e-05,
"loss": 1.4953,
"step": 38100,
"train_loss_gtc": 0.035777587890625,
"train_loss_gtm": 0.011620597839355469,
"train_loss_lm": 1.450546875
},
{
"epoch": 11.73939766441303,
"grad_norm": 0.40840184688568115,
"learning_rate": 1.1256942813102634e-05,
"loss": 1.4928,
"step": 38200,
"train_loss_gtc": 0.031209716796875,
"train_loss_gtm": 0.00724945068359375,
"train_loss_lm": 1.45421875
},
{
"epoch": 11.770129071911494,
"grad_norm": 0.6461498141288757,
"learning_rate": 1.1053900751198614e-05,
"loss": 1.4896,
"step": 38300,
"train_loss_gtc": 0.033707275390625,
"train_loss_gtm": 0.007514209747314453,
"train_loss_lm": 1.45015625
},
{
"epoch": 11.800860479409957,
"grad_norm": 0.46932530403137207,
"learning_rate": 1.0852478730075422e-05,
"loss": 1.4971,
"step": 38400,
"train_loss_gtc": 0.0347412109375,
"train_loss_gtm": 0.014281749725341797,
"train_loss_lm": 1.45265625
},
{
"epoch": 11.83159188690842,
"grad_norm": 0.417879194021225,
"learning_rate": 1.0652685128284285e-05,
"loss": 1.493,
"step": 38500,
"train_loss_gtc": 0.034190673828125,
"train_loss_gtm": 0.007110633850097656,
"train_loss_lm": 1.451796875
},
{
"epoch": 11.862323294406885,
"grad_norm": 0.38669833540916443,
"learning_rate": 1.0454528256639095e-05,
"loss": 1.4928,
"step": 38600,
"train_loss_gtc": 0.032156982421875,
"train_loss_gtm": 0.008788909912109375,
"train_loss_lm": 1.45203125
},
{
"epoch": 11.893054701905347,
"grad_norm": 1.0371503829956055,
"learning_rate": 1.0258016357870703e-05,
"loss": 1.4918,
"step": 38700,
"train_loss_gtc": 0.03337646484375,
"train_loss_gtm": 0.007540702819824219,
"train_loss_lm": 1.450390625
},
{
"epoch": 11.92378610940381,
"grad_norm": 0.7227888703346252,
"learning_rate": 1.0063157606284001e-05,
"loss": 1.4903,
"step": 38800,
"train_loss_gtc": 0.032996826171875,
"train_loss_gtm": 0.005477218627929687,
"train_loss_lm": 1.452578125
},
{
"epoch": 11.954517516902275,
"grad_norm": 0.44045162200927734,
"learning_rate": 9.869960107417924e-06,
"loss": 1.4931,
"step": 38900,
"train_loss_gtc": 0.034642333984375,
"train_loss_gtm": 0.009967632293701172,
"train_loss_lm": 1.4534375
},
{
"epoch": 11.985248924400738,
"grad_norm": 0.36739978194236755,
"learning_rate": 9.678431897708279e-06,
"loss": 1.4923,
"step": 39000,
"train_loss_gtc": 0.03304931640625,
"train_loss_gtm": 0.007914905548095702,
"train_loss_lm": 1.45109375
},
{
"epoch": 12.0,
"eval_loss": 1.6339843273162842,
"eval_runtime": 3.8887,
"eval_samples_per_second": 255.872,
"eval_steps_per_second": 2.829,
"step": 39048,
"train_loss_gtc": 0.032511393229166664,
"train_loss_gtm": 0.011383334795633951,
"train_loss_lm": 1.4518229166666667,
"val_loss_gtc": 0.067724609375,
"val_loss_gtm": 0.07337799072265624,
"val_loss_lm": 1.47890625
},
{
"epoch": 12.0159803318992,
"grad_norm": 0.5276215672492981,
"learning_rate": 9.48858094415348e-06,
"loss": 1.4867,
"step": 39100,
"train_loss_gtc": 0.031123234675480768,
"train_loss_gtm": 0.007110412304217999,
"train_loss_lm": 1.4439603365384615
},
{
"epoch": 12.046711739397665,
"grad_norm": 1.3187389373779297,
"learning_rate": 9.300415143983122e-06,
"loss": 1.4823,
"step": 39200,
"train_loss_gtc": 0.03217041015625,
"train_loss_gtm": 0.007877159118652343,
"train_loss_lm": 1.44421875
},
{
"epoch": 12.077443146896128,
"grad_norm": 0.37951648235321045,
"learning_rate": 9.113942324329445e-06,
"loss": 1.4868,
"step": 39300,
"train_loss_gtc": 0.032154541015625,
"train_loss_gtm": 0.006891098022460938,
"train_loss_lm": 1.446171875
},
{
"epoch": 12.10817455439459,
"grad_norm": 0.6352601051330566,
"learning_rate": 8.929170241901807e-06,
"loss": 1.4818,
"step": 39400,
"train_loss_gtc": 0.032747802734375,
"train_loss_gtm": 0.007182502746582031,
"train_loss_lm": 1.445390625
},
{
"epoch": 12.138905961893055,
"grad_norm": 0.46073710918426514,
"learning_rate": 8.746106582663994e-06,
"loss": 1.4839,
"step": 39500,
"train_loss_gtc": 0.03167236328125,
"train_loss_gtm": 0.009096622467041016,
"train_loss_lm": 1.447734375
},
{
"epoch": 12.169637369391518,
"grad_norm": 0.3877211809158325,
"learning_rate": 8.56475896151454e-06,
"loss": 1.4845,
"step": 39600,
"train_loss_gtc": 0.032230224609375,
"train_loss_gtm": 0.005583648681640625,
"train_loss_lm": 1.445
},
{
"epoch": 12.200368776889981,
"grad_norm": 0.5160537362098694,
"learning_rate": 8.385134921969923e-06,
"loss": 1.4865,
"step": 39700,
"train_loss_gtc": 0.032567138671875,
"train_loss_gtm": 0.012664890289306641,
"train_loss_lm": 1.44546875
},
{
"epoch": 12.231100184388445,
"grad_norm": 0.34780940413475037,
"learning_rate": 8.207241935850812e-06,
"loss": 1.4859,
"step": 39800,
"train_loss_gtc": 0.031810302734375,
"train_loss_gtm": 0.00482290267944336,
"train_loss_lm": 1.4471875
},
{
"epoch": 12.261831591886908,
"grad_norm": 0.39777079224586487,
"learning_rate": 8.031087402971232e-06,
"loss": 1.488,
"step": 39900,
"train_loss_gtc": 0.0323828125,
"train_loss_gtm": 0.015415172576904296,
"train_loss_lm": 1.444921875
},
{
"epoch": 12.292562999385371,
"grad_norm": 0.5161352753639221,
"learning_rate": 7.856678650830806e-06,
"loss": 1.4832,
"step": 40000,
"train_loss_gtc": 0.03137939453125,
"train_loss_gtm": 0.0043726348876953125,
"train_loss_lm": 1.44625
},
{
"epoch": 12.323294406883836,
"grad_norm": 0.3717089295387268,
"learning_rate": 7.684022934309926e-06,
"loss": 1.4859,
"step": 40100,
"train_loss_gtc": 0.032230224609375,
"train_loss_gtm": 0.008196029663085937,
"train_loss_lm": 1.44453125
},
{
"epoch": 12.354025814382299,
"grad_norm": 0.4823426902294159,
"learning_rate": 7.513127435367923e-06,
"loss": 1.4862,
"step": 40200,
"train_loss_gtc": 0.032799072265625,
"train_loss_gtm": 0.008565444946289063,
"train_loss_lm": 1.446953125
},
{
"epoch": 12.384757221880761,
"grad_norm": 0.3817342221736908,
"learning_rate": 7.343999262744389e-06,
"loss": 1.4889,
"step": 40300,
"train_loss_gtc": 0.033624267578125,
"train_loss_gtm": 0.00685495376586914,
"train_loss_lm": 1.445234375
},
{
"epoch": 12.415488629379226,
"grad_norm": 0.38065531849861145,
"learning_rate": 7.176645451663433e-06,
"loss": 1.4908,
"step": 40400,
"train_loss_gtc": 0.034915771484375,
"train_loss_gtm": 0.011385536193847657,
"train_loss_lm": 1.443984375
},
{
"epoch": 12.446220036877689,
"grad_norm": 0.39833277463912964,
"learning_rate": 7.011072963541088e-06,
"loss": 1.4832,
"step": 40500,
"train_loss_gtc": 0.031995849609375,
"train_loss_gtm": 0.006886463165283203,
"train_loss_lm": 1.44546875
},
{
"epoch": 12.476951444376152,
"grad_norm": 0.3548543453216553,
"learning_rate": 6.847288685695663e-06,
"loss": 1.4845,
"step": 40600,
"train_loss_gtc": 0.031795654296875,
"train_loss_gtm": 0.010219860076904296,
"train_loss_lm": 1.446171875
},
{
"epoch": 12.507682851874616,
"grad_norm": 0.46865567564964294,
"learning_rate": 6.6852994310613035e-06,
"loss": 1.4804,
"step": 40700,
"train_loss_gtc": 0.03116455078125,
"train_loss_gtm": 0.0027751541137695313,
"train_loss_lm": 1.444921875
},
{
"epoch": 12.538414259373079,
"grad_norm": 0.3493591547012329,
"learning_rate": 6.525111937904565e-06,
"loss": 1.4867,
"step": 40800,
"train_loss_gtc": 0.03113525390625,
"train_loss_gtm": 0.006133708953857422,
"train_loss_lm": 1.446484375
},
{
"epoch": 12.569145666871542,
"grad_norm": 0.3806462287902832,
"learning_rate": 6.366732869544167e-06,
"loss": 1.4847,
"step": 40900,
"train_loss_gtc": 0.032784423828125,
"train_loss_gtm": 0.009026336669921874,
"train_loss_lm": 1.444609375
},
{
"epoch": 12.599877074370006,
"grad_norm": 0.3359711170196533,
"learning_rate": 6.210168814073775e-06,
"loss": 1.4844,
"step": 41000,
"train_loss_gtc": 0.033193359375,
"train_loss_gtm": 0.013145980834960937,
"train_loss_lm": 1.4425
},
{
"epoch": 12.63060848186847,
"grad_norm": 0.3647012412548065,
"learning_rate": 6.0554262840879505e-06,
"loss": 1.4819,
"step": 41100,
"train_loss_gtc": 0.03174072265625,
"train_loss_gtm": 0.0059863471984863284,
"train_loss_lm": 1.445078125
},
{
"epoch": 12.661339889366934,
"grad_norm": 0.3800066411495209,
"learning_rate": 5.902511716411286e-06,
"loss": 1.4832,
"step": 41200,
"train_loss_gtc": 0.03176025390625,
"train_loss_gtm": 0.004956302642822266,
"train_loss_lm": 1.445703125
},
{
"epoch": 12.692071296865397,
"grad_norm": 6.271182060241699,
"learning_rate": 5.75143147183061e-06,
"loss": 1.4843,
"step": 41300,
"train_loss_gtc": 0.032977294921875,
"train_loss_gtm": 0.008317089080810547,
"train_loss_lm": 1.445390625
},
{
"epoch": 12.72280270436386,
"grad_norm": 1.3521143198013306,
"learning_rate": 5.602191834830445e-06,
"loss": 1.4785,
"step": 41400,
"train_loss_gtc": 0.030087890625,
"train_loss_gtm": 0.0036014556884765626,
"train_loss_lm": 1.4446875
},
{
"epoch": 12.753534111862324,
"grad_norm": 0.38900676369667053,
"learning_rate": 5.454799013331546e-06,
"loss": 1.4838,
"step": 41500,
"train_loss_gtc": 0.031859130859375,
"train_loss_gtm": 0.003786640167236328,
"train_loss_lm": 1.444453125
},
{
"epoch": 12.784265519360787,
"grad_norm": 0.36797913908958435,
"learning_rate": 5.309259138432693e-06,
"loss": 1.4843,
"step": 41600,
"train_loss_gtc": 0.031395263671875,
"train_loss_gtm": 0.005061054229736328,
"train_loss_lm": 1.444609375
},
{
"epoch": 12.81499692685925,
"grad_norm": 2.0185465812683105,
"learning_rate": 5.165578264155646e-06,
"loss": 1.4854,
"step": 41700,
"train_loss_gtc": 0.03158447265625,
"train_loss_gtm": 0.007223720550537109,
"train_loss_lm": 1.444609375
},
{
"epoch": 12.845728334357714,
"grad_norm": 0.37788382172584534,
"learning_rate": 5.023762367193336e-06,
"loss": 1.4802,
"step": 41800,
"train_loss_gtc": 0.031046142578125,
"train_loss_gtm": 0.0037957191467285155,
"train_loss_lm": 1.4475
},
{
"epoch": 12.876459741856177,
"grad_norm": 0.31019526720046997,
"learning_rate": 4.883817346661234e-06,
"loss": 1.4895,
"step": 41900,
"train_loss_gtc": 0.033118896484375,
"train_loss_gtm": 0.00923778533935547,
"train_loss_lm": 1.445546875
},
{
"epoch": 12.90719114935464,
"grad_norm": 0.3986211121082306,
"learning_rate": 4.745749023851964e-06,
"loss": 1.483,
"step": 42000,
"train_loss_gtc": 0.03188232421875,
"train_loss_gtm": 0.008430919647216796,
"train_loss_lm": 1.44296875
},
{
"epoch": 12.937922556853104,
"grad_norm": 0.3529811501502991,
"learning_rate": 4.609563141993156e-06,
"loss": 1.4812,
"step": 42100,
"train_loss_gtc": 0.030782470703125,
"train_loss_gtm": 0.0027103614807128906,
"train_loss_lm": 1.442265625
},
{
"epoch": 12.968653964351567,
"grad_norm": 0.3418220579624176,
"learning_rate": 4.475265366008547e-06,
"loss": 1.4829,
"step": 42200,
"train_loss_gtc": 0.03141357421875,
"train_loss_gtm": 0.007238006591796875,
"train_loss_lm": 1.44453125
},
{
"epoch": 12.99938537185003,
"grad_norm": 0.385499507188797,
"learning_rate": 4.342861282282362e-06,
"loss": 1.4841,
"step": 42300,
"train_loss_gtc": 0.032645263671875,
"train_loss_gtm": 0.0034380340576171875,
"train_loss_lm": 1.444921875
},
{
"epoch": 13.0,
"eval_loss": 1.618749976158142,
"eval_runtime": 3.9049,
"eval_samples_per_second": 254.805,
"eval_steps_per_second": 2.817,
"step": 42302,
"train_loss_gtc": 0.043212890625,
"train_loss_gtm": 0.05727386474609375,
"train_loss_lm": 1.453125,
"val_loss_gtc": 0.06478271484375,
"val_loss_gtm": 0.07197847366333007,
"val_loss_lm": 1.47578125
},
{
"epoch": 13.030116779348495,
"grad_norm": 0.4604727327823639,
"learning_rate": 4.212356398426892e-06,
"loss": 1.481,
"step": 42400,
"train_loss_gtc": 0.03175447425063776,
"train_loss_gtm": 0.006234383096500319,
"train_loss_lm": 1.4418845663265305
},
{
"epoch": 13.060848186846957,
"grad_norm": 0.41722559928894043,
"learning_rate": 4.0837561430534135e-06,
"loss": 1.4805,
"step": 42500,
"train_loss_gtc": 0.03138427734375,
"train_loss_gtm": 0.006003303527832031,
"train_loss_lm": 1.4434375
},
{
"epoch": 13.09157959434542,
"grad_norm": 0.3385833501815796,
"learning_rate": 3.957065865546406e-06,
"loss": 1.4773,
"step": 42600,
"train_loss_gtc": 0.032398681640625,
"train_loss_gtm": 0.005317020416259766,
"train_loss_lm": 1.4409375
},
{
"epoch": 13.122311001843885,
"grad_norm": 0.35626187920570374,
"learning_rate": 3.832290835840974e-06,
"loss": 1.4767,
"step": 42700,
"train_loss_gtc": 0.03093505859375,
"train_loss_gtm": 0.0037181663513183596,
"train_loss_lm": 1.440625
},
{
"epoch": 13.153042409342348,
"grad_norm": 0.3810971975326538,
"learning_rate": 3.7094362442036845e-06,
"loss": 1.4776,
"step": 42800,
"train_loss_gtc": 0.03213134765625,
"train_loss_gtm": 0.004517803192138672,
"train_loss_lm": 1.441328125
},
{
"epoch": 13.18377381684081,
"grad_norm": 0.36459001898765564,
"learning_rate": 3.588507201016633e-06,
"loss": 1.4797,
"step": 42900,
"train_loss_gtc": 0.031527099609375,
"train_loss_gtm": 0.009501018524169923,
"train_loss_lm": 1.440546875
},
{
"epoch": 13.214505224339275,
"grad_norm": 0.505306601524353,
"learning_rate": 3.469508736564897e-06,
"loss": 1.4807,
"step": 43000,
"train_loss_gtc": 0.0320068359375,
"train_loss_gtm": 0.006435070037841797,
"train_loss_lm": 1.4425
},
{
"epoch": 13.245236631837738,
"grad_norm": 1.4232553243637085,
"learning_rate": 3.3524458008272475e-06,
"loss": 1.4775,
"step": 43100,
"train_loss_gtc": 0.03030517578125,
"train_loss_gtm": 0.004405345916748047,
"train_loss_lm": 1.44171875
},
{
"epoch": 13.275968039336203,
"grad_norm": 0.37980103492736816,
"learning_rate": 3.2373232632703197e-06,
"loss": 1.4816,
"step": 43200,
"train_loss_gtc": 0.0322021484375,
"train_loss_gtm": 0.0026582717895507813,
"train_loss_lm": 1.4415625
},
{
"epoch": 13.306699446834665,
"grad_norm": 2.2766940593719482,
"learning_rate": 3.1241459126459706e-06,
"loss": 1.4808,
"step": 43300,
"train_loss_gtc": 0.03172119140625,
"train_loss_gtm": 0.008477497100830077,
"train_loss_lm": 1.44171875
},
{
"epoch": 13.337430854333128,
"grad_norm": 0.427611768245697,
"learning_rate": 3.01291845679213e-06,
"loss": 1.4783,
"step": 43400,
"train_loss_gtc": 0.03244873046875,
"train_loss_gtm": 0.0040134239196777345,
"train_loss_lm": 1.440390625
},
{
"epoch": 13.368162261831593,
"grad_norm": 0.30551981925964355,
"learning_rate": 2.9036455224369765e-06,
"loss": 1.4762,
"step": 43500,
"train_loss_gtc": 0.032469482421875,
"train_loss_gtm": 0.003238506317138672,
"train_loss_lm": 1.439453125
},
{
"epoch": 13.398893669330056,
"grad_norm": 0.42366525530815125,
"learning_rate": 2.7963316550064455e-06,
"loss": 1.4821,
"step": 43600,
"train_loss_gtc": 0.033074951171875,
"train_loss_gtm": 0.010958194732666016,
"train_loss_lm": 1.439765625
},
{
"epoch": 13.429625076828518,
"grad_norm": 0.33650246262550354,
"learning_rate": 2.6909813184351873e-06,
"loss": 1.4795,
"step": 43700,
"train_loss_gtc": 0.032664794921875,
"train_loss_gtm": 0.0049641036987304685,
"train_loss_lm": 1.4409375
},
{
"epoch": 13.460356484326983,
"grad_norm": 0.34106993675231934,
"learning_rate": 2.5875988949808472e-06,
"loss": 1.4846,
"step": 43800,
"train_loss_gtc": 0.03381591796875,
"train_loss_gtm": 0.013602008819580078,
"train_loss_lm": 1.441796875
},
{
"epoch": 13.491087891825446,
"grad_norm": 0.3917493224143982,
"learning_rate": 2.486188685041807e-06,
"loss": 1.4821,
"step": 43900,
"train_loss_gtc": 0.031900634765625,
"train_loss_gtm": 0.008092212677001952,
"train_loss_lm": 1.440859375
},
{
"epoch": 13.521819299323909,
"grad_norm": 0.3950476050376892,
"learning_rate": 2.386754906978278e-06,
"loss": 1.4819,
"step": 44000,
"train_loss_gtc": 0.03089111328125,
"train_loss_gtm": 0.008052177429199218,
"train_loss_lm": 1.4409375
},
{
"epoch": 13.552550706822373,
"grad_norm": 1.4824786186218262,
"learning_rate": 2.2893016969368575e-06,
"loss": 1.4889,
"step": 44100,
"train_loss_gtc": 0.03397705078125,
"train_loss_gtm": 0.019720077514648438,
"train_loss_lm": 1.441640625
},
{
"epoch": 13.583282114320836,
"grad_norm": 0.4232785999774933,
"learning_rate": 2.1938331086784335e-06,
"loss": 1.4796,
"step": 44200,
"train_loss_gtc": 0.030982666015625,
"train_loss_gtm": 0.0070129776000976566,
"train_loss_lm": 1.44140625
},
{
"epoch": 13.614013521819299,
"grad_norm": 0.3128654956817627,
"learning_rate": 2.1003531134096255e-06,
"loss": 1.4759,
"step": 44300,
"train_loss_gtc": 0.03089111328125,
"train_loss_gtm": 0.005992927551269531,
"train_loss_lm": 1.43984375
},
{
"epoch": 13.644744929317763,
"grad_norm": 0.3076239824295044,
"learning_rate": 2.0088655996175097e-06,
"loss": 1.4805,
"step": 44400,
"train_loss_gtc": 0.031224365234375,
"train_loss_gtm": 0.006224002838134766,
"train_loss_lm": 1.4428125
},
{
"epoch": 13.675476336816226,
"grad_norm": 0.3875581622123718,
"learning_rate": 1.9193743729079507e-06,
"loss": 1.4787,
"step": 44500,
"train_loss_gtc": 0.030994873046875,
"train_loss_gtm": 0.005257759094238281,
"train_loss_lm": 1.44125
},
{
"epoch": 13.706207744314689,
"grad_norm": 0.3164869546890259,
"learning_rate": 1.8318831558472582e-06,
"loss": 1.4788,
"step": 44600,
"train_loss_gtc": 0.032825927734375,
"train_loss_gtm": 0.008005275726318359,
"train_loss_lm": 1.4421875
},
{
"epoch": 13.736939151813154,
"grad_norm": 0.40070638060569763,
"learning_rate": 1.7463955878073424e-06,
"loss": 1.4785,
"step": 44700,
"train_loss_gtc": 0.031497802734375,
"train_loss_gtm": 0.0062798881530761715,
"train_loss_lm": 1.44125
},
{
"epoch": 13.767670559311616,
"grad_norm": 0.39651504158973694,
"learning_rate": 1.662915224814321e-06,
"loss": 1.4769,
"step": 44800,
"train_loss_gtc": 0.031278076171875,
"train_loss_gtm": 0.00752462387084961,
"train_loss_lm": 1.44140625
},
{
"epoch": 13.79840196681008,
"grad_norm": 0.3495580554008484,
"learning_rate": 1.5814455394006167e-06,
"loss": 1.4801,
"step": 44900,
"train_loss_gtc": 0.03222900390625,
"train_loss_gtm": 0.008823738098144532,
"train_loss_lm": 1.4409375
},
{
"epoch": 13.829133374308544,
"grad_norm": 0.33641329407691956,
"learning_rate": 1.501989920460517e-06,
"loss": 1.4793,
"step": 45000,
"train_loss_gtc": 0.03152587890625,
"train_loss_gtm": 0.005212993621826172,
"train_loss_lm": 1.441484375
},
{
"epoch": 13.859864781807007,
"grad_norm": 0.4456912577152252,
"learning_rate": 1.4245516731091646e-06,
"loss": 1.4772,
"step": 45100,
"train_loss_gtc": 0.03094482421875,
"train_loss_gtm": 0.002681427001953125,
"train_loss_lm": 1.43921875
},
{
"epoch": 13.89059618930547,
"grad_norm": 0.34826260805130005,
"learning_rate": 1.349134018545134e-06,
"loss": 1.4777,
"step": 45200,
"train_loss_gtc": 0.032086181640625,
"train_loss_gtm": 0.006848697662353516,
"train_loss_lm": 1.43953125
},
{
"epoch": 13.921327596803934,
"grad_norm": 0.3667930066585541,
"learning_rate": 1.2757400939163833e-06,
"loss": 1.4779,
"step": 45300,
"train_loss_gtc": 0.032056884765625,
"train_loss_gtm": 0.005229644775390625,
"train_loss_lm": 1.44171875
},
{
"epoch": 13.952059004302397,
"grad_norm": 0.2943558096885681,
"learning_rate": 1.2043729521897752e-06,
"loss": 1.4775,
"step": 45400,
"train_loss_gtc": 0.03116455078125,
"train_loss_gtm": 0.005698661804199218,
"train_loss_lm": 1.440625
},
{
"epoch": 13.98279041180086,
"grad_norm": 0.3945413827896118,
"learning_rate": 1.1350355620241226e-06,
"loss": 1.4789,
"step": 45500,
"train_loss_gtc": 0.03146484375,
"train_loss_gtm": 0.005878944396972657,
"train_loss_lm": 1.442421875
},
{
"epoch": 14.0,
"eval_loss": 1.60546875,
"eval_runtime": 3.9133,
"eval_samples_per_second": 254.264,
"eval_steps_per_second": 2.811,
"step": 45556,
"train_loss_gtc": 0.030979701450892856,
"train_loss_gtm": 0.002463647297450474,
"train_loss_lm": 1.4439174107142858,
"val_loss_gtc": 0.06396484375,
"val_loss_gtm": 0.06325559616088867,
"val_loss_lm": 1.47578125
},
{
"epoch": 14.013521819299324,
"grad_norm": 0.37518543004989624,
"learning_rate": 1.0677308076466385e-06,
"loss": 1.478,
"step": 45600,
"train_loss_gtc": 0.03331687233664773,
"train_loss_gtm": 0.010442083532159979,
"train_loss_lm": 1.4401633522727273
},
{
"epoch": 14.044253226797787,
"grad_norm": 0.3532905876636505,
"learning_rate": 1.002461488733003e-06,
"loss": 1.4728,
"step": 45700,
"train_loss_gtc": 0.0307568359375,
"train_loss_gtm": 0.0067650794982910155,
"train_loss_lm": 1.436328125
},
{
"epoch": 14.07498463429625,
"grad_norm": 0.40793976187705994,
"learning_rate": 9.392303202908848e-07,
"loss": 1.473,
"step": 45800,
"train_loss_gtc": 0.02984375,
"train_loss_gtm": 0.002398052215576172,
"train_loss_lm": 1.439375
},
{
"epoch": 14.105716041794714,
"grad_norm": 0.3508872985839844,
"learning_rate": 8.780399325470313e-07,
"loss": 1.4732,
"step": 45900,
"train_loss_gtc": 0.031231689453125,
"train_loss_gtm": 0.004491233825683593,
"train_loss_lm": 1.43890625
},
{
"epoch": 14.136447449293177,
"grad_norm": 1.9087783098220825,
"learning_rate": 8.188928708378229e-07,
"loss": 1.4757,
"step": 46000,
"train_loss_gtc": 0.030797119140625,
"train_loss_gtm": 0.0036650848388671874,
"train_loss_lm": 1.437265625
},
{
"epoch": 14.167178856791642,
"grad_norm": 0.5606856942176819,
"learning_rate": 7.61791595503425e-07,
"loss": 1.4788,
"step": 46100,
"train_loss_gtc": 0.03298828125,
"train_loss_gtm": 0.008520011901855468,
"train_loss_lm": 1.440625
},
{
"epoch": 14.197910264290105,
"grad_norm": 0.5114173293113708,
"learning_rate": 7.067384817854184e-07,
"loss": 1.4751,
"step": 46200,
"train_loss_gtc": 0.031270751953125,
"train_loss_gtm": 0.006043624877929687,
"train_loss_lm": 1.438828125
},
{
"epoch": 14.228641671788568,
"grad_norm": 0.3046382963657379,
"learning_rate": 6.537358197280241e-07,
"loss": 1.4759,
"step": 46300,
"train_loss_gtc": 0.029881591796875,
"train_loss_gtm": 0.00199981689453125,
"train_loss_lm": 1.44015625
},
{
"epoch": 14.259373079287032,
"grad_norm": 0.3361985683441162,
"learning_rate": 6.027858140828235e-07,
"loss": 1.48,
"step": 46400,
"train_loss_gtc": 0.032630615234375,
"train_loss_gtm": 0.00764068603515625,
"train_loss_lm": 1.438125
},
{
"epoch": 14.290104486785495,
"grad_norm": 0.332077294588089,
"learning_rate": 5.538905842170649e-07,
"loss": 1.4752,
"step": 46500,
"train_loss_gtc": 0.03033203125,
"train_loss_gtm": 0.008144855499267578,
"train_loss_lm": 1.4384375
},
{
"epoch": 14.320835894283958,
"grad_norm": 0.3548417389392853,
"learning_rate": 5.070521640254788e-07,
"loss": 1.4765,
"step": 46600,
"train_loss_gtc": 0.031827392578125,
"train_loss_gtm": 0.005391826629638672,
"train_loss_lm": 1.438671875
},
{
"epoch": 14.351567301782422,
"grad_norm": 0.35536205768585205,
"learning_rate": 4.622725018457008e-07,
"loss": 1.4791,
"step": 46700,
"train_loss_gtc": 0.031151123046875,
"train_loss_gtm": 0.005690097808837891,
"train_loss_lm": 1.439765625
},
{
"epoch": 14.382298709280885,
"grad_norm": 0.36706480383872986,
"learning_rate": 4.1955346037721445e-07,
"loss": 1.4791,
"step": 46800,
"train_loss_gtc": 0.032288818359375,
"train_loss_gtm": 0.011700859069824219,
"train_loss_lm": 1.438515625
},
{
"epoch": 14.413030116779348,
"grad_norm": 0.3751659393310547,
"learning_rate": 3.7889681660386866e-07,
"loss": 1.4776,
"step": 46900,
"train_loss_gtc": 0.03252685546875,
"train_loss_gtm": 0.009815158843994141,
"train_loss_lm": 1.438125
},
{
"epoch": 14.443761524277813,
"grad_norm": 0.32809144258499146,
"learning_rate": 3.403042617199592e-07,
"loss": 1.4792,
"step": 47000,
"train_loss_gtc": 0.032452392578125,
"train_loss_gtm": 0.0022745895385742187,
"train_loss_lm": 1.440546875
},
{
"epoch": 14.474492931776275,
"grad_norm": 0.3384574055671692,
"learning_rate": 3.037774010598793e-07,
"loss": 1.4798,
"step": 47100,
"train_loss_gtc": 0.03126708984375,
"train_loss_gtm": 0.00357696533203125,
"train_loss_lm": 1.44046875
},
{
"epoch": 14.505224339274738,
"grad_norm": 0.3188628554344177,
"learning_rate": 2.6931775403135074e-07,
"loss": 1.4742,
"step": 47200,
"train_loss_gtc": 0.030872802734375,
"train_loss_gtm": 0.0019446945190429688,
"train_loss_lm": 1.439609375
},
{
"epoch": 14.535955746773203,
"grad_norm": 0.6094369292259216,
"learning_rate": 2.369267540522191e-07,
"loss": 1.4732,
"step": 47300,
"train_loss_gtc": 0.031270751953125,
"train_loss_gtm": 0.0019117927551269532,
"train_loss_lm": 1.44015625
},
{
"epoch": 14.566687154271666,
"grad_norm": 0.3574078381061554,
"learning_rate": 2.0660574849081237e-07,
"loss": 1.477,
"step": 47400,
"train_loss_gtc": 0.030413818359375,
"train_loss_gtm": 0.0030179214477539062,
"train_loss_lm": 1.44078125
},
{
"epoch": 14.597418561770128,
"grad_norm": 0.34775152802467346,
"learning_rate": 1.783559986099137e-07,
"loss": 1.4796,
"step": 47500,
"train_loss_gtc": 0.031693115234375,
"train_loss_gtm": 0.0033152008056640623,
"train_loss_lm": 1.4409375
},
{
"epoch": 14.628149969268593,
"grad_norm": 0.3424850106239319,
"learning_rate": 1.521786795142921e-07,
"loss": 1.4761,
"step": 47600,
"train_loss_gtc": 0.029925537109375,
"train_loss_gtm": 0.0017087364196777344,
"train_loss_lm": 1.43875
},
{
"epoch": 14.658881376767056,
"grad_norm": 0.31570205092430115,
"learning_rate": 1.2807488010181945e-07,
"loss": 1.4798,
"step": 47700,
"train_loss_gtc": 0.032105712890625,
"train_loss_gtm": 0.0031629753112792968,
"train_loss_lm": 1.441171875
},
{
"epoch": 14.689612784265519,
"grad_norm": 0.3260713517665863,
"learning_rate": 1.0604560301816224e-07,
"loss": 1.4788,
"step": 47800,
"train_loss_gtc": 0.031483154296875,
"train_loss_gtm": 0.006717433929443359,
"train_loss_lm": 1.439140625
},
{
"epoch": 14.720344191763983,
"grad_norm": 0.3307570219039917,
"learning_rate": 8.609176461510938e-08,
"loss": 1.4739,
"step": 47900,
"train_loss_gtc": 0.03117919921875,
"train_loss_gtm": 0.005090217590332031,
"train_loss_lm": 1.438203125
},
{
"epoch": 14.751075599262446,
"grad_norm": 0.29519036412239075,
"learning_rate": 6.821419491241376e-08,
"loss": 1.472,
"step": 48000,
"train_loss_gtc": 0.030118408203125,
"train_loss_gtm": 0.002861900329589844,
"train_loss_lm": 1.440078125
},
{
"epoch": 14.781807006760909,
"grad_norm": 0.38826602697372437,
"learning_rate": 5.2413637563292205e-08,
"loss": 1.4736,
"step": 48100,
"train_loss_gtc": 0.030203857421875,
"train_loss_gtm": 0.004720573425292969,
"train_loss_lm": 1.43859375
},
{
"epoch": 14.812538414259373,
"grad_norm": 0.4058314263820648,
"learning_rate": 3.8690749823488967e-08,
"loss": 1.4767,
"step": 48200,
"train_loss_gtc": 0.03099365234375,
"train_loss_gtm": 0.005015640258789062,
"train_loss_lm": 1.4378125
},
{
"epoch": 14.843269821757836,
"grad_norm": 0.38652893900871277,
"learning_rate": 2.7046102523919927e-08,
"loss": 1.471,
"step": 48300,
"train_loss_gtc": 0.030880126953125,
"train_loss_gtm": 0.0031046104431152344,
"train_loss_lm": 1.439609375
},
{
"epoch": 14.8740012292563,
"grad_norm": 0.3828943967819214,
"learning_rate": 1.748018004694707e-08,
"loss": 1.4734,
"step": 48400,
"train_loss_gtc": 0.029947509765625,
"train_loss_gtm": 0.0023802757263183595,
"train_loss_lm": 1.43984375
},
{
"epoch": 14.904732636754764,
"grad_norm": 0.3241247832775116,
"learning_rate": 9.993380306222432e-09,
"loss": 1.4751,
"step": 48500,
"train_loss_gtc": 0.0314208984375,
"train_loss_gtm": 0.005055904388427734,
"train_loss_lm": 1.440234375
},
{
"epoch": 14.935464044253226,
"grad_norm": 0.3689234256744385,
"learning_rate": 4.586014730140198e-09,
"loss": 1.477,
"step": 48600,
"train_loss_gtc": 0.032603759765625,
"train_loss_gtm": 0.004544639587402343,
"train_loss_lm": 1.43734375
},
{
"epoch": 14.966195451751691,
"grad_norm": 0.376676082611084,
"learning_rate": 1.2583082488581976e-09,
"loss": 1.4825,
"step": 48700,
"train_loss_gtc": 0.03310546875,
"train_loss_gtm": 0.009465179443359374,
"train_loss_lm": 1.439765625
},
{
"epoch": 14.996926859250154,
"grad_norm": 0.3576776385307312,
"learning_rate": 1.0399284983142465e-11,
"loss": 1.4777,
"step": 48800,
"train_loss_gtc": 0.0317724609375,
"train_loss_gtm": 0.005117168426513672,
"train_loss_lm": 1.439921875
},
{
"epoch": 15.0,
"eval_loss": 1.6144530773162842,
"eval_runtime": 3.9176,
"eval_samples_per_second": 253.984,
"eval_steps_per_second": 2.808,
"step": 48810,
"train_loss_gtc": 0.03424072265625,
"train_loss_gtm": 0.010280990600585937,
"train_loss_lm": 1.44140625,
"val_loss_gtc": 0.0637451171875,
"val_loss_gtm": 0.06403846740722656,
"val_loss_lm": 1.475
}
],
"logging_steps": 100,
"max_steps": 48810,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 3,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 48,
"trial_name": null,
"trial_params": null
}