llama3_8B_rope4_long_20k / trainer_state.json
VanWang's picture
Upload folder using huggingface_hub
256ef0b verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1875,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016,
"grad_norm": 3.443783900713759,
"learning_rate": 0.0,
"loss": 0.9037,
"step": 1
},
{
"epoch": 0.0032,
"grad_norm": 3.6390305366298263,
"learning_rate": 1.0660980810234543e-08,
"loss": 0.8625,
"step": 2
},
{
"epoch": 0.0048,
"grad_norm": 3.659207091105885,
"learning_rate": 2.1321961620469085e-08,
"loss": 0.9171,
"step": 3
},
{
"epoch": 0.0064,
"grad_norm": 3.7880542872635505,
"learning_rate": 3.1982942430703625e-08,
"loss": 0.9464,
"step": 4
},
{
"epoch": 0.008,
"grad_norm": 3.548747527929191,
"learning_rate": 4.264392324093817e-08,
"loss": 0.895,
"step": 5
},
{
"epoch": 0.0096,
"grad_norm": 3.759771231107573,
"learning_rate": 5.330490405117271e-08,
"loss": 0.941,
"step": 6
},
{
"epoch": 0.0112,
"grad_norm": 3.6420819920454246,
"learning_rate": 6.396588486140725e-08,
"loss": 0.8877,
"step": 7
},
{
"epoch": 0.0128,
"grad_norm": 3.6190454711556015,
"learning_rate": 7.462686567164179e-08,
"loss": 0.8941,
"step": 8
},
{
"epoch": 0.0144,
"grad_norm": 3.496287565902599,
"learning_rate": 8.528784648187634e-08,
"loss": 0.8912,
"step": 9
},
{
"epoch": 0.016,
"grad_norm": 3.4493514506690404,
"learning_rate": 9.59488272921109e-08,
"loss": 0.9044,
"step": 10
},
{
"epoch": 0.0176,
"grad_norm": 3.5793331150788594,
"learning_rate": 1.0660980810234542e-07,
"loss": 0.8867,
"step": 11
},
{
"epoch": 0.0192,
"grad_norm": 3.575767404651874,
"learning_rate": 1.1727078891257997e-07,
"loss": 0.8774,
"step": 12
},
{
"epoch": 0.0208,
"grad_norm": 3.359144887048707,
"learning_rate": 1.279317697228145e-07,
"loss": 0.8802,
"step": 13
},
{
"epoch": 0.0224,
"grad_norm": 3.5053119807841733,
"learning_rate": 1.3859275053304905e-07,
"loss": 0.9283,
"step": 14
},
{
"epoch": 0.024,
"grad_norm": 3.359709007748787,
"learning_rate": 1.4925373134328358e-07,
"loss": 0.924,
"step": 15
},
{
"epoch": 0.0256,
"grad_norm": 3.477654638120491,
"learning_rate": 1.5991471215351813e-07,
"loss": 0.9346,
"step": 16
},
{
"epoch": 0.0272,
"grad_norm": 3.406446613844849,
"learning_rate": 1.7057569296375268e-07,
"loss": 0.9043,
"step": 17
},
{
"epoch": 0.0288,
"grad_norm": 3.313197232515461,
"learning_rate": 1.812366737739872e-07,
"loss": 0.8843,
"step": 18
},
{
"epoch": 0.0304,
"grad_norm": 3.4020111065038345,
"learning_rate": 1.918976545842218e-07,
"loss": 0.8978,
"step": 19
},
{
"epoch": 0.032,
"grad_norm": 3.4825014260059666,
"learning_rate": 2.0255863539445632e-07,
"loss": 0.934,
"step": 20
},
{
"epoch": 0.0336,
"grad_norm": 3.0230420303734395,
"learning_rate": 2.1321961620469084e-07,
"loss": 0.8744,
"step": 21
},
{
"epoch": 0.0352,
"grad_norm": 3.024255215285248,
"learning_rate": 2.2388059701492537e-07,
"loss": 0.9069,
"step": 22
},
{
"epoch": 0.0368,
"grad_norm": 2.879070204798837,
"learning_rate": 2.3454157782515995e-07,
"loss": 0.9035,
"step": 23
},
{
"epoch": 0.0384,
"grad_norm": 2.790682421381564,
"learning_rate": 2.4520255863539447e-07,
"loss": 0.8457,
"step": 24
},
{
"epoch": 0.04,
"grad_norm": 2.8724037770448234,
"learning_rate": 2.55863539445629e-07,
"loss": 0.8928,
"step": 25
},
{
"epoch": 0.0416,
"grad_norm": 2.844540912682716,
"learning_rate": 2.665245202558635e-07,
"loss": 0.864,
"step": 26
},
{
"epoch": 0.0432,
"grad_norm": 2.89503566391836,
"learning_rate": 2.771855010660981e-07,
"loss": 0.9328,
"step": 27
},
{
"epoch": 0.0448,
"grad_norm": 2.759272854755458,
"learning_rate": 2.8784648187633263e-07,
"loss": 0.8844,
"step": 28
},
{
"epoch": 0.0464,
"grad_norm": 2.2083388745590598,
"learning_rate": 2.9850746268656716e-07,
"loss": 0.8057,
"step": 29
},
{
"epoch": 0.048,
"grad_norm": 2.1699446591907217,
"learning_rate": 3.0916844349680174e-07,
"loss": 0.8607,
"step": 30
},
{
"epoch": 0.0496,
"grad_norm": 2.05296694589419,
"learning_rate": 3.1982942430703626e-07,
"loss": 0.838,
"step": 31
},
{
"epoch": 0.0512,
"grad_norm": 2.121805383333904,
"learning_rate": 3.3049040511727084e-07,
"loss": 0.8602,
"step": 32
},
{
"epoch": 0.0528,
"grad_norm": 2.011739875446632,
"learning_rate": 3.4115138592750537e-07,
"loss": 0.8457,
"step": 33
},
{
"epoch": 0.0544,
"grad_norm": 1.9523453367834913,
"learning_rate": 3.518123667377399e-07,
"loss": 0.8293,
"step": 34
},
{
"epoch": 0.056,
"grad_norm": 1.981107101239095,
"learning_rate": 3.624733475479744e-07,
"loss": 0.8453,
"step": 35
},
{
"epoch": 0.0576,
"grad_norm": 1.9118185905806608,
"learning_rate": 3.7313432835820895e-07,
"loss": 0.8702,
"step": 36
},
{
"epoch": 0.0592,
"grad_norm": 1.9059114664746757,
"learning_rate": 3.837953091684436e-07,
"loss": 0.8539,
"step": 37
},
{
"epoch": 0.0608,
"grad_norm": 1.945989919146672,
"learning_rate": 3.944562899786781e-07,
"loss": 0.8725,
"step": 38
},
{
"epoch": 0.0624,
"grad_norm": 2.5159843027781186,
"learning_rate": 4.0511727078891263e-07,
"loss": 0.8626,
"step": 39
},
{
"epoch": 0.064,
"grad_norm": 2.5829022218410174,
"learning_rate": 4.1577825159914716e-07,
"loss": 0.8457,
"step": 40
},
{
"epoch": 0.0656,
"grad_norm": 1.8750845002422516,
"learning_rate": 4.264392324093817e-07,
"loss": 0.7791,
"step": 41
},
{
"epoch": 0.0672,
"grad_norm": 1.5188407462797207,
"learning_rate": 4.371002132196162e-07,
"loss": 0.8197,
"step": 42
},
{
"epoch": 0.0688,
"grad_norm": 1.6521693591262758,
"learning_rate": 4.4776119402985074e-07,
"loss": 0.7645,
"step": 43
},
{
"epoch": 0.0704,
"grad_norm": 1.815538999694159,
"learning_rate": 4.5842217484008537e-07,
"loss": 0.7951,
"step": 44
},
{
"epoch": 0.072,
"grad_norm": 2.018098877914276,
"learning_rate": 4.690831556503199e-07,
"loss": 0.8396,
"step": 45
},
{
"epoch": 0.0736,
"grad_norm": 1.8812363250114577,
"learning_rate": 4.797441364605544e-07,
"loss": 0.778,
"step": 46
},
{
"epoch": 0.0752,
"grad_norm": 1.6856485907071797,
"learning_rate": 4.904051172707889e-07,
"loss": 0.8552,
"step": 47
},
{
"epoch": 0.0768,
"grad_norm": 1.6105066317850283,
"learning_rate": 5.010660980810235e-07,
"loss": 0.7828,
"step": 48
},
{
"epoch": 0.0784,
"grad_norm": 1.3835950677015068,
"learning_rate": 5.11727078891258e-07,
"loss": 0.7733,
"step": 49
},
{
"epoch": 0.08,
"grad_norm": 1.211651669134244,
"learning_rate": 5.223880597014925e-07,
"loss": 0.7857,
"step": 50
},
{
"epoch": 0.0816,
"grad_norm": 1.1514550282299225,
"learning_rate": 5.33049040511727e-07,
"loss": 0.8268,
"step": 51
},
{
"epoch": 0.0832,
"grad_norm": 1.2595752472877826,
"learning_rate": 5.437100213219617e-07,
"loss": 0.8165,
"step": 52
},
{
"epoch": 0.0848,
"grad_norm": 1.3240720993253337,
"learning_rate": 5.543710021321962e-07,
"loss": 0.791,
"step": 53
},
{
"epoch": 0.0864,
"grad_norm": 1.4966442377104252,
"learning_rate": 5.650319829424307e-07,
"loss": 0.7705,
"step": 54
},
{
"epoch": 0.088,
"grad_norm": 1.5713862721148337,
"learning_rate": 5.756929637526653e-07,
"loss": 0.7663,
"step": 55
},
{
"epoch": 0.0896,
"grad_norm": 1.495777580142115,
"learning_rate": 5.863539445628998e-07,
"loss": 0.8122,
"step": 56
},
{
"epoch": 0.0912,
"grad_norm": 1.2257308102132334,
"learning_rate": 5.970149253731343e-07,
"loss": 0.7651,
"step": 57
},
{
"epoch": 0.0928,
"grad_norm": 1.0472266320437198,
"learning_rate": 6.076759061833689e-07,
"loss": 0.731,
"step": 58
},
{
"epoch": 0.0944,
"grad_norm": 1.1407667161540418,
"learning_rate": 6.183368869936035e-07,
"loss": 0.7969,
"step": 59
},
{
"epoch": 0.096,
"grad_norm": 1.3216886366508096,
"learning_rate": 6.28997867803838e-07,
"loss": 0.7604,
"step": 60
},
{
"epoch": 0.0976,
"grad_norm": 1.3049071488059805,
"learning_rate": 6.396588486140725e-07,
"loss": 0.744,
"step": 61
},
{
"epoch": 0.0992,
"grad_norm": 1.2834199728358815,
"learning_rate": 6.50319829424307e-07,
"loss": 0.7262,
"step": 62
},
{
"epoch": 0.1008,
"grad_norm": 1.1410102831594338,
"learning_rate": 6.609808102345417e-07,
"loss": 0.7415,
"step": 63
},
{
"epoch": 0.1024,
"grad_norm": 0.9677160586738733,
"learning_rate": 6.716417910447762e-07,
"loss": 0.7602,
"step": 64
},
{
"epoch": 0.104,
"grad_norm": 0.9587451857542828,
"learning_rate": 6.823027718550107e-07,
"loss": 0.7119,
"step": 65
},
{
"epoch": 0.1056,
"grad_norm": 0.9763862874537796,
"learning_rate": 6.929637526652453e-07,
"loss": 0.7234,
"step": 66
},
{
"epoch": 0.1072,
"grad_norm": 1.077711976878752,
"learning_rate": 7.036247334754798e-07,
"loss": 0.7635,
"step": 67
},
{
"epoch": 0.1088,
"grad_norm": 1.0324752460112436,
"learning_rate": 7.142857142857143e-07,
"loss": 0.69,
"step": 68
},
{
"epoch": 0.1104,
"grad_norm": 0.9354890541545591,
"learning_rate": 7.249466950959488e-07,
"loss": 0.7202,
"step": 69
},
{
"epoch": 0.112,
"grad_norm": 0.7898520659815029,
"learning_rate": 7.356076759061834e-07,
"loss": 0.6578,
"step": 70
},
{
"epoch": 0.1136,
"grad_norm": 0.8073968816858748,
"learning_rate": 7.462686567164179e-07,
"loss": 0.7249,
"step": 71
},
{
"epoch": 0.1152,
"grad_norm": 0.7814000776187614,
"learning_rate": 7.569296375266526e-07,
"loss": 0.6911,
"step": 72
},
{
"epoch": 0.1168,
"grad_norm": 0.8545484790533285,
"learning_rate": 7.675906183368872e-07,
"loss": 0.7123,
"step": 73
},
{
"epoch": 0.1184,
"grad_norm": 0.8382679376651664,
"learning_rate": 7.782515991471217e-07,
"loss": 0.6986,
"step": 74
},
{
"epoch": 0.12,
"grad_norm": 0.8634452208398014,
"learning_rate": 7.889125799573562e-07,
"loss": 0.7119,
"step": 75
},
{
"epoch": 0.1216,
"grad_norm": 0.811486309668032,
"learning_rate": 7.995735607675907e-07,
"loss": 0.6783,
"step": 76
},
{
"epoch": 0.1232,
"grad_norm": 0.7304908349636826,
"learning_rate": 8.102345415778253e-07,
"loss": 0.6484,
"step": 77
},
{
"epoch": 0.1248,
"grad_norm": 0.7949603313914639,
"learning_rate": 8.208955223880598e-07,
"loss": 0.6771,
"step": 78
},
{
"epoch": 0.1264,
"grad_norm": 0.7688327380249824,
"learning_rate": 8.315565031982943e-07,
"loss": 0.6832,
"step": 79
},
{
"epoch": 0.128,
"grad_norm": 0.8764394953703323,
"learning_rate": 8.422174840085288e-07,
"loss": 0.7174,
"step": 80
},
{
"epoch": 0.1296,
"grad_norm": 0.8220509077328104,
"learning_rate": 8.528784648187634e-07,
"loss": 0.6862,
"step": 81
},
{
"epoch": 0.1312,
"grad_norm": 0.7947854466997331,
"learning_rate": 8.635394456289979e-07,
"loss": 0.6696,
"step": 82
},
{
"epoch": 0.1328,
"grad_norm": 0.7349807053318366,
"learning_rate": 8.742004264392324e-07,
"loss": 0.6782,
"step": 83
},
{
"epoch": 0.1344,
"grad_norm": 0.7171194189725714,
"learning_rate": 8.848614072494669e-07,
"loss": 0.6538,
"step": 84
},
{
"epoch": 0.136,
"grad_norm": 0.7262304453867153,
"learning_rate": 8.955223880597015e-07,
"loss": 0.6716,
"step": 85
},
{
"epoch": 0.1376,
"grad_norm": 0.7313472659286344,
"learning_rate": 9.06183368869936e-07,
"loss": 0.6937,
"step": 86
},
{
"epoch": 0.1392,
"grad_norm": 0.770645562826172,
"learning_rate": 9.168443496801707e-07,
"loss": 0.6659,
"step": 87
},
{
"epoch": 0.1408,
"grad_norm": 0.783346316746928,
"learning_rate": 9.275053304904053e-07,
"loss": 0.7053,
"step": 88
},
{
"epoch": 0.1424,
"grad_norm": 0.6991667565530584,
"learning_rate": 9.381663113006398e-07,
"loss": 0.6163,
"step": 89
},
{
"epoch": 0.144,
"grad_norm": 0.6930907823765393,
"learning_rate": 9.488272921108743e-07,
"loss": 0.6844,
"step": 90
},
{
"epoch": 0.1456,
"grad_norm": 0.7620232016264051,
"learning_rate": 9.594882729211088e-07,
"loss": 0.6752,
"step": 91
},
{
"epoch": 0.1472,
"grad_norm": 0.743140636169742,
"learning_rate": 9.701492537313434e-07,
"loss": 0.6798,
"step": 92
},
{
"epoch": 0.1488,
"grad_norm": 0.7042055777911003,
"learning_rate": 9.808102345415779e-07,
"loss": 0.6542,
"step": 93
},
{
"epoch": 0.1504,
"grad_norm": 0.7050683591332227,
"learning_rate": 9.914712153518124e-07,
"loss": 0.6467,
"step": 94
},
{
"epoch": 0.152,
"grad_norm": 0.7004797614822288,
"learning_rate": 1.002132196162047e-06,
"loss": 0.6561,
"step": 95
},
{
"epoch": 0.1536,
"grad_norm": 0.7244574630433713,
"learning_rate": 1.0127931769722815e-06,
"loss": 0.6752,
"step": 96
},
{
"epoch": 0.1552,
"grad_norm": 0.6961978517264288,
"learning_rate": 1.023454157782516e-06,
"loss": 0.6521,
"step": 97
},
{
"epoch": 0.1568,
"grad_norm": 0.7180269667132341,
"learning_rate": 1.0341151385927505e-06,
"loss": 0.6491,
"step": 98
},
{
"epoch": 0.1584,
"grad_norm": 0.7227268220641379,
"learning_rate": 1.044776119402985e-06,
"loss": 0.6808,
"step": 99
},
{
"epoch": 0.16,
"grad_norm": 0.7181870774111148,
"learning_rate": 1.0554371002132196e-06,
"loss": 0.6731,
"step": 100
},
{
"epoch": 0.1616,
"grad_norm": 0.728313466676549,
"learning_rate": 1.066098081023454e-06,
"loss": 0.6668,
"step": 101
},
{
"epoch": 0.1632,
"grad_norm": 0.7371923851925003,
"learning_rate": 1.0767590618336886e-06,
"loss": 0.6443,
"step": 102
},
{
"epoch": 0.1648,
"grad_norm": 0.7449273360955099,
"learning_rate": 1.0874200426439234e-06,
"loss": 0.6697,
"step": 103
},
{
"epoch": 0.1664,
"grad_norm": 0.7087640871697763,
"learning_rate": 1.0980810234541579e-06,
"loss": 0.6868,
"step": 104
},
{
"epoch": 0.168,
"grad_norm": 0.7336730798309535,
"learning_rate": 1.1087420042643924e-06,
"loss": 0.6464,
"step": 105
},
{
"epoch": 0.1696,
"grad_norm": 0.7291761151685803,
"learning_rate": 1.119402985074627e-06,
"loss": 0.6577,
"step": 106
},
{
"epoch": 0.1712,
"grad_norm": 0.7305168985226156,
"learning_rate": 1.1300639658848615e-06,
"loss": 0.7146,
"step": 107
},
{
"epoch": 0.1728,
"grad_norm": 0.7296315597098482,
"learning_rate": 1.140724946695096e-06,
"loss": 0.6683,
"step": 108
},
{
"epoch": 0.1744,
"grad_norm": 0.7097744903484696,
"learning_rate": 1.1513859275053305e-06,
"loss": 0.6398,
"step": 109
},
{
"epoch": 0.176,
"grad_norm": 0.7269961360067623,
"learning_rate": 1.162046908315565e-06,
"loss": 0.674,
"step": 110
},
{
"epoch": 0.1776,
"grad_norm": 0.7170312350089441,
"learning_rate": 1.1727078891257996e-06,
"loss": 0.6785,
"step": 111
},
{
"epoch": 0.1792,
"grad_norm": 0.7487760970400611,
"learning_rate": 1.183368869936034e-06,
"loss": 0.7089,
"step": 112
},
{
"epoch": 0.1808,
"grad_norm": 0.7305797004043646,
"learning_rate": 1.1940298507462686e-06,
"loss": 0.6718,
"step": 113
},
{
"epoch": 0.1824,
"grad_norm": 0.7069812667121284,
"learning_rate": 1.2046908315565034e-06,
"loss": 0.6402,
"step": 114
},
{
"epoch": 0.184,
"grad_norm": 0.7504988512166512,
"learning_rate": 1.2153518123667379e-06,
"loss": 0.6572,
"step": 115
},
{
"epoch": 0.1856,
"grad_norm": 0.7106950161112279,
"learning_rate": 1.2260127931769724e-06,
"loss": 0.6519,
"step": 116
},
{
"epoch": 0.1872,
"grad_norm": 0.7708646405486708,
"learning_rate": 1.236673773987207e-06,
"loss": 0.6763,
"step": 117
},
{
"epoch": 0.1888,
"grad_norm": 0.7035298180344508,
"learning_rate": 1.2473347547974415e-06,
"loss": 0.671,
"step": 118
},
{
"epoch": 0.1904,
"grad_norm": 0.7354734585683664,
"learning_rate": 1.257995735607676e-06,
"loss": 0.6427,
"step": 119
},
{
"epoch": 0.192,
"grad_norm": 0.7100024068696028,
"learning_rate": 1.2686567164179105e-06,
"loss": 0.6256,
"step": 120
},
{
"epoch": 0.1936,
"grad_norm": 0.7579478672252553,
"learning_rate": 1.279317697228145e-06,
"loss": 0.6447,
"step": 121
},
{
"epoch": 0.1952,
"grad_norm": 0.7392544814192312,
"learning_rate": 1.2899786780383796e-06,
"loss": 0.6377,
"step": 122
},
{
"epoch": 0.1968,
"grad_norm": 0.729164847634116,
"learning_rate": 1.300639658848614e-06,
"loss": 0.6494,
"step": 123
},
{
"epoch": 0.1984,
"grad_norm": 0.690759653524007,
"learning_rate": 1.3113006396588488e-06,
"loss": 0.6443,
"step": 124
},
{
"epoch": 0.2,
"grad_norm": 0.7160920208659657,
"learning_rate": 1.3219616204690834e-06,
"loss": 0.6411,
"step": 125
},
{
"epoch": 0.2016,
"grad_norm": 0.720471593378039,
"learning_rate": 1.3326226012793179e-06,
"loss": 0.6209,
"step": 126
},
{
"epoch": 0.2032,
"grad_norm": 0.7223708699809386,
"learning_rate": 1.3432835820895524e-06,
"loss": 0.6365,
"step": 127
},
{
"epoch": 0.2048,
"grad_norm": 0.7153278093036772,
"learning_rate": 1.353944562899787e-06,
"loss": 0.6345,
"step": 128
},
{
"epoch": 0.2064,
"grad_norm": 0.7019982722136778,
"learning_rate": 1.3646055437100215e-06,
"loss": 0.6134,
"step": 129
},
{
"epoch": 0.208,
"grad_norm": 0.7075092004932616,
"learning_rate": 1.375266524520256e-06,
"loss": 0.6388,
"step": 130
},
{
"epoch": 0.2096,
"grad_norm": 0.7077101361641048,
"learning_rate": 1.3859275053304905e-06,
"loss": 0.6159,
"step": 131
},
{
"epoch": 0.2112,
"grad_norm": 0.7059765399735562,
"learning_rate": 1.396588486140725e-06,
"loss": 0.606,
"step": 132
},
{
"epoch": 0.2128,
"grad_norm": 0.7103532905249046,
"learning_rate": 1.4072494669509596e-06,
"loss": 0.6356,
"step": 133
},
{
"epoch": 0.2144,
"grad_norm": 0.7318321126082343,
"learning_rate": 1.417910447761194e-06,
"loss": 0.6263,
"step": 134
},
{
"epoch": 0.216,
"grad_norm": 0.6844147627067427,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.6287,
"step": 135
},
{
"epoch": 0.2176,
"grad_norm": 0.7235841968233678,
"learning_rate": 1.4392324093816632e-06,
"loss": 0.6186,
"step": 136
},
{
"epoch": 0.2192,
"grad_norm": 0.7319045516178274,
"learning_rate": 1.4498933901918977e-06,
"loss": 0.6425,
"step": 137
},
{
"epoch": 0.2208,
"grad_norm": 0.7375286593228014,
"learning_rate": 1.4605543710021322e-06,
"loss": 0.6119,
"step": 138
},
{
"epoch": 0.2224,
"grad_norm": 0.7275264913039928,
"learning_rate": 1.4712153518123667e-06,
"loss": 0.6107,
"step": 139
},
{
"epoch": 0.224,
"grad_norm": 0.7577612784375718,
"learning_rate": 1.4818763326226013e-06,
"loss": 0.6244,
"step": 140
},
{
"epoch": 0.2256,
"grad_norm": 0.705941256323019,
"learning_rate": 1.4925373134328358e-06,
"loss": 0.6167,
"step": 141
},
{
"epoch": 0.2272,
"grad_norm": 0.7186107088852769,
"learning_rate": 1.5031982942430705e-06,
"loss": 0.6305,
"step": 142
},
{
"epoch": 0.2288,
"grad_norm": 0.7184996826843989,
"learning_rate": 1.5138592750533053e-06,
"loss": 0.6295,
"step": 143
},
{
"epoch": 0.2304,
"grad_norm": 0.7099819162560698,
"learning_rate": 1.5245202558635398e-06,
"loss": 0.6287,
"step": 144
},
{
"epoch": 0.232,
"grad_norm": 0.7389568794462862,
"learning_rate": 1.5351812366737743e-06,
"loss": 0.6396,
"step": 145
},
{
"epoch": 0.2336,
"grad_norm": 0.7541550155510219,
"learning_rate": 1.5458422174840088e-06,
"loss": 0.6177,
"step": 146
},
{
"epoch": 0.2352,
"grad_norm": 0.7651733268047308,
"learning_rate": 1.5565031982942434e-06,
"loss": 0.6444,
"step": 147
},
{
"epoch": 0.2368,
"grad_norm": 0.7615011387468057,
"learning_rate": 1.5671641791044779e-06,
"loss": 0.6195,
"step": 148
},
{
"epoch": 0.2384,
"grad_norm": 0.7414999526883134,
"learning_rate": 1.5778251599147124e-06,
"loss": 0.6369,
"step": 149
},
{
"epoch": 0.24,
"grad_norm": 0.7301319037721387,
"learning_rate": 1.588486140724947e-06,
"loss": 0.6438,
"step": 150
},
{
"epoch": 0.2416,
"grad_norm": 0.7818881841606198,
"learning_rate": 1.5991471215351815e-06,
"loss": 0.623,
"step": 151
},
{
"epoch": 0.2432,
"grad_norm": 0.7737881500179932,
"learning_rate": 1.609808102345416e-06,
"loss": 0.5829,
"step": 152
},
{
"epoch": 0.2448,
"grad_norm": 0.7205381796324217,
"learning_rate": 1.6204690831556505e-06,
"loss": 0.6099,
"step": 153
},
{
"epoch": 0.2464,
"grad_norm": 0.7318576981930963,
"learning_rate": 1.631130063965885e-06,
"loss": 0.6336,
"step": 154
},
{
"epoch": 0.248,
"grad_norm": 0.8198291644853148,
"learning_rate": 1.6417910447761196e-06,
"loss": 0.6125,
"step": 155
},
{
"epoch": 0.2496,
"grad_norm": 0.7485929239838601,
"learning_rate": 1.652452025586354e-06,
"loss": 0.6107,
"step": 156
},
{
"epoch": 0.2512,
"grad_norm": 0.734476272045767,
"learning_rate": 1.6631130063965886e-06,
"loss": 0.5674,
"step": 157
},
{
"epoch": 0.2528,
"grad_norm": 0.6993584392139643,
"learning_rate": 1.6737739872068232e-06,
"loss": 0.5984,
"step": 158
},
{
"epoch": 0.2544,
"grad_norm": 0.7599971478300804,
"learning_rate": 1.6844349680170577e-06,
"loss": 0.6051,
"step": 159
},
{
"epoch": 0.256,
"grad_norm": 0.744260685014072,
"learning_rate": 1.6950959488272922e-06,
"loss": 0.6133,
"step": 160
},
{
"epoch": 0.2576,
"grad_norm": 0.7192519279626429,
"learning_rate": 1.7057569296375267e-06,
"loss": 0.5707,
"step": 161
},
{
"epoch": 0.2592,
"grad_norm": 0.7872870172965513,
"learning_rate": 1.7164179104477613e-06,
"loss": 0.6267,
"step": 162
},
{
"epoch": 0.2608,
"grad_norm": 0.7707728775648776,
"learning_rate": 1.7270788912579958e-06,
"loss": 0.6311,
"step": 163
},
{
"epoch": 0.2624,
"grad_norm": 0.8356371962280772,
"learning_rate": 1.7377398720682303e-06,
"loss": 0.6,
"step": 164
},
{
"epoch": 0.264,
"grad_norm": 0.7530958899947432,
"learning_rate": 1.7484008528784648e-06,
"loss": 0.6228,
"step": 165
},
{
"epoch": 0.2656,
"grad_norm": 0.7629164202614659,
"learning_rate": 1.7590618336886994e-06,
"loss": 0.6173,
"step": 166
},
{
"epoch": 0.2672,
"grad_norm": 0.827098028261084,
"learning_rate": 1.7697228144989339e-06,
"loss": 0.6007,
"step": 167
},
{
"epoch": 0.2688,
"grad_norm": 0.7589188696810613,
"learning_rate": 1.7803837953091684e-06,
"loss": 0.6229,
"step": 168
},
{
"epoch": 0.2704,
"grad_norm": 0.7589601883183021,
"learning_rate": 1.791044776119403e-06,
"loss": 0.6205,
"step": 169
},
{
"epoch": 0.272,
"grad_norm": 0.8288996421143208,
"learning_rate": 1.8017057569296375e-06,
"loss": 0.6181,
"step": 170
},
{
"epoch": 0.2736,
"grad_norm": 0.7406268388466966,
"learning_rate": 1.812366737739872e-06,
"loss": 0.6008,
"step": 171
},
{
"epoch": 0.2752,
"grad_norm": 0.7167010511482258,
"learning_rate": 1.8230277185501067e-06,
"loss": 0.6138,
"step": 172
},
{
"epoch": 0.2768,
"grad_norm": 0.7977129098669627,
"learning_rate": 1.8336886993603415e-06,
"loss": 0.6398,
"step": 173
},
{
"epoch": 0.2784,
"grad_norm": 0.8566534837663508,
"learning_rate": 1.844349680170576e-06,
"loss": 0.6397,
"step": 174
},
{
"epoch": 0.28,
"grad_norm": 0.7579341388238693,
"learning_rate": 1.8550106609808105e-06,
"loss": 0.6258,
"step": 175
},
{
"epoch": 0.2816,
"grad_norm": 0.833538686471727,
"learning_rate": 1.865671641791045e-06,
"loss": 0.6178,
"step": 176
},
{
"epoch": 0.2832,
"grad_norm": 0.776175666432481,
"learning_rate": 1.8763326226012796e-06,
"loss": 0.5811,
"step": 177
},
{
"epoch": 0.2848,
"grad_norm": 0.7346297393699228,
"learning_rate": 1.886993603411514e-06,
"loss": 0.6211,
"step": 178
},
{
"epoch": 0.2864,
"grad_norm": 0.7684302974057745,
"learning_rate": 1.8976545842217486e-06,
"loss": 0.6055,
"step": 179
},
{
"epoch": 0.288,
"grad_norm": 0.7411137167271561,
"learning_rate": 1.908315565031983e-06,
"loss": 0.6144,
"step": 180
},
{
"epoch": 0.2896,
"grad_norm": 0.7354676749120596,
"learning_rate": 1.9189765458422177e-06,
"loss": 0.5896,
"step": 181
},
{
"epoch": 0.2912,
"grad_norm": 0.7782977909380668,
"learning_rate": 1.929637526652452e-06,
"loss": 0.6155,
"step": 182
},
{
"epoch": 0.2928,
"grad_norm": 0.7356159662149813,
"learning_rate": 1.9402985074626867e-06,
"loss": 0.6058,
"step": 183
},
{
"epoch": 0.2944,
"grad_norm": 0.7453299967245929,
"learning_rate": 1.9509594882729213e-06,
"loss": 0.5688,
"step": 184
},
{
"epoch": 0.296,
"grad_norm": 0.7415112555793427,
"learning_rate": 1.9616204690831558e-06,
"loss": 0.5822,
"step": 185
},
{
"epoch": 0.2976,
"grad_norm": 0.7099834115712084,
"learning_rate": 1.9722814498933903e-06,
"loss": 0.584,
"step": 186
},
{
"epoch": 0.2992,
"grad_norm": 0.7741998117558853,
"learning_rate": 1.982942430703625e-06,
"loss": 0.6286,
"step": 187
},
{
"epoch": 0.3008,
"grad_norm": 0.7703941304690587,
"learning_rate": 1.9936034115138594e-06,
"loss": 0.5885,
"step": 188
},
{
"epoch": 0.3024,
"grad_norm": 0.6971250203143535,
"learning_rate": 2.004264392324094e-06,
"loss": 0.5636,
"step": 189
},
{
"epoch": 0.304,
"grad_norm": 0.7805552614079694,
"learning_rate": 2.0149253731343284e-06,
"loss": 0.6077,
"step": 190
},
{
"epoch": 0.3056,
"grad_norm": 0.8106414675734367,
"learning_rate": 2.025586353944563e-06,
"loss": 0.6196,
"step": 191
},
{
"epoch": 0.3072,
"grad_norm": 0.7898048465687718,
"learning_rate": 2.0362473347547975e-06,
"loss": 0.6203,
"step": 192
},
{
"epoch": 0.3088,
"grad_norm": 0.795622798580363,
"learning_rate": 2.046908315565032e-06,
"loss": 0.6128,
"step": 193
},
{
"epoch": 0.3104,
"grad_norm": 0.739615365221959,
"learning_rate": 2.0575692963752665e-06,
"loss": 0.5656,
"step": 194
},
{
"epoch": 0.312,
"grad_norm": 0.7451740864151697,
"learning_rate": 2.068230277185501e-06,
"loss": 0.6456,
"step": 195
},
{
"epoch": 0.3136,
"grad_norm": 0.7620174503836828,
"learning_rate": 2.0788912579957356e-06,
"loss": 0.6231,
"step": 196
},
{
"epoch": 0.3152,
"grad_norm": 0.7854516907193481,
"learning_rate": 2.08955223880597e-06,
"loss": 0.6043,
"step": 197
},
{
"epoch": 0.3168,
"grad_norm": 0.6584891538592907,
"learning_rate": 2.1002132196162046e-06,
"loss": 0.5464,
"step": 198
},
{
"epoch": 0.3184,
"grad_norm": 0.776844868408368,
"learning_rate": 2.110874200426439e-06,
"loss": 0.6057,
"step": 199
},
{
"epoch": 0.32,
"grad_norm": 0.7850990380214087,
"learning_rate": 2.1215351812366737e-06,
"loss": 0.5612,
"step": 200
},
{
"epoch": 0.3216,
"grad_norm": 0.6885916851336799,
"learning_rate": 2.132196162046908e-06,
"loss": 0.6019,
"step": 201
},
{
"epoch": 0.3232,
"grad_norm": 0.841176153584393,
"learning_rate": 2.1428571428571427e-06,
"loss": 0.5731,
"step": 202
},
{
"epoch": 0.3248,
"grad_norm": 0.7846478827660619,
"learning_rate": 2.1535181236673773e-06,
"loss": 0.5828,
"step": 203
},
{
"epoch": 0.3264,
"grad_norm": 0.7428540418442645,
"learning_rate": 2.1641791044776118e-06,
"loss": 0.5661,
"step": 204
},
{
"epoch": 0.328,
"grad_norm": 0.870995701553606,
"learning_rate": 2.1748400852878467e-06,
"loss": 0.6067,
"step": 205
},
{
"epoch": 0.3296,
"grad_norm": 0.7778388771110853,
"learning_rate": 2.1855010660980813e-06,
"loss": 0.5713,
"step": 206
},
{
"epoch": 0.3312,
"grad_norm": 0.7808137509105751,
"learning_rate": 2.1961620469083158e-06,
"loss": 0.565,
"step": 207
},
{
"epoch": 0.3328,
"grad_norm": 0.853167109393607,
"learning_rate": 2.2068230277185503e-06,
"loss": 0.5857,
"step": 208
},
{
"epoch": 0.3344,
"grad_norm": 0.734590731677972,
"learning_rate": 2.217484008528785e-06,
"loss": 0.5687,
"step": 209
},
{
"epoch": 0.336,
"grad_norm": 0.802493654740643,
"learning_rate": 2.2281449893390194e-06,
"loss": 0.5973,
"step": 210
},
{
"epoch": 0.3376,
"grad_norm": 0.763288832765317,
"learning_rate": 2.238805970149254e-06,
"loss": 0.5878,
"step": 211
},
{
"epoch": 0.3392,
"grad_norm": 0.8031347437576318,
"learning_rate": 2.2494669509594884e-06,
"loss": 0.6072,
"step": 212
},
{
"epoch": 0.3408,
"grad_norm": 0.8434571464754188,
"learning_rate": 2.260127931769723e-06,
"loss": 0.5953,
"step": 213
},
{
"epoch": 0.3424,
"grad_norm": 0.8355518526289846,
"learning_rate": 2.2707889125799575e-06,
"loss": 0.6361,
"step": 214
},
{
"epoch": 0.344,
"grad_norm": 0.7706620507762623,
"learning_rate": 2.281449893390192e-06,
"loss": 0.5814,
"step": 215
},
{
"epoch": 0.3456,
"grad_norm": 0.7638464771267637,
"learning_rate": 2.2921108742004265e-06,
"loss": 0.5506,
"step": 216
},
{
"epoch": 0.3472,
"grad_norm": 0.7814296599045286,
"learning_rate": 2.302771855010661e-06,
"loss": 0.5826,
"step": 217
},
{
"epoch": 0.3488,
"grad_norm": 0.7845339762934284,
"learning_rate": 2.3134328358208956e-06,
"loss": 0.6091,
"step": 218
},
{
"epoch": 0.3504,
"grad_norm": 0.7560660254775938,
"learning_rate": 2.32409381663113e-06,
"loss": 0.5967,
"step": 219
},
{
"epoch": 0.352,
"grad_norm": 0.7508649264529814,
"learning_rate": 2.3347547974413646e-06,
"loss": 0.6053,
"step": 220
},
{
"epoch": 0.3536,
"grad_norm": 0.7210458508613927,
"learning_rate": 2.345415778251599e-06,
"loss": 0.5642,
"step": 221
},
{
"epoch": 0.3552,
"grad_norm": 0.736381434405332,
"learning_rate": 2.3560767590618337e-06,
"loss": 0.5673,
"step": 222
},
{
"epoch": 0.3568,
"grad_norm": 0.7601497997101813,
"learning_rate": 2.366737739872068e-06,
"loss": 0.5991,
"step": 223
},
{
"epoch": 0.3584,
"grad_norm": 0.7084448686385765,
"learning_rate": 2.3773987206823027e-06,
"loss": 0.5662,
"step": 224
},
{
"epoch": 0.36,
"grad_norm": 0.7961580690261183,
"learning_rate": 2.3880597014925373e-06,
"loss": 0.6292,
"step": 225
},
{
"epoch": 0.3616,
"grad_norm": 0.74581691114757,
"learning_rate": 2.398720682302772e-06,
"loss": 0.545,
"step": 226
},
{
"epoch": 0.3632,
"grad_norm": 0.8058971329947406,
"learning_rate": 2.4093816631130067e-06,
"loss": 0.5996,
"step": 227
},
{
"epoch": 0.3648,
"grad_norm": 0.760064099858659,
"learning_rate": 2.4200426439232413e-06,
"loss": 0.603,
"step": 228
},
{
"epoch": 0.3664,
"grad_norm": 0.7549868329039784,
"learning_rate": 2.4307036247334758e-06,
"loss": 0.6114,
"step": 229
},
{
"epoch": 0.368,
"grad_norm": 0.7720717264723672,
"learning_rate": 2.4413646055437103e-06,
"loss": 0.5941,
"step": 230
},
{
"epoch": 0.3696,
"grad_norm": 0.7616410748286908,
"learning_rate": 2.452025586353945e-06,
"loss": 0.5533,
"step": 231
},
{
"epoch": 0.3712,
"grad_norm": 0.7504708721409487,
"learning_rate": 2.4626865671641794e-06,
"loss": 0.5766,
"step": 232
},
{
"epoch": 0.3728,
"grad_norm": 0.7762657496338241,
"learning_rate": 2.473347547974414e-06,
"loss": 0.5978,
"step": 233
},
{
"epoch": 0.3744,
"grad_norm": 0.7831198389239687,
"learning_rate": 2.4840085287846484e-06,
"loss": 0.6233,
"step": 234
},
{
"epoch": 0.376,
"grad_norm": 0.7681970903498869,
"learning_rate": 2.494669509594883e-06,
"loss": 0.6009,
"step": 235
},
{
"epoch": 0.3776,
"grad_norm": 0.7337967052937636,
"learning_rate": 2.5053304904051175e-06,
"loss": 0.5977,
"step": 236
},
{
"epoch": 0.3792,
"grad_norm": 0.7813473994552436,
"learning_rate": 2.515991471215352e-06,
"loss": 0.5749,
"step": 237
},
{
"epoch": 0.3808,
"grad_norm": 0.7161895053490572,
"learning_rate": 2.5266524520255865e-06,
"loss": 0.5761,
"step": 238
},
{
"epoch": 0.3824,
"grad_norm": 0.8038905440863002,
"learning_rate": 2.537313432835821e-06,
"loss": 0.6717,
"step": 239
},
{
"epoch": 0.384,
"grad_norm": 0.7852844599544667,
"learning_rate": 2.5479744136460556e-06,
"loss": 0.5716,
"step": 240
},
{
"epoch": 0.3856,
"grad_norm": 0.7950726788936678,
"learning_rate": 2.55863539445629e-06,
"loss": 0.5855,
"step": 241
},
{
"epoch": 0.3872,
"grad_norm": 0.8502336760749962,
"learning_rate": 2.5692963752665246e-06,
"loss": 0.5564,
"step": 242
},
{
"epoch": 0.3888,
"grad_norm": 0.7742437970001474,
"learning_rate": 2.579957356076759e-06,
"loss": 0.571,
"step": 243
},
{
"epoch": 0.3904,
"grad_norm": 0.8648393139239822,
"learning_rate": 2.5906183368869937e-06,
"loss": 0.5728,
"step": 244
},
{
"epoch": 0.392,
"grad_norm": 0.6979337264095263,
"learning_rate": 2.601279317697228e-06,
"loss": 0.5495,
"step": 245
},
{
"epoch": 0.3936,
"grad_norm": 0.7256603223068586,
"learning_rate": 2.6119402985074627e-06,
"loss": 0.5439,
"step": 246
},
{
"epoch": 0.3952,
"grad_norm": 0.7918254415993081,
"learning_rate": 2.6226012793176977e-06,
"loss": 0.5709,
"step": 247
},
{
"epoch": 0.3968,
"grad_norm": 0.8347112020129184,
"learning_rate": 2.6332622601279318e-06,
"loss": 0.5987,
"step": 248
},
{
"epoch": 0.3984,
"grad_norm": 0.7603267236888341,
"learning_rate": 2.6439232409381667e-06,
"loss": 0.5615,
"step": 249
},
{
"epoch": 0.4,
"grad_norm": 0.81780496643962,
"learning_rate": 2.654584221748401e-06,
"loss": 0.6007,
"step": 250
},
{
"epoch": 0.4016,
"grad_norm": 0.7791704293627059,
"learning_rate": 2.6652452025586358e-06,
"loss": 0.5799,
"step": 251
},
{
"epoch": 0.4032,
"grad_norm": 0.7488384390242884,
"learning_rate": 2.6759061833688703e-06,
"loss": 0.56,
"step": 252
},
{
"epoch": 0.4048,
"grad_norm": 0.7597491390169727,
"learning_rate": 2.686567164179105e-06,
"loss": 0.5794,
"step": 253
},
{
"epoch": 0.4064,
"grad_norm": 0.7923958441674507,
"learning_rate": 2.6972281449893394e-06,
"loss": 0.5708,
"step": 254
},
{
"epoch": 0.408,
"grad_norm": 0.7729640423091064,
"learning_rate": 2.707889125799574e-06,
"loss": 0.6063,
"step": 255
},
{
"epoch": 0.4096,
"grad_norm": 0.8053586551349151,
"learning_rate": 2.7185501066098084e-06,
"loss": 0.6016,
"step": 256
},
{
"epoch": 0.4112,
"grad_norm": 0.8039885381135673,
"learning_rate": 2.729211087420043e-06,
"loss": 0.5848,
"step": 257
},
{
"epoch": 0.4128,
"grad_norm": 0.9513014083370441,
"learning_rate": 2.7398720682302775e-06,
"loss": 0.6345,
"step": 258
},
{
"epoch": 0.4144,
"grad_norm": 0.7742998578313859,
"learning_rate": 2.750533049040512e-06,
"loss": 0.5694,
"step": 259
},
{
"epoch": 0.416,
"grad_norm": 0.7641716911555947,
"learning_rate": 2.7611940298507465e-06,
"loss": 0.587,
"step": 260
},
{
"epoch": 0.4176,
"grad_norm": 0.821831437603987,
"learning_rate": 2.771855010660981e-06,
"loss": 0.5875,
"step": 261
},
{
"epoch": 0.4192,
"grad_norm": 0.7503591578304264,
"learning_rate": 2.7825159914712156e-06,
"loss": 0.6075,
"step": 262
},
{
"epoch": 0.4208,
"grad_norm": 0.719957241909583,
"learning_rate": 2.79317697228145e-06,
"loss": 0.5803,
"step": 263
},
{
"epoch": 0.4224,
"grad_norm": 0.7451469464502543,
"learning_rate": 2.8038379530916846e-06,
"loss": 0.5663,
"step": 264
},
{
"epoch": 0.424,
"grad_norm": 0.80423744402681,
"learning_rate": 2.814498933901919e-06,
"loss": 0.6023,
"step": 265
},
{
"epoch": 0.4256,
"grad_norm": 0.7971922540683725,
"learning_rate": 2.825159914712154e-06,
"loss": 0.5985,
"step": 266
},
{
"epoch": 0.4272,
"grad_norm": 0.769676645099999,
"learning_rate": 2.835820895522388e-06,
"loss": 0.566,
"step": 267
},
{
"epoch": 0.4288,
"grad_norm": 0.7646967688330923,
"learning_rate": 2.846481876332623e-06,
"loss": 0.5374,
"step": 268
},
{
"epoch": 0.4304,
"grad_norm": 0.7833466676671014,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.5523,
"step": 269
},
{
"epoch": 0.432,
"grad_norm": 0.7391354432900153,
"learning_rate": 2.867803837953092e-06,
"loss": 0.5619,
"step": 270
},
{
"epoch": 0.4336,
"grad_norm": 0.8465814174774607,
"learning_rate": 2.8784648187633263e-06,
"loss": 0.6245,
"step": 271
},
{
"epoch": 0.4352,
"grad_norm": 0.7805615042465259,
"learning_rate": 2.8891257995735613e-06,
"loss": 0.6124,
"step": 272
},
{
"epoch": 0.4368,
"grad_norm": 0.8174129655040298,
"learning_rate": 2.8997867803837954e-06,
"loss": 0.5967,
"step": 273
},
{
"epoch": 0.4384,
"grad_norm": 0.7523355194081328,
"learning_rate": 2.9104477611940303e-06,
"loss": 0.5763,
"step": 274
},
{
"epoch": 0.44,
"grad_norm": 0.7377514276234232,
"learning_rate": 2.9211087420042644e-06,
"loss": 0.5545,
"step": 275
},
{
"epoch": 0.4416,
"grad_norm": 0.8215970879872154,
"learning_rate": 2.9317697228144994e-06,
"loss": 0.5889,
"step": 276
},
{
"epoch": 0.4432,
"grad_norm": 0.8344937751406882,
"learning_rate": 2.9424307036247335e-06,
"loss": 0.6108,
"step": 277
},
{
"epoch": 0.4448,
"grad_norm": 0.7745579794081016,
"learning_rate": 2.9530916844349684e-06,
"loss": 0.5786,
"step": 278
},
{
"epoch": 0.4464,
"grad_norm": 0.804786843475639,
"learning_rate": 2.9637526652452025e-06,
"loss": 0.5707,
"step": 279
},
{
"epoch": 0.448,
"grad_norm": 0.8181561905182807,
"learning_rate": 2.9744136460554375e-06,
"loss": 0.6074,
"step": 280
},
{
"epoch": 0.4496,
"grad_norm": 0.8955915763396078,
"learning_rate": 2.9850746268656716e-06,
"loss": 0.5841,
"step": 281
},
{
"epoch": 0.4512,
"grad_norm": 0.7284050946807041,
"learning_rate": 2.9957356076759065e-06,
"loss": 0.5554,
"step": 282
},
{
"epoch": 0.4528,
"grad_norm": 0.8146518309559156,
"learning_rate": 3.006396588486141e-06,
"loss": 0.5881,
"step": 283
},
{
"epoch": 0.4544,
"grad_norm": 0.7768178946322084,
"learning_rate": 3.0170575692963756e-06,
"loss": 0.5533,
"step": 284
},
{
"epoch": 0.456,
"grad_norm": 0.7696150804059677,
"learning_rate": 3.0277185501066105e-06,
"loss": 0.573,
"step": 285
},
{
"epoch": 0.4576,
"grad_norm": 0.7691229758539965,
"learning_rate": 3.0383795309168446e-06,
"loss": 0.5523,
"step": 286
},
{
"epoch": 0.4592,
"grad_norm": 0.742131995359427,
"learning_rate": 3.0490405117270796e-06,
"loss": 0.5502,
"step": 287
},
{
"epoch": 0.4608,
"grad_norm": 0.7655172897040609,
"learning_rate": 3.0597014925373137e-06,
"loss": 0.5758,
"step": 288
},
{
"epoch": 0.4624,
"grad_norm": 0.7691849080534547,
"learning_rate": 3.0703624733475486e-06,
"loss": 0.5791,
"step": 289
},
{
"epoch": 0.464,
"grad_norm": 0.7818739523919556,
"learning_rate": 3.0810234541577827e-06,
"loss": 0.5623,
"step": 290
},
{
"epoch": 0.4656,
"grad_norm": 0.819560066863059,
"learning_rate": 3.0916844349680177e-06,
"loss": 0.5939,
"step": 291
},
{
"epoch": 0.4672,
"grad_norm": 0.8169731024775341,
"learning_rate": 3.1023454157782518e-06,
"loss": 0.5894,
"step": 292
},
{
"epoch": 0.4688,
"grad_norm": 0.7865620876740786,
"learning_rate": 3.1130063965884867e-06,
"loss": 0.5864,
"step": 293
},
{
"epoch": 0.4704,
"grad_norm": 0.8084169911506447,
"learning_rate": 3.123667377398721e-06,
"loss": 0.5836,
"step": 294
},
{
"epoch": 0.472,
"grad_norm": 0.8443142773932891,
"learning_rate": 3.1343283582089558e-06,
"loss": 0.587,
"step": 295
},
{
"epoch": 0.4736,
"grad_norm": 0.8987651915438104,
"learning_rate": 3.14498933901919e-06,
"loss": 0.5771,
"step": 296
},
{
"epoch": 0.4752,
"grad_norm": 0.860270543929545,
"learning_rate": 3.155650319829425e-06,
"loss": 0.5821,
"step": 297
},
{
"epoch": 0.4768,
"grad_norm": 0.824939899329991,
"learning_rate": 3.166311300639659e-06,
"loss": 0.5481,
"step": 298
},
{
"epoch": 0.4784,
"grad_norm": 0.8672862862240402,
"learning_rate": 3.176972281449894e-06,
"loss": 0.5575,
"step": 299
},
{
"epoch": 0.48,
"grad_norm": 0.8182300142439433,
"learning_rate": 3.187633262260128e-06,
"loss": 0.5815,
"step": 300
},
{
"epoch": 0.4816,
"grad_norm": 0.7380772504538786,
"learning_rate": 3.198294243070363e-06,
"loss": 0.5893,
"step": 301
},
{
"epoch": 0.4832,
"grad_norm": 0.7977879234410605,
"learning_rate": 3.208955223880597e-06,
"loss": 0.5423,
"step": 302
},
{
"epoch": 0.4848,
"grad_norm": 0.7369770503071659,
"learning_rate": 3.219616204690832e-06,
"loss": 0.5707,
"step": 303
},
{
"epoch": 0.4864,
"grad_norm": 0.8429735563676172,
"learning_rate": 3.230277185501066e-06,
"loss": 0.5371,
"step": 304
},
{
"epoch": 0.488,
"grad_norm": 0.7462981202694815,
"learning_rate": 3.240938166311301e-06,
"loss": 0.5569,
"step": 305
},
{
"epoch": 0.4896,
"grad_norm": 0.8635339096769287,
"learning_rate": 3.251599147121535e-06,
"loss": 0.5608,
"step": 306
},
{
"epoch": 0.4912,
"grad_norm": 0.7803060364394415,
"learning_rate": 3.26226012793177e-06,
"loss": 0.5667,
"step": 307
},
{
"epoch": 0.4928,
"grad_norm": 0.8494397388711882,
"learning_rate": 3.272921108742004e-06,
"loss": 0.5588,
"step": 308
},
{
"epoch": 0.4944,
"grad_norm": 0.7832352945833961,
"learning_rate": 3.283582089552239e-06,
"loss": 0.5299,
"step": 309
},
{
"epoch": 0.496,
"grad_norm": 0.8062104067281601,
"learning_rate": 3.2942430703624733e-06,
"loss": 0.5696,
"step": 310
},
{
"epoch": 0.4976,
"grad_norm": 0.8054657716765189,
"learning_rate": 3.304904051172708e-06,
"loss": 0.5328,
"step": 311
},
{
"epoch": 0.4992,
"grad_norm": 0.7876354196805521,
"learning_rate": 3.3155650319829423e-06,
"loss": 0.546,
"step": 312
},
{
"epoch": 0.5008,
"grad_norm": 0.8126381266985822,
"learning_rate": 3.3262260127931773e-06,
"loss": 0.6014,
"step": 313
},
{
"epoch": 0.5024,
"grad_norm": 0.8445398126335238,
"learning_rate": 3.336886993603412e-06,
"loss": 0.5658,
"step": 314
},
{
"epoch": 0.504,
"grad_norm": 0.8304476975533555,
"learning_rate": 3.3475479744136463e-06,
"loss": 0.5805,
"step": 315
},
{
"epoch": 0.5056,
"grad_norm": 0.7547981390881123,
"learning_rate": 3.3582089552238813e-06,
"loss": 0.5433,
"step": 316
},
{
"epoch": 0.5072,
"grad_norm": 0.8181361264873499,
"learning_rate": 3.3688699360341154e-06,
"loss": 0.5738,
"step": 317
},
{
"epoch": 0.5088,
"grad_norm": 0.8462943112139383,
"learning_rate": 3.3795309168443503e-06,
"loss": 0.5509,
"step": 318
},
{
"epoch": 0.5104,
"grad_norm": 0.7922383013268821,
"learning_rate": 3.3901918976545844e-06,
"loss": 0.5693,
"step": 319
},
{
"epoch": 0.512,
"grad_norm": 0.8477252050554914,
"learning_rate": 3.4008528784648194e-06,
"loss": 0.5764,
"step": 320
},
{
"epoch": 0.5136,
"grad_norm": 0.8094481547663196,
"learning_rate": 3.4115138592750535e-06,
"loss": 0.5353,
"step": 321
},
{
"epoch": 0.5152,
"grad_norm": 0.8864054471322013,
"learning_rate": 3.4221748400852884e-06,
"loss": 0.575,
"step": 322
},
{
"epoch": 0.5168,
"grad_norm": 0.7876415032718879,
"learning_rate": 3.4328358208955225e-06,
"loss": 0.5414,
"step": 323
},
{
"epoch": 0.5184,
"grad_norm": 0.8210715320082055,
"learning_rate": 3.4434968017057575e-06,
"loss": 0.5521,
"step": 324
},
{
"epoch": 0.52,
"grad_norm": 0.8371168961714202,
"learning_rate": 3.4541577825159916e-06,
"loss": 0.5827,
"step": 325
},
{
"epoch": 0.5216,
"grad_norm": 0.773292575986764,
"learning_rate": 3.4648187633262265e-06,
"loss": 0.5749,
"step": 326
},
{
"epoch": 0.5232,
"grad_norm": 0.850689297987853,
"learning_rate": 3.4754797441364606e-06,
"loss": 0.5691,
"step": 327
},
{
"epoch": 0.5248,
"grad_norm": 0.7507404127475026,
"learning_rate": 3.4861407249466956e-06,
"loss": 0.5269,
"step": 328
},
{
"epoch": 0.5264,
"grad_norm": 0.7845396303400523,
"learning_rate": 3.4968017057569297e-06,
"loss": 0.5565,
"step": 329
},
{
"epoch": 0.528,
"grad_norm": 0.7824098744828355,
"learning_rate": 3.5074626865671646e-06,
"loss": 0.5706,
"step": 330
},
{
"epoch": 0.5296,
"grad_norm": 0.7342948796645827,
"learning_rate": 3.5181236673773987e-06,
"loss": 0.5321,
"step": 331
},
{
"epoch": 0.5312,
"grad_norm": 0.8194834791920059,
"learning_rate": 3.5287846481876337e-06,
"loss": 0.5535,
"step": 332
},
{
"epoch": 0.5328,
"grad_norm": 0.7847452627526185,
"learning_rate": 3.5394456289978678e-06,
"loss": 0.5584,
"step": 333
},
{
"epoch": 0.5344,
"grad_norm": 0.8998666925556803,
"learning_rate": 3.5501066098081027e-06,
"loss": 0.5566,
"step": 334
},
{
"epoch": 0.536,
"grad_norm": 0.7941633997637204,
"learning_rate": 3.560767590618337e-06,
"loss": 0.5991,
"step": 335
},
{
"epoch": 0.5376,
"grad_norm": 0.881854884085909,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.5689,
"step": 336
},
{
"epoch": 0.5392,
"grad_norm": 0.7882024113715358,
"learning_rate": 3.582089552238806e-06,
"loss": 0.5642,
"step": 337
},
{
"epoch": 0.5408,
"grad_norm": 0.8657625305194023,
"learning_rate": 3.592750533049041e-06,
"loss": 0.5461,
"step": 338
},
{
"epoch": 0.5424,
"grad_norm": 0.8581207132915274,
"learning_rate": 3.603411513859275e-06,
"loss": 0.5724,
"step": 339
},
{
"epoch": 0.544,
"grad_norm": 0.8859736113637721,
"learning_rate": 3.61407249466951e-06,
"loss": 0.5302,
"step": 340
},
{
"epoch": 0.5456,
"grad_norm": 0.8073037679646752,
"learning_rate": 3.624733475479744e-06,
"loss": 0.5682,
"step": 341
},
{
"epoch": 0.5472,
"grad_norm": 0.8692275414543921,
"learning_rate": 3.635394456289979e-06,
"loss": 0.5393,
"step": 342
},
{
"epoch": 0.5488,
"grad_norm": 0.8368478492119548,
"learning_rate": 3.6460554371002135e-06,
"loss": 0.5657,
"step": 343
},
{
"epoch": 0.5504,
"grad_norm": 0.7995247958601864,
"learning_rate": 3.656716417910448e-06,
"loss": 0.5556,
"step": 344
},
{
"epoch": 0.552,
"grad_norm": 0.8276090020791603,
"learning_rate": 3.667377398720683e-06,
"loss": 0.545,
"step": 345
},
{
"epoch": 0.5536,
"grad_norm": 0.7681514100554063,
"learning_rate": 3.678038379530917e-06,
"loss": 0.5442,
"step": 346
},
{
"epoch": 0.5552,
"grad_norm": 0.804481990476876,
"learning_rate": 3.688699360341152e-06,
"loss": 0.6083,
"step": 347
},
{
"epoch": 0.5568,
"grad_norm": 0.7405084001120108,
"learning_rate": 3.699360341151386e-06,
"loss": 0.5594,
"step": 348
},
{
"epoch": 0.5584,
"grad_norm": 0.7772627463789789,
"learning_rate": 3.710021321961621e-06,
"loss": 0.575,
"step": 349
},
{
"epoch": 0.56,
"grad_norm": 0.7789355204470892,
"learning_rate": 3.720682302771855e-06,
"loss": 0.5725,
"step": 350
},
{
"epoch": 0.5616,
"grad_norm": 0.8454290799867418,
"learning_rate": 3.73134328358209e-06,
"loss": 0.5542,
"step": 351
},
{
"epoch": 0.5632,
"grad_norm": 0.7990810798027119,
"learning_rate": 3.742004264392324e-06,
"loss": 0.5258,
"step": 352
},
{
"epoch": 0.5648,
"grad_norm": 0.8167499165126225,
"learning_rate": 3.752665245202559e-06,
"loss": 0.5774,
"step": 353
},
{
"epoch": 0.5664,
"grad_norm": 0.8063317607872015,
"learning_rate": 3.7633262260127933e-06,
"loss": 0.5362,
"step": 354
},
{
"epoch": 0.568,
"grad_norm": 0.7932749827016408,
"learning_rate": 3.773987206823028e-06,
"loss": 0.5334,
"step": 355
},
{
"epoch": 0.5696,
"grad_norm": 0.8031722120801229,
"learning_rate": 3.7846481876332623e-06,
"loss": 0.549,
"step": 356
},
{
"epoch": 0.5712,
"grad_norm": 0.805457309127876,
"learning_rate": 3.7953091684434973e-06,
"loss": 0.5457,
"step": 357
},
{
"epoch": 0.5728,
"grad_norm": 0.8063931320319081,
"learning_rate": 3.8059701492537314e-06,
"loss": 0.5713,
"step": 358
},
{
"epoch": 0.5744,
"grad_norm": 0.7733195413815381,
"learning_rate": 3.816631130063966e-06,
"loss": 0.5186,
"step": 359
},
{
"epoch": 0.576,
"grad_norm": 0.8113756279778045,
"learning_rate": 3.827292110874201e-06,
"loss": 0.5601,
"step": 360
},
{
"epoch": 0.5776,
"grad_norm": 0.9030002101601021,
"learning_rate": 3.837953091684435e-06,
"loss": 0.596,
"step": 361
},
{
"epoch": 0.5792,
"grad_norm": 0.7984289955625544,
"learning_rate": 3.84861407249467e-06,
"loss": 0.5447,
"step": 362
},
{
"epoch": 0.5808,
"grad_norm": 0.8131316696626657,
"learning_rate": 3.859275053304904e-06,
"loss": 0.5661,
"step": 363
},
{
"epoch": 0.5824,
"grad_norm": 0.796620771283217,
"learning_rate": 3.869936034115139e-06,
"loss": 0.5412,
"step": 364
},
{
"epoch": 0.584,
"grad_norm": 0.8315896370386743,
"learning_rate": 3.8805970149253735e-06,
"loss": 0.5421,
"step": 365
},
{
"epoch": 0.5856,
"grad_norm": 0.8010472482295572,
"learning_rate": 3.891257995735608e-06,
"loss": 0.5545,
"step": 366
},
{
"epoch": 0.5872,
"grad_norm": 0.85710233342375,
"learning_rate": 3.9019189765458425e-06,
"loss": 0.5292,
"step": 367
},
{
"epoch": 0.5888,
"grad_norm": 0.7800981161456916,
"learning_rate": 3.912579957356077e-06,
"loss": 0.5722,
"step": 368
},
{
"epoch": 0.5904,
"grad_norm": 0.8120785689836847,
"learning_rate": 3.9232409381663116e-06,
"loss": 0.5221,
"step": 369
},
{
"epoch": 0.592,
"grad_norm": 0.7501430227392313,
"learning_rate": 3.933901918976546e-06,
"loss": 0.5066,
"step": 370
},
{
"epoch": 0.5936,
"grad_norm": 0.7687524109570407,
"learning_rate": 3.944562899786781e-06,
"loss": 0.5429,
"step": 371
},
{
"epoch": 0.5952,
"grad_norm": 0.7508289935518603,
"learning_rate": 3.955223880597015e-06,
"loss": 0.5238,
"step": 372
},
{
"epoch": 0.5968,
"grad_norm": 0.7591693655529905,
"learning_rate": 3.96588486140725e-06,
"loss": 0.5444,
"step": 373
},
{
"epoch": 0.5984,
"grad_norm": 0.8451880038162318,
"learning_rate": 3.976545842217484e-06,
"loss": 0.5713,
"step": 374
},
{
"epoch": 0.6,
"grad_norm": 0.758974892187254,
"learning_rate": 3.987206823027719e-06,
"loss": 0.5465,
"step": 375
},
{
"epoch": 0.6016,
"grad_norm": 0.8953319848826586,
"learning_rate": 3.997867803837953e-06,
"loss": 0.5952,
"step": 376
},
{
"epoch": 0.6032,
"grad_norm": 0.831194906832879,
"learning_rate": 4.008528784648188e-06,
"loss": 0.5198,
"step": 377
},
{
"epoch": 0.6048,
"grad_norm": 0.9163151609769153,
"learning_rate": 4.019189765458423e-06,
"loss": 0.5415,
"step": 378
},
{
"epoch": 0.6064,
"grad_norm": 0.869200099149563,
"learning_rate": 4.029850746268657e-06,
"loss": 0.5626,
"step": 379
},
{
"epoch": 0.608,
"grad_norm": 0.8404925217554066,
"learning_rate": 4.040511727078892e-06,
"loss": 0.5302,
"step": 380
},
{
"epoch": 0.6096,
"grad_norm": 0.7826710011465347,
"learning_rate": 4.051172707889126e-06,
"loss": 0.5352,
"step": 381
},
{
"epoch": 0.6112,
"grad_norm": 0.8241550929780408,
"learning_rate": 4.061833688699361e-06,
"loss": 0.5374,
"step": 382
},
{
"epoch": 0.6128,
"grad_norm": 0.7510788009103369,
"learning_rate": 4.072494669509595e-06,
"loss": 0.5391,
"step": 383
},
{
"epoch": 0.6144,
"grad_norm": 0.8061130294153335,
"learning_rate": 4.08315565031983e-06,
"loss": 0.596,
"step": 384
},
{
"epoch": 0.616,
"grad_norm": 0.8080429382071841,
"learning_rate": 4.093816631130064e-06,
"loss": 0.5326,
"step": 385
},
{
"epoch": 0.6176,
"grad_norm": 0.8170177676336794,
"learning_rate": 4.104477611940299e-06,
"loss": 0.5571,
"step": 386
},
{
"epoch": 0.6192,
"grad_norm": 0.8160013454370157,
"learning_rate": 4.115138592750533e-06,
"loss": 0.5292,
"step": 387
},
{
"epoch": 0.6208,
"grad_norm": 0.7967263545203118,
"learning_rate": 4.125799573560768e-06,
"loss": 0.5626,
"step": 388
},
{
"epoch": 0.6224,
"grad_norm": 0.8568044544627053,
"learning_rate": 4.136460554371002e-06,
"loss": 0.5274,
"step": 389
},
{
"epoch": 0.624,
"grad_norm": 0.8066358962082286,
"learning_rate": 4.1471215351812375e-06,
"loss": 0.5488,
"step": 390
},
{
"epoch": 0.6256,
"grad_norm": 0.9667800237647179,
"learning_rate": 4.157782515991471e-06,
"loss": 0.5474,
"step": 391
},
{
"epoch": 0.6272,
"grad_norm": 0.8056755321027613,
"learning_rate": 4.1684434968017065e-06,
"loss": 0.5483,
"step": 392
},
{
"epoch": 0.6288,
"grad_norm": 1.000045905913509,
"learning_rate": 4.17910447761194e-06,
"loss": 0.5391,
"step": 393
},
{
"epoch": 0.6304,
"grad_norm": 0.8010942641365666,
"learning_rate": 4.1897654584221756e-06,
"loss": 0.5361,
"step": 394
},
{
"epoch": 0.632,
"grad_norm": 0.9451955462134625,
"learning_rate": 4.200426439232409e-06,
"loss": 0.5408,
"step": 395
},
{
"epoch": 0.6336,
"grad_norm": 0.7798088066161465,
"learning_rate": 4.211087420042645e-06,
"loss": 0.5189,
"step": 396
},
{
"epoch": 0.6352,
"grad_norm": 0.9464532007112202,
"learning_rate": 4.221748400852878e-06,
"loss": 0.5442,
"step": 397
},
{
"epoch": 0.6368,
"grad_norm": 0.9021258173152447,
"learning_rate": 4.232409381663114e-06,
"loss": 0.5776,
"step": 398
},
{
"epoch": 0.6384,
"grad_norm": 0.9646933747683049,
"learning_rate": 4.243070362473347e-06,
"loss": 0.5534,
"step": 399
},
{
"epoch": 0.64,
"grad_norm": 0.8688602817866495,
"learning_rate": 4.253731343283583e-06,
"loss": 0.5216,
"step": 400
},
{
"epoch": 0.6416,
"grad_norm": 0.9892060933344063,
"learning_rate": 4.264392324093816e-06,
"loss": 0.5332,
"step": 401
},
{
"epoch": 0.6432,
"grad_norm": 0.9385981933705216,
"learning_rate": 4.275053304904052e-06,
"loss": 0.5665,
"step": 402
},
{
"epoch": 0.6448,
"grad_norm": 1.0764580640875403,
"learning_rate": 4.2857142857142855e-06,
"loss": 0.5349,
"step": 403
},
{
"epoch": 0.6464,
"grad_norm": 1.0040496956050853,
"learning_rate": 4.296375266524521e-06,
"loss": 0.5404,
"step": 404
},
{
"epoch": 0.648,
"grad_norm": 0.8827309331973633,
"learning_rate": 4.3070362473347545e-06,
"loss": 0.5868,
"step": 405
},
{
"epoch": 0.6496,
"grad_norm": 0.9809937804440383,
"learning_rate": 4.31769722814499e-06,
"loss": 0.5546,
"step": 406
},
{
"epoch": 0.6512,
"grad_norm": 0.884351522075039,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.5345,
"step": 407
},
{
"epoch": 0.6528,
"grad_norm": 0.9790501988552092,
"learning_rate": 4.339019189765459e-06,
"loss": 0.5599,
"step": 408
},
{
"epoch": 0.6544,
"grad_norm": 0.9071527284238047,
"learning_rate": 4.3496801705756935e-06,
"loss": 0.5054,
"step": 409
},
{
"epoch": 0.656,
"grad_norm": 0.8445713569492348,
"learning_rate": 4.360341151385928e-06,
"loss": 0.5219,
"step": 410
},
{
"epoch": 0.6576,
"grad_norm": 0.8737956626335754,
"learning_rate": 4.3710021321961625e-06,
"loss": 0.5413,
"step": 411
},
{
"epoch": 0.6592,
"grad_norm": 0.8261342813870577,
"learning_rate": 4.381663113006397e-06,
"loss": 0.5539,
"step": 412
},
{
"epoch": 0.6608,
"grad_norm": 0.8184073447460407,
"learning_rate": 4.3923240938166316e-06,
"loss": 0.5426,
"step": 413
},
{
"epoch": 0.6624,
"grad_norm": 0.8366177996309456,
"learning_rate": 4.402985074626866e-06,
"loss": 0.5554,
"step": 414
},
{
"epoch": 0.664,
"grad_norm": 0.8280357274609805,
"learning_rate": 4.413646055437101e-06,
"loss": 0.5308,
"step": 415
},
{
"epoch": 0.6656,
"grad_norm": 0.8377902715368897,
"learning_rate": 4.424307036247335e-06,
"loss": 0.5456,
"step": 416
},
{
"epoch": 0.6672,
"grad_norm": 0.8510022057043192,
"learning_rate": 4.43496801705757e-06,
"loss": 0.5325,
"step": 417
},
{
"epoch": 0.6688,
"grad_norm": 0.9329195540707577,
"learning_rate": 4.445628997867804e-06,
"loss": 0.542,
"step": 418
},
{
"epoch": 0.6704,
"grad_norm": 0.8224587637626823,
"learning_rate": 4.456289978678039e-06,
"loss": 0.5593,
"step": 419
},
{
"epoch": 0.672,
"grad_norm": 0.8512952980107195,
"learning_rate": 4.466950959488273e-06,
"loss": 0.5118,
"step": 420
},
{
"epoch": 0.6736,
"grad_norm": 0.8607386380556552,
"learning_rate": 4.477611940298508e-06,
"loss": 0.5473,
"step": 421
},
{
"epoch": 0.6752,
"grad_norm": 0.8411448840445225,
"learning_rate": 4.488272921108742e-06,
"loss": 0.5114,
"step": 422
},
{
"epoch": 0.6768,
"grad_norm": 0.827972617616303,
"learning_rate": 4.498933901918977e-06,
"loss": 0.5571,
"step": 423
},
{
"epoch": 0.6784,
"grad_norm": 0.8375199395465468,
"learning_rate": 4.509594882729211e-06,
"loss": 0.5976,
"step": 424
},
{
"epoch": 0.68,
"grad_norm": 0.8110166668311388,
"learning_rate": 4.520255863539446e-06,
"loss": 0.5348,
"step": 425
},
{
"epoch": 0.6816,
"grad_norm": 0.7420626199354639,
"learning_rate": 4.53091684434968e-06,
"loss": 0.532,
"step": 426
},
{
"epoch": 0.6832,
"grad_norm": 0.899693608967382,
"learning_rate": 4.541577825159915e-06,
"loss": 0.573,
"step": 427
},
{
"epoch": 0.6848,
"grad_norm": 0.7700059252057434,
"learning_rate": 4.5522388059701495e-06,
"loss": 0.5118,
"step": 428
},
{
"epoch": 0.6864,
"grad_norm": 0.9380211710233359,
"learning_rate": 4.562899786780384e-06,
"loss": 0.5977,
"step": 429
},
{
"epoch": 0.688,
"grad_norm": 0.8526769684369551,
"learning_rate": 4.5735607675906185e-06,
"loss": 0.5487,
"step": 430
},
{
"epoch": 0.6896,
"grad_norm": 0.8710529228457372,
"learning_rate": 4.584221748400853e-06,
"loss": 0.5353,
"step": 431
},
{
"epoch": 0.6912,
"grad_norm": 0.8141253356345576,
"learning_rate": 4.5948827292110876e-06,
"loss": 0.5141,
"step": 432
},
{
"epoch": 0.6928,
"grad_norm": 0.8368614135087445,
"learning_rate": 4.605543710021322e-06,
"loss": 0.5263,
"step": 433
},
{
"epoch": 0.6944,
"grad_norm": 0.8137340425586644,
"learning_rate": 4.616204690831557e-06,
"loss": 0.5548,
"step": 434
},
{
"epoch": 0.696,
"grad_norm": 0.8459034172420522,
"learning_rate": 4.626865671641791e-06,
"loss": 0.5343,
"step": 435
},
{
"epoch": 0.6976,
"grad_norm": 0.7649917545547742,
"learning_rate": 4.637526652452026e-06,
"loss": 0.5399,
"step": 436
},
{
"epoch": 0.6992,
"grad_norm": 0.7892001175990301,
"learning_rate": 4.64818763326226e-06,
"loss": 0.5149,
"step": 437
},
{
"epoch": 0.7008,
"grad_norm": 0.8002693345532315,
"learning_rate": 4.658848614072495e-06,
"loss": 0.5286,
"step": 438
},
{
"epoch": 0.7024,
"grad_norm": 0.7990955003417757,
"learning_rate": 4.669509594882729e-06,
"loss": 0.5366,
"step": 439
},
{
"epoch": 0.704,
"grad_norm": 0.7790282352454362,
"learning_rate": 4.680170575692965e-06,
"loss": 0.5396,
"step": 440
},
{
"epoch": 0.7056,
"grad_norm": 0.8545689926618768,
"learning_rate": 4.690831556503198e-06,
"loss": 0.5117,
"step": 441
},
{
"epoch": 0.7072,
"grad_norm": 0.8207273201254476,
"learning_rate": 4.701492537313434e-06,
"loss": 0.5905,
"step": 442
},
{
"epoch": 0.7088,
"grad_norm": 0.8399233633785136,
"learning_rate": 4.712153518123667e-06,
"loss": 0.516,
"step": 443
},
{
"epoch": 0.7104,
"grad_norm": 0.8508399508493361,
"learning_rate": 4.722814498933903e-06,
"loss": 0.551,
"step": 444
},
{
"epoch": 0.712,
"grad_norm": 0.8184981164850842,
"learning_rate": 4.733475479744136e-06,
"loss": 0.5851,
"step": 445
},
{
"epoch": 0.7136,
"grad_norm": 0.8030486987368878,
"learning_rate": 4.744136460554372e-06,
"loss": 0.5452,
"step": 446
},
{
"epoch": 0.7152,
"grad_norm": 0.8653743154740211,
"learning_rate": 4.7547974413646055e-06,
"loss": 0.5354,
"step": 447
},
{
"epoch": 0.7168,
"grad_norm": 0.7283574727939222,
"learning_rate": 4.765458422174841e-06,
"loss": 0.5335,
"step": 448
},
{
"epoch": 0.7184,
"grad_norm": 0.8315206489545724,
"learning_rate": 4.7761194029850745e-06,
"loss": 0.5433,
"step": 449
},
{
"epoch": 0.72,
"grad_norm": 0.8441786568165387,
"learning_rate": 4.78678038379531e-06,
"loss": 0.5642,
"step": 450
},
{
"epoch": 0.7216,
"grad_norm": 0.8332152020628741,
"learning_rate": 4.797441364605544e-06,
"loss": 0.5816,
"step": 451
},
{
"epoch": 0.7232,
"grad_norm": 0.9173593393357525,
"learning_rate": 4.808102345415779e-06,
"loss": 0.5252,
"step": 452
},
{
"epoch": 0.7248,
"grad_norm": 0.8648604534406458,
"learning_rate": 4.8187633262260135e-06,
"loss": 0.519,
"step": 453
},
{
"epoch": 0.7264,
"grad_norm": 0.8711058461886974,
"learning_rate": 4.829424307036248e-06,
"loss": 0.5751,
"step": 454
},
{
"epoch": 0.728,
"grad_norm": 0.8420986638862816,
"learning_rate": 4.8400852878464825e-06,
"loss": 0.5334,
"step": 455
},
{
"epoch": 0.7296,
"grad_norm": 0.7555701798663501,
"learning_rate": 4.850746268656717e-06,
"loss": 0.5149,
"step": 456
},
{
"epoch": 0.7312,
"grad_norm": 0.7711499442433453,
"learning_rate": 4.8614072494669516e-06,
"loss": 0.5443,
"step": 457
},
{
"epoch": 0.7328,
"grad_norm": 0.8613489542257805,
"learning_rate": 4.872068230277186e-06,
"loss": 0.5647,
"step": 458
},
{
"epoch": 0.7344,
"grad_norm": 0.9106949411264529,
"learning_rate": 4.882729211087421e-06,
"loss": 0.5425,
"step": 459
},
{
"epoch": 0.736,
"grad_norm": 0.8638599204665492,
"learning_rate": 4.893390191897655e-06,
"loss": 0.5275,
"step": 460
},
{
"epoch": 0.7376,
"grad_norm": 0.8785400163755627,
"learning_rate": 4.90405117270789e-06,
"loss": 0.5469,
"step": 461
},
{
"epoch": 0.7392,
"grad_norm": 0.7875425137925275,
"learning_rate": 4.914712153518124e-06,
"loss": 0.5475,
"step": 462
},
{
"epoch": 0.7408,
"grad_norm": 0.9410710482326542,
"learning_rate": 4.925373134328359e-06,
"loss": 0.5452,
"step": 463
},
{
"epoch": 0.7424,
"grad_norm": 0.8002566346058072,
"learning_rate": 4.936034115138593e-06,
"loss": 0.5263,
"step": 464
},
{
"epoch": 0.744,
"grad_norm": 0.8937768916403107,
"learning_rate": 4.946695095948828e-06,
"loss": 0.5818,
"step": 465
},
{
"epoch": 0.7456,
"grad_norm": 0.7637459435110706,
"learning_rate": 4.957356076759062e-06,
"loss": 0.5436,
"step": 466
},
{
"epoch": 0.7472,
"grad_norm": 0.8123190542135358,
"learning_rate": 4.968017057569297e-06,
"loss": 0.5709,
"step": 467
},
{
"epoch": 0.7488,
"grad_norm": 0.8285740674633666,
"learning_rate": 4.978678038379531e-06,
"loss": 0.5609,
"step": 468
},
{
"epoch": 0.7504,
"grad_norm": 0.8762082463372437,
"learning_rate": 4.989339019189766e-06,
"loss": 0.5885,
"step": 469
},
{
"epoch": 0.752,
"grad_norm": 0.8953998312516339,
"learning_rate": 5e-06,
"loss": 0.5488,
"step": 470
},
{
"epoch": 0.7536,
"grad_norm": 0.8613184172440235,
"learning_rate": 5.010660980810235e-06,
"loss": 0.5116,
"step": 471
},
{
"epoch": 0.7552,
"grad_norm": 0.8082855515563323,
"learning_rate": 5.02132196162047e-06,
"loss": 0.5328,
"step": 472
},
{
"epoch": 0.7568,
"grad_norm": 0.8580014764636912,
"learning_rate": 5.031982942430704e-06,
"loss": 0.5176,
"step": 473
},
{
"epoch": 0.7584,
"grad_norm": 0.8397257602418947,
"learning_rate": 5.0426439232409385e-06,
"loss": 0.5538,
"step": 474
},
{
"epoch": 0.76,
"grad_norm": 0.8147459038098553,
"learning_rate": 5.053304904051173e-06,
"loss": 0.5419,
"step": 475
},
{
"epoch": 0.7616,
"grad_norm": 0.8575872649069343,
"learning_rate": 5.063965884861408e-06,
"loss": 0.5379,
"step": 476
},
{
"epoch": 0.7632,
"grad_norm": 0.8075822295459566,
"learning_rate": 5.074626865671642e-06,
"loss": 0.5249,
"step": 477
},
{
"epoch": 0.7648,
"grad_norm": 0.8591596637488736,
"learning_rate": 5.085287846481877e-06,
"loss": 0.5503,
"step": 478
},
{
"epoch": 0.7664,
"grad_norm": 0.7829151047023781,
"learning_rate": 5.095948827292111e-06,
"loss": 0.543,
"step": 479
},
{
"epoch": 0.768,
"grad_norm": 0.9089122932444886,
"learning_rate": 5.1066098081023465e-06,
"loss": 0.5379,
"step": 480
},
{
"epoch": 0.7696,
"grad_norm": 0.8944528930331815,
"learning_rate": 5.11727078891258e-06,
"loss": 0.5501,
"step": 481
},
{
"epoch": 0.7712,
"grad_norm": 0.8811832962850422,
"learning_rate": 5.127931769722815e-06,
"loss": 0.5087,
"step": 482
},
{
"epoch": 0.7728,
"grad_norm": 0.9520455522966428,
"learning_rate": 5.138592750533049e-06,
"loss": 0.5754,
"step": 483
},
{
"epoch": 0.7744,
"grad_norm": 0.9481383621935475,
"learning_rate": 5.149253731343285e-06,
"loss": 0.5402,
"step": 484
},
{
"epoch": 0.776,
"grad_norm": 0.8412227940463143,
"learning_rate": 5.159914712153518e-06,
"loss": 0.5321,
"step": 485
},
{
"epoch": 0.7776,
"grad_norm": 0.9454760034452969,
"learning_rate": 5.170575692963753e-06,
"loss": 0.5378,
"step": 486
},
{
"epoch": 0.7792,
"grad_norm": 0.8533494620573859,
"learning_rate": 5.181236673773987e-06,
"loss": 0.5159,
"step": 487
},
{
"epoch": 0.7808,
"grad_norm": 0.9361516013200838,
"learning_rate": 5.191897654584223e-06,
"loss": 0.515,
"step": 488
},
{
"epoch": 0.7824,
"grad_norm": 0.8782329235928078,
"learning_rate": 5.202558635394456e-06,
"loss": 0.5107,
"step": 489
},
{
"epoch": 0.784,
"grad_norm": 0.9947369706499299,
"learning_rate": 5.213219616204691e-06,
"loss": 0.5237,
"step": 490
},
{
"epoch": 0.7856,
"grad_norm": 0.8189802535019133,
"learning_rate": 5.2238805970149255e-06,
"loss": 0.5588,
"step": 491
},
{
"epoch": 0.7872,
"grad_norm": 1.0555978132118156,
"learning_rate": 5.234541577825161e-06,
"loss": 0.5253,
"step": 492
},
{
"epoch": 0.7888,
"grad_norm": 0.8155783631263221,
"learning_rate": 5.245202558635395e-06,
"loss": 0.4902,
"step": 493
},
{
"epoch": 0.7904,
"grad_norm": 0.9774352650284707,
"learning_rate": 5.255863539445629e-06,
"loss": 0.5559,
"step": 494
},
{
"epoch": 0.792,
"grad_norm": 0.8976663445833297,
"learning_rate": 5.2665245202558636e-06,
"loss": 0.5431,
"step": 495
},
{
"epoch": 0.7936,
"grad_norm": 0.9808412685023696,
"learning_rate": 5.277185501066099e-06,
"loss": 0.5518,
"step": 496
},
{
"epoch": 0.7952,
"grad_norm": 0.8938141729801423,
"learning_rate": 5.2878464818763335e-06,
"loss": 0.5249,
"step": 497
},
{
"epoch": 0.7968,
"grad_norm": 0.9043328808303781,
"learning_rate": 5.298507462686567e-06,
"loss": 0.5401,
"step": 498
},
{
"epoch": 0.7984,
"grad_norm": 0.8623402415295424,
"learning_rate": 5.309168443496802e-06,
"loss": 0.5229,
"step": 499
},
{
"epoch": 0.8,
"grad_norm": 0.8750031520187871,
"learning_rate": 5.319829424307037e-06,
"loss": 0.5207,
"step": 500
},
{
"epoch": 0.8016,
"grad_norm": 0.763266475972754,
"learning_rate": 5.3304904051172716e-06,
"loss": 0.5056,
"step": 501
},
{
"epoch": 0.8032,
"grad_norm": 0.9198507138970855,
"learning_rate": 5.341151385927505e-06,
"loss": 0.538,
"step": 502
},
{
"epoch": 0.8048,
"grad_norm": 0.7977495583319136,
"learning_rate": 5.351812366737741e-06,
"loss": 0.544,
"step": 503
},
{
"epoch": 0.8064,
"grad_norm": 0.8338757340960701,
"learning_rate": 5.362473347547975e-06,
"loss": 0.5123,
"step": 504
},
{
"epoch": 0.808,
"grad_norm": 0.8791799631125965,
"learning_rate": 5.37313432835821e-06,
"loss": 0.5491,
"step": 505
},
{
"epoch": 0.8096,
"grad_norm": 0.8293019370822474,
"learning_rate": 5.383795309168443e-06,
"loss": 0.5212,
"step": 506
},
{
"epoch": 0.8112,
"grad_norm": 0.8554435528861921,
"learning_rate": 5.394456289978679e-06,
"loss": 0.5045,
"step": 507
},
{
"epoch": 0.8128,
"grad_norm": 0.8355974593461122,
"learning_rate": 5.405117270788913e-06,
"loss": 0.5413,
"step": 508
},
{
"epoch": 0.8144,
"grad_norm": 0.77389969820565,
"learning_rate": 5.415778251599148e-06,
"loss": 0.5002,
"step": 509
},
{
"epoch": 0.816,
"grad_norm": 0.8249599592546532,
"learning_rate": 5.4264392324093815e-06,
"loss": 0.5436,
"step": 510
},
{
"epoch": 0.8176,
"grad_norm": 0.8228797504357074,
"learning_rate": 5.437100213219617e-06,
"loss": 0.5419,
"step": 511
},
{
"epoch": 0.8192,
"grad_norm": 0.8300496904175892,
"learning_rate": 5.447761194029851e-06,
"loss": 0.5532,
"step": 512
},
{
"epoch": 0.8208,
"grad_norm": 0.7994881849240946,
"learning_rate": 5.458422174840086e-06,
"loss": 0.5315,
"step": 513
},
{
"epoch": 0.8224,
"grad_norm": 0.8694989380132128,
"learning_rate": 5.4690831556503196e-06,
"loss": 0.5375,
"step": 514
},
{
"epoch": 0.824,
"grad_norm": 0.7776010437173337,
"learning_rate": 5.479744136460555e-06,
"loss": 0.5435,
"step": 515
},
{
"epoch": 0.8256,
"grad_norm": 0.9503455442662696,
"learning_rate": 5.4904051172707895e-06,
"loss": 0.5034,
"step": 516
},
{
"epoch": 0.8272,
"grad_norm": 0.8213271153418595,
"learning_rate": 5.501066098081024e-06,
"loss": 0.5644,
"step": 517
},
{
"epoch": 0.8288,
"grad_norm": 0.8824829583017852,
"learning_rate": 5.511727078891258e-06,
"loss": 0.5265,
"step": 518
},
{
"epoch": 0.8304,
"grad_norm": 0.8451810272877978,
"learning_rate": 5.522388059701493e-06,
"loss": 0.5502,
"step": 519
},
{
"epoch": 0.832,
"grad_norm": 0.8746711744310373,
"learning_rate": 5.5330490405117276e-06,
"loss": 0.5338,
"step": 520
},
{
"epoch": 0.8336,
"grad_norm": 0.8679819152550929,
"learning_rate": 5.543710021321962e-06,
"loss": 0.5533,
"step": 521
},
{
"epoch": 0.8352,
"grad_norm": 0.8679233312343541,
"learning_rate": 5.554371002132196e-06,
"loss": 0.5309,
"step": 522
},
{
"epoch": 0.8368,
"grad_norm": 0.7910814695342003,
"learning_rate": 5.565031982942431e-06,
"loss": 0.5267,
"step": 523
},
{
"epoch": 0.8384,
"grad_norm": 0.8342811987095753,
"learning_rate": 5.575692963752666e-06,
"loss": 0.5372,
"step": 524
},
{
"epoch": 0.84,
"grad_norm": 0.7834131537642278,
"learning_rate": 5.5863539445629e-06,
"loss": 0.5229,
"step": 525
},
{
"epoch": 0.8416,
"grad_norm": 0.8614673719559317,
"learning_rate": 5.597014925373134e-06,
"loss": 0.5294,
"step": 526
},
{
"epoch": 0.8432,
"grad_norm": 0.7798874148485664,
"learning_rate": 5.607675906183369e-06,
"loss": 0.5445,
"step": 527
},
{
"epoch": 0.8448,
"grad_norm": 0.8596496489429021,
"learning_rate": 5.618336886993604e-06,
"loss": 0.5652,
"step": 528
},
{
"epoch": 0.8464,
"grad_norm": 0.793807162427068,
"learning_rate": 5.628997867803838e-06,
"loss": 0.5343,
"step": 529
},
{
"epoch": 0.848,
"grad_norm": 0.8449495883957229,
"learning_rate": 5.639658848614073e-06,
"loss": 0.5655,
"step": 530
},
{
"epoch": 0.8496,
"grad_norm": 0.8182016637240901,
"learning_rate": 5.650319829424308e-06,
"loss": 0.5856,
"step": 531
},
{
"epoch": 0.8512,
"grad_norm": 0.8109408391980678,
"learning_rate": 5.660980810234542e-06,
"loss": 0.5121,
"step": 532
},
{
"epoch": 0.8528,
"grad_norm": 0.8265194308547619,
"learning_rate": 5.671641791044776e-06,
"loss": 0.5358,
"step": 533
},
{
"epoch": 0.8544,
"grad_norm": 0.8204400281580362,
"learning_rate": 5.682302771855012e-06,
"loss": 0.559,
"step": 534
},
{
"epoch": 0.856,
"grad_norm": 0.8270305363765442,
"learning_rate": 5.692963752665246e-06,
"loss": 0.5239,
"step": 535
},
{
"epoch": 0.8576,
"grad_norm": 0.8714688502657366,
"learning_rate": 5.70362473347548e-06,
"loss": 0.5606,
"step": 536
},
{
"epoch": 0.8592,
"grad_norm": 0.8266359853649553,
"learning_rate": 5.7142857142857145e-06,
"loss": 0.5529,
"step": 537
},
{
"epoch": 0.8608,
"grad_norm": 0.7423922030019656,
"learning_rate": 5.72494669509595e-06,
"loss": 0.5439,
"step": 538
},
{
"epoch": 0.8624,
"grad_norm": 0.7948415966659854,
"learning_rate": 5.735607675906184e-06,
"loss": 0.5104,
"step": 539
},
{
"epoch": 0.864,
"grad_norm": 0.8184234776238962,
"learning_rate": 5.746268656716418e-06,
"loss": 0.5512,
"step": 540
},
{
"epoch": 0.8656,
"grad_norm": 0.8735831797222657,
"learning_rate": 5.756929637526653e-06,
"loss": 0.5489,
"step": 541
},
{
"epoch": 0.8672,
"grad_norm": 0.7696842650627593,
"learning_rate": 5.767590618336888e-06,
"loss": 0.5367,
"step": 542
},
{
"epoch": 0.8688,
"grad_norm": 0.8541064682006467,
"learning_rate": 5.7782515991471225e-06,
"loss": 0.5463,
"step": 543
},
{
"epoch": 0.8704,
"grad_norm": 0.893401210150427,
"learning_rate": 5.788912579957356e-06,
"loss": 0.4927,
"step": 544
},
{
"epoch": 0.872,
"grad_norm": 0.9094618080161737,
"learning_rate": 5.799573560767591e-06,
"loss": 0.5112,
"step": 545
},
{
"epoch": 0.8736,
"grad_norm": 0.9163620143155544,
"learning_rate": 5.810234541577826e-06,
"loss": 0.5242,
"step": 546
},
{
"epoch": 0.8752,
"grad_norm": 0.854627870380798,
"learning_rate": 5.820895522388061e-06,
"loss": 0.5097,
"step": 547
},
{
"epoch": 0.8768,
"grad_norm": 0.9049857621042802,
"learning_rate": 5.831556503198294e-06,
"loss": 0.4897,
"step": 548
},
{
"epoch": 0.8784,
"grad_norm": 0.841595814510566,
"learning_rate": 5.842217484008529e-06,
"loss": 0.5235,
"step": 549
},
{
"epoch": 0.88,
"grad_norm": 0.981541003468602,
"learning_rate": 5.852878464818764e-06,
"loss": 0.5183,
"step": 550
},
{
"epoch": 0.8816,
"grad_norm": 0.888025361245545,
"learning_rate": 5.863539445628999e-06,
"loss": 0.5338,
"step": 551
},
{
"epoch": 0.8832,
"grad_norm": 0.8532402860379611,
"learning_rate": 5.874200426439232e-06,
"loss": 0.5877,
"step": 552
},
{
"epoch": 0.8848,
"grad_norm": 0.827294528054697,
"learning_rate": 5.884861407249467e-06,
"loss": 0.4924,
"step": 553
},
{
"epoch": 0.8864,
"grad_norm": 0.9526621297802707,
"learning_rate": 5.895522388059702e-06,
"loss": 0.5455,
"step": 554
},
{
"epoch": 0.888,
"grad_norm": 0.9046496486378539,
"learning_rate": 5.906183368869937e-06,
"loss": 0.5171,
"step": 555
},
{
"epoch": 0.8896,
"grad_norm": 0.9025775673297872,
"learning_rate": 5.9168443496801705e-06,
"loss": 0.4893,
"step": 556
},
{
"epoch": 0.8912,
"grad_norm": 0.8574022155093447,
"learning_rate": 5.927505330490405e-06,
"loss": 0.5433,
"step": 557
},
{
"epoch": 0.8928,
"grad_norm": 0.8231304371582934,
"learning_rate": 5.93816631130064e-06,
"loss": 0.4815,
"step": 558
},
{
"epoch": 0.8944,
"grad_norm": 0.9173680501366946,
"learning_rate": 5.948827292110875e-06,
"loss": 0.5544,
"step": 559
},
{
"epoch": 0.896,
"grad_norm": 0.8785255714629714,
"learning_rate": 5.959488272921109e-06,
"loss": 0.5251,
"step": 560
},
{
"epoch": 0.8976,
"grad_norm": 0.8539243959014604,
"learning_rate": 5.970149253731343e-06,
"loss": 0.558,
"step": 561
},
{
"epoch": 0.8992,
"grad_norm": 0.7961416140257315,
"learning_rate": 5.9808102345415785e-06,
"loss": 0.5193,
"step": 562
},
{
"epoch": 0.9008,
"grad_norm": 0.9080999868996099,
"learning_rate": 5.991471215351813e-06,
"loss": 0.5312,
"step": 563
},
{
"epoch": 0.9024,
"grad_norm": 0.813439181463869,
"learning_rate": 6.002132196162047e-06,
"loss": 0.4965,
"step": 564
},
{
"epoch": 0.904,
"grad_norm": 0.9219168145673883,
"learning_rate": 6.012793176972282e-06,
"loss": 0.5396,
"step": 565
},
{
"epoch": 0.9056,
"grad_norm": 0.794480434630688,
"learning_rate": 6.023454157782517e-06,
"loss": 0.5393,
"step": 566
},
{
"epoch": 0.9072,
"grad_norm": 0.8548750027758196,
"learning_rate": 6.034115138592751e-06,
"loss": 0.5026,
"step": 567
},
{
"epoch": 0.9088,
"grad_norm": 0.8251007973212211,
"learning_rate": 6.044776119402986e-06,
"loss": 0.5099,
"step": 568
},
{
"epoch": 0.9104,
"grad_norm": 0.8345484986699218,
"learning_rate": 6.055437100213221e-06,
"loss": 0.5399,
"step": 569
},
{
"epoch": 0.912,
"grad_norm": 0.8483664276369052,
"learning_rate": 6.066098081023455e-06,
"loss": 0.5209,
"step": 570
},
{
"epoch": 0.9136,
"grad_norm": 0.8108353261117107,
"learning_rate": 6.076759061833689e-06,
"loss": 0.5167,
"step": 571
},
{
"epoch": 0.9152,
"grad_norm": 0.8063591795914883,
"learning_rate": 6.087420042643924e-06,
"loss": 0.4935,
"step": 572
},
{
"epoch": 0.9168,
"grad_norm": 0.828769688833686,
"learning_rate": 6.098081023454159e-06,
"loss": 0.5483,
"step": 573
},
{
"epoch": 0.9184,
"grad_norm": 0.8555238456757491,
"learning_rate": 6.108742004264393e-06,
"loss": 0.5534,
"step": 574
},
{
"epoch": 0.92,
"grad_norm": 0.8315415336747674,
"learning_rate": 6.119402985074627e-06,
"loss": 0.5221,
"step": 575
},
{
"epoch": 0.9216,
"grad_norm": 0.8565368931538652,
"learning_rate": 6.130063965884862e-06,
"loss": 0.46,
"step": 576
},
{
"epoch": 0.9232,
"grad_norm": 0.8259783667037305,
"learning_rate": 6.140724946695097e-06,
"loss": 0.5222,
"step": 577
},
{
"epoch": 0.9248,
"grad_norm": 0.8933835874543237,
"learning_rate": 6.151385927505331e-06,
"loss": 0.5229,
"step": 578
},
{
"epoch": 0.9264,
"grad_norm": 0.8299341146771261,
"learning_rate": 6.1620469083155655e-06,
"loss": 0.5162,
"step": 579
},
{
"epoch": 0.928,
"grad_norm": 0.851826227347469,
"learning_rate": 6.1727078891258e-06,
"loss": 0.5294,
"step": 580
},
{
"epoch": 0.9296,
"grad_norm": 0.9771804477958785,
"learning_rate": 6.183368869936035e-06,
"loss": 0.5493,
"step": 581
},
{
"epoch": 0.9312,
"grad_norm": 0.844552453653095,
"learning_rate": 6.194029850746269e-06,
"loss": 0.5122,
"step": 582
},
{
"epoch": 0.9328,
"grad_norm": 0.8985001503860374,
"learning_rate": 6.2046908315565036e-06,
"loss": 0.5209,
"step": 583
},
{
"epoch": 0.9344,
"grad_norm": 0.8405023016845977,
"learning_rate": 6.215351812366738e-06,
"loss": 0.5229,
"step": 584
},
{
"epoch": 0.936,
"grad_norm": 0.9290507990979149,
"learning_rate": 6.2260127931769735e-06,
"loss": 0.5177,
"step": 585
},
{
"epoch": 0.9376,
"grad_norm": 0.8051113618708801,
"learning_rate": 6.236673773987207e-06,
"loss": 0.5399,
"step": 586
},
{
"epoch": 0.9392,
"grad_norm": 0.8643207774567074,
"learning_rate": 6.247334754797442e-06,
"loss": 0.5239,
"step": 587
},
{
"epoch": 0.9408,
"grad_norm": 0.7556872262805366,
"learning_rate": 6.257995735607676e-06,
"loss": 0.5382,
"step": 588
},
{
"epoch": 0.9424,
"grad_norm": 0.9526333574647944,
"learning_rate": 6.2686567164179116e-06,
"loss": 0.5377,
"step": 589
},
{
"epoch": 0.944,
"grad_norm": 0.7958583963770268,
"learning_rate": 6.279317697228145e-06,
"loss": 0.4945,
"step": 590
},
{
"epoch": 0.9456,
"grad_norm": 0.9757428010352267,
"learning_rate": 6.28997867803838e-06,
"loss": 0.531,
"step": 591
},
{
"epoch": 0.9472,
"grad_norm": 0.9484280544374282,
"learning_rate": 6.300639658848614e-06,
"loss": 0.5344,
"step": 592
},
{
"epoch": 0.9488,
"grad_norm": 0.8953536000746694,
"learning_rate": 6.31130063965885e-06,
"loss": 0.4969,
"step": 593
},
{
"epoch": 0.9504,
"grad_norm": 1.0762465973095379,
"learning_rate": 6.321961620469083e-06,
"loss": 0.5615,
"step": 594
},
{
"epoch": 0.952,
"grad_norm": 0.8868282604227625,
"learning_rate": 6.332622601279318e-06,
"loss": 0.5015,
"step": 595
},
{
"epoch": 0.9536,
"grad_norm": 0.9387462058082829,
"learning_rate": 6.343283582089553e-06,
"loss": 0.5384,
"step": 596
},
{
"epoch": 0.9552,
"grad_norm": 1.038161905784565,
"learning_rate": 6.353944562899788e-06,
"loss": 0.5406,
"step": 597
},
{
"epoch": 0.9568,
"grad_norm": 0.8349063964387037,
"learning_rate": 6.3646055437100215e-06,
"loss": 0.5128,
"step": 598
},
{
"epoch": 0.9584,
"grad_norm": 0.891273791046686,
"learning_rate": 6.375266524520256e-06,
"loss": 0.5145,
"step": 599
},
{
"epoch": 0.96,
"grad_norm": 0.8487119519500591,
"learning_rate": 6.385927505330491e-06,
"loss": 0.5433,
"step": 600
},
{
"epoch": 0.9616,
"grad_norm": 0.8727449705126482,
"learning_rate": 6.396588486140726e-06,
"loss": 0.5415,
"step": 601
},
{
"epoch": 0.9632,
"grad_norm": 0.7867763723105303,
"learning_rate": 6.4072494669509596e-06,
"loss": 0.4936,
"step": 602
},
{
"epoch": 0.9648,
"grad_norm": 0.8287779412757438,
"learning_rate": 6.417910447761194e-06,
"loss": 0.5278,
"step": 603
},
{
"epoch": 0.9664,
"grad_norm": 0.7745857956565567,
"learning_rate": 6.4285714285714295e-06,
"loss": 0.495,
"step": 604
},
{
"epoch": 0.968,
"grad_norm": 0.8270024054653756,
"learning_rate": 6.439232409381664e-06,
"loss": 0.5222,
"step": 605
},
{
"epoch": 0.9696,
"grad_norm": 0.8290003392577493,
"learning_rate": 6.449893390191898e-06,
"loss": 0.4942,
"step": 606
},
{
"epoch": 0.9712,
"grad_norm": 0.753920920589916,
"learning_rate": 6.460554371002132e-06,
"loss": 0.5188,
"step": 607
},
{
"epoch": 0.9728,
"grad_norm": 0.8358847966610979,
"learning_rate": 6.4712153518123676e-06,
"loss": 0.5526,
"step": 608
},
{
"epoch": 0.9744,
"grad_norm": 0.8114863070028855,
"learning_rate": 6.481876332622602e-06,
"loss": 0.5377,
"step": 609
},
{
"epoch": 0.976,
"grad_norm": 0.8195628044086989,
"learning_rate": 6.492537313432837e-06,
"loss": 0.502,
"step": 610
},
{
"epoch": 0.9776,
"grad_norm": 0.8377558444331323,
"learning_rate": 6.50319829424307e-06,
"loss": 0.4701,
"step": 611
},
{
"epoch": 0.9792,
"grad_norm": 0.8016725399438086,
"learning_rate": 6.513859275053306e-06,
"loss": 0.5309,
"step": 612
},
{
"epoch": 0.9808,
"grad_norm": 0.7755966838681521,
"learning_rate": 6.52452025586354e-06,
"loss": 0.5171,
"step": 613
},
{
"epoch": 0.9824,
"grad_norm": 0.7654081600201667,
"learning_rate": 6.535181236673775e-06,
"loss": 0.5103,
"step": 614
},
{
"epoch": 0.984,
"grad_norm": 0.788504199793285,
"learning_rate": 6.545842217484008e-06,
"loss": 0.4946,
"step": 615
},
{
"epoch": 0.9856,
"grad_norm": 0.8245717363647899,
"learning_rate": 6.556503198294244e-06,
"loss": 0.5527,
"step": 616
},
{
"epoch": 0.9872,
"grad_norm": 0.8245960934834444,
"learning_rate": 6.567164179104478e-06,
"loss": 0.5363,
"step": 617
},
{
"epoch": 0.9888,
"grad_norm": 0.727073026640822,
"learning_rate": 6.577825159914713e-06,
"loss": 0.532,
"step": 618
},
{
"epoch": 0.9904,
"grad_norm": 0.8600642583259648,
"learning_rate": 6.5884861407249465e-06,
"loss": 0.5376,
"step": 619
},
{
"epoch": 0.992,
"grad_norm": 0.821299436298884,
"learning_rate": 6.599147121535182e-06,
"loss": 0.5306,
"step": 620
},
{
"epoch": 0.9936,
"grad_norm": 0.8493937107942993,
"learning_rate": 6.609808102345416e-06,
"loss": 0.5425,
"step": 621
},
{
"epoch": 0.9952,
"grad_norm": 0.8117156613742099,
"learning_rate": 6.620469083155651e-06,
"loss": 0.5148,
"step": 622
},
{
"epoch": 0.9968,
"grad_norm": 0.8106067040078264,
"learning_rate": 6.631130063965885e-06,
"loss": 0.5088,
"step": 623
},
{
"epoch": 0.9984,
"grad_norm": 0.9006626802499182,
"learning_rate": 6.64179104477612e-06,
"loss": 0.5348,
"step": 624
},
{
"epoch": 1.0,
"grad_norm": 0.7955489761135057,
"learning_rate": 6.6524520255863545e-06,
"loss": 0.526,
"step": 625
},
{
"epoch": 1.0016,
"grad_norm": 0.8939049889815265,
"learning_rate": 6.663113006396589e-06,
"loss": 0.4748,
"step": 626
},
{
"epoch": 1.0032,
"grad_norm": 0.9142102536005957,
"learning_rate": 6.673773987206824e-06,
"loss": 0.4791,
"step": 627
},
{
"epoch": 1.0048,
"grad_norm": 0.85000434786646,
"learning_rate": 6.684434968017058e-06,
"loss": 0.5234,
"step": 628
},
{
"epoch": 1.0064,
"grad_norm": 0.9585313430390451,
"learning_rate": 6.695095948827293e-06,
"loss": 0.4802,
"step": 629
},
{
"epoch": 1.008,
"grad_norm": 0.8750443177760309,
"learning_rate": 6.705756929637527e-06,
"loss": 0.5185,
"step": 630
},
{
"epoch": 1.0096,
"grad_norm": 0.9477247147352341,
"learning_rate": 6.7164179104477625e-06,
"loss": 0.5025,
"step": 631
},
{
"epoch": 1.0112,
"grad_norm": 0.904368055639127,
"learning_rate": 6.727078891257996e-06,
"loss": 0.4737,
"step": 632
},
{
"epoch": 1.0128,
"grad_norm": 0.929987383446468,
"learning_rate": 6.737739872068231e-06,
"loss": 0.521,
"step": 633
},
{
"epoch": 1.0144,
"grad_norm": 0.8091474393380501,
"learning_rate": 6.748400852878465e-06,
"loss": 0.4827,
"step": 634
},
{
"epoch": 1.016,
"grad_norm": 0.8707887258583898,
"learning_rate": 6.759061833688701e-06,
"loss": 0.4707,
"step": 635
},
{
"epoch": 1.0176,
"grad_norm": 0.9182994454586503,
"learning_rate": 6.769722814498934e-06,
"loss": 0.4966,
"step": 636
},
{
"epoch": 1.0192,
"grad_norm": 0.8821203820370487,
"learning_rate": 6.780383795309169e-06,
"loss": 0.4895,
"step": 637
},
{
"epoch": 1.0208,
"grad_norm": 0.8392554797468061,
"learning_rate": 6.791044776119403e-06,
"loss": 0.5143,
"step": 638
},
{
"epoch": 1.0224,
"grad_norm": 0.9493234827147355,
"learning_rate": 6.801705756929639e-06,
"loss": 0.4879,
"step": 639
},
{
"epoch": 1.024,
"grad_norm": 0.8932952086378858,
"learning_rate": 6.812366737739872e-06,
"loss": 0.488,
"step": 640
},
{
"epoch": 1.0256,
"grad_norm": 0.8970192831510824,
"learning_rate": 6.823027718550107e-06,
"loss": 0.4812,
"step": 641
},
{
"epoch": 1.0272,
"grad_norm": 0.8934554057803764,
"learning_rate": 6.8336886993603415e-06,
"loss": 0.4937,
"step": 642
},
{
"epoch": 1.0288,
"grad_norm": 0.8735754866413923,
"learning_rate": 6.844349680170577e-06,
"loss": 0.4763,
"step": 643
},
{
"epoch": 1.0304,
"grad_norm": 0.8742904882080139,
"learning_rate": 6.8550106609808105e-06,
"loss": 0.49,
"step": 644
},
{
"epoch": 1.032,
"grad_norm": 0.8328916277112867,
"learning_rate": 6.865671641791045e-06,
"loss": 0.4996,
"step": 645
},
{
"epoch": 1.0336,
"grad_norm": 0.9184053544404314,
"learning_rate": 6.8763326226012796e-06,
"loss": 0.5048,
"step": 646
},
{
"epoch": 1.0352,
"grad_norm": 0.8456606611788858,
"learning_rate": 6.886993603411515e-06,
"loss": 0.4898,
"step": 647
},
{
"epoch": 1.0368,
"grad_norm": 0.8284969357938996,
"learning_rate": 6.8976545842217495e-06,
"loss": 0.5172,
"step": 648
},
{
"epoch": 1.0384,
"grad_norm": 0.8782530091874459,
"learning_rate": 6.908315565031983e-06,
"loss": 0.4988,
"step": 649
},
{
"epoch": 1.04,
"grad_norm": 0.8545255868928607,
"learning_rate": 6.918976545842218e-06,
"loss": 0.4855,
"step": 650
},
{
"epoch": 1.0416,
"grad_norm": 0.9449070971234702,
"learning_rate": 6.929637526652453e-06,
"loss": 0.525,
"step": 651
},
{
"epoch": 1.0432,
"grad_norm": 0.8329960687657197,
"learning_rate": 6.9402985074626876e-06,
"loss": 0.4852,
"step": 652
},
{
"epoch": 1.0448,
"grad_norm": 0.858653034494543,
"learning_rate": 6.950959488272921e-06,
"loss": 0.5175,
"step": 653
},
{
"epoch": 1.0464,
"grad_norm": 0.7785586676865667,
"learning_rate": 6.961620469083156e-06,
"loss": 0.4627,
"step": 654
},
{
"epoch": 1.048,
"grad_norm": 0.8613855083022859,
"learning_rate": 6.972281449893391e-06,
"loss": 0.5209,
"step": 655
},
{
"epoch": 1.0496,
"grad_norm": 0.8286890577089111,
"learning_rate": 6.982942430703626e-06,
"loss": 0.5325,
"step": 656
},
{
"epoch": 1.0512,
"grad_norm": 0.8112752029689932,
"learning_rate": 6.993603411513859e-06,
"loss": 0.4905,
"step": 657
},
{
"epoch": 1.0528,
"grad_norm": 0.8565051990117192,
"learning_rate": 7.004264392324095e-06,
"loss": 0.4939,
"step": 658
},
{
"epoch": 1.0544,
"grad_norm": 0.7784473305249311,
"learning_rate": 7.014925373134329e-06,
"loss": 0.4703,
"step": 659
},
{
"epoch": 1.056,
"grad_norm": 0.8561933854878488,
"learning_rate": 7.025586353944564e-06,
"loss": 0.4892,
"step": 660
},
{
"epoch": 1.0576,
"grad_norm": 0.8522962801371073,
"learning_rate": 7.0362473347547975e-06,
"loss": 0.5085,
"step": 661
},
{
"epoch": 1.0592,
"grad_norm": 0.8604350686873483,
"learning_rate": 7.046908315565033e-06,
"loss": 0.473,
"step": 662
},
{
"epoch": 1.0608,
"grad_norm": 0.8675460649549699,
"learning_rate": 7.057569296375267e-06,
"loss": 0.4949,
"step": 663
},
{
"epoch": 1.0624,
"grad_norm": 0.8594099163899366,
"learning_rate": 7.068230277185502e-06,
"loss": 0.5117,
"step": 664
},
{
"epoch": 1.064,
"grad_norm": 0.8640881532732602,
"learning_rate": 7.0788912579957356e-06,
"loss": 0.4783,
"step": 665
},
{
"epoch": 1.0656,
"grad_norm": 0.8919589834064163,
"learning_rate": 7.089552238805971e-06,
"loss": 0.5246,
"step": 666
},
{
"epoch": 1.0672,
"grad_norm": 0.9750324339852304,
"learning_rate": 7.1002132196162055e-06,
"loss": 0.4973,
"step": 667
},
{
"epoch": 1.0688,
"grad_norm": 0.9957384605818356,
"learning_rate": 7.11087420042644e-06,
"loss": 0.4966,
"step": 668
},
{
"epoch": 1.0704,
"grad_norm": 0.7997520245450188,
"learning_rate": 7.121535181236674e-06,
"loss": 0.4865,
"step": 669
},
{
"epoch": 1.072,
"grad_norm": 0.9582197449325179,
"learning_rate": 7.132196162046909e-06,
"loss": 0.5023,
"step": 670
},
{
"epoch": 1.0735999999999999,
"grad_norm": 0.9272567743413159,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.5332,
"step": 671
},
{
"epoch": 1.0752,
"grad_norm": 0.9017820260888095,
"learning_rate": 7.153518123667378e-06,
"loss": 0.4644,
"step": 672
},
{
"epoch": 1.0768,
"grad_norm": 0.8858975000261681,
"learning_rate": 7.164179104477612e-06,
"loss": 0.4837,
"step": 673
},
{
"epoch": 1.0784,
"grad_norm": 0.8930445495465718,
"learning_rate": 7.174840085287847e-06,
"loss": 0.5044,
"step": 674
},
{
"epoch": 1.08,
"grad_norm": 0.9323861547341619,
"learning_rate": 7.185501066098082e-06,
"loss": 0.4975,
"step": 675
},
{
"epoch": 1.0816,
"grad_norm": 0.8695592279737832,
"learning_rate": 7.196162046908316e-06,
"loss": 0.497,
"step": 676
},
{
"epoch": 1.0832,
"grad_norm": 0.8810401640135046,
"learning_rate": 7.20682302771855e-06,
"loss": 0.4884,
"step": 677
},
{
"epoch": 1.0848,
"grad_norm": 0.9117589561225861,
"learning_rate": 7.217484008528785e-06,
"loss": 0.4944,
"step": 678
},
{
"epoch": 1.0864,
"grad_norm": 0.8400188680420124,
"learning_rate": 7.22814498933902e-06,
"loss": 0.5221,
"step": 679
},
{
"epoch": 1.088,
"grad_norm": 0.8683054433248262,
"learning_rate": 7.238805970149254e-06,
"loss": 0.4992,
"step": 680
},
{
"epoch": 1.0896,
"grad_norm": 0.8532920431348728,
"learning_rate": 7.249466950959488e-06,
"loss": 0.4894,
"step": 681
},
{
"epoch": 1.0912,
"grad_norm": 0.8656995078586304,
"learning_rate": 7.260127931769723e-06,
"loss": 0.4946,
"step": 682
},
{
"epoch": 1.0928,
"grad_norm": 0.8472745906446407,
"learning_rate": 7.270788912579958e-06,
"loss": 0.4866,
"step": 683
},
{
"epoch": 1.0944,
"grad_norm": 0.8865029623698553,
"learning_rate": 7.281449893390192e-06,
"loss": 0.4859,
"step": 684
},
{
"epoch": 1.096,
"grad_norm": 0.8938595923880026,
"learning_rate": 7.292110874200427e-06,
"loss": 0.5124,
"step": 685
},
{
"epoch": 1.0976,
"grad_norm": 0.9001915895636022,
"learning_rate": 7.302771855010662e-06,
"loss": 0.505,
"step": 686
},
{
"epoch": 1.0992,
"grad_norm": 0.8648181883246533,
"learning_rate": 7.313432835820896e-06,
"loss": 0.4547,
"step": 687
},
{
"epoch": 1.1008,
"grad_norm": 0.9013716761336116,
"learning_rate": 7.3240938166311305e-06,
"loss": 0.5068,
"step": 688
},
{
"epoch": 1.1024,
"grad_norm": 0.8897699929901375,
"learning_rate": 7.334754797441366e-06,
"loss": 0.4699,
"step": 689
},
{
"epoch": 1.104,
"grad_norm": 0.9274052628271223,
"learning_rate": 7.3454157782516e-06,
"loss": 0.5205,
"step": 690
},
{
"epoch": 1.1056,
"grad_norm": 0.8904130581010617,
"learning_rate": 7.356076759061834e-06,
"loss": 0.4974,
"step": 691
},
{
"epoch": 1.1072,
"grad_norm": 0.9248330646869078,
"learning_rate": 7.366737739872069e-06,
"loss": 0.487,
"step": 692
},
{
"epoch": 1.1088,
"grad_norm": 0.8849242860814813,
"learning_rate": 7.377398720682304e-06,
"loss": 0.4862,
"step": 693
},
{
"epoch": 1.1104,
"grad_norm": 0.8404508145439668,
"learning_rate": 7.3880597014925385e-06,
"loss": 0.5115,
"step": 694
},
{
"epoch": 1.112,
"grad_norm": 0.9210103265284583,
"learning_rate": 7.398720682302772e-06,
"loss": 0.5137,
"step": 695
},
{
"epoch": 1.1136,
"grad_norm": 0.9028642559788765,
"learning_rate": 7.409381663113007e-06,
"loss": 0.4994,
"step": 696
},
{
"epoch": 1.1152,
"grad_norm": 0.8622481549136872,
"learning_rate": 7.420042643923242e-06,
"loss": 0.4894,
"step": 697
},
{
"epoch": 1.1168,
"grad_norm": 0.8580340963349197,
"learning_rate": 7.430703624733477e-06,
"loss": 0.4795,
"step": 698
},
{
"epoch": 1.1184,
"grad_norm": 0.8555460870305887,
"learning_rate": 7.44136460554371e-06,
"loss": 0.5014,
"step": 699
},
{
"epoch": 1.12,
"grad_norm": 0.923140841505284,
"learning_rate": 7.452025586353945e-06,
"loss": 0.4562,
"step": 700
},
{
"epoch": 1.1216,
"grad_norm": 0.8100751043736653,
"learning_rate": 7.46268656716418e-06,
"loss": 0.4733,
"step": 701
},
{
"epoch": 1.1232,
"grad_norm": 0.9147204854059461,
"learning_rate": 7.473347547974415e-06,
"loss": 0.4752,
"step": 702
},
{
"epoch": 1.1248,
"grad_norm": 0.8419294026515683,
"learning_rate": 7.484008528784648e-06,
"loss": 0.486,
"step": 703
},
{
"epoch": 1.1264,
"grad_norm": 0.8643702806570176,
"learning_rate": 7.494669509594883e-06,
"loss": 0.4613,
"step": 704
},
{
"epoch": 1.1280000000000001,
"grad_norm": 0.949302912936103,
"learning_rate": 7.505330490405118e-06,
"loss": 0.5118,
"step": 705
},
{
"epoch": 1.1296,
"grad_norm": 0.7783378740632438,
"learning_rate": 7.515991471215353e-06,
"loss": 0.4741,
"step": 706
},
{
"epoch": 1.1312,
"grad_norm": 0.9194591494916258,
"learning_rate": 7.5266524520255865e-06,
"loss": 0.4782,
"step": 707
},
{
"epoch": 1.1328,
"grad_norm": 0.8084257080509638,
"learning_rate": 7.537313432835821e-06,
"loss": 0.4789,
"step": 708
},
{
"epoch": 1.1344,
"grad_norm": 0.9370225113345622,
"learning_rate": 7.547974413646056e-06,
"loss": 0.5038,
"step": 709
},
{
"epoch": 1.1360000000000001,
"grad_norm": 0.8967784446409596,
"learning_rate": 7.558635394456291e-06,
"loss": 0.5123,
"step": 710
},
{
"epoch": 1.1376,
"grad_norm": 0.8353568264432971,
"learning_rate": 7.569296375266525e-06,
"loss": 0.4981,
"step": 711
},
{
"epoch": 1.1392,
"grad_norm": 1.0148502497573662,
"learning_rate": 7.579957356076759e-06,
"loss": 0.5154,
"step": 712
},
{
"epoch": 1.1408,
"grad_norm": 0.794448266298346,
"learning_rate": 7.5906183368869945e-06,
"loss": 0.4914,
"step": 713
},
{
"epoch": 1.1424,
"grad_norm": 0.931979450219089,
"learning_rate": 7.601279317697229e-06,
"loss": 0.4814,
"step": 714
},
{
"epoch": 1.144,
"grad_norm": 0.7837557760281342,
"learning_rate": 7.611940298507463e-06,
"loss": 0.4665,
"step": 715
},
{
"epoch": 1.1456,
"grad_norm": 0.8079659200165911,
"learning_rate": 7.622601279317697e-06,
"loss": 0.5002,
"step": 716
},
{
"epoch": 1.1472,
"grad_norm": 0.8433060870697712,
"learning_rate": 7.633262260127933e-06,
"loss": 0.4924,
"step": 717
},
{
"epoch": 1.1488,
"grad_norm": 0.8562511609970845,
"learning_rate": 7.643923240938167e-06,
"loss": 0.4909,
"step": 718
},
{
"epoch": 1.1504,
"grad_norm": 0.8170143423224167,
"learning_rate": 7.654584221748402e-06,
"loss": 0.493,
"step": 719
},
{
"epoch": 1.152,
"grad_norm": 0.8988104256313746,
"learning_rate": 7.665245202558636e-06,
"loss": 0.4858,
"step": 720
},
{
"epoch": 1.1536,
"grad_norm": 0.8926410072505501,
"learning_rate": 7.67590618336887e-06,
"loss": 0.5134,
"step": 721
},
{
"epoch": 1.1552,
"grad_norm": 0.8063108281175257,
"learning_rate": 7.686567164179105e-06,
"loss": 0.4504,
"step": 722
},
{
"epoch": 1.1568,
"grad_norm": 0.9031510431095552,
"learning_rate": 7.69722814498934e-06,
"loss": 0.4897,
"step": 723
},
{
"epoch": 1.1584,
"grad_norm": 0.8000465406306086,
"learning_rate": 7.707889125799574e-06,
"loss": 0.5103,
"step": 724
},
{
"epoch": 1.16,
"grad_norm": 0.8807137653031148,
"learning_rate": 7.718550106609809e-06,
"loss": 0.5188,
"step": 725
},
{
"epoch": 1.1616,
"grad_norm": 0.8529454057323239,
"learning_rate": 7.729211087420043e-06,
"loss": 0.4901,
"step": 726
},
{
"epoch": 1.1632,
"grad_norm": 0.8865930965204165,
"learning_rate": 7.739872068230278e-06,
"loss": 0.5172,
"step": 727
},
{
"epoch": 1.1648,
"grad_norm": 0.8489456406347604,
"learning_rate": 7.750533049040512e-06,
"loss": 0.5318,
"step": 728
},
{
"epoch": 1.1663999999999999,
"grad_norm": 0.8389695400553605,
"learning_rate": 7.761194029850747e-06,
"loss": 0.4847,
"step": 729
},
{
"epoch": 1.168,
"grad_norm": 0.9129636581931746,
"learning_rate": 7.771855010660981e-06,
"loss": 0.5036,
"step": 730
},
{
"epoch": 1.1696,
"grad_norm": 0.8689782710805041,
"learning_rate": 7.782515991471216e-06,
"loss": 0.4621,
"step": 731
},
{
"epoch": 1.1712,
"grad_norm": 0.8366002015417364,
"learning_rate": 7.79317697228145e-06,
"loss": 0.4919,
"step": 732
},
{
"epoch": 1.1728,
"grad_norm": 0.862046670020214,
"learning_rate": 7.803837953091685e-06,
"loss": 0.505,
"step": 733
},
{
"epoch": 1.1743999999999999,
"grad_norm": 0.894624237535269,
"learning_rate": 7.81449893390192e-06,
"loss": 0.4719,
"step": 734
},
{
"epoch": 1.176,
"grad_norm": 0.8185676529493747,
"learning_rate": 7.825159914712154e-06,
"loss": 0.5157,
"step": 735
},
{
"epoch": 1.1776,
"grad_norm": 0.9507916410459172,
"learning_rate": 7.835820895522389e-06,
"loss": 0.4884,
"step": 736
},
{
"epoch": 1.1792,
"grad_norm": 0.8167479057507443,
"learning_rate": 7.846481876332623e-06,
"loss": 0.4666,
"step": 737
},
{
"epoch": 1.1808,
"grad_norm": 0.8135227530782192,
"learning_rate": 7.857142857142858e-06,
"loss": 0.509,
"step": 738
},
{
"epoch": 1.1824,
"grad_norm": 0.9340892372683886,
"learning_rate": 7.867803837953092e-06,
"loss": 0.471,
"step": 739
},
{
"epoch": 1.184,
"grad_norm": 0.8646629530041654,
"learning_rate": 7.878464818763327e-06,
"loss": 0.4923,
"step": 740
},
{
"epoch": 1.1856,
"grad_norm": 0.997273455214431,
"learning_rate": 7.889125799573561e-06,
"loss": 0.4873,
"step": 741
},
{
"epoch": 1.1872,
"grad_norm": 0.9219572311600239,
"learning_rate": 7.899786780383796e-06,
"loss": 0.5069,
"step": 742
},
{
"epoch": 1.1888,
"grad_norm": 0.7934354225458706,
"learning_rate": 7.91044776119403e-06,
"loss": 0.4763,
"step": 743
},
{
"epoch": 1.1904,
"grad_norm": 0.8988143693580111,
"learning_rate": 7.921108742004265e-06,
"loss": 0.5059,
"step": 744
},
{
"epoch": 1.192,
"grad_norm": 0.8620595700348912,
"learning_rate": 7.9317697228145e-06,
"loss": 0.4875,
"step": 745
},
{
"epoch": 1.1936,
"grad_norm": 0.8926860784613844,
"learning_rate": 7.942430703624734e-06,
"loss": 0.4911,
"step": 746
},
{
"epoch": 1.1952,
"grad_norm": 0.9287528764007131,
"learning_rate": 7.953091684434968e-06,
"loss": 0.4801,
"step": 747
},
{
"epoch": 1.1968,
"grad_norm": 0.8261534615904715,
"learning_rate": 7.963752665245203e-06,
"loss": 0.4893,
"step": 748
},
{
"epoch": 1.1984,
"grad_norm": 0.8625619754008521,
"learning_rate": 7.974413646055437e-06,
"loss": 0.4817,
"step": 749
},
{
"epoch": 1.2,
"grad_norm": 0.8660545440351582,
"learning_rate": 7.985074626865672e-06,
"loss": 0.5194,
"step": 750
},
{
"epoch": 1.2016,
"grad_norm": 0.9225217211510944,
"learning_rate": 7.995735607675907e-06,
"loss": 0.5066,
"step": 751
},
{
"epoch": 1.2032,
"grad_norm": 0.91250700607779,
"learning_rate": 8.006396588486141e-06,
"loss": 0.4978,
"step": 752
},
{
"epoch": 1.2048,
"grad_norm": 0.8165175944409211,
"learning_rate": 8.017057569296376e-06,
"loss": 0.4945,
"step": 753
},
{
"epoch": 1.2064,
"grad_norm": 0.9576031761806364,
"learning_rate": 8.02771855010661e-06,
"loss": 0.5141,
"step": 754
},
{
"epoch": 1.208,
"grad_norm": 0.8270917187108142,
"learning_rate": 8.038379530916846e-06,
"loss": 0.4736,
"step": 755
},
{
"epoch": 1.2096,
"grad_norm": 0.8035593655771217,
"learning_rate": 8.049040511727079e-06,
"loss": 0.4772,
"step": 756
},
{
"epoch": 1.2112,
"grad_norm": 0.8255593987600729,
"learning_rate": 8.059701492537314e-06,
"loss": 0.4688,
"step": 757
},
{
"epoch": 1.2128,
"grad_norm": 0.8335731681930802,
"learning_rate": 8.070362473347548e-06,
"loss": 0.4711,
"step": 758
},
{
"epoch": 1.2144,
"grad_norm": 0.8194251984563062,
"learning_rate": 8.081023454157784e-06,
"loss": 0.4454,
"step": 759
},
{
"epoch": 1.216,
"grad_norm": 0.8702856100363334,
"learning_rate": 8.091684434968017e-06,
"loss": 0.5012,
"step": 760
},
{
"epoch": 1.2176,
"grad_norm": 0.9075441167906476,
"learning_rate": 8.102345415778252e-06,
"loss": 0.4892,
"step": 761
},
{
"epoch": 1.2192,
"grad_norm": 0.8005134308272335,
"learning_rate": 8.113006396588486e-06,
"loss": 0.5026,
"step": 762
},
{
"epoch": 1.2208,
"grad_norm": 0.7771323696086689,
"learning_rate": 8.123667377398723e-06,
"loss": 0.4886,
"step": 763
},
{
"epoch": 1.2224,
"grad_norm": 0.78717220004623,
"learning_rate": 8.134328358208955e-06,
"loss": 0.4892,
"step": 764
},
{
"epoch": 1.224,
"grad_norm": 0.7628253673408774,
"learning_rate": 8.14498933901919e-06,
"loss": 0.4567,
"step": 765
},
{
"epoch": 1.2256,
"grad_norm": 0.7703212238803351,
"learning_rate": 8.155650319829424e-06,
"loss": 0.4509,
"step": 766
},
{
"epoch": 1.2272,
"grad_norm": 0.9016403747699305,
"learning_rate": 8.16631130063966e-06,
"loss": 0.4967,
"step": 767
},
{
"epoch": 1.2288000000000001,
"grad_norm": 0.8393187669313528,
"learning_rate": 8.176972281449893e-06,
"loss": 0.4923,
"step": 768
},
{
"epoch": 1.2304,
"grad_norm": 0.8364970899148162,
"learning_rate": 8.187633262260128e-06,
"loss": 0.4933,
"step": 769
},
{
"epoch": 1.232,
"grad_norm": 1.036196569024722,
"learning_rate": 8.198294243070363e-06,
"loss": 0.5072,
"step": 770
},
{
"epoch": 1.2336,
"grad_norm": 0.784372659669558,
"learning_rate": 8.208955223880599e-06,
"loss": 0.4837,
"step": 771
},
{
"epoch": 1.2352,
"grad_norm": 0.8506324572550054,
"learning_rate": 8.219616204690832e-06,
"loss": 0.486,
"step": 772
},
{
"epoch": 1.2368000000000001,
"grad_norm": 0.8299386307354703,
"learning_rate": 8.230277185501066e-06,
"loss": 0.4742,
"step": 773
},
{
"epoch": 1.2384,
"grad_norm": 0.7949690986265776,
"learning_rate": 8.2409381663113e-06,
"loss": 0.5153,
"step": 774
},
{
"epoch": 1.24,
"grad_norm": 0.81322959927153,
"learning_rate": 8.251599147121537e-06,
"loss": 0.4916,
"step": 775
},
{
"epoch": 1.2416,
"grad_norm": 0.7506748375530281,
"learning_rate": 8.26226012793177e-06,
"loss": 0.4909,
"step": 776
},
{
"epoch": 1.2432,
"grad_norm": 0.869003804560298,
"learning_rate": 8.272921108742004e-06,
"loss": 0.4952,
"step": 777
},
{
"epoch": 1.2448,
"grad_norm": 0.7828796386986692,
"learning_rate": 8.283582089552239e-06,
"loss": 0.491,
"step": 778
},
{
"epoch": 1.2464,
"grad_norm": 0.943803876658911,
"learning_rate": 8.294243070362475e-06,
"loss": 0.521,
"step": 779
},
{
"epoch": 1.248,
"grad_norm": 0.9284743039559729,
"learning_rate": 8.304904051172708e-06,
"loss": 0.5005,
"step": 780
},
{
"epoch": 1.2496,
"grad_norm": 0.8563783132882432,
"learning_rate": 8.315565031982942e-06,
"loss": 0.4965,
"step": 781
},
{
"epoch": 1.2511999999999999,
"grad_norm": 0.9414147179698323,
"learning_rate": 8.326226012793177e-06,
"loss": 0.5101,
"step": 782
},
{
"epoch": 1.2528000000000001,
"grad_norm": 0.847176708925908,
"learning_rate": 8.336886993603413e-06,
"loss": 0.4997,
"step": 783
},
{
"epoch": 1.2544,
"grad_norm": 0.8922585077194567,
"learning_rate": 8.347547974413648e-06,
"loss": 0.4812,
"step": 784
},
{
"epoch": 1.256,
"grad_norm": 0.8492575450825625,
"learning_rate": 8.35820895522388e-06,
"loss": 0.4973,
"step": 785
},
{
"epoch": 1.2576,
"grad_norm": 0.769955130113286,
"learning_rate": 8.368869936034117e-06,
"loss": 0.4798,
"step": 786
},
{
"epoch": 1.2591999999999999,
"grad_norm": 0.8808043798910176,
"learning_rate": 8.379530916844351e-06,
"loss": 0.4881,
"step": 787
},
{
"epoch": 1.2608,
"grad_norm": 0.7911441145492251,
"learning_rate": 8.390191897654586e-06,
"loss": 0.4781,
"step": 788
},
{
"epoch": 1.2624,
"grad_norm": 0.8831191122354372,
"learning_rate": 8.400852878464819e-06,
"loss": 0.5249,
"step": 789
},
{
"epoch": 1.264,
"grad_norm": 0.8490944822273508,
"learning_rate": 8.411513859275055e-06,
"loss": 0.496,
"step": 790
},
{
"epoch": 1.2656,
"grad_norm": 0.9034453286543956,
"learning_rate": 8.42217484008529e-06,
"loss": 0.4998,
"step": 791
},
{
"epoch": 1.2671999999999999,
"grad_norm": 0.8186106542485699,
"learning_rate": 8.432835820895524e-06,
"loss": 0.4612,
"step": 792
},
{
"epoch": 1.2688,
"grad_norm": 0.8617138344306301,
"learning_rate": 8.443496801705757e-06,
"loss": 0.4476,
"step": 793
},
{
"epoch": 1.2704,
"grad_norm": 0.7710685418084188,
"learning_rate": 8.454157782515993e-06,
"loss": 0.5233,
"step": 794
},
{
"epoch": 1.272,
"grad_norm": 0.8538109064578701,
"learning_rate": 8.464818763326227e-06,
"loss": 0.4895,
"step": 795
},
{
"epoch": 1.2736,
"grad_norm": 0.807083428196643,
"learning_rate": 8.475479744136462e-06,
"loss": 0.5115,
"step": 796
},
{
"epoch": 1.2752,
"grad_norm": 0.7951784742912327,
"learning_rate": 8.486140724946695e-06,
"loss": 0.4875,
"step": 797
},
{
"epoch": 1.2768,
"grad_norm": 0.8166159349941166,
"learning_rate": 8.496801705756931e-06,
"loss": 0.4889,
"step": 798
},
{
"epoch": 1.2784,
"grad_norm": 0.7561811800967494,
"learning_rate": 8.507462686567165e-06,
"loss": 0.4948,
"step": 799
},
{
"epoch": 1.28,
"grad_norm": 0.8170354309076975,
"learning_rate": 8.5181236673774e-06,
"loss": 0.4801,
"step": 800
},
{
"epoch": 1.2816,
"grad_norm": 0.8325732227308348,
"learning_rate": 8.528784648187633e-06,
"loss": 0.4771,
"step": 801
},
{
"epoch": 1.2832,
"grad_norm": 0.8532852379008969,
"learning_rate": 8.539445628997869e-06,
"loss": 0.5032,
"step": 802
},
{
"epoch": 1.2848,
"grad_norm": 0.8364157169622278,
"learning_rate": 8.550106609808104e-06,
"loss": 0.4926,
"step": 803
},
{
"epoch": 1.2864,
"grad_norm": 0.8198265344617243,
"learning_rate": 8.560767590618338e-06,
"loss": 0.4953,
"step": 804
},
{
"epoch": 1.288,
"grad_norm": 0.7676253771661686,
"learning_rate": 8.571428571428571e-06,
"loss": 0.5121,
"step": 805
},
{
"epoch": 1.2896,
"grad_norm": 0.8178141149780725,
"learning_rate": 8.582089552238807e-06,
"loss": 0.4941,
"step": 806
},
{
"epoch": 1.2912,
"grad_norm": 0.727794607570312,
"learning_rate": 8.592750533049042e-06,
"loss": 0.4691,
"step": 807
},
{
"epoch": 1.2928,
"grad_norm": 0.8254744891795319,
"learning_rate": 8.603411513859276e-06,
"loss": 0.4754,
"step": 808
},
{
"epoch": 1.2944,
"grad_norm": 0.8217573350822549,
"learning_rate": 8.614072494669509e-06,
"loss": 0.5127,
"step": 809
},
{
"epoch": 1.296,
"grad_norm": 0.8681921933879128,
"learning_rate": 8.624733475479745e-06,
"loss": 0.4767,
"step": 810
},
{
"epoch": 1.2976,
"grad_norm": 0.9449479811028989,
"learning_rate": 8.63539445628998e-06,
"loss": 0.4863,
"step": 811
},
{
"epoch": 1.2992,
"grad_norm": 0.8262347272337431,
"learning_rate": 8.646055437100214e-06,
"loss": 0.4775,
"step": 812
},
{
"epoch": 1.3008,
"grad_norm": 0.8851702609211491,
"learning_rate": 8.656716417910447e-06,
"loss": 0.4906,
"step": 813
},
{
"epoch": 1.3024,
"grad_norm": 0.8049544924013474,
"learning_rate": 8.667377398720683e-06,
"loss": 0.4872,
"step": 814
},
{
"epoch": 1.304,
"grad_norm": 0.9884021117715628,
"learning_rate": 8.678038379530918e-06,
"loss": 0.5038,
"step": 815
},
{
"epoch": 1.3056,
"grad_norm": 0.7869006872785753,
"learning_rate": 8.688699360341152e-06,
"loss": 0.4882,
"step": 816
},
{
"epoch": 1.3072,
"grad_norm": 0.7954379611740962,
"learning_rate": 8.699360341151387e-06,
"loss": 0.5194,
"step": 817
},
{
"epoch": 1.3088,
"grad_norm": 0.8058906484306712,
"learning_rate": 8.710021321961621e-06,
"loss": 0.4767,
"step": 818
},
{
"epoch": 1.3104,
"grad_norm": 0.7928405824005407,
"learning_rate": 8.720682302771856e-06,
"loss": 0.4954,
"step": 819
},
{
"epoch": 1.312,
"grad_norm": 0.8268634860285111,
"learning_rate": 8.73134328358209e-06,
"loss": 0.5298,
"step": 820
},
{
"epoch": 1.3136,
"grad_norm": 0.7680714308955621,
"learning_rate": 8.742004264392325e-06,
"loss": 0.4745,
"step": 821
},
{
"epoch": 1.3152,
"grad_norm": 0.8226721900025242,
"learning_rate": 8.75266524520256e-06,
"loss": 0.4834,
"step": 822
},
{
"epoch": 1.3168,
"grad_norm": 0.8588142531758319,
"learning_rate": 8.763326226012794e-06,
"loss": 0.4995,
"step": 823
},
{
"epoch": 1.3184,
"grad_norm": 0.7973825718924447,
"learning_rate": 8.773987206823029e-06,
"loss": 0.471,
"step": 824
},
{
"epoch": 1.32,
"grad_norm": 0.8196008578787768,
"learning_rate": 8.784648187633263e-06,
"loss": 0.4434,
"step": 825
},
{
"epoch": 1.3216,
"grad_norm": 0.821898919951824,
"learning_rate": 8.795309168443498e-06,
"loss": 0.4932,
"step": 826
},
{
"epoch": 1.3232,
"grad_norm": 0.7429051792355805,
"learning_rate": 8.805970149253732e-06,
"loss": 0.487,
"step": 827
},
{
"epoch": 1.3248,
"grad_norm": 0.7808144550211977,
"learning_rate": 8.816631130063967e-06,
"loss": 0.4848,
"step": 828
},
{
"epoch": 1.3264,
"grad_norm": 0.75406464373619,
"learning_rate": 8.827292110874201e-06,
"loss": 0.4797,
"step": 829
},
{
"epoch": 1.328,
"grad_norm": 0.8474425214756218,
"learning_rate": 8.837953091684436e-06,
"loss": 0.4983,
"step": 830
},
{
"epoch": 1.3296000000000001,
"grad_norm": 0.8801019534613489,
"learning_rate": 8.84861407249467e-06,
"loss": 0.5018,
"step": 831
},
{
"epoch": 1.3312,
"grad_norm": 0.7435162835174438,
"learning_rate": 8.859275053304905e-06,
"loss": 0.4638,
"step": 832
},
{
"epoch": 1.3328,
"grad_norm": 1.0349062751171558,
"learning_rate": 8.86993603411514e-06,
"loss": 0.5137,
"step": 833
},
{
"epoch": 1.3344,
"grad_norm": 0.8071414440849783,
"learning_rate": 8.880597014925374e-06,
"loss": 0.5009,
"step": 834
},
{
"epoch": 1.336,
"grad_norm": 0.9477676903674344,
"learning_rate": 8.891257995735608e-06,
"loss": 0.4892,
"step": 835
},
{
"epoch": 1.3376000000000001,
"grad_norm": 0.8771818428184452,
"learning_rate": 8.901918976545843e-06,
"loss": 0.4853,
"step": 836
},
{
"epoch": 1.3392,
"grad_norm": 0.9525100546819315,
"learning_rate": 8.912579957356077e-06,
"loss": 0.5087,
"step": 837
},
{
"epoch": 1.3408,
"grad_norm": 1.0250030593240957,
"learning_rate": 8.923240938166312e-06,
"loss": 0.4976,
"step": 838
},
{
"epoch": 1.3424,
"grad_norm": 0.8329762799790746,
"learning_rate": 8.933901918976547e-06,
"loss": 0.4512,
"step": 839
},
{
"epoch": 1.3439999999999999,
"grad_norm": 0.9725783256416181,
"learning_rate": 8.944562899786781e-06,
"loss": 0.4723,
"step": 840
},
{
"epoch": 1.3456000000000001,
"grad_norm": 0.7926950757963382,
"learning_rate": 8.955223880597016e-06,
"loss": 0.4812,
"step": 841
},
{
"epoch": 1.3472,
"grad_norm": 0.9029107439242626,
"learning_rate": 8.96588486140725e-06,
"loss": 0.4734,
"step": 842
},
{
"epoch": 1.3488,
"grad_norm": 0.8609612617899417,
"learning_rate": 8.976545842217485e-06,
"loss": 0.4706,
"step": 843
},
{
"epoch": 1.3504,
"grad_norm": 0.8609639077025752,
"learning_rate": 8.987206823027719e-06,
"loss": 0.4698,
"step": 844
},
{
"epoch": 1.3519999999999999,
"grad_norm": 0.8751160804112281,
"learning_rate": 8.997867803837954e-06,
"loss": 0.4864,
"step": 845
},
{
"epoch": 1.3536000000000001,
"grad_norm": 0.9006318823556566,
"learning_rate": 9.008528784648188e-06,
"loss": 0.5023,
"step": 846
},
{
"epoch": 1.3552,
"grad_norm": 0.8527123853283762,
"learning_rate": 9.019189765458423e-06,
"loss": 0.4994,
"step": 847
},
{
"epoch": 1.3568,
"grad_norm": 0.8599406865687744,
"learning_rate": 9.029850746268657e-06,
"loss": 0.4874,
"step": 848
},
{
"epoch": 1.3584,
"grad_norm": 0.8722361781773674,
"learning_rate": 9.040511727078892e-06,
"loss": 0.5227,
"step": 849
},
{
"epoch": 1.3599999999999999,
"grad_norm": 0.8870612314576228,
"learning_rate": 9.051172707889126e-06,
"loss": 0.5151,
"step": 850
},
{
"epoch": 1.3616,
"grad_norm": 0.8134429554681971,
"learning_rate": 9.06183368869936e-06,
"loss": 0.4919,
"step": 851
},
{
"epoch": 1.3632,
"grad_norm": 0.7863555698546318,
"learning_rate": 9.072494669509595e-06,
"loss": 0.4766,
"step": 852
},
{
"epoch": 1.3648,
"grad_norm": 0.8186764343933153,
"learning_rate": 9.08315565031983e-06,
"loss": 0.5021,
"step": 853
},
{
"epoch": 1.3664,
"grad_norm": 0.8219695239391486,
"learning_rate": 9.093816631130064e-06,
"loss": 0.5007,
"step": 854
},
{
"epoch": 1.3679999999999999,
"grad_norm": 0.841769164632584,
"learning_rate": 9.104477611940299e-06,
"loss": 0.4914,
"step": 855
},
{
"epoch": 1.3696,
"grad_norm": 0.8330514656539206,
"learning_rate": 9.115138592750533e-06,
"loss": 0.4987,
"step": 856
},
{
"epoch": 1.3712,
"grad_norm": 0.811133989438271,
"learning_rate": 9.125799573560768e-06,
"loss": 0.462,
"step": 857
},
{
"epoch": 1.3728,
"grad_norm": 0.8633384492304504,
"learning_rate": 9.136460554371003e-06,
"loss": 0.4963,
"step": 858
},
{
"epoch": 1.3744,
"grad_norm": 0.8410989072416641,
"learning_rate": 9.147121535181237e-06,
"loss": 0.4977,
"step": 859
},
{
"epoch": 1.376,
"grad_norm": 0.806728983493438,
"learning_rate": 9.157782515991472e-06,
"loss": 0.4624,
"step": 860
},
{
"epoch": 1.3776,
"grad_norm": 0.806341154096036,
"learning_rate": 9.168443496801706e-06,
"loss": 0.4658,
"step": 861
},
{
"epoch": 1.3792,
"grad_norm": 0.8038719681175753,
"learning_rate": 9.17910447761194e-06,
"loss": 0.4697,
"step": 862
},
{
"epoch": 1.3808,
"grad_norm": 0.8714151874327415,
"learning_rate": 9.189765458422175e-06,
"loss": 0.5101,
"step": 863
},
{
"epoch": 1.3824,
"grad_norm": 0.8780911686986344,
"learning_rate": 9.200426439232411e-06,
"loss": 0.5252,
"step": 864
},
{
"epoch": 1.384,
"grad_norm": 0.8516702000502967,
"learning_rate": 9.211087420042644e-06,
"loss": 0.4838,
"step": 865
},
{
"epoch": 1.3856,
"grad_norm": 1.0025506106485897,
"learning_rate": 9.221748400852879e-06,
"loss": 0.5069,
"step": 866
},
{
"epoch": 1.3872,
"grad_norm": 0.7996027724204295,
"learning_rate": 9.232409381663113e-06,
"loss": 0.5178,
"step": 867
},
{
"epoch": 1.3888,
"grad_norm": 1.080490918542818,
"learning_rate": 9.24307036247335e-06,
"loss": 0.5032,
"step": 868
},
{
"epoch": 1.3904,
"grad_norm": 0.8444844807264513,
"learning_rate": 9.253731343283582e-06,
"loss": 0.4701,
"step": 869
},
{
"epoch": 1.392,
"grad_norm": 0.899927637543412,
"learning_rate": 9.264392324093817e-06,
"loss": 0.4904,
"step": 870
},
{
"epoch": 1.3936,
"grad_norm": 0.8718461575504228,
"learning_rate": 9.275053304904051e-06,
"loss": 0.4855,
"step": 871
},
{
"epoch": 1.3952,
"grad_norm": 0.9337795759969514,
"learning_rate": 9.285714285714288e-06,
"loss": 0.5186,
"step": 872
},
{
"epoch": 1.3968,
"grad_norm": 0.8199842573828481,
"learning_rate": 9.29637526652452e-06,
"loss": 0.4707,
"step": 873
},
{
"epoch": 1.3984,
"grad_norm": 0.8652313612196955,
"learning_rate": 9.307036247334755e-06,
"loss": 0.4997,
"step": 874
},
{
"epoch": 1.4,
"grad_norm": 0.8176745862255748,
"learning_rate": 9.31769722814499e-06,
"loss": 0.4759,
"step": 875
},
{
"epoch": 1.4016,
"grad_norm": 0.9753918720629351,
"learning_rate": 9.328358208955226e-06,
"loss": 0.4631,
"step": 876
},
{
"epoch": 1.4032,
"grad_norm": 0.8891976160972742,
"learning_rate": 9.339019189765458e-06,
"loss": 0.5,
"step": 877
},
{
"epoch": 1.4048,
"grad_norm": 0.8913833986097975,
"learning_rate": 9.349680170575693e-06,
"loss": 0.5187,
"step": 878
},
{
"epoch": 1.4064,
"grad_norm": 0.9274476926254841,
"learning_rate": 9.36034115138593e-06,
"loss": 0.4998,
"step": 879
},
{
"epoch": 1.408,
"grad_norm": 0.787290628319696,
"learning_rate": 9.371002132196164e-06,
"loss": 0.4704,
"step": 880
},
{
"epoch": 1.4096,
"grad_norm": 0.7814721109331324,
"learning_rate": 9.381663113006397e-06,
"loss": 0.4694,
"step": 881
},
{
"epoch": 1.4112,
"grad_norm": 0.7991738854610555,
"learning_rate": 9.392324093816631e-06,
"loss": 0.4449,
"step": 882
},
{
"epoch": 1.4128,
"grad_norm": 0.8389584825409092,
"learning_rate": 9.402985074626867e-06,
"loss": 0.4828,
"step": 883
},
{
"epoch": 1.4144,
"grad_norm": 0.8594585961944166,
"learning_rate": 9.413646055437102e-06,
"loss": 0.4592,
"step": 884
},
{
"epoch": 1.416,
"grad_norm": 0.8488703802616425,
"learning_rate": 9.424307036247335e-06,
"loss": 0.4867,
"step": 885
},
{
"epoch": 1.4176,
"grad_norm": 0.7909492063396593,
"learning_rate": 9.43496801705757e-06,
"loss": 0.4714,
"step": 886
},
{
"epoch": 1.4192,
"grad_norm": 0.8665486508087874,
"learning_rate": 9.445628997867805e-06,
"loss": 0.4703,
"step": 887
},
{
"epoch": 1.4208,
"grad_norm": 0.7916642549445045,
"learning_rate": 9.45628997867804e-06,
"loss": 0.4897,
"step": 888
},
{
"epoch": 1.4224,
"grad_norm": 0.8124562469349942,
"learning_rate": 9.466950959488273e-06,
"loss": 0.474,
"step": 889
},
{
"epoch": 1.424,
"grad_norm": 0.7619798562274529,
"learning_rate": 9.477611940298507e-06,
"loss": 0.4799,
"step": 890
},
{
"epoch": 1.4256,
"grad_norm": 0.785232437222393,
"learning_rate": 9.488272921108744e-06,
"loss": 0.5038,
"step": 891
},
{
"epoch": 1.4272,
"grad_norm": 0.8071136715131856,
"learning_rate": 9.498933901918978e-06,
"loss": 0.5083,
"step": 892
},
{
"epoch": 1.4288,
"grad_norm": 0.75611163321337,
"learning_rate": 9.509594882729211e-06,
"loss": 0.47,
"step": 893
},
{
"epoch": 1.4304000000000001,
"grad_norm": 0.8947031000453353,
"learning_rate": 9.520255863539445e-06,
"loss": 0.5253,
"step": 894
},
{
"epoch": 1.432,
"grad_norm": 0.7777694756552822,
"learning_rate": 9.530916844349682e-06,
"loss": 0.5,
"step": 895
},
{
"epoch": 1.4336,
"grad_norm": 0.8413775097766173,
"learning_rate": 9.541577825159916e-06,
"loss": 0.5227,
"step": 896
},
{
"epoch": 1.4352,
"grad_norm": 0.7595043259234036,
"learning_rate": 9.552238805970149e-06,
"loss": 0.4772,
"step": 897
},
{
"epoch": 1.4368,
"grad_norm": 0.8124697261241832,
"learning_rate": 9.562899786780384e-06,
"loss": 0.4981,
"step": 898
},
{
"epoch": 1.4384000000000001,
"grad_norm": 0.859236848802032,
"learning_rate": 9.57356076759062e-06,
"loss": 0.4693,
"step": 899
},
{
"epoch": 1.44,
"grad_norm": 0.8104200026032801,
"learning_rate": 9.584221748400854e-06,
"loss": 0.4771,
"step": 900
},
{
"epoch": 1.4416,
"grad_norm": 0.8584614348837653,
"learning_rate": 9.594882729211089e-06,
"loss": 0.5066,
"step": 901
},
{
"epoch": 1.4432,
"grad_norm": 0.8259724591743942,
"learning_rate": 9.605543710021322e-06,
"loss": 0.477,
"step": 902
},
{
"epoch": 1.4447999999999999,
"grad_norm": 0.817674349808855,
"learning_rate": 9.616204690831558e-06,
"loss": 0.4648,
"step": 903
},
{
"epoch": 1.4464000000000001,
"grad_norm": 0.8859816874161797,
"learning_rate": 9.626865671641792e-06,
"loss": 0.4618,
"step": 904
},
{
"epoch": 1.448,
"grad_norm": 0.8575821423889298,
"learning_rate": 9.637526652452027e-06,
"loss": 0.5056,
"step": 905
},
{
"epoch": 1.4496,
"grad_norm": 0.9607868269874424,
"learning_rate": 9.64818763326226e-06,
"loss": 0.4691,
"step": 906
},
{
"epoch": 1.4512,
"grad_norm": 0.8672657067709886,
"learning_rate": 9.658848614072496e-06,
"loss": 0.4962,
"step": 907
},
{
"epoch": 1.4527999999999999,
"grad_norm": 1.0445509426721418,
"learning_rate": 9.66950959488273e-06,
"loss": 0.476,
"step": 908
},
{
"epoch": 1.4544000000000001,
"grad_norm": 0.9684439200424949,
"learning_rate": 9.680170575692965e-06,
"loss": 0.4726,
"step": 909
},
{
"epoch": 1.456,
"grad_norm": 0.7742572368999419,
"learning_rate": 9.6908315565032e-06,
"loss": 0.4998,
"step": 910
},
{
"epoch": 1.4576,
"grad_norm": 0.8676632527642012,
"learning_rate": 9.701492537313434e-06,
"loss": 0.4948,
"step": 911
},
{
"epoch": 1.4592,
"grad_norm": 0.7629886760452764,
"learning_rate": 9.712153518123669e-06,
"loss": 0.4777,
"step": 912
},
{
"epoch": 1.4607999999999999,
"grad_norm": 0.9709918958387708,
"learning_rate": 9.722814498933903e-06,
"loss": 0.4813,
"step": 913
},
{
"epoch": 1.4624,
"grad_norm": 0.8319565078199239,
"learning_rate": 9.733475479744138e-06,
"loss": 0.483,
"step": 914
},
{
"epoch": 1.464,
"grad_norm": 0.9560500791255669,
"learning_rate": 9.744136460554372e-06,
"loss": 0.4597,
"step": 915
},
{
"epoch": 1.4656,
"grad_norm": 0.8551324757472956,
"learning_rate": 9.754797441364607e-06,
"loss": 0.483,
"step": 916
},
{
"epoch": 1.4672,
"grad_norm": 0.8819372002315914,
"learning_rate": 9.765458422174841e-06,
"loss": 0.5022,
"step": 917
},
{
"epoch": 1.4687999999999999,
"grad_norm": 0.8919454392102223,
"learning_rate": 9.776119402985076e-06,
"loss": 0.4811,
"step": 918
},
{
"epoch": 1.4704,
"grad_norm": 0.7956928740696931,
"learning_rate": 9.78678038379531e-06,
"loss": 0.5032,
"step": 919
},
{
"epoch": 1.472,
"grad_norm": 0.9460431224219428,
"learning_rate": 9.797441364605545e-06,
"loss": 0.5013,
"step": 920
},
{
"epoch": 1.4736,
"grad_norm": 0.7563920215009783,
"learning_rate": 9.80810234541578e-06,
"loss": 0.4912,
"step": 921
},
{
"epoch": 1.4752,
"grad_norm": 0.7958797181312411,
"learning_rate": 9.818763326226014e-06,
"loss": 0.4845,
"step": 922
},
{
"epoch": 1.4768,
"grad_norm": 0.8749244767024981,
"learning_rate": 9.829424307036248e-06,
"loss": 0.4977,
"step": 923
},
{
"epoch": 1.4784,
"grad_norm": 0.7973735882306109,
"learning_rate": 9.840085287846483e-06,
"loss": 0.4984,
"step": 924
},
{
"epoch": 1.48,
"grad_norm": 0.8868550866351855,
"learning_rate": 9.850746268656717e-06,
"loss": 0.4926,
"step": 925
},
{
"epoch": 1.4816,
"grad_norm": 0.9107461226553807,
"learning_rate": 9.861407249466952e-06,
"loss": 0.4739,
"step": 926
},
{
"epoch": 1.4832,
"grad_norm": 0.9588502326001234,
"learning_rate": 9.872068230277187e-06,
"loss": 0.4782,
"step": 927
},
{
"epoch": 1.4848,
"grad_norm": 0.8364354427376426,
"learning_rate": 9.882729211087421e-06,
"loss": 0.4477,
"step": 928
},
{
"epoch": 1.4864,
"grad_norm": 0.7874286019641639,
"learning_rate": 9.893390191897656e-06,
"loss": 0.4413,
"step": 929
},
{
"epoch": 1.488,
"grad_norm": 0.978506341009771,
"learning_rate": 9.90405117270789e-06,
"loss": 0.4612,
"step": 930
},
{
"epoch": 1.4896,
"grad_norm": 0.8490652879114352,
"learning_rate": 9.914712153518125e-06,
"loss": 0.4751,
"step": 931
},
{
"epoch": 1.4912,
"grad_norm": 1.0018664232526466,
"learning_rate": 9.925373134328359e-06,
"loss": 0.4671,
"step": 932
},
{
"epoch": 1.4928,
"grad_norm": 0.8614595050477257,
"learning_rate": 9.936034115138594e-06,
"loss": 0.4939,
"step": 933
},
{
"epoch": 1.4944,
"grad_norm": 0.8324806829999502,
"learning_rate": 9.946695095948828e-06,
"loss": 0.457,
"step": 934
},
{
"epoch": 1.496,
"grad_norm": 0.8503951859763951,
"learning_rate": 9.957356076759063e-06,
"loss": 0.4725,
"step": 935
},
{
"epoch": 1.4976,
"grad_norm": 0.8755011425954693,
"learning_rate": 9.968017057569297e-06,
"loss": 0.4851,
"step": 936
},
{
"epoch": 1.4992,
"grad_norm": 0.9263738559864301,
"learning_rate": 9.978678038379532e-06,
"loss": 0.4885,
"step": 937
},
{
"epoch": 1.5008,
"grad_norm": 0.9673864844853349,
"learning_rate": 9.989339019189766e-06,
"loss": 0.4881,
"step": 938
},
{
"epoch": 1.5024,
"grad_norm": 0.9290777102985807,
"learning_rate": 1e-05,
"loss": 0.4821,
"step": 939
},
{
"epoch": 1.504,
"grad_norm": 0.8458709982219547,
"learning_rate": 9.999971896515836e-06,
"loss": 0.4915,
"step": 940
},
{
"epoch": 1.5056,
"grad_norm": 1.0858511586378827,
"learning_rate": 9.999887586379264e-06,
"loss": 0.4756,
"step": 941
},
{
"epoch": 1.5072,
"grad_norm": 0.8354556740923118,
"learning_rate": 9.99974707053805e-06,
"loss": 0.4847,
"step": 942
},
{
"epoch": 1.5088,
"grad_norm": 0.9898179436259126,
"learning_rate": 9.999550350571785e-06,
"loss": 0.4801,
"step": 943
},
{
"epoch": 1.5104,
"grad_norm": 0.8793163891262439,
"learning_rate": 9.999297428691878e-06,
"loss": 0.4699,
"step": 944
},
{
"epoch": 1.512,
"grad_norm": 0.9930914467085877,
"learning_rate": 9.998988307741521e-06,
"loss": 0.5036,
"step": 945
},
{
"epoch": 1.5135999999999998,
"grad_norm": 0.8909325985734913,
"learning_rate": 9.998622991195668e-06,
"loss": 0.4843,
"step": 946
},
{
"epoch": 1.5152,
"grad_norm": 0.8906776966778176,
"learning_rate": 9.998201483160981e-06,
"loss": 0.4924,
"step": 947
},
{
"epoch": 1.5168,
"grad_norm": 1.0210001302818335,
"learning_rate": 9.997723788375803e-06,
"loss": 0.4908,
"step": 948
},
{
"epoch": 1.5184,
"grad_norm": 0.8343669235401521,
"learning_rate": 9.997189912210086e-06,
"loss": 0.4665,
"step": 949
},
{
"epoch": 1.52,
"grad_norm": 0.8811832399885232,
"learning_rate": 9.996599860665342e-06,
"loss": 0.4589,
"step": 950
},
{
"epoch": 1.5215999999999998,
"grad_norm": 0.8437250504573268,
"learning_rate": 9.995953640374574e-06,
"loss": 0.5018,
"step": 951
},
{
"epoch": 1.5232,
"grad_norm": 0.9152349301031418,
"learning_rate": 9.9952512586022e-06,
"loss": 0.488,
"step": 952
},
{
"epoch": 1.5248,
"grad_norm": 0.7795464645786981,
"learning_rate": 9.994492723243965e-06,
"loss": 0.4933,
"step": 953
},
{
"epoch": 1.5264,
"grad_norm": 0.763019952273118,
"learning_rate": 9.993678042826869e-06,
"loss": 0.4714,
"step": 954
},
{
"epoch": 1.528,
"grad_norm": 0.886034738901285,
"learning_rate": 9.99280722650905e-06,
"loss": 0.5121,
"step": 955
},
{
"epoch": 1.5295999999999998,
"grad_norm": 0.7455654076044084,
"learning_rate": 9.991880284079704e-06,
"loss": 0.4568,
"step": 956
},
{
"epoch": 1.5312000000000001,
"grad_norm": 0.8682016590039219,
"learning_rate": 9.99089722595895e-06,
"loss": 0.4554,
"step": 957
},
{
"epoch": 1.5328,
"grad_norm": 0.8015242545658919,
"learning_rate": 9.989858063197735e-06,
"loss": 0.4814,
"step": 958
},
{
"epoch": 1.5344,
"grad_norm": 0.8341406535742181,
"learning_rate": 9.988762807477694e-06,
"loss": 0.4821,
"step": 959
},
{
"epoch": 1.536,
"grad_norm": 0.8709933534074977,
"learning_rate": 9.987611471111027e-06,
"loss": 0.4877,
"step": 960
},
{
"epoch": 1.5375999999999999,
"grad_norm": 0.7987019576052976,
"learning_rate": 9.986404067040363e-06,
"loss": 0.4707,
"step": 961
},
{
"epoch": 1.5392000000000001,
"grad_norm": 0.8902420538131095,
"learning_rate": 9.985140608838604e-06,
"loss": 0.5248,
"step": 962
},
{
"epoch": 1.5408,
"grad_norm": 0.7170194508042936,
"learning_rate": 9.98382111070878e-06,
"loss": 0.424,
"step": 963
},
{
"epoch": 1.5424,
"grad_norm": 0.914961108671794,
"learning_rate": 9.982445587483893e-06,
"loss": 0.4805,
"step": 964
},
{
"epoch": 1.544,
"grad_norm": 0.9088053940764901,
"learning_rate": 9.981014054626737e-06,
"loss": 0.5042,
"step": 965
},
{
"epoch": 1.5455999999999999,
"grad_norm": 0.8489619285484816,
"learning_rate": 9.979526528229737e-06,
"loss": 0.4956,
"step": 966
},
{
"epoch": 1.5472000000000001,
"grad_norm": 0.8164706702959176,
"learning_rate": 9.977983025014765e-06,
"loss": 0.4689,
"step": 967
},
{
"epoch": 1.5488,
"grad_norm": 0.7566483029226344,
"learning_rate": 9.976383562332946e-06,
"loss": 0.5193,
"step": 968
},
{
"epoch": 1.5504,
"grad_norm": 0.880276594890666,
"learning_rate": 9.974728158164471e-06,
"loss": 0.4655,
"step": 969
},
{
"epoch": 1.552,
"grad_norm": 0.8081458173299156,
"learning_rate": 9.973016831118389e-06,
"loss": 0.5069,
"step": 970
},
{
"epoch": 1.5535999999999999,
"grad_norm": 0.8544537513815259,
"learning_rate": 9.971249600432403e-06,
"loss": 0.4798,
"step": 971
},
{
"epoch": 1.5552000000000001,
"grad_norm": 0.7504450860493904,
"learning_rate": 9.969426485972645e-06,
"loss": 0.4867,
"step": 972
},
{
"epoch": 1.5568,
"grad_norm": 0.8701881540311115,
"learning_rate": 9.967547508233466e-06,
"loss": 0.4684,
"step": 973
},
{
"epoch": 1.5584,
"grad_norm": 0.8852314246097367,
"learning_rate": 9.965612688337194e-06,
"loss": 0.4855,
"step": 974
},
{
"epoch": 1.56,
"grad_norm": 0.8242831459153037,
"learning_rate": 9.9636220480339e-06,
"loss": 0.5001,
"step": 975
},
{
"epoch": 1.5615999999999999,
"grad_norm": 1.005604417980349,
"learning_rate": 9.961575609701154e-06,
"loss": 0.4891,
"step": 976
},
{
"epoch": 1.5632000000000001,
"grad_norm": 0.8862228251379858,
"learning_rate": 9.959473396343777e-06,
"loss": 0.4831,
"step": 977
},
{
"epoch": 1.5648,
"grad_norm": 0.864096678336817,
"learning_rate": 9.957315431593578e-06,
"loss": 0.4843,
"step": 978
},
{
"epoch": 1.5664,
"grad_norm": 0.776906764480194,
"learning_rate": 9.955101739709085e-06,
"loss": 0.4616,
"step": 979
},
{
"epoch": 1.568,
"grad_norm": 0.8029486812052213,
"learning_rate": 9.952832345575283e-06,
"loss": 0.4757,
"step": 980
},
{
"epoch": 1.5695999999999999,
"grad_norm": 0.8036715371775551,
"learning_rate": 9.950507274703323e-06,
"loss": 0.5217,
"step": 981
},
{
"epoch": 1.5712000000000002,
"grad_norm": 0.7765331989278372,
"learning_rate": 9.948126553230242e-06,
"loss": 0.5046,
"step": 982
},
{
"epoch": 1.5728,
"grad_norm": 0.8150699203250963,
"learning_rate": 9.945690207918667e-06,
"loss": 0.4506,
"step": 983
},
{
"epoch": 1.5744,
"grad_norm": 0.7255927188827002,
"learning_rate": 9.943198266156517e-06,
"loss": 0.4603,
"step": 984
},
{
"epoch": 1.576,
"grad_norm": 0.8155544836444186,
"learning_rate": 9.940650755956686e-06,
"loss": 0.4495,
"step": 985
},
{
"epoch": 1.5776,
"grad_norm": 0.7461262093206759,
"learning_rate": 9.938047705956746e-06,
"loss": 0.4772,
"step": 986
},
{
"epoch": 1.5792000000000002,
"grad_norm": 0.8761054409345614,
"learning_rate": 9.935389145418599e-06,
"loss": 0.4763,
"step": 987
},
{
"epoch": 1.5808,
"grad_norm": 0.7085545139303683,
"learning_rate": 9.932675104228177e-06,
"loss": 0.4716,
"step": 988
},
{
"epoch": 1.5824,
"grad_norm": 0.8608971225616455,
"learning_rate": 9.929905612895082e-06,
"loss": 0.5045,
"step": 989
},
{
"epoch": 1.584,
"grad_norm": 0.8489045991866748,
"learning_rate": 9.927080702552256e-06,
"loss": 0.4897,
"step": 990
},
{
"epoch": 1.5856,
"grad_norm": 0.7519848393673628,
"learning_rate": 9.924200404955628e-06,
"loss": 0.5145,
"step": 991
},
{
"epoch": 1.5872000000000002,
"grad_norm": 0.7892016944926196,
"learning_rate": 9.921264752483761e-06,
"loss": 0.488,
"step": 992
},
{
"epoch": 1.5888,
"grad_norm": 0.7995854543733288,
"learning_rate": 9.918273778137477e-06,
"loss": 0.4903,
"step": 993
},
{
"epoch": 1.5904,
"grad_norm": 0.7423383199728011,
"learning_rate": 9.915227515539497e-06,
"loss": 0.4656,
"step": 994
},
{
"epoch": 1.592,
"grad_norm": 0.8102371092720678,
"learning_rate": 9.912125998934055e-06,
"loss": 0.5035,
"step": 995
},
{
"epoch": 1.5936,
"grad_norm": 0.8743193972829729,
"learning_rate": 9.908969263186525e-06,
"loss": 0.4819,
"step": 996
},
{
"epoch": 1.5952,
"grad_norm": 0.832621075746147,
"learning_rate": 9.905757343783014e-06,
"loss": 0.4652,
"step": 997
},
{
"epoch": 1.5968,
"grad_norm": 0.8268268770260372,
"learning_rate": 9.90249027682997e-06,
"loss": 0.4914,
"step": 998
},
{
"epoch": 1.5984,
"grad_norm": 0.8225814361360733,
"learning_rate": 9.899168099053784e-06,
"loss": 0.4919,
"step": 999
},
{
"epoch": 1.6,
"grad_norm": 0.8339861968131481,
"learning_rate": 9.895790847800361e-06,
"loss": 0.5045,
"step": 1000
},
{
"epoch": 1.6016,
"grad_norm": 0.7750417430083946,
"learning_rate": 9.892358561034713e-06,
"loss": 0.5089,
"step": 1001
},
{
"epoch": 1.6032,
"grad_norm": 0.8973432178284,
"learning_rate": 9.888871277340522e-06,
"loss": 0.4907,
"step": 1002
},
{
"epoch": 1.6048,
"grad_norm": 0.8396657295268389,
"learning_rate": 9.885329035919724e-06,
"loss": 0.4591,
"step": 1003
},
{
"epoch": 1.6064,
"grad_norm": 0.8393541454238976,
"learning_rate": 9.881731876592046e-06,
"loss": 0.4808,
"step": 1004
},
{
"epoch": 1.608,
"grad_norm": 0.9385582992222241,
"learning_rate": 9.878079839794572e-06,
"loss": 0.494,
"step": 1005
},
{
"epoch": 1.6096,
"grad_norm": 0.830952348808366,
"learning_rate": 9.874372966581285e-06,
"loss": 0.4515,
"step": 1006
},
{
"epoch": 1.6112,
"grad_norm": 0.9056178883886478,
"learning_rate": 9.870611298622606e-06,
"loss": 0.4644,
"step": 1007
},
{
"epoch": 1.6128,
"grad_norm": 0.8817177962821087,
"learning_rate": 9.866794878204926e-06,
"loss": 0.4801,
"step": 1008
},
{
"epoch": 1.6143999999999998,
"grad_norm": 0.8449838869033015,
"learning_rate": 9.862923748230128e-06,
"loss": 0.5034,
"step": 1009
},
{
"epoch": 1.616,
"grad_norm": 0.8231314388625361,
"learning_rate": 9.858997952215112e-06,
"loss": 0.4919,
"step": 1010
},
{
"epoch": 1.6176,
"grad_norm": 0.9054712168233684,
"learning_rate": 9.855017534291293e-06,
"loss": 0.4702,
"step": 1011
},
{
"epoch": 1.6192,
"grad_norm": 0.8603610256941876,
"learning_rate": 9.850982539204115e-06,
"loss": 0.494,
"step": 1012
},
{
"epoch": 1.6208,
"grad_norm": 0.7903765631524319,
"learning_rate": 9.846893012312549e-06,
"loss": 0.4865,
"step": 1013
},
{
"epoch": 1.6223999999999998,
"grad_norm": 0.8246003235011296,
"learning_rate": 9.842748999588575e-06,
"loss": 0.4781,
"step": 1014
},
{
"epoch": 1.624,
"grad_norm": 0.875643686096793,
"learning_rate": 9.838550547616671e-06,
"loss": 0.4808,
"step": 1015
},
{
"epoch": 1.6256,
"grad_norm": 0.881772767604039,
"learning_rate": 9.83429770359329e-06,
"loss": 0.5087,
"step": 1016
},
{
"epoch": 1.6272,
"grad_norm": 0.786175611896203,
"learning_rate": 9.829990515326324e-06,
"loss": 0.4822,
"step": 1017
},
{
"epoch": 1.6288,
"grad_norm": 0.832941696772607,
"learning_rate": 9.825629031234574e-06,
"loss": 0.4578,
"step": 1018
},
{
"epoch": 1.6303999999999998,
"grad_norm": 0.8394407049227415,
"learning_rate": 9.821213300347198e-06,
"loss": 0.4995,
"step": 1019
},
{
"epoch": 1.6320000000000001,
"grad_norm": 0.8861442201395248,
"learning_rate": 9.816743372303166e-06,
"loss": 0.4272,
"step": 1020
},
{
"epoch": 1.6336,
"grad_norm": 0.8465108245209845,
"learning_rate": 9.812219297350699e-06,
"loss": 0.5211,
"step": 1021
},
{
"epoch": 1.6352,
"grad_norm": 0.9345603525137967,
"learning_rate": 9.807641126346704e-06,
"loss": 0.4861,
"step": 1022
},
{
"epoch": 1.6368,
"grad_norm": 0.7714038775151155,
"learning_rate": 9.803008910756203e-06,
"loss": 0.5068,
"step": 1023
},
{
"epoch": 1.6383999999999999,
"grad_norm": 0.9111951235060974,
"learning_rate": 9.798322702651754e-06,
"loss": 0.4941,
"step": 1024
},
{
"epoch": 1.6400000000000001,
"grad_norm": 0.7219012597652522,
"learning_rate": 9.793582554712873e-06,
"loss": 0.4612,
"step": 1025
},
{
"epoch": 1.6416,
"grad_norm": 0.9229091704285989,
"learning_rate": 9.788788520225421e-06,
"loss": 0.4793,
"step": 1026
},
{
"epoch": 1.6432,
"grad_norm": 0.883303948978402,
"learning_rate": 9.783940653081033e-06,
"loss": 0.4766,
"step": 1027
},
{
"epoch": 1.6448,
"grad_norm": 0.8053642115782714,
"learning_rate": 9.779039007776487e-06,
"loss": 0.4932,
"step": 1028
},
{
"epoch": 1.6463999999999999,
"grad_norm": 0.904105581732504,
"learning_rate": 9.774083639413112e-06,
"loss": 0.4911,
"step": 1029
},
{
"epoch": 1.6480000000000001,
"grad_norm": 0.7208310136248466,
"learning_rate": 9.769074603696153e-06,
"loss": 0.4462,
"step": 1030
},
{
"epoch": 1.6496,
"grad_norm": 0.8215946897369514,
"learning_rate": 9.764011956934153e-06,
"loss": 0.4784,
"step": 1031
},
{
"epoch": 1.6512,
"grad_norm": 0.7917541871283812,
"learning_rate": 9.758895756038314e-06,
"loss": 0.4666,
"step": 1032
},
{
"epoch": 1.6528,
"grad_norm": 0.7759384663457598,
"learning_rate": 9.753726058521868e-06,
"loss": 0.4807,
"step": 1033
},
{
"epoch": 1.6543999999999999,
"grad_norm": 0.6925254734409707,
"learning_rate": 9.748502922499418e-06,
"loss": 0.4503,
"step": 1034
},
{
"epoch": 1.6560000000000001,
"grad_norm": 0.7794587615717855,
"learning_rate": 9.743226406686293e-06,
"loss": 0.4834,
"step": 1035
},
{
"epoch": 1.6576,
"grad_norm": 0.7395717362574595,
"learning_rate": 9.737896570397885e-06,
"loss": 0.4474,
"step": 1036
},
{
"epoch": 1.6592,
"grad_norm": 0.7921173834301597,
"learning_rate": 9.73251347354898e-06,
"loss": 0.4673,
"step": 1037
},
{
"epoch": 1.6608,
"grad_norm": 0.7735229936168149,
"learning_rate": 9.72707717665309e-06,
"loss": 0.4767,
"step": 1038
},
{
"epoch": 1.6623999999999999,
"grad_norm": 0.7323984798081137,
"learning_rate": 9.721587740821768e-06,
"loss": 0.4533,
"step": 1039
},
{
"epoch": 1.6640000000000001,
"grad_norm": 0.7996750771350366,
"learning_rate": 9.716045227763923e-06,
"loss": 0.5183,
"step": 1040
},
{
"epoch": 1.6656,
"grad_norm": 0.755207320032317,
"learning_rate": 9.71044969978513e-06,
"loss": 0.525,
"step": 1041
},
{
"epoch": 1.6672,
"grad_norm": 0.8031777726974793,
"learning_rate": 9.704801219786915e-06,
"loss": 0.5058,
"step": 1042
},
{
"epoch": 1.6688,
"grad_norm": 0.7202638009085313,
"learning_rate": 9.699099851266071e-06,
"loss": 0.4838,
"step": 1043
},
{
"epoch": 1.6703999999999999,
"grad_norm": 0.7882504924842982,
"learning_rate": 9.693345658313923e-06,
"loss": 0.4797,
"step": 1044
},
{
"epoch": 1.6720000000000002,
"grad_norm": 0.7207002076199619,
"learning_rate": 9.68753870561562e-06,
"loss": 0.4625,
"step": 1045
},
{
"epoch": 1.6736,
"grad_norm": 0.8360369650104762,
"learning_rate": 9.681679058449402e-06,
"loss": 0.4921,
"step": 1046
},
{
"epoch": 1.6752,
"grad_norm": 0.7392880407370019,
"learning_rate": 9.675766782685874e-06,
"loss": 0.4671,
"step": 1047
},
{
"epoch": 1.6768,
"grad_norm": 0.901397058863532,
"learning_rate": 9.669801944787249e-06,
"loss": 0.4909,
"step": 1048
},
{
"epoch": 1.6784,
"grad_norm": 0.7554033748327394,
"learning_rate": 9.663784611806624e-06,
"loss": 0.4583,
"step": 1049
},
{
"epoch": 1.6800000000000002,
"grad_norm": 0.8472780441657755,
"learning_rate": 9.657714851387204e-06,
"loss": 0.4892,
"step": 1050
},
{
"epoch": 1.6816,
"grad_norm": 0.7982580606887658,
"learning_rate": 9.651592731761554e-06,
"loss": 0.4884,
"step": 1051
},
{
"epoch": 1.6832,
"grad_norm": 0.9345405844469842,
"learning_rate": 9.645418321750835e-06,
"loss": 0.4774,
"step": 1052
},
{
"epoch": 1.6848,
"grad_norm": 0.8228296409803656,
"learning_rate": 9.639191690764018e-06,
"loss": 0.4693,
"step": 1053
},
{
"epoch": 1.6864,
"grad_norm": 0.9276080068135167,
"learning_rate": 9.632912908797116e-06,
"loss": 0.4657,
"step": 1054
},
{
"epoch": 1.688,
"grad_norm": 0.8426690179390168,
"learning_rate": 9.626582046432384e-06,
"loss": 0.4787,
"step": 1055
},
{
"epoch": 1.6896,
"grad_norm": 0.8977757427025327,
"learning_rate": 9.620199174837542e-06,
"loss": 0.4835,
"step": 1056
},
{
"epoch": 1.6912,
"grad_norm": 0.7919301046680037,
"learning_rate": 9.61376436576496e-06,
"loss": 0.4829,
"step": 1057
},
{
"epoch": 1.6928,
"grad_norm": 0.8129688641139422,
"learning_rate": 9.607277691550862e-06,
"loss": 0.4712,
"step": 1058
},
{
"epoch": 1.6944,
"grad_norm": 0.8450952852686898,
"learning_rate": 9.600739225114506e-06,
"loss": 0.4912,
"step": 1059
},
{
"epoch": 1.696,
"grad_norm": 0.7760494775184295,
"learning_rate": 9.594149039957366e-06,
"loss": 0.4566,
"step": 1060
},
{
"epoch": 1.6976,
"grad_norm": 0.8362616732216648,
"learning_rate": 9.587507210162307e-06,
"loss": 0.4674,
"step": 1061
},
{
"epoch": 1.6992,
"grad_norm": 0.7923720181156818,
"learning_rate": 9.580813810392755e-06,
"loss": 0.4911,
"step": 1062
},
{
"epoch": 1.7008,
"grad_norm": 0.76930025123472,
"learning_rate": 9.57406891589185e-06,
"loss": 0.4443,
"step": 1063
},
{
"epoch": 1.7024,
"grad_norm": 0.7388257976016034,
"learning_rate": 9.567272602481607e-06,
"loss": 0.4625,
"step": 1064
},
{
"epoch": 1.704,
"grad_norm": 0.7435395961998326,
"learning_rate": 9.56042494656206e-06,
"loss": 0.4629,
"step": 1065
},
{
"epoch": 1.7056,
"grad_norm": 0.7445853385287796,
"learning_rate": 9.553526025110404e-06,
"loss": 0.51,
"step": 1066
},
{
"epoch": 1.7072,
"grad_norm": 0.7566025320046942,
"learning_rate": 9.546575915680134e-06,
"loss": 0.5005,
"step": 1067
},
{
"epoch": 1.7088,
"grad_norm": 0.7759422448278186,
"learning_rate": 9.539574696400165e-06,
"loss": 0.5031,
"step": 1068
},
{
"epoch": 1.7104,
"grad_norm": 0.7543277771064258,
"learning_rate": 9.532522445973956e-06,
"loss": 0.4995,
"step": 1069
},
{
"epoch": 1.712,
"grad_norm": 0.7341510333436678,
"learning_rate": 9.525419243678633e-06,
"loss": 0.4624,
"step": 1070
},
{
"epoch": 1.7136,
"grad_norm": 0.7875756776882786,
"learning_rate": 9.51826516936409e-06,
"loss": 0.4972,
"step": 1071
},
{
"epoch": 1.7151999999999998,
"grad_norm": 0.7843503644841715,
"learning_rate": 9.51106030345209e-06,
"loss": 0.4795,
"step": 1072
},
{
"epoch": 1.7168,
"grad_norm": 0.7237225873861687,
"learning_rate": 9.503804726935369e-06,
"loss": 0.4818,
"step": 1073
},
{
"epoch": 1.7184,
"grad_norm": 0.7787633601269695,
"learning_rate": 9.496498521376718e-06,
"loss": 0.495,
"step": 1074
},
{
"epoch": 1.72,
"grad_norm": 0.6977403808538022,
"learning_rate": 9.48914176890807e-06,
"loss": 0.4472,
"step": 1075
},
{
"epoch": 1.7216,
"grad_norm": 0.7696335166426286,
"learning_rate": 9.481734552229578e-06,
"loss": 0.455,
"step": 1076
},
{
"epoch": 1.7231999999999998,
"grad_norm": 0.7112621901106786,
"learning_rate": 9.474276954608677e-06,
"loss": 0.5013,
"step": 1077
},
{
"epoch": 1.7248,
"grad_norm": 0.9176947750950379,
"learning_rate": 9.46676905987916e-06,
"loss": 0.5003,
"step": 1078
},
{
"epoch": 1.7264,
"grad_norm": 1.3248973520670817,
"learning_rate": 9.459210952440226e-06,
"loss": 0.4855,
"step": 1079
},
{
"epoch": 1.728,
"grad_norm": 0.8812839955457178,
"learning_rate": 9.451602717255536e-06,
"loss": 0.4756,
"step": 1080
},
{
"epoch": 1.7296,
"grad_norm": 0.7358226984295164,
"learning_rate": 9.44394443985226e-06,
"loss": 0.4564,
"step": 1081
},
{
"epoch": 1.7311999999999999,
"grad_norm": 0.7730761571200185,
"learning_rate": 9.436236206320104e-06,
"loss": 0.4766,
"step": 1082
},
{
"epoch": 1.7328000000000001,
"grad_norm": 0.7509702127464707,
"learning_rate": 9.428478103310358e-06,
"loss": 0.4867,
"step": 1083
},
{
"epoch": 1.7344,
"grad_norm": 0.8109776958271638,
"learning_rate": 9.420670218034913e-06,
"loss": 0.4955,
"step": 1084
},
{
"epoch": 1.736,
"grad_norm": 0.7674813988649232,
"learning_rate": 9.412812638265279e-06,
"loss": 0.4891,
"step": 1085
},
{
"epoch": 1.7376,
"grad_norm": 0.8525778346012978,
"learning_rate": 9.404905452331605e-06,
"loss": 0.4575,
"step": 1086
},
{
"epoch": 1.7391999999999999,
"grad_norm": 0.7941716219885411,
"learning_rate": 9.39694874912168e-06,
"loss": 0.4816,
"step": 1087
},
{
"epoch": 1.7408000000000001,
"grad_norm": 0.7524721878502354,
"learning_rate": 9.38894261807994e-06,
"loss": 0.4736,
"step": 1088
},
{
"epoch": 1.7424,
"grad_norm": 0.7673717141057368,
"learning_rate": 9.380887149206453e-06,
"loss": 0.4751,
"step": 1089
},
{
"epoch": 1.744,
"grad_norm": 0.8733631085887028,
"learning_rate": 9.372782433055915e-06,
"loss": 0.4983,
"step": 1090
},
{
"epoch": 1.7456,
"grad_norm": 0.7790443134514797,
"learning_rate": 9.364628560736631e-06,
"loss": 0.4634,
"step": 1091
},
{
"epoch": 1.7471999999999999,
"grad_norm": 0.7347132470320543,
"learning_rate": 9.356425623909493e-06,
"loss": 0.4538,
"step": 1092
},
{
"epoch": 1.7488000000000001,
"grad_norm": 0.7984639832990776,
"learning_rate": 9.34817371478694e-06,
"loss": 0.4576,
"step": 1093
},
{
"epoch": 1.7504,
"grad_norm": 0.7949795614582291,
"learning_rate": 9.33987292613193e-06,
"loss": 0.494,
"step": 1094
},
{
"epoch": 1.752,
"grad_norm": 0.8528298981207325,
"learning_rate": 9.331523351256898e-06,
"loss": 0.4719,
"step": 1095
},
{
"epoch": 1.7536,
"grad_norm": 0.8273161679245847,
"learning_rate": 9.323125084022701e-06,
"loss": 0.5114,
"step": 1096
},
{
"epoch": 1.7551999999999999,
"grad_norm": 0.8177732465705448,
"learning_rate": 9.31467821883757e-06,
"loss": 0.4693,
"step": 1097
},
{
"epoch": 1.7568000000000001,
"grad_norm": 0.7929842641850815,
"learning_rate": 9.306182850656037e-06,
"loss": 0.5143,
"step": 1098
},
{
"epoch": 1.7584,
"grad_norm": 0.7983649940388804,
"learning_rate": 9.297639074977885e-06,
"loss": 0.4477,
"step": 1099
},
{
"epoch": 1.76,
"grad_norm": 0.8030706601108953,
"learning_rate": 9.289046987847058e-06,
"loss": 0.4946,
"step": 1100
},
{
"epoch": 1.7616,
"grad_norm": 0.7414340357556809,
"learning_rate": 9.280406685850587e-06,
"loss": 0.4696,
"step": 1101
},
{
"epoch": 1.7631999999999999,
"grad_norm": 0.7824443709979357,
"learning_rate": 9.271718266117512e-06,
"loss": 0.4979,
"step": 1102
},
{
"epoch": 1.7648000000000001,
"grad_norm": 0.7926314063991798,
"learning_rate": 9.262981826317778e-06,
"loss": 0.4782,
"step": 1103
},
{
"epoch": 1.7664,
"grad_norm": 0.7604584164739576,
"learning_rate": 9.254197464661143e-06,
"loss": 0.4895,
"step": 1104
},
{
"epoch": 1.768,
"grad_norm": 0.761316814462793,
"learning_rate": 9.245365279896077e-06,
"loss": 0.4837,
"step": 1105
},
{
"epoch": 1.7696,
"grad_norm": 0.7209835263967452,
"learning_rate": 9.236485371308642e-06,
"loss": 0.4837,
"step": 1106
},
{
"epoch": 1.7711999999999999,
"grad_norm": 0.8213247627372725,
"learning_rate": 9.227557838721391e-06,
"loss": 0.4822,
"step": 1107
},
{
"epoch": 1.7728000000000002,
"grad_norm": 0.7460338028112166,
"learning_rate": 9.218582782492228e-06,
"loss": 0.495,
"step": 1108
},
{
"epoch": 1.7744,
"grad_norm": 0.7267567965373588,
"learning_rate": 9.209560303513296e-06,
"loss": 0.4368,
"step": 1109
},
{
"epoch": 1.776,
"grad_norm": 0.7211707144951695,
"learning_rate": 9.200490503209831e-06,
"loss": 0.4996,
"step": 1110
},
{
"epoch": 1.7776,
"grad_norm": 0.7257292962586889,
"learning_rate": 9.191373483539032e-06,
"loss": 0.4519,
"step": 1111
},
{
"epoch": 1.7792,
"grad_norm": 0.7181000434476451,
"learning_rate": 9.182209346988901e-06,
"loss": 0.4877,
"step": 1112
},
{
"epoch": 1.7808000000000002,
"grad_norm": 0.6998913729441977,
"learning_rate": 9.17299819657711e-06,
"loss": 0.4756,
"step": 1113
},
{
"epoch": 1.7824,
"grad_norm": 0.7585859336115078,
"learning_rate": 9.163740135849824e-06,
"loss": 0.4832,
"step": 1114
},
{
"epoch": 1.784,
"grad_norm": 0.7835474770178205,
"learning_rate": 9.154435268880547e-06,
"loss": 0.4748,
"step": 1115
},
{
"epoch": 1.7856,
"grad_norm": 0.7508571411491034,
"learning_rate": 9.145083700268955e-06,
"loss": 0.4649,
"step": 1116
},
{
"epoch": 1.7872,
"grad_norm": 0.8020723261386411,
"learning_rate": 9.135685535139709e-06,
"loss": 0.5001,
"step": 1117
},
{
"epoch": 1.7888,
"grad_norm": 0.6910426818371657,
"learning_rate": 9.126240879141286e-06,
"loss": 0.4466,
"step": 1118
},
{
"epoch": 1.7904,
"grad_norm": 0.8312177303482557,
"learning_rate": 9.116749838444778e-06,
"loss": 0.5066,
"step": 1119
},
{
"epoch": 1.792,
"grad_norm": 0.8798938353858576,
"learning_rate": 9.107212519742714e-06,
"loss": 0.4837,
"step": 1120
},
{
"epoch": 1.7936,
"grad_norm": 0.7398989429279293,
"learning_rate": 9.097629030247846e-06,
"loss": 0.5078,
"step": 1121
},
{
"epoch": 1.7952,
"grad_norm": 0.7872104783544963,
"learning_rate": 9.087999477691953e-06,
"loss": 0.4552,
"step": 1122
},
{
"epoch": 1.7968,
"grad_norm": 0.8673266664400423,
"learning_rate": 9.078323970324626e-06,
"loss": 0.4481,
"step": 1123
},
{
"epoch": 1.7984,
"grad_norm": 0.7759002350665339,
"learning_rate": 9.06860261691205e-06,
"loss": 0.4497,
"step": 1124
},
{
"epoch": 1.8,
"grad_norm": 0.8384770692839796,
"learning_rate": 9.058835526735788e-06,
"loss": 0.4474,
"step": 1125
},
{
"epoch": 1.8016,
"grad_norm": 0.7322304789714222,
"learning_rate": 9.049022809591546e-06,
"loss": 0.4449,
"step": 1126
},
{
"epoch": 1.8032,
"grad_norm": 0.7937302647096102,
"learning_rate": 9.039164575787937e-06,
"loss": 0.5042,
"step": 1127
},
{
"epoch": 1.8048,
"grad_norm": 0.8346903570431222,
"learning_rate": 9.029260936145252e-06,
"loss": 0.4603,
"step": 1128
},
{
"epoch": 1.8064,
"grad_norm": 0.7557433180719302,
"learning_rate": 9.019312001994203e-06,
"loss": 0.4576,
"step": 1129
},
{
"epoch": 1.808,
"grad_norm": 0.9072781260749982,
"learning_rate": 9.009317885174672e-06,
"loss": 0.4964,
"step": 1130
},
{
"epoch": 1.8096,
"grad_norm": 0.7673078175366279,
"learning_rate": 8.999278698034462e-06,
"loss": 0.4591,
"step": 1131
},
{
"epoch": 1.8112,
"grad_norm": 0.9578376130833234,
"learning_rate": 8.989194553428028e-06,
"loss": 0.4822,
"step": 1132
},
{
"epoch": 1.8128,
"grad_norm": 0.8435660382315425,
"learning_rate": 8.979065564715209e-06,
"loss": 0.4764,
"step": 1133
},
{
"epoch": 1.8144,
"grad_norm": 0.9384898259414467,
"learning_rate": 8.968891845759955e-06,
"loss": 0.4693,
"step": 1134
},
{
"epoch": 1.8159999999999998,
"grad_norm": 0.7740493192334655,
"learning_rate": 8.958673510929046e-06,
"loss": 0.4879,
"step": 1135
},
{
"epoch": 1.8176,
"grad_norm": 1.202185398672549,
"learning_rate": 8.948410675090807e-06,
"loss": 0.4947,
"step": 1136
},
{
"epoch": 1.8192,
"grad_norm": 0.792379067255798,
"learning_rate": 8.938103453613814e-06,
"loss": 0.4771,
"step": 1137
},
{
"epoch": 1.8208,
"grad_norm": 0.8306544602637841,
"learning_rate": 8.927751962365603e-06,
"loss": 0.4794,
"step": 1138
},
{
"epoch": 1.8224,
"grad_norm": 0.81541668232239,
"learning_rate": 8.917356317711359e-06,
"loss": 0.4741,
"step": 1139
},
{
"epoch": 1.8239999999999998,
"grad_norm": 0.7683318265954167,
"learning_rate": 8.906916636512618e-06,
"loss": 0.4822,
"step": 1140
},
{
"epoch": 1.8256000000000001,
"grad_norm": 0.8849502579326289,
"learning_rate": 8.89643303612595e-06,
"loss": 0.4685,
"step": 1141
},
{
"epoch": 1.8272,
"grad_norm": 0.7608968464316281,
"learning_rate": 8.885905634401629e-06,
"loss": 0.4666,
"step": 1142
},
{
"epoch": 1.8288,
"grad_norm": 0.8097542549953858,
"learning_rate": 8.875334549682322e-06,
"loss": 0.4783,
"step": 1143
},
{
"epoch": 1.8304,
"grad_norm": 0.8216152980420542,
"learning_rate": 8.864719900801755e-06,
"loss": 0.4584,
"step": 1144
},
{
"epoch": 1.8319999999999999,
"grad_norm": 0.8656307140884373,
"learning_rate": 8.854061807083376e-06,
"loss": 0.504,
"step": 1145
},
{
"epoch": 1.8336000000000001,
"grad_norm": 0.8224033400645684,
"learning_rate": 8.84336038833901e-06,
"loss": 0.4727,
"step": 1146
},
{
"epoch": 1.8352,
"grad_norm": 0.8363949626195645,
"learning_rate": 8.832615764867521e-06,
"loss": 0.4804,
"step": 1147
},
{
"epoch": 1.8368,
"grad_norm": 0.8119714327103984,
"learning_rate": 8.821828057453448e-06,
"loss": 0.5037,
"step": 1148
},
{
"epoch": 1.8384,
"grad_norm": 0.8058780927155984,
"learning_rate": 8.810997387365656e-06,
"loss": 0.4653,
"step": 1149
},
{
"epoch": 1.8399999999999999,
"grad_norm": 0.8968949671889885,
"learning_rate": 8.800123876355976e-06,
"loss": 0.4854,
"step": 1150
},
{
"epoch": 1.8416000000000001,
"grad_norm": 0.7386589358702967,
"learning_rate": 8.789207646657823e-06,
"loss": 0.4835,
"step": 1151
},
{
"epoch": 1.8432,
"grad_norm": 0.7837448106837076,
"learning_rate": 8.778248820984829e-06,
"loss": 0.4884,
"step": 1152
},
{
"epoch": 1.8448,
"grad_norm": 0.7522920292251475,
"learning_rate": 8.767247522529473e-06,
"loss": 0.485,
"step": 1153
},
{
"epoch": 1.8464,
"grad_norm": 0.8199450680001841,
"learning_rate": 8.75620387496168e-06,
"loss": 0.5207,
"step": 1154
},
{
"epoch": 1.8479999999999999,
"grad_norm": 0.7464684959146133,
"learning_rate": 8.74511800242744e-06,
"loss": 0.4942,
"step": 1155
},
{
"epoch": 1.8496000000000001,
"grad_norm": 0.8258051483853871,
"learning_rate": 8.733990029547408e-06,
"loss": 0.4882,
"step": 1156
},
{
"epoch": 1.8512,
"grad_norm": 0.6732953958913235,
"learning_rate": 8.72282008141551e-06,
"loss": 0.439,
"step": 1157
},
{
"epoch": 1.8528,
"grad_norm": 0.7402695579684151,
"learning_rate": 8.71160828359753e-06,
"loss": 0.4768,
"step": 1158
},
{
"epoch": 1.8544,
"grad_norm": 0.7040926189197414,
"learning_rate": 8.7003547621297e-06,
"loss": 0.4648,
"step": 1159
},
{
"epoch": 1.8559999999999999,
"grad_norm": 0.7250897200570943,
"learning_rate": 8.689059643517285e-06,
"loss": 0.4718,
"step": 1160
},
{
"epoch": 1.8576000000000001,
"grad_norm": 0.7012526603280671,
"learning_rate": 8.677723054733163e-06,
"loss": 0.449,
"step": 1161
},
{
"epoch": 1.8592,
"grad_norm": 0.8723369562580054,
"learning_rate": 8.666345123216387e-06,
"loss": 0.4954,
"step": 1162
},
{
"epoch": 1.8608,
"grad_norm": 0.7295611805392298,
"learning_rate": 8.654925976870766e-06,
"loss": 0.465,
"step": 1163
},
{
"epoch": 1.8624,
"grad_norm": 0.73472158103287,
"learning_rate": 8.64346574406342e-06,
"loss": 0.4837,
"step": 1164
},
{
"epoch": 1.8639999999999999,
"grad_norm": 0.7455549509974911,
"learning_rate": 8.631964553623336e-06,
"loss": 0.4818,
"step": 1165
},
{
"epoch": 1.8656000000000001,
"grad_norm": 0.7115678369682501,
"learning_rate": 8.620422534839925e-06,
"loss": 0.463,
"step": 1166
},
{
"epoch": 1.8672,
"grad_norm": 0.7857802546484215,
"learning_rate": 8.608839817461565e-06,
"loss": 0.4725,
"step": 1167
},
{
"epoch": 1.8688,
"grad_norm": 0.768644071166201,
"learning_rate": 8.597216531694136e-06,
"loss": 0.4873,
"step": 1168
},
{
"epoch": 1.8704,
"grad_norm": 0.7846588618058573,
"learning_rate": 8.585552808199577e-06,
"loss": 0.4738,
"step": 1169
},
{
"epoch": 1.8719999999999999,
"grad_norm": 0.7471086835732865,
"learning_rate": 8.57384877809439e-06,
"loss": 0.4537,
"step": 1170
},
{
"epoch": 1.8736000000000002,
"grad_norm": 0.8648658959127388,
"learning_rate": 8.562104572948185e-06,
"loss": 0.4698,
"step": 1171
},
{
"epoch": 1.8752,
"grad_norm": 0.7109035208423685,
"learning_rate": 8.550320324782198e-06,
"loss": 0.4553,
"step": 1172
},
{
"epoch": 1.8768,
"grad_norm": 0.8015327722654481,
"learning_rate": 8.538496166067798e-06,
"loss": 0.4774,
"step": 1173
},
{
"epoch": 1.8784,
"grad_norm": 0.7306504452949123,
"learning_rate": 8.526632229725012e-06,
"loss": 0.4827,
"step": 1174
},
{
"epoch": 1.88,
"grad_norm": 0.7004149184104759,
"learning_rate": 8.514728649121017e-06,
"loss": 0.4659,
"step": 1175
},
{
"epoch": 1.8816000000000002,
"grad_norm": 0.7913013945698448,
"learning_rate": 8.50278555806865e-06,
"loss": 0.4684,
"step": 1176
},
{
"epoch": 1.8832,
"grad_norm": 0.7801219375804972,
"learning_rate": 8.490803090824895e-06,
"loss": 0.4905,
"step": 1177
},
{
"epoch": 1.8848,
"grad_norm": 0.7351394597035321,
"learning_rate": 8.478781382089387e-06,
"loss": 0.469,
"step": 1178
},
{
"epoch": 1.8864,
"grad_norm": 0.7707855237564337,
"learning_rate": 8.466720567002887e-06,
"loss": 0.4831,
"step": 1179
},
{
"epoch": 1.888,
"grad_norm": 0.7208606345948122,
"learning_rate": 8.454620781145761e-06,
"loss": 0.4742,
"step": 1180
},
{
"epoch": 1.8896,
"grad_norm": 0.79848628475022,
"learning_rate": 8.442482160536469e-06,
"loss": 0.4863,
"step": 1181
},
{
"epoch": 1.8912,
"grad_norm": 0.6769590611616342,
"learning_rate": 8.430304841630024e-06,
"loss": 0.4649,
"step": 1182
},
{
"epoch": 1.8928,
"grad_norm": 0.7530969549620007,
"learning_rate": 8.418088961316459e-06,
"loss": 0.471,
"step": 1183
},
{
"epoch": 1.8944,
"grad_norm": 0.7114720544049936,
"learning_rate": 8.405834656919295e-06,
"loss": 0.4826,
"step": 1184
},
{
"epoch": 1.896,
"grad_norm": 0.7328135064170878,
"learning_rate": 8.393542066193994e-06,
"loss": 0.491,
"step": 1185
},
{
"epoch": 1.8976,
"grad_norm": 0.692361493124172,
"learning_rate": 8.381211327326403e-06,
"loss": 0.4407,
"step": 1186
},
{
"epoch": 1.8992,
"grad_norm": 0.7657820423628003,
"learning_rate": 8.368842578931214e-06,
"loss": 0.4808,
"step": 1187
},
{
"epoch": 1.9008,
"grad_norm": 0.7493563054639181,
"learning_rate": 8.356435960050398e-06,
"loss": 0.4588,
"step": 1188
},
{
"epoch": 1.9024,
"grad_norm": 0.7076961903732508,
"learning_rate": 8.34399161015164e-06,
"loss": 0.4603,
"step": 1189
},
{
"epoch": 1.904,
"grad_norm": 0.8155352599270661,
"learning_rate": 8.331509669126778e-06,
"loss": 0.4588,
"step": 1190
},
{
"epoch": 1.9056,
"grad_norm": 0.7246019082046947,
"learning_rate": 8.318990277290224e-06,
"loss": 0.462,
"step": 1191
},
{
"epoch": 1.9072,
"grad_norm": 0.8329063517847972,
"learning_rate": 8.306433575377388e-06,
"loss": 0.4558,
"step": 1192
},
{
"epoch": 1.9088,
"grad_norm": 0.6981008593587018,
"learning_rate": 8.293839704543103e-06,
"loss": 0.4834,
"step": 1193
},
{
"epoch": 1.9104,
"grad_norm": 0.8653080211979014,
"learning_rate": 8.281208806360028e-06,
"loss": 0.4507,
"step": 1194
},
{
"epoch": 1.912,
"grad_norm": 0.95604800095903,
"learning_rate": 8.268541022817058e-06,
"loss": 0.5203,
"step": 1195
},
{
"epoch": 1.9136,
"grad_norm": 0.7952132943771812,
"learning_rate": 8.255836496317739e-06,
"loss": 0.445,
"step": 1196
},
{
"epoch": 1.9152,
"grad_norm": 0.7661599790643271,
"learning_rate": 8.243095369678653e-06,
"loss": 0.45,
"step": 1197
},
{
"epoch": 1.9167999999999998,
"grad_norm": 0.736039388177421,
"learning_rate": 8.230317786127822e-06,
"loss": 0.4813,
"step": 1198
},
{
"epoch": 1.9184,
"grad_norm": 0.8051710006606744,
"learning_rate": 8.217503889303088e-06,
"loss": 0.4595,
"step": 1199
},
{
"epoch": 1.92,
"grad_norm": 0.7836426970956726,
"learning_rate": 8.204653823250516e-06,
"loss": 0.4831,
"step": 1200
},
{
"epoch": 1.9216,
"grad_norm": 0.7734510329192413,
"learning_rate": 8.191767732422754e-06,
"loss": 0.4809,
"step": 1201
},
{
"epoch": 1.9232,
"grad_norm": 0.7187700406077102,
"learning_rate": 8.17884576167742e-06,
"loss": 0.4409,
"step": 1202
},
{
"epoch": 1.9247999999999998,
"grad_norm": 0.745714087535346,
"learning_rate": 8.165888056275478e-06,
"loss": 0.443,
"step": 1203
},
{
"epoch": 1.9264000000000001,
"grad_norm": 0.7048072632404838,
"learning_rate": 8.152894761879593e-06,
"loss": 0.4838,
"step": 1204
},
{
"epoch": 1.928,
"grad_norm": 0.7264527631172872,
"learning_rate": 8.1398660245525e-06,
"loss": 0.4613,
"step": 1205
},
{
"epoch": 1.9296,
"grad_norm": 0.7195940478277059,
"learning_rate": 8.126801990755371e-06,
"loss": 0.4459,
"step": 1206
},
{
"epoch": 1.9312,
"grad_norm": 0.7553387112860547,
"learning_rate": 8.113702807346147e-06,
"loss": 0.4886,
"step": 1207
},
{
"epoch": 1.9327999999999999,
"grad_norm": 0.6954464450652126,
"learning_rate": 8.100568621577907e-06,
"loss": 0.4666,
"step": 1208
},
{
"epoch": 1.9344000000000001,
"grad_norm": 0.7304012484152707,
"learning_rate": 8.087399581097205e-06,
"loss": 0.4621,
"step": 1209
},
{
"epoch": 1.936,
"grad_norm": 1.6917118741128514,
"learning_rate": 8.074195833942405e-06,
"loss": 0.4705,
"step": 1210
},
{
"epoch": 1.9376,
"grad_norm": 0.7656279018780524,
"learning_rate": 8.060957528542032e-06,
"loss": 0.4889,
"step": 1211
},
{
"epoch": 1.9392,
"grad_norm": 0.6513452251174792,
"learning_rate": 8.047684813713086e-06,
"loss": 0.4582,
"step": 1212
},
{
"epoch": 1.9407999999999999,
"grad_norm": 1.0730911684214248,
"learning_rate": 8.03437783865938e-06,
"loss": 0.4389,
"step": 1213
},
{
"epoch": 1.9424000000000001,
"grad_norm": 0.7023367058796101,
"learning_rate": 8.021036752969859e-06,
"loss": 0.445,
"step": 1214
},
{
"epoch": 1.944,
"grad_norm": 0.6984689461295697,
"learning_rate": 8.007661706616919e-06,
"loss": 0.4888,
"step": 1215
},
{
"epoch": 1.9456,
"grad_norm": 0.7502708537039777,
"learning_rate": 7.99425284995472e-06,
"loss": 0.4468,
"step": 1216
},
{
"epoch": 1.9472,
"grad_norm": 0.7168320783099587,
"learning_rate": 7.980810333717499e-06,
"loss": 0.4701,
"step": 1217
},
{
"epoch": 1.9487999999999999,
"grad_norm": 0.8045767509305717,
"learning_rate": 7.967334309017876e-06,
"loss": 0.4439,
"step": 1218
},
{
"epoch": 1.9504000000000001,
"grad_norm": 0.8849219260640486,
"learning_rate": 7.953824927345146e-06,
"loss": 0.5201,
"step": 1219
},
{
"epoch": 1.952,
"grad_norm": 0.6787271238107349,
"learning_rate": 7.940282340563586e-06,
"loss": 0.466,
"step": 1220
},
{
"epoch": 1.9536,
"grad_norm": 0.7690530385600742,
"learning_rate": 7.92670670091075e-06,
"loss": 0.4524,
"step": 1221
},
{
"epoch": 1.9552,
"grad_norm": 0.6686012178909408,
"learning_rate": 7.913098160995742e-06,
"loss": 0.443,
"step": 1222
},
{
"epoch": 1.9567999999999999,
"grad_norm": 0.6996879963962194,
"learning_rate": 7.899456873797519e-06,
"loss": 0.4637,
"step": 1223
},
{
"epoch": 1.9584000000000001,
"grad_norm": 0.7150953281715766,
"learning_rate": 7.885782992663162e-06,
"loss": 0.4832,
"step": 1224
},
{
"epoch": 1.96,
"grad_norm": 0.6918952092029659,
"learning_rate": 7.87207667130615e-06,
"loss": 0.476,
"step": 1225
},
{
"epoch": 1.9616,
"grad_norm": 0.7115691290659868,
"learning_rate": 7.858338063804638e-06,
"loss": 0.499,
"step": 1226
},
{
"epoch": 1.9632,
"grad_norm": 0.6555011571772331,
"learning_rate": 7.84456732459972e-06,
"loss": 0.4843,
"step": 1227
},
{
"epoch": 1.9647999999999999,
"grad_norm": 0.715962295702684,
"learning_rate": 7.830764608493697e-06,
"loss": 0.4946,
"step": 1228
},
{
"epoch": 1.9664000000000001,
"grad_norm": 0.7541602992549761,
"learning_rate": 7.816930070648335e-06,
"loss": 0.4666,
"step": 1229
},
{
"epoch": 1.968,
"grad_norm": 0.720723686996417,
"learning_rate": 7.803063866583119e-06,
"loss": 0.4752,
"step": 1230
},
{
"epoch": 1.9696,
"grad_norm": 0.8413086585547764,
"learning_rate": 7.789166152173508e-06,
"loss": 0.4664,
"step": 1231
},
{
"epoch": 1.9712,
"grad_norm": 0.6792387613339629,
"learning_rate": 7.775237083649182e-06,
"loss": 0.4477,
"step": 1232
},
{
"epoch": 1.9727999999999999,
"grad_norm": 0.773696750431894,
"learning_rate": 7.761276817592283e-06,
"loss": 0.4861,
"step": 1233
},
{
"epoch": 1.9744000000000002,
"grad_norm": 0.696941964362057,
"learning_rate": 7.747285510935654e-06,
"loss": 0.4598,
"step": 1234
},
{
"epoch": 1.976,
"grad_norm": 0.7080059593102934,
"learning_rate": 7.733263320961087e-06,
"loss": 0.4656,
"step": 1235
},
{
"epoch": 1.9776,
"grad_norm": 0.7186736221997335,
"learning_rate": 7.719210405297537e-06,
"loss": 0.4651,
"step": 1236
},
{
"epoch": 1.9792,
"grad_norm": 0.7170560204985752,
"learning_rate": 7.705126921919358e-06,
"loss": 0.4561,
"step": 1237
},
{
"epoch": 1.9808,
"grad_norm": 0.7414750990700175,
"learning_rate": 7.691013029144535e-06,
"loss": 0.4574,
"step": 1238
},
{
"epoch": 1.9824000000000002,
"grad_norm": 0.7559504861881715,
"learning_rate": 7.676868885632893e-06,
"loss": 0.4478,
"step": 1239
},
{
"epoch": 1.984,
"grad_norm": 0.7159506812568182,
"learning_rate": 7.662694650384315e-06,
"loss": 0.4738,
"step": 1240
},
{
"epoch": 1.9856,
"grad_norm": 0.7148912940280229,
"learning_rate": 7.648490482736959e-06,
"loss": 0.4716,
"step": 1241
},
{
"epoch": 1.9872,
"grad_norm": 0.6786983608203037,
"learning_rate": 7.634256542365468e-06,
"loss": 0.4797,
"step": 1242
},
{
"epoch": 1.9888,
"grad_norm": 0.8016066893153321,
"learning_rate": 7.6199929892791666e-06,
"loss": 0.5211,
"step": 1243
},
{
"epoch": 1.9904,
"grad_norm": 0.6830168298566425,
"learning_rate": 7.60569998382027e-06,
"loss": 0.4985,
"step": 1244
},
{
"epoch": 1.992,
"grad_norm": 0.812026382596777,
"learning_rate": 7.591377686662081e-06,
"loss": 0.4719,
"step": 1245
},
{
"epoch": 1.9936,
"grad_norm": 0.7132848502425145,
"learning_rate": 7.577026258807181e-06,
"loss": 0.4802,
"step": 1246
},
{
"epoch": 1.9952,
"grad_norm": 0.7187112930479812,
"learning_rate": 7.562645861585615e-06,
"loss": 0.4658,
"step": 1247
},
{
"epoch": 1.9968,
"grad_norm": 0.7218289482909626,
"learning_rate": 7.548236656653095e-06,
"loss": 0.4692,
"step": 1248
},
{
"epoch": 1.9984,
"grad_norm": 0.6893057203147361,
"learning_rate": 7.533798805989165e-06,
"loss": 0.479,
"step": 1249
},
{
"epoch": 2.0,
"grad_norm": 0.6811147668238169,
"learning_rate": 7.519332471895384e-06,
"loss": 0.4228,
"step": 1250
},
{
"epoch": 2.0016,
"grad_norm": 0.942142555999305,
"learning_rate": 7.504837816993513e-06,
"loss": 0.4094,
"step": 1251
},
{
"epoch": 2.0032,
"grad_norm": 0.7740007104758746,
"learning_rate": 7.490315004223672e-06,
"loss": 0.4108,
"step": 1252
},
{
"epoch": 2.0048,
"grad_norm": 0.8169139096512836,
"learning_rate": 7.475764196842516e-06,
"loss": 0.4433,
"step": 1253
},
{
"epoch": 2.0064,
"grad_norm": 0.8209520669424164,
"learning_rate": 7.4611855584214e-06,
"loss": 0.3826,
"step": 1254
},
{
"epoch": 2.008,
"grad_norm": 0.8798367918362482,
"learning_rate": 7.446579252844536e-06,
"loss": 0.4224,
"step": 1255
},
{
"epoch": 2.0096,
"grad_norm": 0.8691405382550204,
"learning_rate": 7.431945444307157e-06,
"loss": 0.4289,
"step": 1256
},
{
"epoch": 2.0112,
"grad_norm": 0.800556536362489,
"learning_rate": 7.417284297313665e-06,
"loss": 0.4023,
"step": 1257
},
{
"epoch": 2.0128,
"grad_norm": 0.7728720252217788,
"learning_rate": 7.402595976675785e-06,
"loss": 0.4122,
"step": 1258
},
{
"epoch": 2.0144,
"grad_norm": 0.7536268800020502,
"learning_rate": 7.387880647510709e-06,
"loss": 0.395,
"step": 1259
},
{
"epoch": 2.016,
"grad_norm": 0.7495826111062508,
"learning_rate": 7.37313847523925e-06,
"loss": 0.4156,
"step": 1260
},
{
"epoch": 2.0176,
"grad_norm": 0.7977316124513112,
"learning_rate": 7.358369625583966e-06,
"loss": 0.4342,
"step": 1261
},
{
"epoch": 2.0192,
"grad_norm": 0.763736878663743,
"learning_rate": 7.343574264567311e-06,
"loss": 0.403,
"step": 1262
},
{
"epoch": 2.0208,
"grad_norm": 0.7318474686470899,
"learning_rate": 7.3287525585097615e-06,
"loss": 0.3909,
"step": 1263
},
{
"epoch": 2.0224,
"grad_norm": 0.7605413484257524,
"learning_rate": 7.313904674027954e-06,
"loss": 0.3885,
"step": 1264
},
{
"epoch": 2.024,
"grad_norm": 0.7279511815872336,
"learning_rate": 7.299030778032799e-06,
"loss": 0.4048,
"step": 1265
},
{
"epoch": 2.0256,
"grad_norm": 0.7523015788911891,
"learning_rate": 7.284131037727618e-06,
"loss": 0.4022,
"step": 1266
},
{
"epoch": 2.0272,
"grad_norm": 0.7599105856257794,
"learning_rate": 7.269205620606259e-06,
"loss": 0.3668,
"step": 1267
},
{
"epoch": 2.0288,
"grad_norm": 0.7438421120510148,
"learning_rate": 7.2542546944512106e-06,
"loss": 0.3808,
"step": 1268
},
{
"epoch": 2.0304,
"grad_norm": 0.727359932201372,
"learning_rate": 7.239278427331718e-06,
"loss": 0.3785,
"step": 1269
},
{
"epoch": 2.032,
"grad_norm": 0.6937857053944503,
"learning_rate": 7.224276987601895e-06,
"loss": 0.3757,
"step": 1270
},
{
"epoch": 2.0336,
"grad_norm": 0.780207663850111,
"learning_rate": 7.209250543898834e-06,
"loss": 0.4329,
"step": 1271
},
{
"epoch": 2.0352,
"grad_norm": 0.7309618417958235,
"learning_rate": 7.194199265140701e-06,
"loss": 0.4289,
"step": 1272
},
{
"epoch": 2.0368,
"grad_norm": 0.756730142412508,
"learning_rate": 7.179123320524848e-06,
"loss": 0.4011,
"step": 1273
},
{
"epoch": 2.0384,
"grad_norm": 0.7748502996836906,
"learning_rate": 7.1640228795259025e-06,
"loss": 0.4166,
"step": 1274
},
{
"epoch": 2.04,
"grad_norm": 0.7138213951471011,
"learning_rate": 7.148898111893867e-06,
"loss": 0.4211,
"step": 1275
},
{
"epoch": 2.0416,
"grad_norm": 0.7096395677980951,
"learning_rate": 7.133749187652208e-06,
"loss": 0.4351,
"step": 1276
},
{
"epoch": 2.0432,
"grad_norm": 0.7598339781728084,
"learning_rate": 7.118576277095944e-06,
"loss": 0.4173,
"step": 1277
},
{
"epoch": 2.0448,
"grad_norm": 0.7318720964428549,
"learning_rate": 7.103379550789741e-06,
"loss": 0.3985,
"step": 1278
},
{
"epoch": 2.0464,
"grad_norm": 0.7207495438632298,
"learning_rate": 7.088159179565978e-06,
"loss": 0.3681,
"step": 1279
},
{
"epoch": 2.048,
"grad_norm": 0.7495804691040051,
"learning_rate": 7.07291533452284e-06,
"loss": 0.4188,
"step": 1280
},
{
"epoch": 2.0496,
"grad_norm": 0.7464935959309549,
"learning_rate": 7.05764818702239e-06,
"loss": 0.387,
"step": 1281
},
{
"epoch": 2.0512,
"grad_norm": 0.7699151568223941,
"learning_rate": 7.042357908688646e-06,
"loss": 0.4167,
"step": 1282
},
{
"epoch": 2.0528,
"grad_norm": 0.7495209441991714,
"learning_rate": 7.027044671405643e-06,
"loss": 0.4072,
"step": 1283
},
{
"epoch": 2.0544,
"grad_norm": 0.6843159358600286,
"learning_rate": 7.0117086473155095e-06,
"loss": 0.3873,
"step": 1284
},
{
"epoch": 2.056,
"grad_norm": 0.7506916305788107,
"learning_rate": 6.996350008816532e-06,
"loss": 0.416,
"step": 1285
},
{
"epoch": 2.0576,
"grad_norm": 0.6939939433254977,
"learning_rate": 6.980968928561209e-06,
"loss": 0.3881,
"step": 1286
},
{
"epoch": 2.0592,
"grad_norm": 0.736883088527874,
"learning_rate": 6.965565579454322e-06,
"loss": 0.4142,
"step": 1287
},
{
"epoch": 2.0608,
"grad_norm": 0.7892335221877158,
"learning_rate": 6.9501401346509786e-06,
"loss": 0.4345,
"step": 1288
},
{
"epoch": 2.0624,
"grad_norm": 0.7166910275994461,
"learning_rate": 6.934692767554679e-06,
"loss": 0.409,
"step": 1289
},
{
"epoch": 2.064,
"grad_norm": 0.7612787510568383,
"learning_rate": 6.9192236518153566e-06,
"loss": 0.4234,
"step": 1290
},
{
"epoch": 2.0656,
"grad_norm": 0.7385227507104376,
"learning_rate": 6.903732961327432e-06,
"loss": 0.4231,
"step": 1291
},
{
"epoch": 2.0672,
"grad_norm": 0.7287381426108751,
"learning_rate": 6.888220870227853e-06,
"loss": 0.4269,
"step": 1292
},
{
"epoch": 2.0688,
"grad_norm": 0.7260025903032917,
"learning_rate": 6.872687552894145e-06,
"loss": 0.4167,
"step": 1293
},
{
"epoch": 2.0704,
"grad_norm": 0.6962157986575372,
"learning_rate": 6.857133183942442e-06,
"loss": 0.4125,
"step": 1294
},
{
"epoch": 2.072,
"grad_norm": 0.7451887904030556,
"learning_rate": 6.841557938225527e-06,
"loss": 0.4403,
"step": 1295
},
{
"epoch": 2.0736,
"grad_norm": 0.7219167742786095,
"learning_rate": 6.825961990830871e-06,
"loss": 0.4155,
"step": 1296
},
{
"epoch": 2.0752,
"grad_norm": 0.6884631866006362,
"learning_rate": 6.810345517078657e-06,
"loss": 0.413,
"step": 1297
},
{
"epoch": 2.0768,
"grad_norm": 0.7660646636597886,
"learning_rate": 6.794708692519815e-06,
"loss": 0.4048,
"step": 1298
},
{
"epoch": 2.0784,
"grad_norm": 0.7096954819778793,
"learning_rate": 6.779051692934043e-06,
"loss": 0.4095,
"step": 1299
},
{
"epoch": 2.08,
"grad_norm": 0.7507337262396936,
"learning_rate": 6.76337469432784e-06,
"loss": 0.4588,
"step": 1300
},
{
"epoch": 2.0816,
"grad_norm": 0.7695804036957934,
"learning_rate": 6.747677872932519e-06,
"loss": 0.4387,
"step": 1301
},
{
"epoch": 2.0832,
"grad_norm": 0.6821148233816933,
"learning_rate": 6.731961405202224e-06,
"loss": 0.4015,
"step": 1302
},
{
"epoch": 2.0848,
"grad_norm": 0.8274265466470279,
"learning_rate": 6.716225467811961e-06,
"loss": 0.4157,
"step": 1303
},
{
"epoch": 2.0864,
"grad_norm": 0.7695532034604734,
"learning_rate": 6.700470237655597e-06,
"loss": 0.424,
"step": 1304
},
{
"epoch": 2.088,
"grad_norm": 0.7574175438272212,
"learning_rate": 6.684695891843871e-06,
"loss": 0.4273,
"step": 1305
},
{
"epoch": 2.0896,
"grad_norm": 0.7106036941292224,
"learning_rate": 6.668902607702419e-06,
"loss": 0.4033,
"step": 1306
},
{
"epoch": 2.0912,
"grad_norm": 0.746696090756007,
"learning_rate": 6.653090562769764e-06,
"loss": 0.4172,
"step": 1307
},
{
"epoch": 2.0928,
"grad_norm": 0.7065553861543276,
"learning_rate": 6.637259934795328e-06,
"loss": 0.4036,
"step": 1308
},
{
"epoch": 2.0944,
"grad_norm": 0.7827478356395094,
"learning_rate": 6.6214109017374306e-06,
"loss": 0.4124,
"step": 1309
},
{
"epoch": 2.096,
"grad_norm": 0.7013377808905202,
"learning_rate": 6.605543641761293e-06,
"loss": 0.401,
"step": 1310
},
{
"epoch": 2.0976,
"grad_norm": 0.7311854764718958,
"learning_rate": 6.589658333237031e-06,
"loss": 0.3862,
"step": 1311
},
{
"epoch": 2.0992,
"grad_norm": 0.7372615193443035,
"learning_rate": 6.573755154737651e-06,
"loss": 0.4055,
"step": 1312
},
{
"epoch": 2.1008,
"grad_norm": 0.7291518334922953,
"learning_rate": 6.5578342850370415e-06,
"loss": 0.4261,
"step": 1313
},
{
"epoch": 2.1024,
"grad_norm": 0.8324400879041759,
"learning_rate": 6.54189590310797e-06,
"loss": 0.4466,
"step": 1314
},
{
"epoch": 2.104,
"grad_norm": 0.7392933627356066,
"learning_rate": 6.525940188120059e-06,
"loss": 0.4085,
"step": 1315
},
{
"epoch": 2.1056,
"grad_norm": 0.7991093699398838,
"learning_rate": 6.509967319437782e-06,
"loss": 0.3926,
"step": 1316
},
{
"epoch": 2.1072,
"grad_norm": 0.7382714582408669,
"learning_rate": 6.493977476618445e-06,
"loss": 0.3966,
"step": 1317
},
{
"epoch": 2.1088,
"grad_norm": 0.8306869188995181,
"learning_rate": 6.477970839410166e-06,
"loss": 0.3873,
"step": 1318
},
{
"epoch": 2.1104,
"grad_norm": 0.7363308972189302,
"learning_rate": 6.461947587749855e-06,
"loss": 0.4211,
"step": 1319
},
{
"epoch": 2.112,
"grad_norm": 0.7422749242705443,
"learning_rate": 6.445907901761189e-06,
"loss": 0.4142,
"step": 1320
},
{
"epoch": 2.1136,
"grad_norm": 0.822246172800683,
"learning_rate": 6.429851961752597e-06,
"loss": 0.4132,
"step": 1321
},
{
"epoch": 2.1152,
"grad_norm": 0.7378843525152939,
"learning_rate": 6.413779948215218e-06,
"loss": 0.4128,
"step": 1322
},
{
"epoch": 2.1168,
"grad_norm": 0.8531643497470118,
"learning_rate": 6.397692041820885e-06,
"loss": 0.4171,
"step": 1323
},
{
"epoch": 2.1184,
"grad_norm": 0.7333781651418984,
"learning_rate": 6.381588423420085e-06,
"loss": 0.4092,
"step": 1324
},
{
"epoch": 2.12,
"grad_norm": 0.7895400176236084,
"learning_rate": 6.365469274039936e-06,
"loss": 0.3994,
"step": 1325
},
{
"epoch": 2.1216,
"grad_norm": 0.7977366495086916,
"learning_rate": 6.349334774882136e-06,
"loss": 0.4004,
"step": 1326
},
{
"epoch": 2.1232,
"grad_norm": 0.7203694849580552,
"learning_rate": 6.333185107320945e-06,
"loss": 0.3949,
"step": 1327
},
{
"epoch": 2.1248,
"grad_norm": 0.7660899144917048,
"learning_rate": 6.317020452901134e-06,
"loss": 0.3907,
"step": 1328
},
{
"epoch": 2.1264,
"grad_norm": 0.7346196820608468,
"learning_rate": 6.300840993335945e-06,
"loss": 0.4109,
"step": 1329
},
{
"epoch": 2.128,
"grad_norm": 0.7605130816243495,
"learning_rate": 6.2846469105050545e-06,
"loss": 0.3845,
"step": 1330
},
{
"epoch": 2.1296,
"grad_norm": 0.763565798253721,
"learning_rate": 6.26843838645252e-06,
"loss": 0.4275,
"step": 1331
},
{
"epoch": 2.1312,
"grad_norm": 0.7635222394945375,
"learning_rate": 6.2522156033847435e-06,
"loss": 0.4019,
"step": 1332
},
{
"epoch": 2.1328,
"grad_norm": 0.7122514894717155,
"learning_rate": 6.235978743668415e-06,
"loss": 0.3992,
"step": 1333
},
{
"epoch": 2.1344,
"grad_norm": 0.7634523079399845,
"learning_rate": 6.219727989828466e-06,
"loss": 0.3962,
"step": 1334
},
{
"epoch": 2.136,
"grad_norm": 0.7411828984909158,
"learning_rate": 6.203463524546017e-06,
"loss": 0.4274,
"step": 1335
},
{
"epoch": 2.1376,
"grad_norm": 0.6811926949135216,
"learning_rate": 6.187185530656327e-06,
"loss": 0.3777,
"step": 1336
},
{
"epoch": 2.1391999999999998,
"grad_norm": 0.7346885484792756,
"learning_rate": 6.1708941911467335e-06,
"loss": 0.4176,
"step": 1337
},
{
"epoch": 2.1408,
"grad_norm": 0.700465605432178,
"learning_rate": 6.154589689154594e-06,
"loss": 0.3943,
"step": 1338
},
{
"epoch": 2.1424,
"grad_norm": 0.6885094603302031,
"learning_rate": 6.138272207965238e-06,
"loss": 0.4284,
"step": 1339
},
{
"epoch": 2.144,
"grad_norm": 0.7409839744147182,
"learning_rate": 6.121941931009894e-06,
"loss": 0.4261,
"step": 1340
},
{
"epoch": 2.1456,
"grad_norm": 0.7569295619787667,
"learning_rate": 6.105599041863631e-06,
"loss": 0.4386,
"step": 1341
},
{
"epoch": 2.1471999999999998,
"grad_norm": 0.7014645801815328,
"learning_rate": 6.089243724243303e-06,
"loss": 0.3973,
"step": 1342
},
{
"epoch": 2.1488,
"grad_norm": 0.6854172780386442,
"learning_rate": 6.072876162005474e-06,
"loss": 0.4007,
"step": 1343
},
{
"epoch": 2.1504,
"grad_norm": 0.69782675982927,
"learning_rate": 6.056496539144351e-06,
"loss": 0.3958,
"step": 1344
},
{
"epoch": 2.152,
"grad_norm": 0.665761808333572,
"learning_rate": 6.040105039789726e-06,
"loss": 0.4382,
"step": 1345
},
{
"epoch": 2.1536,
"grad_norm": 0.6715297399502295,
"learning_rate": 6.023701848204893e-06,
"loss": 0.3722,
"step": 1346
},
{
"epoch": 2.1552,
"grad_norm": 0.6703345368133065,
"learning_rate": 6.007287148784591e-06,
"loss": 0.3935,
"step": 1347
},
{
"epoch": 2.1568,
"grad_norm": 0.7079035209974975,
"learning_rate": 5.990861126052914e-06,
"loss": 0.3882,
"step": 1348
},
{
"epoch": 2.1584,
"grad_norm": 0.7252107816623263,
"learning_rate": 5.974423964661249e-06,
"loss": 0.421,
"step": 1349
},
{
"epoch": 2.16,
"grad_norm": 0.6859455329532241,
"learning_rate": 5.957975849386202e-06,
"loss": 0.3898,
"step": 1350
},
{
"epoch": 2.1616,
"grad_norm": 0.658909658030749,
"learning_rate": 5.941516965127509e-06,
"loss": 0.4145,
"step": 1351
},
{
"epoch": 2.1632,
"grad_norm": 0.6964614246833752,
"learning_rate": 5.925047496905968e-06,
"loss": 0.4198,
"step": 1352
},
{
"epoch": 2.1648,
"grad_norm": 0.7142994737245355,
"learning_rate": 5.908567629861354e-06,
"loss": 0.4222,
"step": 1353
},
{
"epoch": 2.1664,
"grad_norm": 0.6826892894964197,
"learning_rate": 5.892077549250341e-06,
"loss": 0.388,
"step": 1354
},
{
"epoch": 2.168,
"grad_norm": 0.6947181993249654,
"learning_rate": 5.8755774404444175e-06,
"loss": 0.4027,
"step": 1355
},
{
"epoch": 2.1696,
"grad_norm": 0.6623191040439221,
"learning_rate": 5.8590674889278e-06,
"loss": 0.3933,
"step": 1356
},
{
"epoch": 2.1712,
"grad_norm": 0.7076610375413062,
"learning_rate": 5.842547880295353e-06,
"loss": 0.4218,
"step": 1357
},
{
"epoch": 2.1728,
"grad_norm": 0.670755940027531,
"learning_rate": 5.8260188002505034e-06,
"loss": 0.4177,
"step": 1358
},
{
"epoch": 2.1744,
"grad_norm": 0.6831146691129467,
"learning_rate": 5.809480434603143e-06,
"loss": 0.3685,
"step": 1359
},
{
"epoch": 2.176,
"grad_norm": 0.726705280178454,
"learning_rate": 5.792932969267553e-06,
"loss": 0.4068,
"step": 1360
},
{
"epoch": 2.1776,
"grad_norm": 0.7488174281934294,
"learning_rate": 5.776376590260306e-06,
"loss": 0.4223,
"step": 1361
},
{
"epoch": 2.1792,
"grad_norm": 0.7355960902533142,
"learning_rate": 5.759811483698173e-06,
"loss": 0.398,
"step": 1362
},
{
"epoch": 2.1808,
"grad_norm": 0.687800573239867,
"learning_rate": 5.743237835796042e-06,
"loss": 0.3723,
"step": 1363
},
{
"epoch": 2.1824,
"grad_norm": 0.7153679038108298,
"learning_rate": 5.726655832864809e-06,
"loss": 0.403,
"step": 1364
},
{
"epoch": 2.184,
"grad_norm": 0.6856145780856769,
"learning_rate": 5.7100656613093005e-06,
"loss": 0.3883,
"step": 1365
},
{
"epoch": 2.1856,
"grad_norm": 0.699073434830909,
"learning_rate": 5.693467507626165e-06,
"loss": 0.4006,
"step": 1366
},
{
"epoch": 2.1872,
"grad_norm": 0.7413382057726158,
"learning_rate": 5.67686155840178e-06,
"loss": 0.3986,
"step": 1367
},
{
"epoch": 2.1888,
"grad_norm": 0.696181735941245,
"learning_rate": 5.660248000310162e-06,
"loss": 0.4102,
"step": 1368
},
{
"epoch": 2.1904,
"grad_norm": 0.6633405370718249,
"learning_rate": 5.643627020110855e-06,
"loss": 0.3799,
"step": 1369
},
{
"epoch": 2.192,
"grad_norm": 0.73029597277892,
"learning_rate": 5.626998804646842e-06,
"loss": 0.3803,
"step": 1370
},
{
"epoch": 2.1936,
"grad_norm": 0.6845929372793488,
"learning_rate": 5.610363540842435e-06,
"loss": 0.418,
"step": 1371
},
{
"epoch": 2.1952,
"grad_norm": 0.7142592582014309,
"learning_rate": 5.593721415701189e-06,
"loss": 0.4162,
"step": 1372
},
{
"epoch": 2.1968,
"grad_norm": 0.6786135519346064,
"learning_rate": 5.577072616303779e-06,
"loss": 0.4161,
"step": 1373
},
{
"epoch": 2.1984,
"grad_norm": 0.6783654100973778,
"learning_rate": 5.560417329805916e-06,
"loss": 0.4124,
"step": 1374
},
{
"epoch": 2.2,
"grad_norm": 0.6819976648386019,
"learning_rate": 5.543755743436231e-06,
"loss": 0.3737,
"step": 1375
},
{
"epoch": 2.2016,
"grad_norm": 0.6592168446016077,
"learning_rate": 5.527088044494176e-06,
"loss": 0.3682,
"step": 1376
},
{
"epoch": 2.2032,
"grad_norm": 0.6952606563442375,
"learning_rate": 5.510414420347918e-06,
"loss": 0.4014,
"step": 1377
},
{
"epoch": 2.2048,
"grad_norm": 0.7695855630431729,
"learning_rate": 5.493735058432227e-06,
"loss": 0.4068,
"step": 1378
},
{
"epoch": 2.2064,
"grad_norm": 0.7088645745459955,
"learning_rate": 5.477050146246379e-06,
"loss": 0.404,
"step": 1379
},
{
"epoch": 2.208,
"grad_norm": 0.7210274092132056,
"learning_rate": 5.4603598713520354e-06,
"loss": 0.4213,
"step": 1380
},
{
"epoch": 2.2096,
"grad_norm": 0.6867630862942176,
"learning_rate": 5.443664421371153e-06,
"loss": 0.4103,
"step": 1381
},
{
"epoch": 2.2112,
"grad_norm": 0.7375263347537202,
"learning_rate": 5.426963983983853e-06,
"loss": 0.3992,
"step": 1382
},
{
"epoch": 2.2128,
"grad_norm": 0.7295547316150184,
"learning_rate": 5.410258746926328e-06,
"loss": 0.4223,
"step": 1383
},
{
"epoch": 2.2144,
"grad_norm": 0.7223275500404251,
"learning_rate": 5.393548897988724e-06,
"loss": 0.4034,
"step": 1384
},
{
"epoch": 2.216,
"grad_norm": 0.7243637838300921,
"learning_rate": 5.376834625013031e-06,
"loss": 0.4177,
"step": 1385
},
{
"epoch": 2.2176,
"grad_norm": 0.7173220811903077,
"learning_rate": 5.360116115890972e-06,
"loss": 0.3634,
"step": 1386
},
{
"epoch": 2.2192,
"grad_norm": 0.7305499937559662,
"learning_rate": 5.343393558561888e-06,
"loss": 0.4021,
"step": 1387
},
{
"epoch": 2.2208,
"grad_norm": 0.7035111004271161,
"learning_rate": 5.3266671410106306e-06,
"loss": 0.3911,
"step": 1388
},
{
"epoch": 2.2224,
"grad_norm": 0.7142086611702474,
"learning_rate": 5.309937051265443e-06,
"loss": 0.4013,
"step": 1389
},
{
"epoch": 2.224,
"grad_norm": 0.6912644551993837,
"learning_rate": 5.293203477395851e-06,
"loss": 0.3718,
"step": 1390
},
{
"epoch": 2.2256,
"grad_norm": 0.7557855577623867,
"learning_rate": 5.276466607510544e-06,
"loss": 0.4226,
"step": 1391
},
{
"epoch": 2.2272,
"grad_norm": 0.6771738582039201,
"learning_rate": 5.259726629755268e-06,
"loss": 0.4066,
"step": 1392
},
{
"epoch": 2.2288,
"grad_norm": 0.7163773998662869,
"learning_rate": 5.2429837323107005e-06,
"loss": 0.4141,
"step": 1393
},
{
"epoch": 2.2304,
"grad_norm": 0.7453580760304513,
"learning_rate": 5.226238103390343e-06,
"loss": 0.3945,
"step": 1394
},
{
"epoch": 2.232,
"grad_norm": 0.6675446305575985,
"learning_rate": 5.209489931238405e-06,
"loss": 0.4206,
"step": 1395
},
{
"epoch": 2.2336,
"grad_norm": 0.7179812275637177,
"learning_rate": 5.192739404127679e-06,
"loss": 0.4039,
"step": 1396
},
{
"epoch": 2.2352,
"grad_norm": 0.6819937541847448,
"learning_rate": 5.175986710357439e-06,
"loss": 0.3953,
"step": 1397
},
{
"epoch": 2.2368,
"grad_norm": 0.708633520110563,
"learning_rate": 5.159232038251305e-06,
"loss": 0.4016,
"step": 1398
},
{
"epoch": 2.2384,
"grad_norm": 0.7089142714394822,
"learning_rate": 5.142475576155146e-06,
"loss": 0.4128,
"step": 1399
},
{
"epoch": 2.24,
"grad_norm": 0.682097796087665,
"learning_rate": 5.125717512434947e-06,
"loss": 0.3925,
"step": 1400
},
{
"epoch": 2.2416,
"grad_norm": 0.6873278404458673,
"learning_rate": 5.108958035474703e-06,
"loss": 0.3986,
"step": 1401
},
{
"epoch": 2.2432,
"grad_norm": 0.6833518258535074,
"learning_rate": 5.092197333674286e-06,
"loss": 0.3849,
"step": 1402
},
{
"epoch": 2.2448,
"grad_norm": 0.6899471206939742,
"learning_rate": 5.075435595447346e-06,
"loss": 0.395,
"step": 1403
},
{
"epoch": 2.2464,
"grad_norm": 0.7060680016467793,
"learning_rate": 5.0586730092191835e-06,
"loss": 0.4117,
"step": 1404
},
{
"epoch": 2.248,
"grad_norm": 0.6691116987846536,
"learning_rate": 5.041909763424625e-06,
"loss": 0.3869,
"step": 1405
},
{
"epoch": 2.2496,
"grad_norm": 0.6726090508188094,
"learning_rate": 5.0251460465059175e-06,
"loss": 0.3973,
"step": 1406
},
{
"epoch": 2.2512,
"grad_norm": 0.7261719342392827,
"learning_rate": 5.0083820469106016e-06,
"loss": 0.3885,
"step": 1407
},
{
"epoch": 2.2528,
"grad_norm": 0.7045454425538864,
"learning_rate": 4.991617953089399e-06,
"loss": 0.4085,
"step": 1408
},
{
"epoch": 2.2544,
"grad_norm": 0.7268522978922883,
"learning_rate": 4.9748539534940825e-06,
"loss": 0.3925,
"step": 1409
},
{
"epoch": 2.2560000000000002,
"grad_norm": 0.7293107359251259,
"learning_rate": 4.9580902365753765e-06,
"loss": 0.413,
"step": 1410
},
{
"epoch": 2.2576,
"grad_norm": 0.7306458660459159,
"learning_rate": 4.941326990780819e-06,
"loss": 0.3819,
"step": 1411
},
{
"epoch": 2.2592,
"grad_norm": 0.7453668668217899,
"learning_rate": 4.9245644045526546e-06,
"loss": 0.4128,
"step": 1412
},
{
"epoch": 2.2608,
"grad_norm": 0.7327780131800568,
"learning_rate": 4.907802666325716e-06,
"loss": 0.3984,
"step": 1413
},
{
"epoch": 2.2624,
"grad_norm": 0.6705661754023295,
"learning_rate": 4.891041964525301e-06,
"loss": 0.363,
"step": 1414
},
{
"epoch": 2.2640000000000002,
"grad_norm": 0.6705860102552303,
"learning_rate": 4.874282487565053e-06,
"loss": 0.4082,
"step": 1415
},
{
"epoch": 2.2656,
"grad_norm": 0.6754476261858788,
"learning_rate": 4.857524423844855e-06,
"loss": 0.4024,
"step": 1416
},
{
"epoch": 2.2672,
"grad_norm": 0.6943591916964985,
"learning_rate": 4.840767961748697e-06,
"loss": 0.4042,
"step": 1417
},
{
"epoch": 2.2688,
"grad_norm": 0.6479606601514696,
"learning_rate": 4.824013289642563e-06,
"loss": 0.4014,
"step": 1418
},
{
"epoch": 2.2704,
"grad_norm": 0.7011706070992588,
"learning_rate": 4.807260595872322e-06,
"loss": 0.4016,
"step": 1419
},
{
"epoch": 2.2720000000000002,
"grad_norm": 0.661445156193842,
"learning_rate": 4.790510068761596e-06,
"loss": 0.4132,
"step": 1420
},
{
"epoch": 2.2736,
"grad_norm": 0.6418778504051741,
"learning_rate": 4.773761896609658e-06,
"loss": 0.3758,
"step": 1421
},
{
"epoch": 2.2752,
"grad_norm": 0.693742601668366,
"learning_rate": 4.757016267689302e-06,
"loss": 0.3709,
"step": 1422
},
{
"epoch": 2.2768,
"grad_norm": 0.6749659667015893,
"learning_rate": 4.740273370244734e-06,
"loss": 0.3955,
"step": 1423
},
{
"epoch": 2.2784,
"grad_norm": 0.7249497133831025,
"learning_rate": 4.723533392489457e-06,
"loss": 0.3826,
"step": 1424
},
{
"epoch": 2.2800000000000002,
"grad_norm": 0.638182443419821,
"learning_rate": 4.706796522604152e-06,
"loss": 0.3764,
"step": 1425
},
{
"epoch": 2.2816,
"grad_norm": 0.7091242913232427,
"learning_rate": 4.690062948734558e-06,
"loss": 0.4075,
"step": 1426
},
{
"epoch": 2.2832,
"grad_norm": 0.7643436712622311,
"learning_rate": 4.673332858989371e-06,
"loss": 0.4033,
"step": 1427
},
{
"epoch": 2.2848,
"grad_norm": 0.6519541942146109,
"learning_rate": 4.656606441438114e-06,
"loss": 0.4077,
"step": 1428
},
{
"epoch": 2.2864,
"grad_norm": 0.6789895179066501,
"learning_rate": 4.639883884109029e-06,
"loss": 0.3892,
"step": 1429
},
{
"epoch": 2.288,
"grad_norm": 0.7086470439814316,
"learning_rate": 4.623165374986971e-06,
"loss": 0.3915,
"step": 1430
},
{
"epoch": 2.2896,
"grad_norm": 0.7074373363966382,
"learning_rate": 4.606451102011278e-06,
"loss": 0.4109,
"step": 1431
},
{
"epoch": 2.2912,
"grad_norm": 0.6975980679382322,
"learning_rate": 4.589741253073673e-06,
"loss": 0.4318,
"step": 1432
},
{
"epoch": 2.2928,
"grad_norm": 0.7023847612883237,
"learning_rate": 4.573036016016149e-06,
"loss": 0.3808,
"step": 1433
},
{
"epoch": 2.2944,
"grad_norm": 0.7036622385598531,
"learning_rate": 4.556335578628849e-06,
"loss": 0.4258,
"step": 1434
},
{
"epoch": 2.296,
"grad_norm": 0.673940239730178,
"learning_rate": 4.539640128647965e-06,
"loss": 0.4067,
"step": 1435
},
{
"epoch": 2.2976,
"grad_norm": 0.7483025545223577,
"learning_rate": 4.522949853753624e-06,
"loss": 0.411,
"step": 1436
},
{
"epoch": 2.2992,
"grad_norm": 0.7146685489644412,
"learning_rate": 4.506264941567774e-06,
"loss": 0.4396,
"step": 1437
},
{
"epoch": 2.3008,
"grad_norm": 0.668252809567489,
"learning_rate": 4.489585579652083e-06,
"loss": 0.3773,
"step": 1438
},
{
"epoch": 2.3024,
"grad_norm": 0.7823858754349284,
"learning_rate": 4.472911955505825e-06,
"loss": 0.4211,
"step": 1439
},
{
"epoch": 2.304,
"grad_norm": 0.6753315804492159,
"learning_rate": 4.456244256563769e-06,
"loss": 0.3964,
"step": 1440
},
{
"epoch": 2.3056,
"grad_norm": 0.6821633560700455,
"learning_rate": 4.439582670194086e-06,
"loss": 0.3763,
"step": 1441
},
{
"epoch": 2.3072,
"grad_norm": 0.7103390458787602,
"learning_rate": 4.422927383696224e-06,
"loss": 0.4258,
"step": 1442
},
{
"epoch": 2.3088,
"grad_norm": 0.6720513053935617,
"learning_rate": 4.406278584298813e-06,
"loss": 0.4202,
"step": 1443
},
{
"epoch": 2.3104,
"grad_norm": 0.6805679607043243,
"learning_rate": 4.389636459157567e-06,
"loss": 0.41,
"step": 1444
},
{
"epoch": 2.312,
"grad_norm": 0.6819644157134925,
"learning_rate": 4.373001195353159e-06,
"loss": 0.3685,
"step": 1445
},
{
"epoch": 2.3136,
"grad_norm": 0.6802122050642477,
"learning_rate": 4.356372979889146e-06,
"loss": 0.4091,
"step": 1446
},
{
"epoch": 2.3152,
"grad_norm": 0.6841376043715974,
"learning_rate": 4.339751999689839e-06,
"loss": 0.4062,
"step": 1447
},
{
"epoch": 2.3168,
"grad_norm": 0.7051365598389653,
"learning_rate": 4.323138441598219e-06,
"loss": 0.4131,
"step": 1448
},
{
"epoch": 2.3184,
"grad_norm": 0.6850129830520767,
"learning_rate": 4.306532492373836e-06,
"loss": 0.3851,
"step": 1449
},
{
"epoch": 2.32,
"grad_norm": 0.6813794267985254,
"learning_rate": 4.2899343386907e-06,
"loss": 0.4046,
"step": 1450
},
{
"epoch": 2.3216,
"grad_norm": 0.8127582644849977,
"learning_rate": 4.273344167135191e-06,
"loss": 0.4326,
"step": 1451
},
{
"epoch": 2.3232,
"grad_norm": 0.6877511095487422,
"learning_rate": 4.25676216420396e-06,
"loss": 0.3993,
"step": 1452
},
{
"epoch": 2.3247999999999998,
"grad_norm": 0.6755597194123087,
"learning_rate": 4.240188516301829e-06,
"loss": 0.3774,
"step": 1453
},
{
"epoch": 2.3264,
"grad_norm": 0.6712226844945036,
"learning_rate": 4.223623409739695e-06,
"loss": 0.4036,
"step": 1454
},
{
"epoch": 2.328,
"grad_norm": 0.6909899770747273,
"learning_rate": 4.207067030732449e-06,
"loss": 0.4003,
"step": 1455
},
{
"epoch": 2.3296,
"grad_norm": 0.6253345765747766,
"learning_rate": 4.190519565396859e-06,
"loss": 0.3804,
"step": 1456
},
{
"epoch": 2.3312,
"grad_norm": 0.6458198181452498,
"learning_rate": 4.173981199749498e-06,
"loss": 0.396,
"step": 1457
},
{
"epoch": 2.3327999999999998,
"grad_norm": 0.6704496718692096,
"learning_rate": 4.157452119704648e-06,
"loss": 0.3901,
"step": 1458
},
{
"epoch": 2.3344,
"grad_norm": 0.6493221379438446,
"learning_rate": 4.140932511072201e-06,
"loss": 0.3735,
"step": 1459
},
{
"epoch": 2.336,
"grad_norm": 0.6295406358467901,
"learning_rate": 4.124422559555584e-06,
"loss": 0.3564,
"step": 1460
},
{
"epoch": 2.3376,
"grad_norm": 0.7077651554025548,
"learning_rate": 4.10792245074966e-06,
"loss": 0.4208,
"step": 1461
},
{
"epoch": 2.3392,
"grad_norm": 0.6567217376957504,
"learning_rate": 4.091432370138646e-06,
"loss": 0.3922,
"step": 1462
},
{
"epoch": 2.3407999999999998,
"grad_norm": 0.6556453133870094,
"learning_rate": 4.0749525030940335e-06,
"loss": 0.4057,
"step": 1463
},
{
"epoch": 2.3424,
"grad_norm": 0.6724886679673604,
"learning_rate": 4.058483034872493e-06,
"loss": 0.3861,
"step": 1464
},
{
"epoch": 2.344,
"grad_norm": 0.6855294133077684,
"learning_rate": 4.042024150613798e-06,
"loss": 0.3774,
"step": 1465
},
{
"epoch": 2.3456,
"grad_norm": 0.6635423226767675,
"learning_rate": 4.025576035338752e-06,
"loss": 0.3946,
"step": 1466
},
{
"epoch": 2.3472,
"grad_norm": 0.7423848787310959,
"learning_rate": 4.009138873947089e-06,
"loss": 0.3914,
"step": 1467
},
{
"epoch": 2.3487999999999998,
"grad_norm": 0.6431595325825293,
"learning_rate": 3.992712851215411e-06,
"loss": 0.3826,
"step": 1468
},
{
"epoch": 2.3504,
"grad_norm": 0.6878948261458981,
"learning_rate": 3.976298151795107e-06,
"loss": 0.4079,
"step": 1469
},
{
"epoch": 2.352,
"grad_norm": 0.7080433729321897,
"learning_rate": 3.959894960210275e-06,
"loss": 0.4105,
"step": 1470
},
{
"epoch": 2.3536,
"grad_norm": 0.6683034505218146,
"learning_rate": 3.9435034608556505e-06,
"loss": 0.3808,
"step": 1471
},
{
"epoch": 2.3552,
"grad_norm": 0.6960317935323845,
"learning_rate": 3.9271238379945285e-06,
"loss": 0.4075,
"step": 1472
},
{
"epoch": 2.3568,
"grad_norm": 0.6424432168544976,
"learning_rate": 3.9107562757566975e-06,
"loss": 0.3995,
"step": 1473
},
{
"epoch": 2.3584,
"grad_norm": 0.6589890194361783,
"learning_rate": 3.8944009581363696e-06,
"loss": 0.3845,
"step": 1474
},
{
"epoch": 2.36,
"grad_norm": 0.6952275153409164,
"learning_rate": 3.87805806899011e-06,
"loss": 0.395,
"step": 1475
},
{
"epoch": 2.3616,
"grad_norm": 0.6648463342159043,
"learning_rate": 3.861727792034762e-06,
"loss": 0.3829,
"step": 1476
},
{
"epoch": 2.3632,
"grad_norm": 0.7094582369296529,
"learning_rate": 3.8454103108454075e-06,
"loss": 0.394,
"step": 1477
},
{
"epoch": 2.3648,
"grad_norm": 0.6262697064772254,
"learning_rate": 3.82910580885327e-06,
"loss": 0.4001,
"step": 1478
},
{
"epoch": 2.3664,
"grad_norm": 0.6503012682537402,
"learning_rate": 3.8128144693436743e-06,
"loss": 0.3826,
"step": 1479
},
{
"epoch": 2.368,
"grad_norm": 0.7263227516132823,
"learning_rate": 3.7965364754539845e-06,
"loss": 0.4194,
"step": 1480
},
{
"epoch": 2.3696,
"grad_norm": 0.6982469981323273,
"learning_rate": 3.7802720101715355e-06,
"loss": 0.4261,
"step": 1481
},
{
"epoch": 2.3712,
"grad_norm": 0.638292800352635,
"learning_rate": 3.764021256331587e-06,
"loss": 0.3899,
"step": 1482
},
{
"epoch": 2.3728,
"grad_norm": 0.6877066110692359,
"learning_rate": 3.747784396615258e-06,
"loss": 0.4094,
"step": 1483
},
{
"epoch": 2.3744,
"grad_norm": 0.6903168810990971,
"learning_rate": 3.731561613547481e-06,
"loss": 0.404,
"step": 1484
},
{
"epoch": 2.376,
"grad_norm": 0.7006555167018877,
"learning_rate": 3.7153530894949476e-06,
"loss": 0.3846,
"step": 1485
},
{
"epoch": 2.3776,
"grad_norm": 0.672790227815282,
"learning_rate": 3.699159006664056e-06,
"loss": 0.3967,
"step": 1486
},
{
"epoch": 2.3792,
"grad_norm": 0.6525571495063067,
"learning_rate": 3.682979547098867e-06,
"loss": 0.3889,
"step": 1487
},
{
"epoch": 2.3808,
"grad_norm": 0.6814511121574097,
"learning_rate": 3.6668148926790557e-06,
"loss": 0.4082,
"step": 1488
},
{
"epoch": 2.3824,
"grad_norm": 0.6920829456162518,
"learning_rate": 3.6506652251178663e-06,
"loss": 0.3912,
"step": 1489
},
{
"epoch": 2.384,
"grad_norm": 0.6793947033978027,
"learning_rate": 3.6345307259600657e-06,
"loss": 0.4029,
"step": 1490
},
{
"epoch": 2.3856,
"grad_norm": 0.6885773294894112,
"learning_rate": 3.618411576579916e-06,
"loss": 0.3924,
"step": 1491
},
{
"epoch": 2.3872,
"grad_norm": 0.6528553289067818,
"learning_rate": 3.602307958179117e-06,
"loss": 0.3756,
"step": 1492
},
{
"epoch": 2.3888,
"grad_norm": 0.6846371926456436,
"learning_rate": 3.586220051784783e-06,
"loss": 0.4027,
"step": 1493
},
{
"epoch": 2.3904,
"grad_norm": 0.658940194551754,
"learning_rate": 3.5701480382474047e-06,
"loss": 0.3877,
"step": 1494
},
{
"epoch": 2.392,
"grad_norm": 0.649993839146253,
"learning_rate": 3.554092098238811e-06,
"loss": 0.3749,
"step": 1495
},
{
"epoch": 2.3936,
"grad_norm": 0.6997098749775951,
"learning_rate": 3.538052412250147e-06,
"loss": 0.4402,
"step": 1496
},
{
"epoch": 2.3952,
"grad_norm": 0.6723568514834078,
"learning_rate": 3.5220291605898354e-06,
"loss": 0.3833,
"step": 1497
},
{
"epoch": 2.3968,
"grad_norm": 0.6446746094646758,
"learning_rate": 3.5060225233815554e-06,
"loss": 0.3913,
"step": 1498
},
{
"epoch": 2.3984,
"grad_norm": 0.6788233040081576,
"learning_rate": 3.4900326805622185e-06,
"loss": 0.4148,
"step": 1499
},
{
"epoch": 2.4,
"grad_norm": 0.6450106514990551,
"learning_rate": 3.474059811879944e-06,
"loss": 0.366,
"step": 1500
},
{
"epoch": 2.4016,
"grad_norm": 0.6634551584509225,
"learning_rate": 3.458104096892031e-06,
"loss": 0.3909,
"step": 1501
},
{
"epoch": 2.4032,
"grad_norm": 0.6619386717966989,
"learning_rate": 3.4421657149629593e-06,
"loss": 0.3943,
"step": 1502
},
{
"epoch": 2.4048,
"grad_norm": 0.6424827311751734,
"learning_rate": 3.4262448452623514e-06,
"loss": 0.3719,
"step": 1503
},
{
"epoch": 2.4064,
"grad_norm": 0.6658069057214353,
"learning_rate": 3.410341666762971e-06,
"loss": 0.3898,
"step": 1504
},
{
"epoch": 2.408,
"grad_norm": 0.6687417884193916,
"learning_rate": 3.3944563582387084e-06,
"loss": 0.3798,
"step": 1505
},
{
"epoch": 2.4096,
"grad_norm": 0.6681859209001135,
"learning_rate": 3.3785890982625702e-06,
"loss": 0.3919,
"step": 1506
},
{
"epoch": 2.4112,
"grad_norm": 0.6902976293558055,
"learning_rate": 3.3627400652046736e-06,
"loss": 0.4113,
"step": 1507
},
{
"epoch": 2.4128,
"grad_norm": 0.6567954118535304,
"learning_rate": 3.3469094372302374e-06,
"loss": 0.3871,
"step": 1508
},
{
"epoch": 2.4144,
"grad_norm": 0.6462346211089492,
"learning_rate": 3.331097392297582e-06,
"loss": 0.3721,
"step": 1509
},
{
"epoch": 2.416,
"grad_norm": 0.6980015096147874,
"learning_rate": 3.31530410815613e-06,
"loss": 0.3966,
"step": 1510
},
{
"epoch": 2.4176,
"grad_norm": 0.6468230714773202,
"learning_rate": 3.2995297623444067e-06,
"loss": 0.3988,
"step": 1511
},
{
"epoch": 2.4192,
"grad_norm": 0.6223331150007658,
"learning_rate": 3.283774532188039e-06,
"loss": 0.3783,
"step": 1512
},
{
"epoch": 2.4208,
"grad_norm": 0.6945157649484203,
"learning_rate": 3.268038594797777e-06,
"loss": 0.4102,
"step": 1513
},
{
"epoch": 2.4224,
"grad_norm": 0.6609439584259712,
"learning_rate": 3.2523221270674845e-06,
"loss": 0.4097,
"step": 1514
},
{
"epoch": 2.424,
"grad_norm": 0.6551730823477213,
"learning_rate": 3.2366253056721607e-06,
"loss": 0.3776,
"step": 1515
},
{
"epoch": 2.4256,
"grad_norm": 0.7236681230147213,
"learning_rate": 3.220948307065959e-06,
"loss": 0.4086,
"step": 1516
},
{
"epoch": 2.4272,
"grad_norm": 0.6480195720845133,
"learning_rate": 3.2052913074801876e-06,
"loss": 0.4135,
"step": 1517
},
{
"epoch": 2.4288,
"grad_norm": 0.6703274388094672,
"learning_rate": 3.1896544829213444e-06,
"loss": 0.3984,
"step": 1518
},
{
"epoch": 2.4304,
"grad_norm": 0.7483782518618497,
"learning_rate": 3.17403800916913e-06,
"loss": 0.4037,
"step": 1519
},
{
"epoch": 2.432,
"grad_norm": 0.66746703547942,
"learning_rate": 3.1584420617744737e-06,
"loss": 0.4035,
"step": 1520
},
{
"epoch": 2.4336,
"grad_norm": 0.6401406179681777,
"learning_rate": 3.142866816057559e-06,
"loss": 0.4157,
"step": 1521
},
{
"epoch": 2.4352,
"grad_norm": 0.6455361206817064,
"learning_rate": 3.1273124471058567e-06,
"loss": 0.3962,
"step": 1522
},
{
"epoch": 2.4368,
"grad_norm": 0.6384977031003991,
"learning_rate": 3.1117791297721468e-06,
"loss": 0.3905,
"step": 1523
},
{
"epoch": 2.4384,
"grad_norm": 0.6725488719932304,
"learning_rate": 3.09626703867257e-06,
"loss": 0.4252,
"step": 1524
},
{
"epoch": 2.44,
"grad_norm": 0.6605135568050072,
"learning_rate": 3.0807763481846455e-06,
"loss": 0.3826,
"step": 1525
},
{
"epoch": 2.4416,
"grad_norm": 0.6721958666254848,
"learning_rate": 3.0653072324453226e-06,
"loss": 0.3962,
"step": 1526
},
{
"epoch": 2.4432,
"grad_norm": 0.6780206249071795,
"learning_rate": 3.049859865349023e-06,
"loss": 0.3751,
"step": 1527
},
{
"epoch": 2.4448,
"grad_norm": 0.6238495618746309,
"learning_rate": 3.0344344205456807e-06,
"loss": 0.3903,
"step": 1528
},
{
"epoch": 2.4464,
"grad_norm": 0.6513304236677114,
"learning_rate": 3.0190310714387914e-06,
"loss": 0.3975,
"step": 1529
},
{
"epoch": 2.448,
"grad_norm": 0.7177034625230926,
"learning_rate": 3.00364999118347e-06,
"loss": 0.4159,
"step": 1530
},
{
"epoch": 2.4496,
"grad_norm": 0.6507837034685228,
"learning_rate": 2.988291352684491e-06,
"loss": 0.378,
"step": 1531
},
{
"epoch": 2.4512,
"grad_norm": 0.6118401749337214,
"learning_rate": 2.9729553285943587e-06,
"loss": 0.3795,
"step": 1532
},
{
"epoch": 2.4528,
"grad_norm": 0.6395089221435761,
"learning_rate": 2.9576420913113568e-06,
"loss": 0.4056,
"step": 1533
},
{
"epoch": 2.4544,
"grad_norm": 0.625923794552961,
"learning_rate": 2.9423518129776095e-06,
"loss": 0.37,
"step": 1534
},
{
"epoch": 2.456,
"grad_norm": 0.6827282105564476,
"learning_rate": 2.927084665477162e-06,
"loss": 0.4257,
"step": 1535
},
{
"epoch": 2.4576000000000002,
"grad_norm": 0.6605488868655838,
"learning_rate": 2.9118408204340244e-06,
"loss": 0.3719,
"step": 1536
},
{
"epoch": 2.4592,
"grad_norm": 0.6406328281031711,
"learning_rate": 2.8966204492102606e-06,
"loss": 0.3965,
"step": 1537
},
{
"epoch": 2.4608,
"grad_norm": 0.6326830223678251,
"learning_rate": 2.8814237229040556e-06,
"loss": 0.3783,
"step": 1538
},
{
"epoch": 2.4624,
"grad_norm": 0.6219392243662581,
"learning_rate": 2.866250812347795e-06,
"loss": 0.3932,
"step": 1539
},
{
"epoch": 2.464,
"grad_norm": 0.6251015621535402,
"learning_rate": 2.8511018881061347e-06,
"loss": 0.3815,
"step": 1540
},
{
"epoch": 2.4656000000000002,
"grad_norm": 0.6179759318368455,
"learning_rate": 2.8359771204741e-06,
"loss": 0.3849,
"step": 1541
},
{
"epoch": 2.4672,
"grad_norm": 0.652891973703576,
"learning_rate": 2.8208766794751518e-06,
"loss": 0.3752,
"step": 1542
},
{
"epoch": 2.4688,
"grad_norm": 0.6357740525025578,
"learning_rate": 2.8058007348593003e-06,
"loss": 0.401,
"step": 1543
},
{
"epoch": 2.4704,
"grad_norm": 0.6395930935434486,
"learning_rate": 2.7907494561011693e-06,
"loss": 0.381,
"step": 1544
},
{
"epoch": 2.472,
"grad_norm": 0.6512737399321729,
"learning_rate": 2.775723012398107e-06,
"loss": 0.3814,
"step": 1545
},
{
"epoch": 2.4736000000000002,
"grad_norm": 0.6583217552351495,
"learning_rate": 2.760721572668284e-06,
"loss": 0.4403,
"step": 1546
},
{
"epoch": 2.4752,
"grad_norm": 0.6372114454208074,
"learning_rate": 2.745745305548793e-06,
"loss": 0.4075,
"step": 1547
},
{
"epoch": 2.4768,
"grad_norm": 0.6407639143002334,
"learning_rate": 2.730794379393742e-06,
"loss": 0.394,
"step": 1548
},
{
"epoch": 2.4784,
"grad_norm": 0.6329406769881196,
"learning_rate": 2.7158689622723816e-06,
"loss": 0.3912,
"step": 1549
},
{
"epoch": 2.48,
"grad_norm": 0.6195750421768822,
"learning_rate": 2.7009692219672025e-06,
"loss": 0.373,
"step": 1550
},
{
"epoch": 2.4816,
"grad_norm": 0.6192216383642458,
"learning_rate": 2.6860953259720473e-06,
"loss": 0.3829,
"step": 1551
},
{
"epoch": 2.4832,
"grad_norm": 0.6850583448382874,
"learning_rate": 2.67124744149024e-06,
"loss": 0.4015,
"step": 1552
},
{
"epoch": 2.4848,
"grad_norm": 0.6562865958799802,
"learning_rate": 2.6564257354326915e-06,
"loss": 0.4099,
"step": 1553
},
{
"epoch": 2.4864,
"grad_norm": 0.6511507259040457,
"learning_rate": 2.641630374416036e-06,
"loss": 0.4062,
"step": 1554
},
{
"epoch": 2.488,
"grad_norm": 0.640268506310053,
"learning_rate": 2.6268615247607533e-06,
"loss": 0.405,
"step": 1555
},
{
"epoch": 2.4896,
"grad_norm": 0.6211246689771531,
"learning_rate": 2.612119352489292e-06,
"loss": 0.3971,
"step": 1556
},
{
"epoch": 2.4912,
"grad_norm": 0.6368488390713289,
"learning_rate": 2.597404023324217e-06,
"loss": 0.4017,
"step": 1557
},
{
"epoch": 2.4928,
"grad_norm": 0.7336452901332965,
"learning_rate": 2.582715702686337e-06,
"loss": 0.407,
"step": 1558
},
{
"epoch": 2.4944,
"grad_norm": 0.616540685351605,
"learning_rate": 2.5680545556928438e-06,
"loss": 0.3558,
"step": 1559
},
{
"epoch": 2.496,
"grad_norm": 0.6577552715026054,
"learning_rate": 2.5534207471554644e-06,
"loss": 0.3981,
"step": 1560
},
{
"epoch": 2.4976,
"grad_norm": 0.6549969308925954,
"learning_rate": 2.5388144415786026e-06,
"loss": 0.4143,
"step": 1561
},
{
"epoch": 2.4992,
"grad_norm": 0.6288196948164876,
"learning_rate": 2.5242358031574853e-06,
"loss": 0.3885,
"step": 1562
},
{
"epoch": 2.5008,
"grad_norm": 0.6126052917870453,
"learning_rate": 2.509684995776329e-06,
"loss": 0.3689,
"step": 1563
},
{
"epoch": 2.5023999999999997,
"grad_norm": 0.6876081050348912,
"learning_rate": 2.4951621830064887e-06,
"loss": 0.4122,
"step": 1564
},
{
"epoch": 2.504,
"grad_norm": 0.6426287104445904,
"learning_rate": 2.480667528104617e-06,
"loss": 0.3971,
"step": 1565
},
{
"epoch": 2.5056000000000003,
"grad_norm": 0.6187941851724073,
"learning_rate": 2.4662011940108383e-06,
"loss": 0.3863,
"step": 1566
},
{
"epoch": 2.5072,
"grad_norm": 0.6466697588066401,
"learning_rate": 2.4517633433469062e-06,
"loss": 0.388,
"step": 1567
},
{
"epoch": 2.5088,
"grad_norm": 0.6482939191272499,
"learning_rate": 2.437354138414385e-06,
"loss": 0.3903,
"step": 1568
},
{
"epoch": 2.5103999999999997,
"grad_norm": 0.6396499447362041,
"learning_rate": 2.4229737411928222e-06,
"loss": 0.4091,
"step": 1569
},
{
"epoch": 2.512,
"grad_norm": 0.6245065552164869,
"learning_rate": 2.40862231333792e-06,
"loss": 0.395,
"step": 1570
},
{
"epoch": 2.5136,
"grad_norm": 0.6399504852316908,
"learning_rate": 2.3943000161797304e-06,
"loss": 0.3986,
"step": 1571
},
{
"epoch": 2.5152,
"grad_norm": 0.6291806247948137,
"learning_rate": 2.3800070107208355e-06,
"loss": 0.3804,
"step": 1572
},
{
"epoch": 2.5168,
"grad_norm": 0.6423976811330442,
"learning_rate": 2.365743457634533e-06,
"loss": 0.3722,
"step": 1573
},
{
"epoch": 2.5183999999999997,
"grad_norm": 0.6631554110429378,
"learning_rate": 2.351509517263041e-06,
"loss": 0.3978,
"step": 1574
},
{
"epoch": 2.52,
"grad_norm": 0.6585162256686338,
"learning_rate": 2.3373053496156865e-06,
"loss": 0.4175,
"step": 1575
},
{
"epoch": 2.5216,
"grad_norm": 0.6774760409466307,
"learning_rate": 2.3231311143671077e-06,
"loss": 0.3994,
"step": 1576
},
{
"epoch": 2.5232,
"grad_norm": 0.6518443745193649,
"learning_rate": 2.308986970855466e-06,
"loss": 0.3857,
"step": 1577
},
{
"epoch": 2.5248,
"grad_norm": 0.6235079803424979,
"learning_rate": 2.2948730780806407e-06,
"loss": 0.3818,
"step": 1578
},
{
"epoch": 2.5263999999999998,
"grad_norm": 0.6965629135690532,
"learning_rate": 2.2807895947024643e-06,
"loss": 0.3796,
"step": 1579
},
{
"epoch": 2.528,
"grad_norm": 0.6217783322318544,
"learning_rate": 2.2667366790389152e-06,
"loss": 0.3862,
"step": 1580
},
{
"epoch": 2.5296,
"grad_norm": 0.618178271536513,
"learning_rate": 2.2527144890643465e-06,
"loss": 0.4029,
"step": 1581
},
{
"epoch": 2.5312,
"grad_norm": 0.64690974988133,
"learning_rate": 2.2387231824077188e-06,
"loss": 0.3964,
"step": 1582
},
{
"epoch": 2.5328,
"grad_norm": 0.6322672998920709,
"learning_rate": 2.2247629163508207e-06,
"loss": 0.385,
"step": 1583
},
{
"epoch": 2.5343999999999998,
"grad_norm": 0.6406596537736634,
"learning_rate": 2.2108338478264934e-06,
"loss": 0.3845,
"step": 1584
},
{
"epoch": 2.536,
"grad_norm": 0.6721282769087035,
"learning_rate": 2.196936133416882e-06,
"loss": 0.4234,
"step": 1585
},
{
"epoch": 2.5376,
"grad_norm": 0.6412509414173777,
"learning_rate": 2.1830699293516677e-06,
"loss": 0.4255,
"step": 1586
},
{
"epoch": 2.5392,
"grad_norm": 0.6913729833010835,
"learning_rate": 2.1692353915063047e-06,
"loss": 0.4186,
"step": 1587
},
{
"epoch": 2.5408,
"grad_norm": 0.6694757232889293,
"learning_rate": 2.155432675400283e-06,
"loss": 0.3556,
"step": 1588
},
{
"epoch": 2.5423999999999998,
"grad_norm": 0.6715304054251654,
"learning_rate": 2.141661936195364e-06,
"loss": 0.4319,
"step": 1589
},
{
"epoch": 2.544,
"grad_norm": 0.6653104930547877,
"learning_rate": 2.1279233286938503e-06,
"loss": 0.4249,
"step": 1590
},
{
"epoch": 2.5456,
"grad_norm": 0.6268563379389929,
"learning_rate": 2.1142170073368396e-06,
"loss": 0.3891,
"step": 1591
},
{
"epoch": 2.5472,
"grad_norm": 0.6643197899692231,
"learning_rate": 2.100543126202481e-06,
"loss": 0.4063,
"step": 1592
},
{
"epoch": 2.5488,
"grad_norm": 0.6342692376125808,
"learning_rate": 2.0869018390042588e-06,
"loss": 0.3992,
"step": 1593
},
{
"epoch": 2.5504,
"grad_norm": 0.5948066156047811,
"learning_rate": 2.0732932990892528e-06,
"loss": 0.3673,
"step": 1594
},
{
"epoch": 2.552,
"grad_norm": 0.6694485740217448,
"learning_rate": 2.059717659436415e-06,
"loss": 0.397,
"step": 1595
},
{
"epoch": 2.5536,
"grad_norm": 0.6308509391397539,
"learning_rate": 2.0461750726548558e-06,
"loss": 0.407,
"step": 1596
},
{
"epoch": 2.5552,
"grad_norm": 0.6261729443272391,
"learning_rate": 2.032665690982126e-06,
"loss": 0.3773,
"step": 1597
},
{
"epoch": 2.5568,
"grad_norm": 0.6346772474236525,
"learning_rate": 2.0191896662825012e-06,
"loss": 0.3941,
"step": 1598
},
{
"epoch": 2.5584,
"grad_norm": 0.6376245729227132,
"learning_rate": 2.0057471500452822e-06,
"loss": 0.4113,
"step": 1599
},
{
"epoch": 2.56,
"grad_norm": 0.641020908385709,
"learning_rate": 1.9923382933830836e-06,
"loss": 0.4098,
"step": 1600
},
{
"epoch": 2.5616,
"grad_norm": 0.6275775456371993,
"learning_rate": 1.9789632470301423e-06,
"loss": 0.3992,
"step": 1601
},
{
"epoch": 2.5632,
"grad_norm": 0.6453400022000114,
"learning_rate": 1.9656221613406217e-06,
"loss": 0.3961,
"step": 1602
},
{
"epoch": 2.5648,
"grad_norm": 0.6728800160345044,
"learning_rate": 1.952315186286915e-06,
"loss": 0.4069,
"step": 1603
},
{
"epoch": 2.5664,
"grad_norm": 0.5922663369940925,
"learning_rate": 1.9390424714579683e-06,
"loss": 0.3796,
"step": 1604
},
{
"epoch": 2.568,
"grad_norm": 0.6055756534765394,
"learning_rate": 1.925804166057596e-06,
"loss": 0.3734,
"step": 1605
},
{
"epoch": 2.5696,
"grad_norm": 0.6337357094885002,
"learning_rate": 1.9126004189027975e-06,
"loss": 0.382,
"step": 1606
},
{
"epoch": 2.5712,
"grad_norm": 0.6223091168842378,
"learning_rate": 1.8994313784220942e-06,
"loss": 0.3899,
"step": 1607
},
{
"epoch": 2.5728,
"grad_norm": 0.6304486561080737,
"learning_rate": 1.8862971926538553e-06,
"loss": 0.3935,
"step": 1608
},
{
"epoch": 2.5744,
"grad_norm": 0.6638930774983509,
"learning_rate": 1.8731980092446305e-06,
"loss": 0.4189,
"step": 1609
},
{
"epoch": 2.576,
"grad_norm": 0.6609598736574402,
"learning_rate": 1.8601339754475007e-06,
"loss": 0.4043,
"step": 1610
},
{
"epoch": 2.5776,
"grad_norm": 0.6460433585689176,
"learning_rate": 1.8471052381204091e-06,
"loss": 0.4289,
"step": 1611
},
{
"epoch": 2.5792,
"grad_norm": 0.6284239142431678,
"learning_rate": 1.8341119437245231e-06,
"loss": 0.4009,
"step": 1612
},
{
"epoch": 2.5808,
"grad_norm": 0.5788234045672292,
"learning_rate": 1.8211542383225811e-06,
"loss": 0.3798,
"step": 1613
},
{
"epoch": 2.5824,
"grad_norm": 0.6168466381836182,
"learning_rate": 1.8082322675772478e-06,
"loss": 0.3871,
"step": 1614
},
{
"epoch": 2.584,
"grad_norm": 0.5918253150989333,
"learning_rate": 1.795346176749484e-06,
"loss": 0.3745,
"step": 1615
},
{
"epoch": 2.5856,
"grad_norm": 0.6069246083161115,
"learning_rate": 1.7824961106969124e-06,
"loss": 0.403,
"step": 1616
},
{
"epoch": 2.5872,
"grad_norm": 0.5930680357250637,
"learning_rate": 1.7696822138721798e-06,
"loss": 0.4001,
"step": 1617
},
{
"epoch": 2.5888,
"grad_norm": 0.6129454651899682,
"learning_rate": 1.756904630321347e-06,
"loss": 0.3801,
"step": 1618
},
{
"epoch": 2.5904,
"grad_norm": 0.6189740075978353,
"learning_rate": 1.7441635036822624e-06,
"loss": 0.3943,
"step": 1619
},
{
"epoch": 2.592,
"grad_norm": 0.6274624006210813,
"learning_rate": 1.7314589771829426e-06,
"loss": 0.4132,
"step": 1620
},
{
"epoch": 2.5936,
"grad_norm": 0.9954803403258453,
"learning_rate": 1.718791193639973e-06,
"loss": 0.3763,
"step": 1621
},
{
"epoch": 2.5952,
"grad_norm": 0.619882141163173,
"learning_rate": 1.706160295456898e-06,
"loss": 0.4138,
"step": 1622
},
{
"epoch": 2.5968,
"grad_norm": 0.6228088213238236,
"learning_rate": 1.693566424622612e-06,
"loss": 0.3804,
"step": 1623
},
{
"epoch": 2.5984,
"grad_norm": 0.6265320543753518,
"learning_rate": 1.6810097227097782e-06,
"loss": 0.3911,
"step": 1624
},
{
"epoch": 2.6,
"grad_norm": 0.6203707060758011,
"learning_rate": 1.668490330873223e-06,
"loss": 0.39,
"step": 1625
},
{
"epoch": 2.6016,
"grad_norm": 0.621294828917456,
"learning_rate": 1.6560083898483598e-06,
"loss": 0.4142,
"step": 1626
},
{
"epoch": 2.6032,
"grad_norm": 0.6101531315821962,
"learning_rate": 1.6435640399496033e-06,
"loss": 0.3725,
"step": 1627
},
{
"epoch": 2.6048,
"grad_norm": 0.6863906992808965,
"learning_rate": 1.6311574210687865e-06,
"loss": 0.4023,
"step": 1628
},
{
"epoch": 2.6064,
"grad_norm": 0.6426995722369155,
"learning_rate": 1.618788672673598e-06,
"loss": 0.4103,
"step": 1629
},
{
"epoch": 2.608,
"grad_norm": 0.6533732877012939,
"learning_rate": 1.6064579338060088e-06,
"loss": 0.4059,
"step": 1630
},
{
"epoch": 2.6096,
"grad_norm": 0.6078243562020307,
"learning_rate": 1.5941653430807052e-06,
"loss": 0.3763,
"step": 1631
},
{
"epoch": 2.6112,
"grad_norm": 0.5944464717669109,
"learning_rate": 1.5819110386835413e-06,
"loss": 0.3768,
"step": 1632
},
{
"epoch": 2.6128,
"grad_norm": 0.623580671550532,
"learning_rate": 1.5696951583699776e-06,
"loss": 0.3875,
"step": 1633
},
{
"epoch": 2.6144,
"grad_norm": 0.5922332226181343,
"learning_rate": 1.5575178394635315e-06,
"loss": 0.4053,
"step": 1634
},
{
"epoch": 2.616,
"grad_norm": 0.5845659932749108,
"learning_rate": 1.545379218854241e-06,
"loss": 0.3871,
"step": 1635
},
{
"epoch": 2.6176,
"grad_norm": 0.6268357052713963,
"learning_rate": 1.5332794329971157e-06,
"loss": 0.4103,
"step": 1636
},
{
"epoch": 2.6192,
"grad_norm": 0.7478861047169991,
"learning_rate": 1.5212186179106142e-06,
"loss": 0.4175,
"step": 1637
},
{
"epoch": 2.6208,
"grad_norm": 0.6192499205634353,
"learning_rate": 1.5091969091751073e-06,
"loss": 0.3843,
"step": 1638
},
{
"epoch": 2.6224,
"grad_norm": 0.6166702089908779,
"learning_rate": 1.4972144419313528e-06,
"loss": 0.3806,
"step": 1639
},
{
"epoch": 2.624,
"grad_norm": 0.6349397684516485,
"learning_rate": 1.4852713508789835e-06,
"loss": 0.3863,
"step": 1640
},
{
"epoch": 2.6256,
"grad_norm": 0.5927888932635257,
"learning_rate": 1.4733677702749894e-06,
"loss": 0.3655,
"step": 1641
},
{
"epoch": 2.6272,
"grad_norm": 0.6026968586334261,
"learning_rate": 1.4615038339322025e-06,
"loss": 0.3685,
"step": 1642
},
{
"epoch": 2.6288,
"grad_norm": 0.6543361088043362,
"learning_rate": 1.4496796752178032e-06,
"loss": 0.3983,
"step": 1643
},
{
"epoch": 2.6304,
"grad_norm": 0.6458512084445657,
"learning_rate": 1.4378954270518169e-06,
"loss": 0.3945,
"step": 1644
},
{
"epoch": 2.632,
"grad_norm": 0.6374123540148537,
"learning_rate": 1.4261512219056118e-06,
"loss": 0.3941,
"step": 1645
},
{
"epoch": 2.6336,
"grad_norm": 0.6904183914760732,
"learning_rate": 1.4144471918004255e-06,
"loss": 0.4231,
"step": 1646
},
{
"epoch": 2.6352,
"grad_norm": 0.62514925414384,
"learning_rate": 1.402783468305864e-06,
"loss": 0.4042,
"step": 1647
},
{
"epoch": 2.6368,
"grad_norm": 0.6250879922886552,
"learning_rate": 1.391160182538437e-06,
"loss": 0.3839,
"step": 1648
},
{
"epoch": 2.6384,
"grad_norm": 0.6251249723911877,
"learning_rate": 1.3795774651600757e-06,
"loss": 0.4204,
"step": 1649
},
{
"epoch": 2.64,
"grad_norm": 0.6287593747093052,
"learning_rate": 1.3680354463766642e-06,
"loss": 0.4043,
"step": 1650
},
{
"epoch": 2.6416,
"grad_norm": 0.64865146086009,
"learning_rate": 1.3565342559365808e-06,
"loss": 0.4074,
"step": 1651
},
{
"epoch": 2.6432,
"grad_norm": 0.6267476637499041,
"learning_rate": 1.3450740231292354e-06,
"loss": 0.3842,
"step": 1652
},
{
"epoch": 2.6448,
"grad_norm": 0.6293389567269077,
"learning_rate": 1.3336548767836144e-06,
"loss": 0.3762,
"step": 1653
},
{
"epoch": 2.6464,
"grad_norm": 0.6058061494114316,
"learning_rate": 1.3222769452668382e-06,
"loss": 0.4058,
"step": 1654
},
{
"epoch": 2.648,
"grad_norm": 0.5997822993884304,
"learning_rate": 1.3109403564827155e-06,
"loss": 0.39,
"step": 1655
},
{
"epoch": 2.6496,
"grad_norm": 0.5798357022123661,
"learning_rate": 1.2996452378703013e-06,
"loss": 0.3567,
"step": 1656
},
{
"epoch": 2.6512000000000002,
"grad_norm": 0.592566773403498,
"learning_rate": 1.2883917164024722e-06,
"loss": 0.3867,
"step": 1657
},
{
"epoch": 2.6528,
"grad_norm": 0.5931018419806823,
"learning_rate": 1.2771799185844913e-06,
"loss": 0.4138,
"step": 1658
},
{
"epoch": 2.6544,
"grad_norm": 0.6085959754789083,
"learning_rate": 1.266009970452593e-06,
"loss": 0.3784,
"step": 1659
},
{
"epoch": 2.656,
"grad_norm": 0.6183384483894584,
"learning_rate": 1.2548819975725624e-06,
"loss": 0.3888,
"step": 1660
},
{
"epoch": 2.6576,
"grad_norm": 0.6214291059574565,
"learning_rate": 1.2437961250383207e-06,
"loss": 0.4044,
"step": 1661
},
{
"epoch": 2.6592000000000002,
"grad_norm": 0.6325046429616501,
"learning_rate": 1.2327524774705268e-06,
"loss": 0.3855,
"step": 1662
},
{
"epoch": 2.6608,
"grad_norm": 0.6197427462940891,
"learning_rate": 1.221751179015172e-06,
"loss": 0.379,
"step": 1663
},
{
"epoch": 2.6624,
"grad_norm": 0.6206703848814074,
"learning_rate": 1.2107923533421795e-06,
"loss": 0.3981,
"step": 1664
},
{
"epoch": 2.664,
"grad_norm": 0.5929949370443285,
"learning_rate": 1.1998761236440248e-06,
"loss": 0.4103,
"step": 1665
},
{
"epoch": 2.6656,
"grad_norm": 0.6376212703380394,
"learning_rate": 1.1890026126343446e-06,
"loss": 0.4125,
"step": 1666
},
{
"epoch": 2.6672000000000002,
"grad_norm": 0.6369696343567768,
"learning_rate": 1.1781719425465538e-06,
"loss": 0.4099,
"step": 1667
},
{
"epoch": 2.6688,
"grad_norm": 0.6405197317366532,
"learning_rate": 1.1673842351324816e-06,
"loss": 0.3802,
"step": 1668
},
{
"epoch": 2.6704,
"grad_norm": 0.6262504215886,
"learning_rate": 1.1566396116609907e-06,
"loss": 0.39,
"step": 1669
},
{
"epoch": 2.672,
"grad_norm": 0.6304118500726281,
"learning_rate": 1.1459381929166251e-06,
"loss": 0.3854,
"step": 1670
},
{
"epoch": 2.6736,
"grad_norm": 0.6147956319129659,
"learning_rate": 1.1352800991982467e-06,
"loss": 0.3829,
"step": 1671
},
{
"epoch": 2.6752000000000002,
"grad_norm": 0.6083959768111769,
"learning_rate": 1.1246654503176795e-06,
"loss": 0.3824,
"step": 1672
},
{
"epoch": 2.6768,
"grad_norm": 0.6474014155941403,
"learning_rate": 1.1140943655983727e-06,
"loss": 0.384,
"step": 1673
},
{
"epoch": 2.6784,
"grad_norm": 0.661173342284011,
"learning_rate": 1.103566963874052e-06,
"loss": 0.3671,
"step": 1674
},
{
"epoch": 2.68,
"grad_norm": 0.6671292261409086,
"learning_rate": 1.0930833634873811e-06,
"loss": 0.4108,
"step": 1675
},
{
"epoch": 2.6816,
"grad_norm": 0.5919654658407986,
"learning_rate": 1.082643682288641e-06,
"loss": 0.3861,
"step": 1676
},
{
"epoch": 2.6832000000000003,
"grad_norm": 0.606017708841482,
"learning_rate": 1.0722480376343997e-06,
"loss": 0.3838,
"step": 1677
},
{
"epoch": 2.6848,
"grad_norm": 0.6111833117856191,
"learning_rate": 1.0618965463861868e-06,
"loss": 0.3959,
"step": 1678
},
{
"epoch": 2.6864,
"grad_norm": 0.66229175827883,
"learning_rate": 1.0515893249091936e-06,
"loss": 0.3954,
"step": 1679
},
{
"epoch": 2.6879999999999997,
"grad_norm": 0.6225602473696187,
"learning_rate": 1.0413264890709546e-06,
"loss": 0.3915,
"step": 1680
},
{
"epoch": 2.6896,
"grad_norm": 0.6324295682096658,
"learning_rate": 1.0311081542400452e-06,
"loss": 0.4035,
"step": 1681
},
{
"epoch": 2.6912000000000003,
"grad_norm": 0.5782408195023804,
"learning_rate": 1.0209344352847923e-06,
"loss": 0.3814,
"step": 1682
},
{
"epoch": 2.6928,
"grad_norm": 0.5981292289007456,
"learning_rate": 1.0108054465719736e-06,
"loss": 0.3843,
"step": 1683
},
{
"epoch": 2.6944,
"grad_norm": 0.6386070296775564,
"learning_rate": 1.0007213019655393e-06,
"loss": 0.3925,
"step": 1684
},
{
"epoch": 2.6959999999999997,
"grad_norm": 0.5986251172966329,
"learning_rate": 9.906821148253303e-07,
"loss": 0.3624,
"step": 1685
},
{
"epoch": 2.6976,
"grad_norm": 0.6107708577953296,
"learning_rate": 9.806879980057993e-07,
"loss": 0.3783,
"step": 1686
},
{
"epoch": 2.6992000000000003,
"grad_norm": 0.6231399319138614,
"learning_rate": 9.707390638547482e-07,
"loss": 0.3763,
"step": 1687
},
{
"epoch": 2.7008,
"grad_norm": 0.6088089708455741,
"learning_rate": 9.608354242120637e-07,
"loss": 0.3989,
"step": 1688
},
{
"epoch": 2.7024,
"grad_norm": 0.6207549015074908,
"learning_rate": 9.509771904084558e-07,
"loss": 0.3808,
"step": 1689
},
{
"epoch": 2.7039999999999997,
"grad_norm": 0.6591205512689258,
"learning_rate": 9.411644732642122e-07,
"loss": 0.416,
"step": 1690
},
{
"epoch": 2.7056,
"grad_norm": 0.6045094661220406,
"learning_rate": 9.313973830879514e-07,
"loss": 0.3996,
"step": 1691
},
{
"epoch": 2.7072000000000003,
"grad_norm": 0.6017543127945517,
"learning_rate": 9.216760296753758e-07,
"loss": 0.3624,
"step": 1692
},
{
"epoch": 2.7088,
"grad_norm": 0.6051557230372354,
"learning_rate": 9.120005223080486e-07,
"loss": 0.3934,
"step": 1693
},
{
"epoch": 2.7104,
"grad_norm": 0.6055162741880877,
"learning_rate": 9.023709697521543e-07,
"loss": 0.3886,
"step": 1694
},
{
"epoch": 2.7119999999999997,
"grad_norm": 0.6102023101677977,
"learning_rate": 8.927874802572861e-07,
"loss": 0.4042,
"step": 1695
},
{
"epoch": 2.7136,
"grad_norm": 0.5776267937267195,
"learning_rate": 8.832501615552225e-07,
"loss": 0.3803,
"step": 1696
},
{
"epoch": 2.7152,
"grad_norm": 0.615711936633695,
"learning_rate": 8.737591208587159e-07,
"loss": 0.405,
"step": 1697
},
{
"epoch": 2.7168,
"grad_norm": 0.6455560899948569,
"learning_rate": 8.643144648602913e-07,
"loss": 0.4374,
"step": 1698
},
{
"epoch": 2.7184,
"grad_norm": 0.5905261653895563,
"learning_rate": 8.549162997310467e-07,
"loss": 0.3716,
"step": 1699
},
{
"epoch": 2.7199999999999998,
"grad_norm": 0.6112904373809765,
"learning_rate": 8.455647311194537e-07,
"loss": 0.386,
"step": 1700
},
{
"epoch": 2.7216,
"grad_norm": 0.6030522579380936,
"learning_rate": 8.362598641501774e-07,
"loss": 0.3966,
"step": 1701
},
{
"epoch": 2.7232,
"grad_norm": 0.5883945298360702,
"learning_rate": 8.270018034228916e-07,
"loss": 0.3901,
"step": 1702
},
{
"epoch": 2.7248,
"grad_norm": 0.5791821954738914,
"learning_rate": 8.177906530110996e-07,
"loss": 0.3498,
"step": 1703
},
{
"epoch": 2.7264,
"grad_norm": 0.5825217026653828,
"learning_rate": 8.086265164609708e-07,
"loss": 0.3869,
"step": 1704
},
{
"epoch": 2.7279999999999998,
"grad_norm": 0.5976749528633419,
"learning_rate": 7.995094967901701e-07,
"loss": 0.3849,
"step": 1705
},
{
"epoch": 2.7296,
"grad_norm": 0.6606592974479811,
"learning_rate": 7.90439696486705e-07,
"loss": 0.3983,
"step": 1706
},
{
"epoch": 2.7312,
"grad_norm": 0.5862915441222696,
"learning_rate": 7.814172175077738e-07,
"loss": 0.3682,
"step": 1707
},
{
"epoch": 2.7328,
"grad_norm": 0.6004467759052456,
"learning_rate": 7.724421612786109e-07,
"loss": 0.403,
"step": 1708
},
{
"epoch": 2.7344,
"grad_norm": 0.5893551587311924,
"learning_rate": 7.635146286913587e-07,
"loss": 0.3847,
"step": 1709
},
{
"epoch": 2.7359999999999998,
"grad_norm": 0.6081689478494531,
"learning_rate": 7.546347201039255e-07,
"loss": 0.3931,
"step": 1710
},
{
"epoch": 2.7376,
"grad_norm": 0.6174886614429648,
"learning_rate": 7.458025353388592e-07,
"loss": 0.3985,
"step": 1711
},
{
"epoch": 2.7392,
"grad_norm": 0.634124853662826,
"learning_rate": 7.37018173682223e-07,
"loss": 0.392,
"step": 1712
},
{
"epoch": 2.7408,
"grad_norm": 0.6588212500354214,
"learning_rate": 7.282817338824893e-07,
"loss": 0.4133,
"step": 1713
},
{
"epoch": 2.7424,
"grad_norm": 0.5955630737541837,
"learning_rate": 7.195933141494133e-07,
"loss": 0.3701,
"step": 1714
},
{
"epoch": 2.7439999999999998,
"grad_norm": 0.60471648702862,
"learning_rate": 7.109530121529439e-07,
"loss": 0.3921,
"step": 1715
},
{
"epoch": 2.7456,
"grad_norm": 0.6234697420391768,
"learning_rate": 7.023609250221153e-07,
"loss": 0.3805,
"step": 1716
},
{
"epoch": 2.7472,
"grad_norm": 0.5909780981357816,
"learning_rate": 6.938171493439622e-07,
"loss": 0.3822,
"step": 1717
},
{
"epoch": 2.7488,
"grad_norm": 0.5916412097862507,
"learning_rate": 6.853217811624313e-07,
"loss": 0.3896,
"step": 1718
},
{
"epoch": 2.7504,
"grad_norm": 0.5701027407029094,
"learning_rate": 6.768749159772992e-07,
"loss": 0.3841,
"step": 1719
},
{
"epoch": 2.752,
"grad_norm": 0.6224648857193659,
"learning_rate": 6.684766487431027e-07,
"loss": 0.411,
"step": 1720
},
{
"epoch": 2.7536,
"grad_norm": 0.6062600883850013,
"learning_rate": 6.601270738680721e-07,
"loss": 0.3896,
"step": 1721
},
{
"epoch": 2.7552,
"grad_norm": 0.5825798421411809,
"learning_rate": 6.518262852130625e-07,
"loss": 0.3469,
"step": 1722
},
{
"epoch": 2.7568,
"grad_norm": 0.6245531507828768,
"learning_rate": 6.435743760905083e-07,
"loss": 0.4002,
"step": 1723
},
{
"epoch": 2.7584,
"grad_norm": 0.6012872581602149,
"learning_rate": 6.353714392633698e-07,
"loss": 0.3946,
"step": 1724
},
{
"epoch": 2.76,
"grad_norm": 0.600518750459418,
"learning_rate": 6.272175669440861e-07,
"loss": 0.4028,
"step": 1725
},
{
"epoch": 2.7616,
"grad_norm": 0.6565378768747119,
"learning_rate": 6.191128507935479e-07,
"loss": 0.4181,
"step": 1726
},
{
"epoch": 2.7632,
"grad_norm": 0.5880124521353837,
"learning_rate": 6.110573819200605e-07,
"loss": 0.3771,
"step": 1727
},
{
"epoch": 2.7648,
"grad_norm": 0.6116040078923606,
"learning_rate": 6.030512508783187e-07,
"loss": 0.4195,
"step": 1728
},
{
"epoch": 2.7664,
"grad_norm": 0.6014679070546552,
"learning_rate": 5.950945476683955e-07,
"loss": 0.3974,
"step": 1729
},
{
"epoch": 2.768,
"grad_norm": 0.6009701852260378,
"learning_rate": 5.871873617347218e-07,
"loss": 0.3785,
"step": 1730
},
{
"epoch": 2.7696,
"grad_norm": 0.5859448462013398,
"learning_rate": 5.793297819650884e-07,
"loss": 0.3762,
"step": 1731
},
{
"epoch": 2.7712,
"grad_norm": 0.6291628272389389,
"learning_rate": 5.715218966896435e-07,
"loss": 0.3854,
"step": 1732
},
{
"epoch": 2.7728,
"grad_norm": 0.6197404357834241,
"learning_rate": 5.637637936798978e-07,
"loss": 0.3818,
"step": 1733
},
{
"epoch": 2.7744,
"grad_norm": 0.5905004221255729,
"learning_rate": 5.560555601477418e-07,
"loss": 0.3767,
"step": 1734
},
{
"epoch": 2.776,
"grad_norm": 0.5958395135943388,
"learning_rate": 5.483972827444645e-07,
"loss": 0.3937,
"step": 1735
},
{
"epoch": 2.7776,
"grad_norm": 0.5996780255952401,
"learning_rate": 5.407890475597761e-07,
"loss": 0.3896,
"step": 1736
},
{
"epoch": 2.7792,
"grad_norm": 0.5669617446118168,
"learning_rate": 5.332309401208407e-07,
"loss": 0.3726,
"step": 1737
},
{
"epoch": 2.7808,
"grad_norm": 0.6268307738196953,
"learning_rate": 5.257230453913237e-07,
"loss": 0.4233,
"step": 1738
},
{
"epoch": 2.7824,
"grad_norm": 0.5864553870059951,
"learning_rate": 5.182654477704229e-07,
"loss": 0.3846,
"step": 1739
},
{
"epoch": 2.784,
"grad_norm": 0.6036664703348141,
"learning_rate": 5.108582310919302e-07,
"loss": 0.3928,
"step": 1740
},
{
"epoch": 2.7856,
"grad_norm": 0.6031516892613834,
"learning_rate": 5.035014786232828e-07,
"loss": 0.3925,
"step": 1741
},
{
"epoch": 2.7872,
"grad_norm": 0.5866800018560955,
"learning_rate": 4.961952730646319e-07,
"loss": 0.3859,
"step": 1742
},
{
"epoch": 2.7888,
"grad_norm": 0.5877381777569246,
"learning_rate": 4.889396965479115e-07,
"loss": 0.3673,
"step": 1743
},
{
"epoch": 2.7904,
"grad_norm": 0.6209548847901005,
"learning_rate": 4.817348306359121e-07,
"loss": 0.4131,
"step": 1744
},
{
"epoch": 2.792,
"grad_norm": 0.6046029653791415,
"learning_rate": 4.745807563213678e-07,
"loss": 0.391,
"step": 1745
},
{
"epoch": 2.7936,
"grad_norm": 0.6135800654153217,
"learning_rate": 4.6747755402604565e-07,
"loss": 0.3988,
"step": 1746
},
{
"epoch": 2.7952,
"grad_norm": 0.5864985386522907,
"learning_rate": 4.6042530359983793e-07,
"loss": 0.3784,
"step": 1747
},
{
"epoch": 2.7968,
"grad_norm": 0.6125567210434135,
"learning_rate": 4.534240843198662e-07,
"loss": 0.4236,
"step": 1748
},
{
"epoch": 2.7984,
"grad_norm": 0.5972475717825011,
"learning_rate": 4.464739748895963e-07,
"loss": 0.4029,
"step": 1749
},
{
"epoch": 2.8,
"grad_norm": 0.6051136361996698,
"learning_rate": 4.3957505343794115e-07,
"loss": 0.3704,
"step": 1750
},
{
"epoch": 2.8016,
"grad_norm": 0.5813134121600891,
"learning_rate": 4.327273975183949e-07,
"loss": 0.3789,
"step": 1751
},
{
"epoch": 2.8032,
"grad_norm": 0.6234931456274084,
"learning_rate": 4.259310841081515e-07,
"loss": 0.3879,
"step": 1752
},
{
"epoch": 2.8048,
"grad_norm": 0.5915396255605055,
"learning_rate": 4.191861896072458e-07,
"loss": 0.3942,
"step": 1753
},
{
"epoch": 2.8064,
"grad_norm": 0.6236188832413802,
"learning_rate": 4.1249278983769405e-07,
"loss": 0.3993,
"step": 1754
},
{
"epoch": 2.808,
"grad_norm": 0.589419057067984,
"learning_rate": 4.058509600426358e-07,
"loss": 0.3699,
"step": 1755
},
{
"epoch": 2.8096,
"grad_norm": 0.5920683961204296,
"learning_rate": 3.9926077488549543e-07,
"loss": 0.3723,
"step": 1756
},
{
"epoch": 2.8112,
"grad_norm": 0.5927044128760407,
"learning_rate": 3.9272230844913884e-07,
"loss": 0.415,
"step": 1757
},
{
"epoch": 2.8128,
"grad_norm": 0.5867650416517225,
"learning_rate": 3.8623563423504094e-07,
"loss": 0.3812,
"step": 1758
},
{
"epoch": 2.8144,
"grad_norm": 0.6011267395145728,
"learning_rate": 3.798008251624585e-07,
"loss": 0.3897,
"step": 1759
},
{
"epoch": 2.816,
"grad_norm": 0.5903530577818467,
"learning_rate": 3.734179535676169e-07,
"loss": 0.3712,
"step": 1760
},
{
"epoch": 2.8176,
"grad_norm": 0.6233995273902431,
"learning_rate": 3.6708709120288564e-07,
"loss": 0.4169,
"step": 1761
},
{
"epoch": 2.8192,
"grad_norm": 0.5819666810024561,
"learning_rate": 3.6080830923598266e-07,
"loss": 0.4034,
"step": 1762
},
{
"epoch": 2.8208,
"grad_norm": 0.624654571648536,
"learning_rate": 3.545816782491657e-07,
"loss": 0.407,
"step": 1763
},
{
"epoch": 2.8224,
"grad_norm": 0.556916942200227,
"learning_rate": 3.484072682384465e-07,
"loss": 0.38,
"step": 1764
},
{
"epoch": 2.824,
"grad_norm": 0.5783913767205718,
"learning_rate": 3.422851486127987e-07,
"loss": 0.3636,
"step": 1765
},
{
"epoch": 2.8256,
"grad_norm": 0.5950530497276595,
"learning_rate": 3.3621538819337776e-07,
"loss": 0.3936,
"step": 1766
},
{
"epoch": 2.8272,
"grad_norm": 0.5729311169002653,
"learning_rate": 3.301980552127509e-07,
"loss": 0.3953,
"step": 1767
},
{
"epoch": 2.8288,
"grad_norm": 0.6104157298212889,
"learning_rate": 3.2423321731412774e-07,
"loss": 0.392,
"step": 1768
},
{
"epoch": 2.8304,
"grad_norm": 0.6151767806763212,
"learning_rate": 3.1832094155059776e-07,
"loss": 0.409,
"step": 1769
},
{
"epoch": 2.832,
"grad_norm": 0.5970876033807373,
"learning_rate": 3.1246129438438076e-07,
"loss": 0.3966,
"step": 1770
},
{
"epoch": 2.8336,
"grad_norm": 0.5906164353444058,
"learning_rate": 3.0665434168607846e-07,
"loss": 0.3647,
"step": 1771
},
{
"epoch": 2.8352,
"grad_norm": 0.6423459013932351,
"learning_rate": 3.009001487339308e-07,
"loss": 0.3946,
"step": 1772
},
{
"epoch": 2.8368,
"grad_norm": 0.618731392362854,
"learning_rate": 2.9519878021308624e-07,
"loss": 0.4115,
"step": 1773
},
{
"epoch": 2.8384,
"grad_norm": 0.5631497037610242,
"learning_rate": 2.8955030021487254e-07,
"loss": 0.376,
"step": 1774
},
{
"epoch": 2.84,
"grad_norm": 0.5686353998203753,
"learning_rate": 2.839547722360769e-07,
"loss": 0.4046,
"step": 1775
},
{
"epoch": 2.8416,
"grad_norm": 0.5868907407507975,
"learning_rate": 2.7841225917823347e-07,
"loss": 0.3994,
"step": 1776
},
{
"epoch": 2.8432,
"grad_norm": 0.6131155477454447,
"learning_rate": 2.7292282334691167e-07,
"loss": 0.407,
"step": 1777
},
{
"epoch": 2.8448,
"grad_norm": 0.5611882012761621,
"learning_rate": 2.674865264510218e-07,
"loss": 0.383,
"step": 1778
},
{
"epoch": 2.8464,
"grad_norm": 0.6360575911404578,
"learning_rate": 2.6210342960211744e-07,
"loss": 0.4001,
"step": 1779
},
{
"epoch": 2.848,
"grad_norm": 0.5910729354119789,
"learning_rate": 2.5677359331370834e-07,
"loss": 0.3948,
"step": 1780
},
{
"epoch": 2.8496,
"grad_norm": 0.6193866816475463,
"learning_rate": 2.5149707750058316e-07,
"loss": 0.4185,
"step": 1781
},
{
"epoch": 2.8512,
"grad_norm": 0.5952981628387326,
"learning_rate": 2.462739414781334e-07,
"loss": 0.384,
"step": 1782
},
{
"epoch": 2.8528000000000002,
"grad_norm": 0.5766562432375145,
"learning_rate": 2.411042439616873e-07,
"loss": 0.3718,
"step": 1783
},
{
"epoch": 2.8544,
"grad_norm": 0.6042455836579004,
"learning_rate": 2.3598804306584843e-07,
"loss": 0.3972,
"step": 1784
},
{
"epoch": 2.856,
"grad_norm": 0.5832791231533607,
"learning_rate": 2.309253963038477e-07,
"loss": 0.3884,
"step": 1785
},
{
"epoch": 2.8576,
"grad_norm": 0.57155021302307,
"learning_rate": 2.2591636058688804e-07,
"loss": 0.3726,
"step": 1786
},
{
"epoch": 2.8592,
"grad_norm": 0.584952063487649,
"learning_rate": 2.2096099222351343e-07,
"loss": 0.3761,
"step": 1787
},
{
"epoch": 2.8608000000000002,
"grad_norm": 0.6044143923654395,
"learning_rate": 2.1605934691896868e-07,
"loss": 0.422,
"step": 1788
},
{
"epoch": 2.8624,
"grad_norm": 0.5983876466668332,
"learning_rate": 2.1121147977457956e-07,
"loss": 0.3915,
"step": 1789
},
{
"epoch": 2.864,
"grad_norm": 0.578092623216434,
"learning_rate": 2.0641744528712925e-07,
"loss": 0.3715,
"step": 1790
},
{
"epoch": 2.8656,
"grad_norm": 0.6189519979180191,
"learning_rate": 2.0167729734824558e-07,
"loss": 0.3964,
"step": 1791
},
{
"epoch": 2.8672,
"grad_norm": 0.6329375967863802,
"learning_rate": 1.9699108924379818e-07,
"loss": 0.3965,
"step": 1792
},
{
"epoch": 2.8688000000000002,
"grad_norm": 0.5936327910809372,
"learning_rate": 1.9235887365329774e-07,
"loss": 0.3969,
"step": 1793
},
{
"epoch": 2.8704,
"grad_norm": 0.6062843542755041,
"learning_rate": 1.877807026493028e-07,
"loss": 0.374,
"step": 1794
},
{
"epoch": 2.872,
"grad_norm": 0.6064985451141988,
"learning_rate": 1.832566276968345e-07,
"loss": 0.4009,
"step": 1795
},
{
"epoch": 2.8736,
"grad_norm": 0.5791394670396515,
"learning_rate": 1.7878669965280315e-07,
"loss": 0.4081,
"step": 1796
},
{
"epoch": 2.8752,
"grad_norm": 0.5772849682819012,
"learning_rate": 1.7437096876542713e-07,
"loss": 0.3877,
"step": 1797
},
{
"epoch": 2.8768000000000002,
"grad_norm": 0.5755792836414835,
"learning_rate": 1.7000948467367718e-07,
"loss": 0.3717,
"step": 1798
},
{
"epoch": 2.8784,
"grad_norm": 0.5641764856486274,
"learning_rate": 1.657022964067112e-07,
"loss": 0.3744,
"step": 1799
},
{
"epoch": 2.88,
"grad_norm": 0.591931491296406,
"learning_rate": 1.6144945238332987e-07,
"loss": 0.3807,
"step": 1800
},
{
"epoch": 2.8816,
"grad_norm": 0.6021597426570837,
"learning_rate": 1.5725100041142694e-07,
"loss": 0.3961,
"step": 1801
},
{
"epoch": 2.8832,
"grad_norm": 0.5737465714509197,
"learning_rate": 1.5310698768745247e-07,
"loss": 0.3732,
"step": 1802
},
{
"epoch": 2.8848000000000003,
"grad_norm": 0.5892387520983471,
"learning_rate": 1.4901746079588552e-07,
"loss": 0.3896,
"step": 1803
},
{
"epoch": 2.8864,
"grad_norm": 0.5870420700599931,
"learning_rate": 1.4498246570870843e-07,
"loss": 0.3829,
"step": 1804
},
{
"epoch": 2.888,
"grad_norm": 0.578380767083382,
"learning_rate": 1.4100204778488947e-07,
"loss": 0.3827,
"step": 1805
},
{
"epoch": 2.8895999999999997,
"grad_norm": 0.561509751999802,
"learning_rate": 1.370762517698715e-07,
"loss": 0.3697,
"step": 1806
},
{
"epoch": 2.8912,
"grad_norm": 0.6183409039095087,
"learning_rate": 1.3320512179507528e-07,
"loss": 0.402,
"step": 1807
},
{
"epoch": 2.8928000000000003,
"grad_norm": 0.5875985516283856,
"learning_rate": 1.293887013773959e-07,
"loss": 0.3915,
"step": 1808
},
{
"epoch": 2.8944,
"grad_norm": 0.5828817132400705,
"learning_rate": 1.2562703341871708e-07,
"loss": 0.3962,
"step": 1809
},
{
"epoch": 2.896,
"grad_norm": 0.5709156992366151,
"learning_rate": 1.2192016020542986e-07,
"loss": 0.4018,
"step": 1810
},
{
"epoch": 2.8975999999999997,
"grad_norm": 0.5675316884964438,
"learning_rate": 1.1826812340795524e-07,
"loss": 0.3889,
"step": 1811
},
{
"epoch": 2.8992,
"grad_norm": 0.5660206983512364,
"learning_rate": 1.1467096408027678e-07,
"loss": 0.3797,
"step": 1812
},
{
"epoch": 2.9008000000000003,
"grad_norm": 0.5759501694299175,
"learning_rate": 1.1112872265947816e-07,
"loss": 0.3673,
"step": 1813
},
{
"epoch": 2.9024,
"grad_norm": 0.6136831172005237,
"learning_rate": 1.0764143896528967e-07,
"loss": 0.4292,
"step": 1814
},
{
"epoch": 2.904,
"grad_norm": 0.6035412046125581,
"learning_rate": 1.0420915219964023e-07,
"loss": 0.3758,
"step": 1815
},
{
"epoch": 2.9055999999999997,
"grad_norm": 0.5939026598018314,
"learning_rate": 1.008319009462172e-07,
"loss": 0.3907,
"step": 1816
},
{
"epoch": 2.9072,
"grad_norm": 0.5915764633109246,
"learning_rate": 9.75097231700295e-08,
"loss": 0.4082,
"step": 1817
},
{
"epoch": 2.9088000000000003,
"grad_norm": 0.5878852986273059,
"learning_rate": 9.424265621698736e-08,
"loss": 0.4037,
"step": 1818
},
{
"epoch": 2.9104,
"grad_norm": 0.6045378997343863,
"learning_rate": 9.103073681347607e-08,
"loss": 0.3877,
"step": 1819
},
{
"epoch": 2.912,
"grad_norm": 0.6137673568136339,
"learning_rate": 8.787400106594568e-08,
"loss": 0.3996,
"step": 1820
},
{
"epoch": 2.9135999999999997,
"grad_norm": 0.5874825425674062,
"learning_rate": 8.477248446050523e-08,
"loss": 0.4075,
"step": 1821
},
{
"epoch": 2.9152,
"grad_norm": 0.6191104133551966,
"learning_rate": 8.172622186252421e-08,
"loss": 0.4128,
"step": 1822
},
{
"epoch": 2.9168,
"grad_norm": 0.5986660459769473,
"learning_rate": 7.873524751624006e-08,
"loss": 0.3945,
"step": 1823
},
{
"epoch": 2.9184,
"grad_norm": 0.5814253850836604,
"learning_rate": 7.579959504437184e-08,
"loss": 0.3744,
"step": 1824
},
{
"epoch": 2.92,
"grad_norm": 0.6408180368914321,
"learning_rate": 7.291929744774495e-08,
"loss": 0.3957,
"step": 1825
},
{
"epoch": 2.9215999999999998,
"grad_norm": 0.5940898505743869,
"learning_rate": 7.009438710491978e-08,
"loss": 0.3974,
"step": 1826
},
{
"epoch": 2.9232,
"grad_norm": 0.5748885742134016,
"learning_rate": 6.732489577182422e-08,
"loss": 0.3863,
"step": 1827
},
{
"epoch": 2.9248,
"grad_norm": 0.5928273265231164,
"learning_rate": 6.461085458140059e-08,
"loss": 0.4054,
"step": 1828
},
{
"epoch": 2.9264,
"grad_norm": 0.61406609505904,
"learning_rate": 6.195229404325542e-08,
"loss": 0.4038,
"step": 1829
},
{
"epoch": 2.928,
"grad_norm": 0.5982594770367062,
"learning_rate": 5.934924404331355e-08,
"loss": 0.3942,
"step": 1830
},
{
"epoch": 2.9295999999999998,
"grad_norm": 0.5833835611307522,
"learning_rate": 5.680173384348453e-08,
"loss": 0.3814,
"step": 1831
},
{
"epoch": 2.9312,
"grad_norm": 0.5953616192217653,
"learning_rate": 5.4309792081334024e-08,
"loss": 0.378,
"step": 1832
},
{
"epoch": 2.9328,
"grad_norm": 0.5788430998553257,
"learning_rate": 5.187344676976014e-08,
"loss": 0.374,
"step": 1833
},
{
"epoch": 2.9344,
"grad_norm": 0.5983250963476545,
"learning_rate": 4.949272529667926e-08,
"loss": 0.3908,
"step": 1834
},
{
"epoch": 2.936,
"grad_norm": 0.6329342686357748,
"learning_rate": 4.716765442471849e-08,
"loss": 0.4017,
"step": 1835
},
{
"epoch": 2.9375999999999998,
"grad_norm": 0.6104543471470529,
"learning_rate": 4.489826029091593e-08,
"loss": 0.3935,
"step": 1836
},
{
"epoch": 2.9392,
"grad_norm": 0.6183565501070613,
"learning_rate": 4.2684568406423656e-08,
"loss": 0.3858,
"step": 1837
},
{
"epoch": 2.9408,
"grad_norm": 0.6040788483466192,
"learning_rate": 4.0526603656223515e-08,
"loss": 0.3921,
"step": 1838
},
{
"epoch": 2.9424,
"grad_norm": 0.59530129492777,
"learning_rate": 3.8424390298846815e-08,
"loss": 0.3872,
"step": 1839
},
{
"epoch": 2.944,
"grad_norm": 0.5905525722251078,
"learning_rate": 3.637795196610228e-08,
"loss": 0.4187,
"step": 1840
},
{
"epoch": 2.9455999999999998,
"grad_norm": 0.5954984827514153,
"learning_rate": 3.4387311662807396e-08,
"loss": 0.3882,
"step": 1841
},
{
"epoch": 2.9472,
"grad_norm": 0.5893859257500954,
"learning_rate": 3.24524917665342e-08,
"loss": 0.3845,
"step": 1842
},
{
"epoch": 2.9488,
"grad_norm": 0.5854141760253769,
"learning_rate": 3.0573514027355535e-08,
"loss": 0.385,
"step": 1843
},
{
"epoch": 2.9504,
"grad_norm": 0.5973858670348429,
"learning_rate": 2.8750399567599174e-08,
"loss": 0.3865,
"step": 1844
},
{
"epoch": 2.952,
"grad_norm": 0.5688150622764607,
"learning_rate": 2.6983168881611897e-08,
"loss": 0.3834,
"step": 1845
},
{
"epoch": 2.9536,
"grad_norm": 0.5866592155654836,
"learning_rate": 2.527184183553022e-08,
"loss": 0.3843,
"step": 1846
},
{
"epoch": 2.9552,
"grad_norm": 0.5943566654704281,
"learning_rate": 2.3616437667055014e-08,
"loss": 0.3991,
"step": 1847
},
{
"epoch": 2.9568,
"grad_norm": 0.5610313719080197,
"learning_rate": 2.2016974985236695e-08,
"loss": 0.3783,
"step": 1848
},
{
"epoch": 2.9584,
"grad_norm": 0.574680562576564,
"learning_rate": 2.047347177026371e-08,
"loss": 0.3757,
"step": 1849
},
{
"epoch": 2.96,
"grad_norm": 0.5775662336109608,
"learning_rate": 1.898594537326437e-08,
"loss": 0.3792,
"step": 1850
},
{
"epoch": 2.9616,
"grad_norm": 0.5896786369541753,
"learning_rate": 1.7554412516108678e-08,
"loss": 0.3852,
"step": 1851
},
{
"epoch": 2.9632,
"grad_norm": 0.5635671620269883,
"learning_rate": 1.6178889291220135e-08,
"loss": 0.3837,
"step": 1852
},
{
"epoch": 2.9648,
"grad_norm": 0.5856894931575174,
"learning_rate": 1.4859391161397008e-08,
"loss": 0.3809,
"step": 1853
},
{
"epoch": 2.9664,
"grad_norm": 0.5700795764285082,
"learning_rate": 1.3595932959638015e-08,
"loss": 0.3938,
"step": 1854
},
{
"epoch": 2.968,
"grad_norm": 0.5617189246959258,
"learning_rate": 1.2388528888973017e-08,
"loss": 0.3528,
"step": 1855
},
{
"epoch": 2.9696,
"grad_norm": 0.6603954428250471,
"learning_rate": 1.1237192522307594e-08,
"loss": 0.3925,
"step": 1856
},
{
"epoch": 2.9712,
"grad_norm": 0.6090255027413676,
"learning_rate": 1.014193680226594e-08,
"loss": 0.3624,
"step": 1857
},
{
"epoch": 2.9728,
"grad_norm": 0.5901397399448918,
"learning_rate": 9.102774041049867e-09,
"loss": 0.4002,
"step": 1858
},
{
"epoch": 2.9744,
"grad_norm": 0.5963203048784672,
"learning_rate": 8.119715920296145e-09,
"loss": 0.3811,
"step": 1859
},
{
"epoch": 2.976,
"grad_norm": 0.5860303110406762,
"learning_rate": 7.1927734909488235e-09,
"loss": 0.3954,
"step": 1860
},
{
"epoch": 2.9776,
"grad_norm": 0.5906482897156549,
"learning_rate": 6.321957173132665e-09,
"loss": 0.3966,
"step": 1861
},
{
"epoch": 2.9792,
"grad_norm": 0.6018549097676631,
"learning_rate": 5.507276756036018e-09,
"loss": 0.3831,
"step": 1862
},
{
"epoch": 2.9808,
"grad_norm": 0.5958485094010272,
"learning_rate": 4.74874139780257e-09,
"loss": 0.3624,
"step": 1863
},
{
"epoch": 2.9824,
"grad_norm": 0.5802974238961234,
"learning_rate": 4.046359625426988e-09,
"loss": 0.3913,
"step": 1864
},
{
"epoch": 2.984,
"grad_norm": 0.5941602413219743,
"learning_rate": 3.400139334658881e-09,
"loss": 0.4107,
"step": 1865
},
{
"epoch": 2.9856,
"grad_norm": 0.5949666726420274,
"learning_rate": 2.81008778991565e-09,
"loss": 0.3898,
"step": 1866
},
{
"epoch": 2.9872,
"grad_norm": 0.5706501152872799,
"learning_rate": 2.27621162419811e-09,
"loss": 0.3801,
"step": 1867
},
{
"epoch": 2.9888,
"grad_norm": 0.5920080363506407,
"learning_rate": 1.7985168390194375e-09,
"loss": 0.4038,
"step": 1868
},
{
"epoch": 2.9904,
"grad_norm": 0.5923134764523963,
"learning_rate": 1.3770088043335573e-09,
"loss": 0.4083,
"step": 1869
},
{
"epoch": 2.992,
"grad_norm": 0.5650835745974974,
"learning_rate": 1.01169225847908e-09,
"loss": 0.3843,
"step": 1870
},
{
"epoch": 2.9936,
"grad_norm": 0.577196003203904,
"learning_rate": 7.025713081232343e-10,
"loss": 0.3686,
"step": 1871
},
{
"epoch": 2.9952,
"grad_norm": 0.5902254985223518,
"learning_rate": 4.496494282157926e-10,
"loss": 0.3921,
"step": 1872
},
{
"epoch": 2.9968,
"grad_norm": 0.5784728891063586,
"learning_rate": 2.529294619513234e-10,
"loss": 0.4188,
"step": 1873
},
{
"epoch": 2.9984,
"grad_norm": 0.5636393556114256,
"learning_rate": 1.1241362073588502e-10,
"loss": 0.3836,
"step": 1874
},
{
"epoch": 3.0,
"grad_norm": 0.5863281249393232,
"learning_rate": 2.8103484164820894e-11,
"loss": 0.3853,
"step": 1875
}
],
"logging_steps": 1,
"max_steps": 1875,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1063007016714240.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}