egor1-7b-sft-step600 / trainer_state.json
FlippyDora's picture
Add files using upload-large-folder tool
3e3474e verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2,
"eval_steps": 500,
"global_step": 600,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002,
"grad_norm": 9.074012756347656,
"learning_rate": 0.0,
"loss": 1.1121,
"step": 1
},
{
"epoch": 0.004,
"grad_norm": 10.233168601989746,
"learning_rate": 6.666666666666668e-08,
"loss": 1.2991,
"step": 2
},
{
"epoch": 0.006,
"grad_norm": 10.353023529052734,
"learning_rate": 1.3333333333333336e-07,
"loss": 1.3845,
"step": 3
},
{
"epoch": 0.008,
"grad_norm": 9.337516784667969,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.2514,
"step": 4
},
{
"epoch": 0.01,
"grad_norm": 10.30583667755127,
"learning_rate": 2.666666666666667e-07,
"loss": 1.3357,
"step": 5
},
{
"epoch": 0.012,
"grad_norm": 9.072500228881836,
"learning_rate": 3.3333333333333335e-07,
"loss": 1.1423,
"step": 6
},
{
"epoch": 0.014,
"grad_norm": 9.9359712600708,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.2476,
"step": 7
},
{
"epoch": 0.016,
"grad_norm": 9.5555419921875,
"learning_rate": 4.666666666666667e-07,
"loss": 1.2722,
"step": 8
},
{
"epoch": 0.018,
"grad_norm": 11.1956148147583,
"learning_rate": 5.333333333333335e-07,
"loss": 1.4133,
"step": 9
},
{
"epoch": 0.02,
"grad_norm": 9.675064086914062,
"learning_rate": 6.000000000000001e-07,
"loss": 1.3254,
"step": 10
},
{
"epoch": 0.022,
"grad_norm": 8.407377243041992,
"learning_rate": 6.666666666666667e-07,
"loss": 1.2715,
"step": 11
},
{
"epoch": 0.024,
"grad_norm": 8.663851737976074,
"learning_rate": 7.333333333333334e-07,
"loss": 1.2211,
"step": 12
},
{
"epoch": 0.026,
"grad_norm": 6.422065258026123,
"learning_rate": 8.000000000000001e-07,
"loss": 1.1171,
"step": 13
},
{
"epoch": 0.028,
"grad_norm": 5.890502452850342,
"learning_rate": 8.666666666666668e-07,
"loss": 1.1156,
"step": 14
},
{
"epoch": 0.03,
"grad_norm": 6.316369533538818,
"learning_rate": 9.333333333333334e-07,
"loss": 1.2838,
"step": 15
},
{
"epoch": 0.032,
"grad_norm": 5.3838911056518555,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0919,
"step": 16
},
{
"epoch": 0.034,
"grad_norm": 3.512526273727417,
"learning_rate": 1.066666666666667e-06,
"loss": 0.9868,
"step": 17
},
{
"epoch": 0.036,
"grad_norm": 3.633479595184326,
"learning_rate": 1.1333333333333334e-06,
"loss": 1.154,
"step": 18
},
{
"epoch": 0.038,
"grad_norm": 3.0341765880584717,
"learning_rate": 1.2000000000000002e-06,
"loss": 0.9785,
"step": 19
},
{
"epoch": 0.04,
"grad_norm": 3.596338987350464,
"learning_rate": 1.2666666666666669e-06,
"loss": 1.0783,
"step": 20
},
{
"epoch": 0.042,
"grad_norm": 3.751676082611084,
"learning_rate": 1.3333333333333334e-06,
"loss": 1.0437,
"step": 21
},
{
"epoch": 0.044,
"grad_norm": 2.806690216064453,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.0281,
"step": 22
},
{
"epoch": 0.046,
"grad_norm": 2.822889566421509,
"learning_rate": 1.4666666666666669e-06,
"loss": 0.9858,
"step": 23
},
{
"epoch": 0.048,
"grad_norm": 4.2820024490356445,
"learning_rate": 1.5333333333333334e-06,
"loss": 1.1258,
"step": 24
},
{
"epoch": 0.05,
"grad_norm": 4.147153377532959,
"learning_rate": 1.6000000000000001e-06,
"loss": 0.9762,
"step": 25
},
{
"epoch": 0.052,
"grad_norm": 3.698709726333618,
"learning_rate": 1.6666666666666667e-06,
"loss": 0.9955,
"step": 26
},
{
"epoch": 0.054,
"grad_norm": 3.4206106662750244,
"learning_rate": 1.7333333333333336e-06,
"loss": 0.9448,
"step": 27
},
{
"epoch": 0.056,
"grad_norm": 2.970322608947754,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.9204,
"step": 28
},
{
"epoch": 0.058,
"grad_norm": 2.804030179977417,
"learning_rate": 1.8666666666666669e-06,
"loss": 0.9797,
"step": 29
},
{
"epoch": 0.06,
"grad_norm": 2.511726140975952,
"learning_rate": 1.9333333333333336e-06,
"loss": 0.9715,
"step": 30
},
{
"epoch": 0.062,
"grad_norm": 2.303981304168701,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9446,
"step": 31
},
{
"epoch": 0.064,
"grad_norm": 2.4346749782562256,
"learning_rate": 2.0666666666666666e-06,
"loss": 0.9981,
"step": 32
},
{
"epoch": 0.066,
"grad_norm": 2.267148494720459,
"learning_rate": 2.133333333333334e-06,
"loss": 0.9105,
"step": 33
},
{
"epoch": 0.068,
"grad_norm": 2.305480718612671,
"learning_rate": 2.2e-06,
"loss": 0.9894,
"step": 34
},
{
"epoch": 0.07,
"grad_norm": 2.2832963466644287,
"learning_rate": 2.266666666666667e-06,
"loss": 0.9252,
"step": 35
},
{
"epoch": 0.072,
"grad_norm": 2.43243408203125,
"learning_rate": 2.3333333333333336e-06,
"loss": 0.9257,
"step": 36
},
{
"epoch": 0.074,
"grad_norm": 2.2071030139923096,
"learning_rate": 2.4000000000000003e-06,
"loss": 0.9159,
"step": 37
},
{
"epoch": 0.076,
"grad_norm": 2.364295482635498,
"learning_rate": 2.466666666666667e-06,
"loss": 0.9814,
"step": 38
},
{
"epoch": 0.078,
"grad_norm": 2.3316822052001953,
"learning_rate": 2.5333333333333338e-06,
"loss": 0.8988,
"step": 39
},
{
"epoch": 0.08,
"grad_norm": 2.122209310531616,
"learning_rate": 2.6e-06,
"loss": 0.843,
"step": 40
},
{
"epoch": 0.082,
"grad_norm": 2.1770570278167725,
"learning_rate": 2.666666666666667e-06,
"loss": 0.9904,
"step": 41
},
{
"epoch": 0.084,
"grad_norm": 1.8280588388442993,
"learning_rate": 2.7333333333333336e-06,
"loss": 0.8662,
"step": 42
},
{
"epoch": 0.086,
"grad_norm": 2.038935422897339,
"learning_rate": 2.8000000000000003e-06,
"loss": 0.9375,
"step": 43
},
{
"epoch": 0.088,
"grad_norm": 1.918784260749817,
"learning_rate": 2.866666666666667e-06,
"loss": 0.8811,
"step": 44
},
{
"epoch": 0.09,
"grad_norm": 2.156174421310425,
"learning_rate": 2.9333333333333338e-06,
"loss": 0.9608,
"step": 45
},
{
"epoch": 0.092,
"grad_norm": 1.9383772611618042,
"learning_rate": 3e-06,
"loss": 0.7929,
"step": 46
},
{
"epoch": 0.094,
"grad_norm": 2.064772605895996,
"learning_rate": 3.066666666666667e-06,
"loss": 0.7918,
"step": 47
},
{
"epoch": 0.096,
"grad_norm": 1.6674835681915283,
"learning_rate": 3.133333333333334e-06,
"loss": 0.781,
"step": 48
},
{
"epoch": 0.098,
"grad_norm": 1.9640334844589233,
"learning_rate": 3.2000000000000003e-06,
"loss": 0.8781,
"step": 49
},
{
"epoch": 0.1,
"grad_norm": 1.7003456354141235,
"learning_rate": 3.266666666666667e-06,
"loss": 0.8295,
"step": 50
},
{
"epoch": 0.102,
"grad_norm": 2.007157325744629,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.7991,
"step": 51
},
{
"epoch": 0.104,
"grad_norm": 1.923763632774353,
"learning_rate": 3.4000000000000005e-06,
"loss": 0.8986,
"step": 52
},
{
"epoch": 0.106,
"grad_norm": 1.9844627380371094,
"learning_rate": 3.4666666666666672e-06,
"loss": 0.8552,
"step": 53
},
{
"epoch": 0.108,
"grad_norm": 1.7688173055648804,
"learning_rate": 3.5333333333333335e-06,
"loss": 0.8094,
"step": 54
},
{
"epoch": 0.11,
"grad_norm": 2.014064311981201,
"learning_rate": 3.6000000000000003e-06,
"loss": 0.8492,
"step": 55
},
{
"epoch": 0.112,
"grad_norm": 1.7487848997116089,
"learning_rate": 3.6666666666666666e-06,
"loss": 0.8027,
"step": 56
},
{
"epoch": 0.114,
"grad_norm": 2.024191379547119,
"learning_rate": 3.7333333333333337e-06,
"loss": 0.7526,
"step": 57
},
{
"epoch": 0.116,
"grad_norm": 1.862096905708313,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.822,
"step": 58
},
{
"epoch": 0.118,
"grad_norm": 1.8116499185562134,
"learning_rate": 3.866666666666667e-06,
"loss": 0.8448,
"step": 59
},
{
"epoch": 0.12,
"grad_norm": 2.121875762939453,
"learning_rate": 3.9333333333333335e-06,
"loss": 0.8509,
"step": 60
},
{
"epoch": 0.122,
"grad_norm": 1.9175318479537964,
"learning_rate": 4.000000000000001e-06,
"loss": 0.7965,
"step": 61
},
{
"epoch": 0.124,
"grad_norm": 1.937557339668274,
"learning_rate": 4.066666666666667e-06,
"loss": 0.8288,
"step": 62
},
{
"epoch": 0.126,
"grad_norm": 1.920708417892456,
"learning_rate": 4.133333333333333e-06,
"loss": 0.837,
"step": 63
},
{
"epoch": 0.128,
"grad_norm": 1.8257769346237183,
"learning_rate": 4.2000000000000004e-06,
"loss": 0.8358,
"step": 64
},
{
"epoch": 0.13,
"grad_norm": 2.02504825592041,
"learning_rate": 4.266666666666668e-06,
"loss": 0.8389,
"step": 65
},
{
"epoch": 0.132,
"grad_norm": 1.7633429765701294,
"learning_rate": 4.333333333333334e-06,
"loss": 0.838,
"step": 66
},
{
"epoch": 0.134,
"grad_norm": 1.7253111600875854,
"learning_rate": 4.4e-06,
"loss": 0.7638,
"step": 67
},
{
"epoch": 0.136,
"grad_norm": 1.757811188697815,
"learning_rate": 4.4666666666666665e-06,
"loss": 0.8302,
"step": 68
},
{
"epoch": 0.138,
"grad_norm": 1.8107619285583496,
"learning_rate": 4.533333333333334e-06,
"loss": 0.8787,
"step": 69
},
{
"epoch": 0.14,
"grad_norm": 2.0448291301727295,
"learning_rate": 4.600000000000001e-06,
"loss": 0.844,
"step": 70
},
{
"epoch": 0.142,
"grad_norm": 1.9952377080917358,
"learning_rate": 4.666666666666667e-06,
"loss": 0.8028,
"step": 71
},
{
"epoch": 0.144,
"grad_norm": 1.8734209537506104,
"learning_rate": 4.7333333333333335e-06,
"loss": 0.8019,
"step": 72
},
{
"epoch": 0.146,
"grad_norm": 1.9044350385665894,
"learning_rate": 4.800000000000001e-06,
"loss": 0.8009,
"step": 73
},
{
"epoch": 0.148,
"grad_norm": 1.6113317012786865,
"learning_rate": 4.866666666666667e-06,
"loss": 0.7548,
"step": 74
},
{
"epoch": 0.15,
"grad_norm": 1.8096058368682861,
"learning_rate": 4.933333333333334e-06,
"loss": 0.8315,
"step": 75
},
{
"epoch": 0.152,
"grad_norm": 1.917984962463379,
"learning_rate": 5e-06,
"loss": 0.6815,
"step": 76
},
{
"epoch": 0.154,
"grad_norm": 1.8071333169937134,
"learning_rate": 5.0666666666666676e-06,
"loss": 0.8477,
"step": 77
},
{
"epoch": 0.156,
"grad_norm": 1.7420942783355713,
"learning_rate": 5.133333333333334e-06,
"loss": 0.8223,
"step": 78
},
{
"epoch": 0.158,
"grad_norm": 1.6158325672149658,
"learning_rate": 5.2e-06,
"loss": 0.7425,
"step": 79
},
{
"epoch": 0.16,
"grad_norm": 1.6784685850143433,
"learning_rate": 5.2666666666666665e-06,
"loss": 0.7937,
"step": 80
},
{
"epoch": 0.162,
"grad_norm": 1.7116400003433228,
"learning_rate": 5.333333333333334e-06,
"loss": 0.7972,
"step": 81
},
{
"epoch": 0.164,
"grad_norm": 1.817854881286621,
"learning_rate": 5.400000000000001e-06,
"loss": 0.7349,
"step": 82
},
{
"epoch": 0.166,
"grad_norm": 1.6824537515640259,
"learning_rate": 5.466666666666667e-06,
"loss": 0.7843,
"step": 83
},
{
"epoch": 0.168,
"grad_norm": 1.8841086626052856,
"learning_rate": 5.533333333333334e-06,
"loss": 0.7751,
"step": 84
},
{
"epoch": 0.17,
"grad_norm": 1.7656822204589844,
"learning_rate": 5.600000000000001e-06,
"loss": 0.8324,
"step": 85
},
{
"epoch": 0.172,
"grad_norm": 1.8588966131210327,
"learning_rate": 5.666666666666667e-06,
"loss": 0.7741,
"step": 86
},
{
"epoch": 0.174,
"grad_norm": 1.826100468635559,
"learning_rate": 5.733333333333334e-06,
"loss": 0.7904,
"step": 87
},
{
"epoch": 0.176,
"grad_norm": 2.0713677406311035,
"learning_rate": 5.8e-06,
"loss": 0.7186,
"step": 88
},
{
"epoch": 0.178,
"grad_norm": 1.842650294303894,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.7625,
"step": 89
},
{
"epoch": 0.18,
"grad_norm": 1.7636773586273193,
"learning_rate": 5.933333333333335e-06,
"loss": 0.7325,
"step": 90
},
{
"epoch": 0.182,
"grad_norm": 1.9637341499328613,
"learning_rate": 6e-06,
"loss": 0.7395,
"step": 91
},
{
"epoch": 0.184,
"grad_norm": 2.0242807865142822,
"learning_rate": 6.066666666666667e-06,
"loss": 0.7709,
"step": 92
},
{
"epoch": 0.186,
"grad_norm": 1.6189954280853271,
"learning_rate": 6.133333333333334e-06,
"loss": 0.7162,
"step": 93
},
{
"epoch": 0.188,
"grad_norm": 1.6651357412338257,
"learning_rate": 6.200000000000001e-06,
"loss": 0.6606,
"step": 94
},
{
"epoch": 0.19,
"grad_norm": 1.5688307285308838,
"learning_rate": 6.266666666666668e-06,
"loss": 0.6288,
"step": 95
},
{
"epoch": 0.192,
"grad_norm": 1.7620747089385986,
"learning_rate": 6.333333333333333e-06,
"loss": 0.7872,
"step": 96
},
{
"epoch": 0.194,
"grad_norm": 1.949406385421753,
"learning_rate": 6.4000000000000006e-06,
"loss": 0.7401,
"step": 97
},
{
"epoch": 0.196,
"grad_norm": 1.9228166341781616,
"learning_rate": 6.466666666666667e-06,
"loss": 0.8483,
"step": 98
},
{
"epoch": 0.198,
"grad_norm": 1.688018798828125,
"learning_rate": 6.533333333333334e-06,
"loss": 0.7293,
"step": 99
},
{
"epoch": 0.2,
"grad_norm": 1.861033320426941,
"learning_rate": 6.600000000000001e-06,
"loss": 0.7434,
"step": 100
},
{
"epoch": 0.202,
"grad_norm": 1.5343120098114014,
"learning_rate": 6.666666666666667e-06,
"loss": 0.7408,
"step": 101
},
{
"epoch": 0.204,
"grad_norm": 2.037724256515503,
"learning_rate": 6.733333333333334e-06,
"loss": 0.7478,
"step": 102
},
{
"epoch": 0.206,
"grad_norm": 1.756672739982605,
"learning_rate": 6.800000000000001e-06,
"loss": 0.7826,
"step": 103
},
{
"epoch": 0.208,
"grad_norm": 1.841848373413086,
"learning_rate": 6.866666666666667e-06,
"loss": 0.6668,
"step": 104
},
{
"epoch": 0.21,
"grad_norm": 1.9532805681228638,
"learning_rate": 6.9333333333333344e-06,
"loss": 0.7651,
"step": 105
},
{
"epoch": 0.212,
"grad_norm": 2.2724688053131104,
"learning_rate": 7e-06,
"loss": 0.8298,
"step": 106
},
{
"epoch": 0.214,
"grad_norm": 2.0979325771331787,
"learning_rate": 7.066666666666667e-06,
"loss": 0.7218,
"step": 107
},
{
"epoch": 0.216,
"grad_norm": 2.240551710128784,
"learning_rate": 7.133333333333334e-06,
"loss": 0.7275,
"step": 108
},
{
"epoch": 0.218,
"grad_norm": 1.7819173336029053,
"learning_rate": 7.2000000000000005e-06,
"loss": 0.7546,
"step": 109
},
{
"epoch": 0.22,
"grad_norm": 1.8862335681915283,
"learning_rate": 7.266666666666668e-06,
"loss": 0.7409,
"step": 110
},
{
"epoch": 0.222,
"grad_norm": 2.2147364616394043,
"learning_rate": 7.333333333333333e-06,
"loss": 0.8455,
"step": 111
},
{
"epoch": 0.224,
"grad_norm": 1.6574546098709106,
"learning_rate": 7.4e-06,
"loss": 0.6964,
"step": 112
},
{
"epoch": 0.226,
"grad_norm": 1.7560558319091797,
"learning_rate": 7.4666666666666675e-06,
"loss": 0.6683,
"step": 113
},
{
"epoch": 0.228,
"grad_norm": 1.7830218076705933,
"learning_rate": 7.533333333333334e-06,
"loss": 0.7328,
"step": 114
},
{
"epoch": 0.23,
"grad_norm": 1.626004695892334,
"learning_rate": 7.600000000000001e-06,
"loss": 0.7094,
"step": 115
},
{
"epoch": 0.232,
"grad_norm": 1.7532685995101929,
"learning_rate": 7.666666666666667e-06,
"loss": 0.6798,
"step": 116
},
{
"epoch": 0.234,
"grad_norm": 1.8709031343460083,
"learning_rate": 7.733333333333334e-06,
"loss": 0.7504,
"step": 117
},
{
"epoch": 0.236,
"grad_norm": 1.9467530250549316,
"learning_rate": 7.800000000000002e-06,
"loss": 0.6752,
"step": 118
},
{
"epoch": 0.238,
"grad_norm": 1.9866005182266235,
"learning_rate": 7.866666666666667e-06,
"loss": 0.7465,
"step": 119
},
{
"epoch": 0.24,
"grad_norm": 2.031965970993042,
"learning_rate": 7.933333333333334e-06,
"loss": 0.6898,
"step": 120
},
{
"epoch": 0.242,
"grad_norm": 2.1669931411743164,
"learning_rate": 8.000000000000001e-06,
"loss": 0.8142,
"step": 121
},
{
"epoch": 0.244,
"grad_norm": 2.127737045288086,
"learning_rate": 8.066666666666667e-06,
"loss": 0.6916,
"step": 122
},
{
"epoch": 0.246,
"grad_norm": 1.7279045581817627,
"learning_rate": 8.133333333333334e-06,
"loss": 0.6211,
"step": 123
},
{
"epoch": 0.248,
"grad_norm": 1.8746819496154785,
"learning_rate": 8.2e-06,
"loss": 0.7103,
"step": 124
},
{
"epoch": 0.25,
"grad_norm": 1.6384533643722534,
"learning_rate": 8.266666666666667e-06,
"loss": 0.7519,
"step": 125
},
{
"epoch": 0.252,
"grad_norm": 1.7692192792892456,
"learning_rate": 8.333333333333334e-06,
"loss": 0.6414,
"step": 126
},
{
"epoch": 0.254,
"grad_norm": 2.125847339630127,
"learning_rate": 8.400000000000001e-06,
"loss": 0.7211,
"step": 127
},
{
"epoch": 0.256,
"grad_norm": 1.9361572265625,
"learning_rate": 8.466666666666668e-06,
"loss": 0.7447,
"step": 128
},
{
"epoch": 0.258,
"grad_norm": 1.7561177015304565,
"learning_rate": 8.533333333333335e-06,
"loss": 0.8059,
"step": 129
},
{
"epoch": 0.26,
"grad_norm": 1.6317267417907715,
"learning_rate": 8.6e-06,
"loss": 0.6754,
"step": 130
},
{
"epoch": 0.262,
"grad_norm": 1.769873023033142,
"learning_rate": 8.666666666666668e-06,
"loss": 0.7687,
"step": 131
},
{
"epoch": 0.264,
"grad_norm": 1.6142659187316895,
"learning_rate": 8.733333333333333e-06,
"loss": 0.6884,
"step": 132
},
{
"epoch": 0.266,
"grad_norm": 1.97073495388031,
"learning_rate": 8.8e-06,
"loss": 0.6229,
"step": 133
},
{
"epoch": 0.268,
"grad_norm": 1.826964020729065,
"learning_rate": 8.866666666666668e-06,
"loss": 0.6991,
"step": 134
},
{
"epoch": 0.27,
"grad_norm": 1.6792782545089722,
"learning_rate": 8.933333333333333e-06,
"loss": 0.6273,
"step": 135
},
{
"epoch": 0.272,
"grad_norm": 2.159935474395752,
"learning_rate": 9e-06,
"loss": 0.6598,
"step": 136
},
{
"epoch": 0.274,
"grad_norm": 1.8708237409591675,
"learning_rate": 9.066666666666667e-06,
"loss": 0.6693,
"step": 137
},
{
"epoch": 0.276,
"grad_norm": 1.965039849281311,
"learning_rate": 9.133333333333335e-06,
"loss": 0.7231,
"step": 138
},
{
"epoch": 0.278,
"grad_norm": 1.9504064321517944,
"learning_rate": 9.200000000000002e-06,
"loss": 0.7306,
"step": 139
},
{
"epoch": 0.28,
"grad_norm": 1.9902204275131226,
"learning_rate": 9.266666666666667e-06,
"loss": 0.712,
"step": 140
},
{
"epoch": 0.282,
"grad_norm": 2.1614973545074463,
"learning_rate": 9.333333333333334e-06,
"loss": 0.6246,
"step": 141
},
{
"epoch": 0.284,
"grad_norm": 2.2333343029022217,
"learning_rate": 9.4e-06,
"loss": 0.6273,
"step": 142
},
{
"epoch": 0.286,
"grad_norm": 1.8635379076004028,
"learning_rate": 9.466666666666667e-06,
"loss": 0.6877,
"step": 143
},
{
"epoch": 0.288,
"grad_norm": 2.0924274921417236,
"learning_rate": 9.533333333333334e-06,
"loss": 0.6652,
"step": 144
},
{
"epoch": 0.29,
"grad_norm": 1.9547358751296997,
"learning_rate": 9.600000000000001e-06,
"loss": 0.6344,
"step": 145
},
{
"epoch": 0.292,
"grad_norm": 2.028376817703247,
"learning_rate": 9.666666666666667e-06,
"loss": 0.6835,
"step": 146
},
{
"epoch": 0.294,
"grad_norm": 2.3274097442626953,
"learning_rate": 9.733333333333334e-06,
"loss": 0.6882,
"step": 147
},
{
"epoch": 0.296,
"grad_norm": 3.013546943664551,
"learning_rate": 9.800000000000001e-06,
"loss": 0.7537,
"step": 148
},
{
"epoch": 0.298,
"grad_norm": 1.912866234779358,
"learning_rate": 9.866666666666668e-06,
"loss": 0.6437,
"step": 149
},
{
"epoch": 0.3,
"grad_norm": 1.8834062814712524,
"learning_rate": 9.933333333333334e-06,
"loss": 0.6877,
"step": 150
},
{
"epoch": 0.302,
"grad_norm": 2.4105477333068848,
"learning_rate": 1e-05,
"loss": 0.7029,
"step": 151
},
{
"epoch": 0.304,
"grad_norm": 2.0727202892303467,
"learning_rate": 9.99998646145412e-06,
"loss": 0.6817,
"step": 152
},
{
"epoch": 0.306,
"grad_norm": 1.970862627029419,
"learning_rate": 9.999945845889795e-06,
"loss": 0.7434,
"step": 153
},
{
"epoch": 0.308,
"grad_norm": 1.9560319185256958,
"learning_rate": 9.999878153526974e-06,
"loss": 0.6499,
"step": 154
},
{
"epoch": 0.31,
"grad_norm": 1.9579132795333862,
"learning_rate": 9.999783384732242e-06,
"loss": 0.6387,
"step": 155
},
{
"epoch": 0.312,
"grad_norm": 1.92261803150177,
"learning_rate": 9.999661540018812e-06,
"loss": 0.7011,
"step": 156
},
{
"epoch": 0.314,
"grad_norm": 2.063448905944824,
"learning_rate": 9.999512620046523e-06,
"loss": 0.6473,
"step": 157
},
{
"epoch": 0.316,
"grad_norm": 1.7672314643859863,
"learning_rate": 9.999336625621836e-06,
"loss": 0.5952,
"step": 158
},
{
"epoch": 0.318,
"grad_norm": 2.16186261177063,
"learning_rate": 9.99913355769784e-06,
"loss": 0.6737,
"step": 159
},
{
"epoch": 0.32,
"grad_norm": 2.319474935531616,
"learning_rate": 9.998903417374228e-06,
"loss": 0.6113,
"step": 160
},
{
"epoch": 0.322,
"grad_norm": 1.719564437866211,
"learning_rate": 9.99864620589731e-06,
"loss": 0.5909,
"step": 161
},
{
"epoch": 0.324,
"grad_norm": 2.073621988296509,
"learning_rate": 9.998361924659989e-06,
"loss": 0.6285,
"step": 162
},
{
"epoch": 0.326,
"grad_norm": 2.088226079940796,
"learning_rate": 9.998050575201772e-06,
"loss": 0.6524,
"step": 163
},
{
"epoch": 0.328,
"grad_norm": 2.1609091758728027,
"learning_rate": 9.997712159208745e-06,
"loss": 0.6309,
"step": 164
},
{
"epoch": 0.33,
"grad_norm": 1.558556079864502,
"learning_rate": 9.99734667851357e-06,
"loss": 0.5617,
"step": 165
},
{
"epoch": 0.332,
"grad_norm": 2.1843314170837402,
"learning_rate": 9.99695413509548e-06,
"loss": 0.6855,
"step": 166
},
{
"epoch": 0.334,
"grad_norm": 1.97001051902771,
"learning_rate": 9.99653453108026e-06,
"loss": 0.5322,
"step": 167
},
{
"epoch": 0.336,
"grad_norm": 2.2219862937927246,
"learning_rate": 9.996087868740244e-06,
"loss": 0.7239,
"step": 168
},
{
"epoch": 0.338,
"grad_norm": 1.6730774641036987,
"learning_rate": 9.995614150494293e-06,
"loss": 0.5811,
"step": 169
},
{
"epoch": 0.34,
"grad_norm": 2.2813098430633545,
"learning_rate": 9.995113378907791e-06,
"loss": 0.65,
"step": 170
},
{
"epoch": 0.342,
"grad_norm": 1.806111454963684,
"learning_rate": 9.994585556692624e-06,
"loss": 0.5377,
"step": 171
},
{
"epoch": 0.344,
"grad_norm": 1.8797581195831299,
"learning_rate": 9.994030686707171e-06,
"loss": 0.6768,
"step": 172
},
{
"epoch": 0.346,
"grad_norm": 2.2682533264160156,
"learning_rate": 9.993448771956285e-06,
"loss": 0.697,
"step": 173
},
{
"epoch": 0.348,
"grad_norm": 2.145071029663086,
"learning_rate": 9.99283981559128e-06,
"loss": 0.6764,
"step": 174
},
{
"epoch": 0.35,
"grad_norm": 2.1207141876220703,
"learning_rate": 9.992203820909906e-06,
"loss": 0.6779,
"step": 175
},
{
"epoch": 0.352,
"grad_norm": 2.1582446098327637,
"learning_rate": 9.991540791356342e-06,
"loss": 0.5686,
"step": 176
},
{
"epoch": 0.354,
"grad_norm": 2.3938088417053223,
"learning_rate": 9.99085073052117e-06,
"loss": 0.5409,
"step": 177
},
{
"epoch": 0.356,
"grad_norm": 1.7042397260665894,
"learning_rate": 9.990133642141359e-06,
"loss": 0.5424,
"step": 178
},
{
"epoch": 0.358,
"grad_norm": 1.9700382947921753,
"learning_rate": 9.989389530100242e-06,
"loss": 0.5174,
"step": 179
},
{
"epoch": 0.36,
"grad_norm": 1.8847728967666626,
"learning_rate": 9.988618398427495e-06,
"loss": 0.6533,
"step": 180
},
{
"epoch": 0.362,
"grad_norm": 1.59050452709198,
"learning_rate": 9.987820251299121e-06,
"loss": 0.4797,
"step": 181
},
{
"epoch": 0.364,
"grad_norm": 2.219714641571045,
"learning_rate": 9.986995093037422e-06,
"loss": 0.6777,
"step": 182
},
{
"epoch": 0.366,
"grad_norm": 1.7240313291549683,
"learning_rate": 9.986142928110972e-06,
"loss": 0.5583,
"step": 183
},
{
"epoch": 0.368,
"grad_norm": 1.6769351959228516,
"learning_rate": 9.985263761134602e-06,
"loss": 0.4817,
"step": 184
},
{
"epoch": 0.37,
"grad_norm": 1.8481873273849487,
"learning_rate": 9.984357596869369e-06,
"loss": 0.6243,
"step": 185
},
{
"epoch": 0.372,
"grad_norm": 1.861271619796753,
"learning_rate": 9.98342444022253e-06,
"loss": 0.7299,
"step": 186
},
{
"epoch": 0.374,
"grad_norm": 1.8269773721694946,
"learning_rate": 9.982464296247523e-06,
"loss": 0.5756,
"step": 187
},
{
"epoch": 0.376,
"grad_norm": 1.9181562662124634,
"learning_rate": 9.981477170143924e-06,
"loss": 0.5722,
"step": 188
},
{
"epoch": 0.378,
"grad_norm": 1.8730833530426025,
"learning_rate": 9.980463067257437e-06,
"loss": 0.6558,
"step": 189
},
{
"epoch": 0.38,
"grad_norm": 1.6602306365966797,
"learning_rate": 9.979421993079853e-06,
"loss": 0.5833,
"step": 190
},
{
"epoch": 0.382,
"grad_norm": 2.296027421951294,
"learning_rate": 9.978353953249023e-06,
"loss": 0.6008,
"step": 191
},
{
"epoch": 0.384,
"grad_norm": 1.9284954071044922,
"learning_rate": 9.977258953548831e-06,
"loss": 0.5399,
"step": 192
},
{
"epoch": 0.386,
"grad_norm": 1.9524036645889282,
"learning_rate": 9.976136999909156e-06,
"loss": 0.6628,
"step": 193
},
{
"epoch": 0.388,
"grad_norm": 1.8157129287719727,
"learning_rate": 9.97498809840585e-06,
"loss": 0.551,
"step": 194
},
{
"epoch": 0.39,
"grad_norm": 2.375074625015259,
"learning_rate": 9.973812255260692e-06,
"loss": 0.6485,
"step": 195
},
{
"epoch": 0.392,
"grad_norm": 1.9661322832107544,
"learning_rate": 9.972609476841368e-06,
"loss": 0.5929,
"step": 196
},
{
"epoch": 0.394,
"grad_norm": 1.6226578950881958,
"learning_rate": 9.971379769661422e-06,
"loss": 0.482,
"step": 197
},
{
"epoch": 0.396,
"grad_norm": 1.7707046270370483,
"learning_rate": 9.970123140380237e-06,
"loss": 0.5459,
"step": 198
},
{
"epoch": 0.398,
"grad_norm": 1.897963047027588,
"learning_rate": 9.968839595802982e-06,
"loss": 0.543,
"step": 199
},
{
"epoch": 0.4,
"grad_norm": 2.0392866134643555,
"learning_rate": 9.967529142880592e-06,
"loss": 0.6598,
"step": 200
},
{
"epoch": 0.402,
"grad_norm": 1.7297391891479492,
"learning_rate": 9.966191788709716e-06,
"loss": 0.5494,
"step": 201
},
{
"epoch": 0.404,
"grad_norm": 1.7709267139434814,
"learning_rate": 9.964827540532685e-06,
"loss": 0.534,
"step": 202
},
{
"epoch": 0.406,
"grad_norm": 1.688926339149475,
"learning_rate": 9.963436405737476e-06,
"loss": 0.4496,
"step": 203
},
{
"epoch": 0.408,
"grad_norm": 1.7673691511154175,
"learning_rate": 9.962018391857665e-06,
"loss": 0.4809,
"step": 204
},
{
"epoch": 0.41,
"grad_norm": 1.7127712965011597,
"learning_rate": 9.960573506572391e-06,
"loss": 0.5793,
"step": 205
},
{
"epoch": 0.412,
"grad_norm": 2.070152997970581,
"learning_rate": 9.959101757706308e-06,
"loss": 0.6257,
"step": 206
},
{
"epoch": 0.414,
"grad_norm": 1.820627212524414,
"learning_rate": 9.957603153229559e-06,
"loss": 0.5028,
"step": 207
},
{
"epoch": 0.416,
"grad_norm": 2.0696165561676025,
"learning_rate": 9.95607770125771e-06,
"loss": 0.484,
"step": 208
},
{
"epoch": 0.418,
"grad_norm": 2.1388514041900635,
"learning_rate": 9.95452541005172e-06,
"loss": 0.565,
"step": 209
},
{
"epoch": 0.42,
"grad_norm": 1.8014533519744873,
"learning_rate": 9.952946288017899e-06,
"loss": 0.5564,
"step": 210
},
{
"epoch": 0.422,
"grad_norm": 1.9076919555664062,
"learning_rate": 9.951340343707852e-06,
"loss": 0.4663,
"step": 211
},
{
"epoch": 0.424,
"grad_norm": 2.1348037719726562,
"learning_rate": 9.94970758581844e-06,
"loss": 0.5435,
"step": 212
},
{
"epoch": 0.426,
"grad_norm": 1.8880589008331299,
"learning_rate": 9.948048023191728e-06,
"loss": 0.4695,
"step": 213
},
{
"epoch": 0.428,
"grad_norm": 1.757129192352295,
"learning_rate": 9.946361664814942e-06,
"loss": 0.6256,
"step": 214
},
{
"epoch": 0.43,
"grad_norm": 2.1104235649108887,
"learning_rate": 9.94464851982042e-06,
"loss": 0.4596,
"step": 215
},
{
"epoch": 0.432,
"grad_norm": 1.8526471853256226,
"learning_rate": 9.942908597485558e-06,
"loss": 0.5644,
"step": 216
},
{
"epoch": 0.434,
"grad_norm": 1.7613459825515747,
"learning_rate": 9.941141907232766e-06,
"loss": 0.4666,
"step": 217
},
{
"epoch": 0.436,
"grad_norm": 1.5490107536315918,
"learning_rate": 9.939348458629406e-06,
"loss": 0.4919,
"step": 218
},
{
"epoch": 0.438,
"grad_norm": 1.9331456422805786,
"learning_rate": 9.937528261387753e-06,
"loss": 0.5037,
"step": 219
},
{
"epoch": 0.44,
"grad_norm": 2.422027826309204,
"learning_rate": 9.93568132536494e-06,
"loss": 0.5772,
"step": 220
},
{
"epoch": 0.442,
"grad_norm": 1.615033745765686,
"learning_rate": 9.933807660562898e-06,
"loss": 0.4155,
"step": 221
},
{
"epoch": 0.444,
"grad_norm": 1.6419297456741333,
"learning_rate": 9.9319072771283e-06,
"loss": 0.4746,
"step": 222
},
{
"epoch": 0.446,
"grad_norm": 2.0711774826049805,
"learning_rate": 9.929980185352525e-06,
"loss": 0.5218,
"step": 223
},
{
"epoch": 0.448,
"grad_norm": 1.5442270040512085,
"learning_rate": 9.928026395671577e-06,
"loss": 0.4077,
"step": 224
},
{
"epoch": 0.45,
"grad_norm": 1.8072466850280762,
"learning_rate": 9.926045918666045e-06,
"loss": 0.5136,
"step": 225
},
{
"epoch": 0.452,
"grad_norm": 2.0246639251708984,
"learning_rate": 9.924038765061042e-06,
"loss": 0.4485,
"step": 226
},
{
"epoch": 0.454,
"grad_norm": 2.249467372894287,
"learning_rate": 9.92200494572614e-06,
"loss": 0.4849,
"step": 227
},
{
"epoch": 0.456,
"grad_norm": 1.6570943593978882,
"learning_rate": 9.919944471675328e-06,
"loss": 0.4854,
"step": 228
},
{
"epoch": 0.458,
"grad_norm": 1.7272253036499023,
"learning_rate": 9.91785735406693e-06,
"loss": 0.4429,
"step": 229
},
{
"epoch": 0.46,
"grad_norm": 1.8478769063949585,
"learning_rate": 9.915743604203563e-06,
"loss": 0.404,
"step": 230
},
{
"epoch": 0.462,
"grad_norm": 1.7986191511154175,
"learning_rate": 9.913603233532067e-06,
"loss": 0.4054,
"step": 231
},
{
"epoch": 0.464,
"grad_norm": 2.0111937522888184,
"learning_rate": 9.911436253643445e-06,
"loss": 0.4901,
"step": 232
},
{
"epoch": 0.466,
"grad_norm": 1.6492829322814941,
"learning_rate": 9.909242676272797e-06,
"loss": 0.4451,
"step": 233
},
{
"epoch": 0.468,
"grad_norm": 1.6549859046936035,
"learning_rate": 9.907022513299264e-06,
"loss": 0.4552,
"step": 234
},
{
"epoch": 0.47,
"grad_norm": 1.8157479763031006,
"learning_rate": 9.904775776745959e-06,
"loss": 0.5326,
"step": 235
},
{
"epoch": 0.472,
"grad_norm": 1.8481768369674683,
"learning_rate": 9.902502478779897e-06,
"loss": 0.4379,
"step": 236
},
{
"epoch": 0.474,
"grad_norm": 1.4958271980285645,
"learning_rate": 9.90020263171194e-06,
"loss": 0.3648,
"step": 237
},
{
"epoch": 0.476,
"grad_norm": 1.71196711063385,
"learning_rate": 9.89787624799672e-06,
"loss": 0.462,
"step": 238
},
{
"epoch": 0.478,
"grad_norm": 2.0301215648651123,
"learning_rate": 9.89552334023258e-06,
"loss": 0.4472,
"step": 239
},
{
"epoch": 0.48,
"grad_norm": 1.8186923265457153,
"learning_rate": 9.893143921161501e-06,
"loss": 0.495,
"step": 240
},
{
"epoch": 0.482,
"grad_norm": 1.9236863851547241,
"learning_rate": 9.890738003669029e-06,
"loss": 0.4073,
"step": 241
},
{
"epoch": 0.484,
"grad_norm": 2.024024486541748,
"learning_rate": 9.888305600784217e-06,
"loss": 0.4675,
"step": 242
},
{
"epoch": 0.486,
"grad_norm": 1.7005952596664429,
"learning_rate": 9.88584672567954e-06,
"loss": 0.383,
"step": 243
},
{
"epoch": 0.488,
"grad_norm": 1.9218895435333252,
"learning_rate": 9.883361391670841e-06,
"loss": 0.4722,
"step": 244
},
{
"epoch": 0.49,
"grad_norm": 1.7836483716964722,
"learning_rate": 9.880849612217238e-06,
"loss": 0.3517,
"step": 245
},
{
"epoch": 0.492,
"grad_norm": 1.9461677074432373,
"learning_rate": 9.878311400921072e-06,
"loss": 0.5054,
"step": 246
},
{
"epoch": 0.494,
"grad_norm": 1.580214500427246,
"learning_rate": 9.875746771527817e-06,
"loss": 0.4537,
"step": 247
},
{
"epoch": 0.496,
"grad_norm": 1.7891846895217896,
"learning_rate": 9.873155737926014e-06,
"loss": 0.527,
"step": 248
},
{
"epoch": 0.498,
"grad_norm": 1.7294811010360718,
"learning_rate": 9.870538314147194e-06,
"loss": 0.3914,
"step": 249
},
{
"epoch": 0.5,
"grad_norm": 1.6117743253707886,
"learning_rate": 9.867894514365802e-06,
"loss": 0.3725,
"step": 250
},
{
"epoch": 0.502,
"grad_norm": 1.5580772161483765,
"learning_rate": 9.86522435289912e-06,
"loss": 0.3674,
"step": 251
},
{
"epoch": 0.504,
"grad_norm": 2.0612754821777344,
"learning_rate": 9.862527844207189e-06,
"loss": 0.4836,
"step": 252
},
{
"epoch": 0.506,
"grad_norm": 1.9967230558395386,
"learning_rate": 9.859805002892733e-06,
"loss": 0.4445,
"step": 253
},
{
"epoch": 0.508,
"grad_norm": 1.7749967575073242,
"learning_rate": 9.857055843701073e-06,
"loss": 0.3683,
"step": 254
},
{
"epoch": 0.51,
"grad_norm": 1.5155335664749146,
"learning_rate": 9.85428038152006e-06,
"loss": 0.2863,
"step": 255
},
{
"epoch": 0.512,
"grad_norm": 2.0063834190368652,
"learning_rate": 9.851478631379982e-06,
"loss": 0.4551,
"step": 256
},
{
"epoch": 0.514,
"grad_norm": 1.7783546447753906,
"learning_rate": 9.84865060845349e-06,
"loss": 0.3491,
"step": 257
},
{
"epoch": 0.516,
"grad_norm": 2.130695343017578,
"learning_rate": 9.845796328055505e-06,
"loss": 0.3395,
"step": 258
},
{
"epoch": 0.518,
"grad_norm": 1.6611360311508179,
"learning_rate": 9.842915805643156e-06,
"loss": 0.3728,
"step": 259
},
{
"epoch": 0.52,
"grad_norm": 1.8808437585830688,
"learning_rate": 9.840009056815674e-06,
"loss": 0.4014,
"step": 260
},
{
"epoch": 0.522,
"grad_norm": 1.5398544073104858,
"learning_rate": 9.83707609731432e-06,
"loss": 0.3664,
"step": 261
},
{
"epoch": 0.524,
"grad_norm": 1.8447811603546143,
"learning_rate": 9.834116943022299e-06,
"loss": 0.4879,
"step": 262
},
{
"epoch": 0.526,
"grad_norm": 1.7809211015701294,
"learning_rate": 9.831131609964664e-06,
"loss": 0.3631,
"step": 263
},
{
"epoch": 0.528,
"grad_norm": 1.5172531604766846,
"learning_rate": 9.828120114308248e-06,
"loss": 0.3715,
"step": 264
},
{
"epoch": 0.53,
"grad_norm": 1.573677659034729,
"learning_rate": 9.825082472361558e-06,
"loss": 0.2771,
"step": 265
},
{
"epoch": 0.532,
"grad_norm": 1.915945291519165,
"learning_rate": 9.822018700574696e-06,
"loss": 0.4118,
"step": 266
},
{
"epoch": 0.534,
"grad_norm": 1.861373782157898,
"learning_rate": 9.818928815539266e-06,
"loss": 0.3969,
"step": 267
},
{
"epoch": 0.536,
"grad_norm": 1.3833045959472656,
"learning_rate": 9.815812833988292e-06,
"loss": 0.2481,
"step": 268
},
{
"epoch": 0.538,
"grad_norm": 1.7370811700820923,
"learning_rate": 9.812670772796113e-06,
"loss": 0.4429,
"step": 269
},
{
"epoch": 0.54,
"grad_norm": 1.5217926502227783,
"learning_rate": 9.809502648978311e-06,
"loss": 0.3402,
"step": 270
},
{
"epoch": 0.542,
"grad_norm": 1.653145432472229,
"learning_rate": 9.806308479691595e-06,
"loss": 0.3477,
"step": 271
},
{
"epoch": 0.544,
"grad_norm": 1.70926034450531,
"learning_rate": 9.803088282233733e-06,
"loss": 0.4009,
"step": 272
},
{
"epoch": 0.546,
"grad_norm": 1.609721064567566,
"learning_rate": 9.799842074043438e-06,
"loss": 0.3397,
"step": 273
},
{
"epoch": 0.548,
"grad_norm": 1.60042142868042,
"learning_rate": 9.796569872700287e-06,
"loss": 0.3678,
"step": 274
},
{
"epoch": 0.55,
"grad_norm": 1.543382167816162,
"learning_rate": 9.793271695924621e-06,
"loss": 0.3365,
"step": 275
},
{
"epoch": 0.552,
"grad_norm": 2.711608409881592,
"learning_rate": 9.789947561577445e-06,
"loss": 0.4371,
"step": 276
},
{
"epoch": 0.554,
"grad_norm": 1.8913193941116333,
"learning_rate": 9.786597487660336e-06,
"loss": 0.4662,
"step": 277
},
{
"epoch": 0.556,
"grad_norm": 1.6751612424850464,
"learning_rate": 9.78322149231535e-06,
"loss": 0.4259,
"step": 278
},
{
"epoch": 0.558,
"grad_norm": 1.5974494218826294,
"learning_rate": 9.779819593824909e-06,
"loss": 0.4305,
"step": 279
},
{
"epoch": 0.56,
"grad_norm": 1.4549661874771118,
"learning_rate": 9.776391810611719e-06,
"loss": 0.3568,
"step": 280
},
{
"epoch": 0.562,
"grad_norm": 1.9823698997497559,
"learning_rate": 9.77293816123866e-06,
"loss": 0.4721,
"step": 281
},
{
"epoch": 0.564,
"grad_norm": 2.018462657928467,
"learning_rate": 9.769458664408689e-06,
"loss": 0.4735,
"step": 282
},
{
"epoch": 0.566,
"grad_norm": 1.8653925657272339,
"learning_rate": 9.765953338964736e-06,
"loss": 0.3503,
"step": 283
},
{
"epoch": 0.568,
"grad_norm": 1.7118836641311646,
"learning_rate": 9.762422203889604e-06,
"loss": 0.3659,
"step": 284
},
{
"epoch": 0.57,
"grad_norm": 1.6338058710098267,
"learning_rate": 9.75886527830587e-06,
"loss": 0.2939,
"step": 285
},
{
"epoch": 0.572,
"grad_norm": 1.5765326023101807,
"learning_rate": 9.755282581475769e-06,
"loss": 0.2898,
"step": 286
},
{
"epoch": 0.574,
"grad_norm": 1.7563010454177856,
"learning_rate": 9.751674132801106e-06,
"loss": 0.305,
"step": 287
},
{
"epoch": 0.576,
"grad_norm": 1.5117406845092773,
"learning_rate": 9.748039951823141e-06,
"loss": 0.2965,
"step": 288
},
{
"epoch": 0.578,
"grad_norm": 1.775891900062561,
"learning_rate": 9.744380058222483e-06,
"loss": 0.3359,
"step": 289
},
{
"epoch": 0.58,
"grad_norm": 1.7677249908447266,
"learning_rate": 9.740694471818988e-06,
"loss": 0.3332,
"step": 290
},
{
"epoch": 0.582,
"grad_norm": 1.7065130472183228,
"learning_rate": 9.736983212571646e-06,
"loss": 0.3361,
"step": 291
},
{
"epoch": 0.584,
"grad_norm": 2.0350592136383057,
"learning_rate": 9.733246300578482e-06,
"loss": 0.3061,
"step": 292
},
{
"epoch": 0.586,
"grad_norm": 1.881553053855896,
"learning_rate": 9.729483756076436e-06,
"loss": 0.4417,
"step": 293
},
{
"epoch": 0.588,
"grad_norm": 1.845169186592102,
"learning_rate": 9.72569559944126e-06,
"loss": 0.401,
"step": 294
},
{
"epoch": 0.59,
"grad_norm": 1.8087979555130005,
"learning_rate": 9.721881851187406e-06,
"loss": 0.4629,
"step": 295
},
{
"epoch": 0.592,
"grad_norm": 1.7599945068359375,
"learning_rate": 9.718042531967918e-06,
"loss": 0.2951,
"step": 296
},
{
"epoch": 0.594,
"grad_norm": 1.8704352378845215,
"learning_rate": 9.714177662574316e-06,
"loss": 0.4033,
"step": 297
},
{
"epoch": 0.596,
"grad_norm": 1.8199055194854736,
"learning_rate": 9.710287263936485e-06,
"loss": 0.4836,
"step": 298
},
{
"epoch": 0.598,
"grad_norm": 1.7064282894134521,
"learning_rate": 9.70637135712256e-06,
"loss": 0.3298,
"step": 299
},
{
"epoch": 0.6,
"grad_norm": 2.22114896774292,
"learning_rate": 9.702429963338812e-06,
"loss": 0.399,
"step": 300
},
{
"epoch": 0.602,
"grad_norm": 1.8880242109298706,
"learning_rate": 9.698463103929542e-06,
"loss": 0.3465,
"step": 301
},
{
"epoch": 0.604,
"grad_norm": 1.886757493019104,
"learning_rate": 9.694470800376951e-06,
"loss": 0.3468,
"step": 302
},
{
"epoch": 0.606,
"grad_norm": 1.476881742477417,
"learning_rate": 9.690453074301035e-06,
"loss": 0.2446,
"step": 303
},
{
"epoch": 0.608,
"grad_norm": 1.8497904539108276,
"learning_rate": 9.68640994745946e-06,
"loss": 0.4195,
"step": 304
},
{
"epoch": 0.61,
"grad_norm": 1.8314309120178223,
"learning_rate": 9.682341441747446e-06,
"loss": 0.3153,
"step": 305
},
{
"epoch": 0.612,
"grad_norm": 1.9367709159851074,
"learning_rate": 9.678247579197658e-06,
"loss": 0.3185,
"step": 306
},
{
"epoch": 0.614,
"grad_norm": 1.4805539846420288,
"learning_rate": 9.674128381980073e-06,
"loss": 0.2507,
"step": 307
},
{
"epoch": 0.616,
"grad_norm": 1.6094000339508057,
"learning_rate": 9.669983872401868e-06,
"loss": 0.2187,
"step": 308
},
{
"epoch": 0.618,
"grad_norm": 1.5348446369171143,
"learning_rate": 9.665814072907293e-06,
"loss": 0.2326,
"step": 309
},
{
"epoch": 0.62,
"grad_norm": 1.8826684951782227,
"learning_rate": 9.661619006077562e-06,
"loss": 0.3359,
"step": 310
},
{
"epoch": 0.622,
"grad_norm": 1.7773888111114502,
"learning_rate": 9.657398694630713e-06,
"loss": 0.2697,
"step": 311
},
{
"epoch": 0.624,
"grad_norm": 1.7263822555541992,
"learning_rate": 9.653153161421497e-06,
"loss": 0.2422,
"step": 312
},
{
"epoch": 0.626,
"grad_norm": 2.116281747817993,
"learning_rate": 9.648882429441258e-06,
"loss": 0.3718,
"step": 313
},
{
"epoch": 0.628,
"grad_norm": 1.7101234197616577,
"learning_rate": 9.644586521817792e-06,
"loss": 0.3674,
"step": 314
},
{
"epoch": 0.63,
"grad_norm": 2.12149977684021,
"learning_rate": 9.640265461815235e-06,
"loss": 0.5202,
"step": 315
},
{
"epoch": 0.632,
"grad_norm": 2.06618070602417,
"learning_rate": 9.635919272833938e-06,
"loss": 0.3975,
"step": 316
},
{
"epoch": 0.634,
"grad_norm": 1.9198070764541626,
"learning_rate": 9.63154797841033e-06,
"loss": 0.343,
"step": 317
},
{
"epoch": 0.636,
"grad_norm": 1.5635013580322266,
"learning_rate": 9.627151602216801e-06,
"loss": 0.3303,
"step": 318
},
{
"epoch": 0.638,
"grad_norm": 1.5643494129180908,
"learning_rate": 9.622730168061568e-06,
"loss": 0.2927,
"step": 319
},
{
"epoch": 0.64,
"grad_norm": 1.8530386686325073,
"learning_rate": 9.618283699888543e-06,
"loss": 0.2539,
"step": 320
},
{
"epoch": 0.642,
"grad_norm": 2.2413387298583984,
"learning_rate": 9.613812221777212e-06,
"loss": 0.3513,
"step": 321
},
{
"epoch": 0.644,
"grad_norm": 2.0326476097106934,
"learning_rate": 9.609315757942504e-06,
"loss": 0.2588,
"step": 322
},
{
"epoch": 0.646,
"grad_norm": 2.1528379917144775,
"learning_rate": 9.604794332734647e-06,
"loss": 0.2732,
"step": 323
},
{
"epoch": 0.648,
"grad_norm": 1.8883870840072632,
"learning_rate": 9.600247970639053e-06,
"loss": 0.3172,
"step": 324
},
{
"epoch": 0.65,
"grad_norm": 1.8538719415664673,
"learning_rate": 9.595676696276173e-06,
"loss": 0.3115,
"step": 325
},
{
"epoch": 0.652,
"grad_norm": 1.676763653755188,
"learning_rate": 9.591080534401371e-06,
"loss": 0.3002,
"step": 326
},
{
"epoch": 0.654,
"grad_norm": 1.8477318286895752,
"learning_rate": 9.586459509904786e-06,
"loss": 0.3321,
"step": 327
},
{
"epoch": 0.656,
"grad_norm": 1.7304288148880005,
"learning_rate": 9.581813647811199e-06,
"loss": 0.2766,
"step": 328
},
{
"epoch": 0.658,
"grad_norm": 1.8328994512557983,
"learning_rate": 9.577142973279896e-06,
"loss": 0.3712,
"step": 329
},
{
"epoch": 0.66,
"grad_norm": 1.7225216627120972,
"learning_rate": 9.572447511604536e-06,
"loss": 0.3262,
"step": 330
},
{
"epoch": 0.662,
"grad_norm": 1.5904541015625,
"learning_rate": 9.567727288213005e-06,
"loss": 0.2373,
"step": 331
},
{
"epoch": 0.664,
"grad_norm": 1.7042733430862427,
"learning_rate": 9.56298232866729e-06,
"loss": 0.3051,
"step": 332
},
{
"epoch": 0.666,
"grad_norm": 1.7410995960235596,
"learning_rate": 9.55821265866333e-06,
"loss": 0.3074,
"step": 333
},
{
"epoch": 0.668,
"grad_norm": 1.7952817678451538,
"learning_rate": 9.553418304030886e-06,
"loss": 0.3572,
"step": 334
},
{
"epoch": 0.67,
"grad_norm": 1.4647603034973145,
"learning_rate": 9.548599290733393e-06,
"loss": 0.2161,
"step": 335
},
{
"epoch": 0.672,
"grad_norm": 2.135939359664917,
"learning_rate": 9.543755644867823e-06,
"loss": 0.4055,
"step": 336
},
{
"epoch": 0.674,
"grad_norm": 1.717570424079895,
"learning_rate": 9.538887392664544e-06,
"loss": 0.2576,
"step": 337
},
{
"epoch": 0.676,
"grad_norm": 1.7470098733901978,
"learning_rate": 9.53399456048718e-06,
"loss": 0.3408,
"step": 338
},
{
"epoch": 0.678,
"grad_norm": 1.8334686756134033,
"learning_rate": 9.529077174832466e-06,
"loss": 0.3347,
"step": 339
},
{
"epoch": 0.68,
"grad_norm": 2.086667537689209,
"learning_rate": 9.524135262330098e-06,
"loss": 0.3498,
"step": 340
},
{
"epoch": 0.682,
"grad_norm": 2.0446317195892334,
"learning_rate": 9.519168849742603e-06,
"loss": 0.4212,
"step": 341
},
{
"epoch": 0.684,
"grad_norm": 1.5848861932754517,
"learning_rate": 9.514177963965181e-06,
"loss": 0.2083,
"step": 342
},
{
"epoch": 0.686,
"grad_norm": 1.6067830324172974,
"learning_rate": 9.50916263202557e-06,
"loss": 0.3383,
"step": 343
},
{
"epoch": 0.688,
"grad_norm": 1.7842918634414673,
"learning_rate": 9.504122881083886e-06,
"loss": 0.376,
"step": 344
},
{
"epoch": 0.69,
"grad_norm": 1.6721829175949097,
"learning_rate": 9.499058738432492e-06,
"loss": 0.2289,
"step": 345
},
{
"epoch": 0.692,
"grad_norm": 1.4510334730148315,
"learning_rate": 9.493970231495836e-06,
"loss": 0.2032,
"step": 346
},
{
"epoch": 0.694,
"grad_norm": 1.9582585096359253,
"learning_rate": 9.488857387830315e-06,
"loss": 0.2712,
"step": 347
},
{
"epoch": 0.696,
"grad_norm": 1.8526630401611328,
"learning_rate": 9.483720235124113e-06,
"loss": 0.4167,
"step": 348
},
{
"epoch": 0.698,
"grad_norm": 1.9199018478393555,
"learning_rate": 9.478558801197065e-06,
"loss": 0.2576,
"step": 349
},
{
"epoch": 0.7,
"grad_norm": 1.4618760347366333,
"learning_rate": 9.473373114000493e-06,
"loss": 0.1777,
"step": 350
},
{
"epoch": 0.702,
"grad_norm": 2.0231404304504395,
"learning_rate": 9.468163201617063e-06,
"loss": 0.254,
"step": 351
},
{
"epoch": 0.704,
"grad_norm": 1.3274458646774292,
"learning_rate": 9.46292909226063e-06,
"loss": 0.1577,
"step": 352
},
{
"epoch": 0.706,
"grad_norm": 2.0219509601593018,
"learning_rate": 9.457670814276083e-06,
"loss": 0.278,
"step": 353
},
{
"epoch": 0.708,
"grad_norm": 1.5874745845794678,
"learning_rate": 9.452388396139202e-06,
"loss": 0.2046,
"step": 354
},
{
"epoch": 0.71,
"grad_norm": 1.2424635887145996,
"learning_rate": 9.44708186645649e-06,
"loss": 0.1533,
"step": 355
},
{
"epoch": 0.712,
"grad_norm": 1.6024327278137207,
"learning_rate": 9.441751253965022e-06,
"loss": 0.2354,
"step": 356
},
{
"epoch": 0.714,
"grad_norm": 1.4302263259887695,
"learning_rate": 9.436396587532297e-06,
"loss": 0.181,
"step": 357
},
{
"epoch": 0.716,
"grad_norm": 1.3045355081558228,
"learning_rate": 9.431017896156074e-06,
"loss": 0.1141,
"step": 358
},
{
"epoch": 0.718,
"grad_norm": 1.9596627950668335,
"learning_rate": 9.425615208964217e-06,
"loss": 0.248,
"step": 359
},
{
"epoch": 0.72,
"grad_norm": 1.5831594467163086,
"learning_rate": 9.420188555214537e-06,
"loss": 0.2521,
"step": 360
},
{
"epoch": 0.722,
"grad_norm": 1.524751901626587,
"learning_rate": 9.414737964294636e-06,
"loss": 0.3191,
"step": 361
},
{
"epoch": 0.724,
"grad_norm": 1.6688232421875,
"learning_rate": 9.40926346572174e-06,
"loss": 0.2146,
"step": 362
},
{
"epoch": 0.726,
"grad_norm": 1.8539663553237915,
"learning_rate": 9.403765089142554e-06,
"loss": 0.3524,
"step": 363
},
{
"epoch": 0.728,
"grad_norm": 1.5958819389343262,
"learning_rate": 9.398242864333084e-06,
"loss": 0.2913,
"step": 364
},
{
"epoch": 0.73,
"grad_norm": 1.3911813497543335,
"learning_rate": 9.392696821198488e-06,
"loss": 0.2529,
"step": 365
},
{
"epoch": 0.732,
"grad_norm": 1.3639315366744995,
"learning_rate": 9.38712698977291e-06,
"loss": 0.1505,
"step": 366
},
{
"epoch": 0.734,
"grad_norm": 1.8511894941329956,
"learning_rate": 9.381533400219319e-06,
"loss": 0.2977,
"step": 367
},
{
"epoch": 0.736,
"grad_norm": 1.9356679916381836,
"learning_rate": 9.375916082829341e-06,
"loss": 0.2784,
"step": 368
},
{
"epoch": 0.738,
"grad_norm": 1.784163236618042,
"learning_rate": 9.370275068023097e-06,
"loss": 0.3302,
"step": 369
},
{
"epoch": 0.74,
"grad_norm": 1.436231255531311,
"learning_rate": 9.364610386349048e-06,
"loss": 0.1248,
"step": 370
},
{
"epoch": 0.742,
"grad_norm": 2.2542660236358643,
"learning_rate": 9.358922068483813e-06,
"loss": 0.3149,
"step": 371
},
{
"epoch": 0.744,
"grad_norm": 1.5600509643554688,
"learning_rate": 9.35321014523201e-06,
"loss": 0.1924,
"step": 372
},
{
"epoch": 0.746,
"grad_norm": 1.7589770555496216,
"learning_rate": 9.347474647526095e-06,
"loss": 0.2127,
"step": 373
},
{
"epoch": 0.748,
"grad_norm": 1.6427364349365234,
"learning_rate": 9.34171560642619e-06,
"loss": 0.1524,
"step": 374
},
{
"epoch": 0.75,
"grad_norm": 1.5470690727233887,
"learning_rate": 9.335933053119906e-06,
"loss": 0.2051,
"step": 375
},
{
"epoch": 0.752,
"grad_norm": 1.5658862590789795,
"learning_rate": 9.330127018922195e-06,
"loss": 0.2415,
"step": 376
},
{
"epoch": 0.754,
"grad_norm": 1.5468968152999878,
"learning_rate": 9.324297535275156e-06,
"loss": 0.1993,
"step": 377
},
{
"epoch": 0.756,
"grad_norm": 1.397178053855896,
"learning_rate": 9.318444633747884e-06,
"loss": 0.1774,
"step": 378
},
{
"epoch": 0.758,
"grad_norm": 1.6021531820297241,
"learning_rate": 9.312568346036288e-06,
"loss": 0.2997,
"step": 379
},
{
"epoch": 0.76,
"grad_norm": 1.580793023109436,
"learning_rate": 9.306668703962927e-06,
"loss": 0.196,
"step": 380
},
{
"epoch": 0.762,
"grad_norm": 1.7833058834075928,
"learning_rate": 9.30074573947683e-06,
"loss": 0.2005,
"step": 381
},
{
"epoch": 0.764,
"grad_norm": 1.6787939071655273,
"learning_rate": 9.294799484653323e-06,
"loss": 0.1089,
"step": 382
},
{
"epoch": 0.766,
"grad_norm": 1.4611705541610718,
"learning_rate": 9.288829971693869e-06,
"loss": 0.2504,
"step": 383
},
{
"epoch": 0.768,
"grad_norm": 1.9019439220428467,
"learning_rate": 9.282837232925876e-06,
"loss": 0.1973,
"step": 384
},
{
"epoch": 0.77,
"grad_norm": 1.9683581590652466,
"learning_rate": 9.276821300802535e-06,
"loss": 0.241,
"step": 385
},
{
"epoch": 0.772,
"grad_norm": 1.6254339218139648,
"learning_rate": 9.27078220790263e-06,
"loss": 0.2799,
"step": 386
},
{
"epoch": 0.774,
"grad_norm": 1.6056698560714722,
"learning_rate": 9.264719986930376e-06,
"loss": 0.1804,
"step": 387
},
{
"epoch": 0.776,
"grad_norm": 1.4303953647613525,
"learning_rate": 9.25863467071524e-06,
"loss": 0.2681,
"step": 388
},
{
"epoch": 0.778,
"grad_norm": 1.707230806350708,
"learning_rate": 9.25252629221175e-06,
"loss": 0.2711,
"step": 389
},
{
"epoch": 0.78,
"grad_norm": 1.709281086921692,
"learning_rate": 9.246394884499334e-06,
"loss": 0.256,
"step": 390
},
{
"epoch": 0.782,
"grad_norm": 1.6412250995635986,
"learning_rate": 9.24024048078213e-06,
"loss": 0.2271,
"step": 391
},
{
"epoch": 0.784,
"grad_norm": 1.3375557661056519,
"learning_rate": 9.234063114388809e-06,
"loss": 0.2052,
"step": 392
},
{
"epoch": 0.786,
"grad_norm": 1.449644923210144,
"learning_rate": 9.227862818772392e-06,
"loss": 0.2519,
"step": 393
},
{
"epoch": 0.788,
"grad_norm": 1.5053601264953613,
"learning_rate": 9.221639627510076e-06,
"loss": 0.2186,
"step": 394
},
{
"epoch": 0.79,
"grad_norm": 1.5800226926803589,
"learning_rate": 9.215393574303043e-06,
"loss": 0.2031,
"step": 395
},
{
"epoch": 0.792,
"grad_norm": 1.5477964878082275,
"learning_rate": 9.209124692976287e-06,
"loss": 0.1865,
"step": 396
},
{
"epoch": 0.794,
"grad_norm": 1.7126036882400513,
"learning_rate": 9.202833017478421e-06,
"loss": 0.2517,
"step": 397
},
{
"epoch": 0.796,
"grad_norm": 1.5103005170822144,
"learning_rate": 9.196518581881502e-06,
"loss": 0.2488,
"step": 398
},
{
"epoch": 0.798,
"grad_norm": 1.8108140230178833,
"learning_rate": 9.190181420380838e-06,
"loss": 0.2877,
"step": 399
},
{
"epoch": 0.8,
"grad_norm": 1.5517218112945557,
"learning_rate": 9.18382156729481e-06,
"loss": 0.1755,
"step": 400
},
{
"epoch": 0.802,
"grad_norm": 1.5021002292633057,
"learning_rate": 9.177439057064684e-06,
"loss": 0.1731,
"step": 401
},
{
"epoch": 0.804,
"grad_norm": 1.3245618343353271,
"learning_rate": 9.171033924254421e-06,
"loss": 0.1414,
"step": 402
},
{
"epoch": 0.806,
"grad_norm": 1.6147440671920776,
"learning_rate": 9.164606203550498e-06,
"loss": 0.1876,
"step": 403
},
{
"epoch": 0.808,
"grad_norm": 1.5092687606811523,
"learning_rate": 9.15815592976171e-06,
"loss": 0.138,
"step": 404
},
{
"epoch": 0.81,
"grad_norm": 1.7393368482589722,
"learning_rate": 9.151683137818989e-06,
"loss": 0.2142,
"step": 405
},
{
"epoch": 0.812,
"grad_norm": 1.7157940864562988,
"learning_rate": 9.145187862775208e-06,
"loss": 0.1615,
"step": 406
},
{
"epoch": 0.814,
"grad_norm": 1.4408173561096191,
"learning_rate": 9.138670139805004e-06,
"loss": 0.139,
"step": 407
},
{
"epoch": 0.816,
"grad_norm": 1.648302435874939,
"learning_rate": 9.132130004204569e-06,
"loss": 0.1163,
"step": 408
},
{
"epoch": 0.818,
"grad_norm": 1.5025349855422974,
"learning_rate": 9.125567491391476e-06,
"loss": 0.1426,
"step": 409
},
{
"epoch": 0.82,
"grad_norm": 1.495948076248169,
"learning_rate": 9.118982636904476e-06,
"loss": 0.1804,
"step": 410
},
{
"epoch": 0.822,
"grad_norm": 1.4459843635559082,
"learning_rate": 9.112375476403313e-06,
"loss": 0.1603,
"step": 411
},
{
"epoch": 0.824,
"grad_norm": 1.8957642316818237,
"learning_rate": 9.10574604566852e-06,
"loss": 0.2707,
"step": 412
},
{
"epoch": 0.826,
"grad_norm": 2.036060094833374,
"learning_rate": 9.099094380601244e-06,
"loss": 0.3142,
"step": 413
},
{
"epoch": 0.828,
"grad_norm": 1.4369035959243774,
"learning_rate": 9.09242051722303e-06,
"loss": 0.2311,
"step": 414
},
{
"epoch": 0.83,
"grad_norm": 1.3601306676864624,
"learning_rate": 9.085724491675642e-06,
"loss": 0.1153,
"step": 415
},
{
"epoch": 0.832,
"grad_norm": 1.539316177368164,
"learning_rate": 9.079006340220862e-06,
"loss": 0.1519,
"step": 416
},
{
"epoch": 0.834,
"grad_norm": 1.6601051092147827,
"learning_rate": 9.072266099240286e-06,
"loss": 0.1848,
"step": 417
},
{
"epoch": 0.836,
"grad_norm": 1.8635293245315552,
"learning_rate": 9.065503805235139e-06,
"loss": 0.1731,
"step": 418
},
{
"epoch": 0.838,
"grad_norm": 2.097959518432617,
"learning_rate": 9.058719494826076e-06,
"loss": 0.404,
"step": 419
},
{
"epoch": 0.84,
"grad_norm": 2.099665641784668,
"learning_rate": 9.051913204752972e-06,
"loss": 0.2034,
"step": 420
},
{
"epoch": 0.842,
"grad_norm": 1.8327354192733765,
"learning_rate": 9.045084971874738e-06,
"loss": 0.2435,
"step": 421
},
{
"epoch": 0.844,
"grad_norm": 1.6721452474594116,
"learning_rate": 9.03823483316911e-06,
"loss": 0.2248,
"step": 422
},
{
"epoch": 0.846,
"grad_norm": 1.5078072547912598,
"learning_rate": 9.031362825732456e-06,
"loss": 0.1608,
"step": 423
},
{
"epoch": 0.848,
"grad_norm": 1.7546452283859253,
"learning_rate": 9.02446898677957e-06,
"loss": 0.1649,
"step": 424
},
{
"epoch": 0.85,
"grad_norm": 1.5506815910339355,
"learning_rate": 9.017553353643479e-06,
"loss": 0.245,
"step": 425
},
{
"epoch": 0.852,
"grad_norm": 1.6377143859863281,
"learning_rate": 9.01061596377522e-06,
"loss": 0.1609,
"step": 426
},
{
"epoch": 0.854,
"grad_norm": 1.3670685291290283,
"learning_rate": 9.003656854743667e-06,
"loss": 0.1507,
"step": 427
},
{
"epoch": 0.856,
"grad_norm": 1.7809122800827026,
"learning_rate": 8.996676064235308e-06,
"loss": 0.2064,
"step": 428
},
{
"epoch": 0.858,
"grad_norm": 1.5835070610046387,
"learning_rate": 8.989673630054044e-06,
"loss": 0.1358,
"step": 429
},
{
"epoch": 0.86,
"grad_norm": 1.5268651247024536,
"learning_rate": 8.982649590120982e-06,
"loss": 0.1558,
"step": 430
},
{
"epoch": 0.862,
"grad_norm": 2.022331714630127,
"learning_rate": 8.97560398247424e-06,
"loss": 0.1557,
"step": 431
},
{
"epoch": 0.864,
"grad_norm": 1.5841243267059326,
"learning_rate": 8.96853684526873e-06,
"loss": 0.0923,
"step": 432
},
{
"epoch": 0.866,
"grad_norm": 1.6304723024368286,
"learning_rate": 8.961448216775955e-06,
"loss": 0.1563,
"step": 433
},
{
"epoch": 0.868,
"grad_norm": 1.7919025421142578,
"learning_rate": 8.954338135383804e-06,
"loss": 0.1692,
"step": 434
},
{
"epoch": 0.87,
"grad_norm": 1.8628870248794556,
"learning_rate": 8.947206639596346e-06,
"loss": 0.2082,
"step": 435
},
{
"epoch": 0.872,
"grad_norm": 1.5078816413879395,
"learning_rate": 8.94005376803361e-06,
"loss": 0.2606,
"step": 436
},
{
"epoch": 0.874,
"grad_norm": 1.5765806436538696,
"learning_rate": 8.932879559431392e-06,
"loss": 0.1426,
"step": 437
},
{
"epoch": 0.876,
"grad_norm": 1.343929648399353,
"learning_rate": 8.925684052641027e-06,
"loss": 0.1908,
"step": 438
},
{
"epoch": 0.878,
"grad_norm": 1.702836513519287,
"learning_rate": 8.9184672866292e-06,
"loss": 0.2557,
"step": 439
},
{
"epoch": 0.88,
"grad_norm": 1.7433876991271973,
"learning_rate": 8.911229300477716e-06,
"loss": 0.2205,
"step": 440
},
{
"epoch": 0.882,
"grad_norm": 1.3122010231018066,
"learning_rate": 8.903970133383297e-06,
"loss": 0.1327,
"step": 441
},
{
"epoch": 0.884,
"grad_norm": 1.5010037422180176,
"learning_rate": 8.896689824657371e-06,
"loss": 0.1909,
"step": 442
},
{
"epoch": 0.886,
"grad_norm": 1.507695198059082,
"learning_rate": 8.889388413725857e-06,
"loss": 0.1751,
"step": 443
},
{
"epoch": 0.888,
"grad_norm": 1.7989376783370972,
"learning_rate": 8.882065940128946e-06,
"loss": 0.2674,
"step": 444
},
{
"epoch": 0.89,
"grad_norm": 1.7190501689910889,
"learning_rate": 8.874722443520898e-06,
"loss": 0.2409,
"step": 445
},
{
"epoch": 0.892,
"grad_norm": 1.787239909172058,
"learning_rate": 8.867357963669821e-06,
"loss": 0.1394,
"step": 446
},
{
"epoch": 0.894,
"grad_norm": 1.6416536569595337,
"learning_rate": 8.859972540457451e-06,
"loss": 0.1698,
"step": 447
},
{
"epoch": 0.896,
"grad_norm": 1.502375841140747,
"learning_rate": 8.852566213878947e-06,
"loss": 0.1149,
"step": 448
},
{
"epoch": 0.898,
"grad_norm": 1.5897934436798096,
"learning_rate": 8.845139024042664e-06,
"loss": 0.1707,
"step": 449
},
{
"epoch": 0.9,
"grad_norm": 1.2941950559616089,
"learning_rate": 8.837691011169944e-06,
"loss": 0.104,
"step": 450
},
{
"epoch": 0.902,
"grad_norm": 1.708298683166504,
"learning_rate": 8.83022221559489e-06,
"loss": 0.2151,
"step": 451
},
{
"epoch": 0.904,
"grad_norm": 2.3875808715820312,
"learning_rate": 8.822732677764158e-06,
"loss": 0.2799,
"step": 452
},
{
"epoch": 0.906,
"grad_norm": 1.5689988136291504,
"learning_rate": 8.815222438236726e-06,
"loss": 0.2035,
"step": 453
},
{
"epoch": 0.908,
"grad_norm": 1.1103384494781494,
"learning_rate": 8.807691537683685e-06,
"loss": 0.0694,
"step": 454
},
{
"epoch": 0.91,
"grad_norm": 1.6364744901657104,
"learning_rate": 8.800140016888009e-06,
"loss": 0.1829,
"step": 455
},
{
"epoch": 0.912,
"grad_norm": 1.566340446472168,
"learning_rate": 8.792567916744346e-06,
"loss": 0.2079,
"step": 456
},
{
"epoch": 0.914,
"grad_norm": 1.5354089736938477,
"learning_rate": 8.784975278258783e-06,
"loss": 0.138,
"step": 457
},
{
"epoch": 0.916,
"grad_norm": 1.6658852100372314,
"learning_rate": 8.777362142548636e-06,
"loss": 0.1845,
"step": 458
},
{
"epoch": 0.918,
"grad_norm": 1.4948214292526245,
"learning_rate": 8.769728550842217e-06,
"loss": 0.1475,
"step": 459
},
{
"epoch": 0.92,
"grad_norm": 1.6014717817306519,
"learning_rate": 8.762074544478622e-06,
"loss": 0.1891,
"step": 460
},
{
"epoch": 0.922,
"grad_norm": 1.6478428840637207,
"learning_rate": 8.754400164907496e-06,
"loss": 0.1808,
"step": 461
},
{
"epoch": 0.924,
"grad_norm": 2.035034418106079,
"learning_rate": 8.746705453688815e-06,
"loss": 0.1773,
"step": 462
},
{
"epoch": 0.926,
"grad_norm": 1.97378408908844,
"learning_rate": 8.73899045249266e-06,
"loss": 0.2585,
"step": 463
},
{
"epoch": 0.928,
"grad_norm": 1.7488819360733032,
"learning_rate": 8.73125520309899e-06,
"loss": 0.1975,
"step": 464
},
{
"epoch": 0.93,
"grad_norm": 1.4043940305709839,
"learning_rate": 8.723499747397415e-06,
"loss": 0.1401,
"step": 465
},
{
"epoch": 0.932,
"grad_norm": 1.358202576637268,
"learning_rate": 8.715724127386971e-06,
"loss": 0.1652,
"step": 466
},
{
"epoch": 0.934,
"grad_norm": 1.294463038444519,
"learning_rate": 8.707928385175898e-06,
"loss": 0.1187,
"step": 467
},
{
"epoch": 0.936,
"grad_norm": 1.3944119215011597,
"learning_rate": 8.700112562981398e-06,
"loss": 0.1893,
"step": 468
},
{
"epoch": 0.938,
"grad_norm": 1.4225037097930908,
"learning_rate": 8.692276703129421e-06,
"loss": 0.1225,
"step": 469
},
{
"epoch": 0.94,
"grad_norm": 1.2248589992523193,
"learning_rate": 8.68442084805442e-06,
"loss": 0.1162,
"step": 470
},
{
"epoch": 0.942,
"grad_norm": 1.177075982093811,
"learning_rate": 8.676545040299145e-06,
"loss": 0.1072,
"step": 471
},
{
"epoch": 0.944,
"grad_norm": 2.0721774101257324,
"learning_rate": 8.668649322514382e-06,
"loss": 0.1961,
"step": 472
},
{
"epoch": 0.946,
"grad_norm": 1.694319725036621,
"learning_rate": 8.660733737458751e-06,
"loss": 0.1608,
"step": 473
},
{
"epoch": 0.948,
"grad_norm": 1.47614586353302,
"learning_rate": 8.652798327998458e-06,
"loss": 0.1236,
"step": 474
},
{
"epoch": 0.95,
"grad_norm": 1.8103671073913574,
"learning_rate": 8.644843137107058e-06,
"loss": 0.2321,
"step": 475
},
{
"epoch": 0.952,
"grad_norm": 1.6554206609725952,
"learning_rate": 8.636868207865244e-06,
"loss": 0.2501,
"step": 476
},
{
"epoch": 0.954,
"grad_norm": 2.072645664215088,
"learning_rate": 8.628873583460593e-06,
"loss": 0.2904,
"step": 477
},
{
"epoch": 0.956,
"grad_norm": 1.444131851196289,
"learning_rate": 8.620859307187339e-06,
"loss": 0.1377,
"step": 478
},
{
"epoch": 0.958,
"grad_norm": 1.2418184280395508,
"learning_rate": 8.61282542244614e-06,
"loss": 0.0772,
"step": 479
},
{
"epoch": 0.96,
"grad_norm": 1.3381738662719727,
"learning_rate": 8.604771972743848e-06,
"loss": 0.0934,
"step": 480
},
{
"epoch": 0.962,
"grad_norm": 1.494141936302185,
"learning_rate": 8.596699001693257e-06,
"loss": 0.1627,
"step": 481
},
{
"epoch": 0.964,
"grad_norm": 1.06540048122406,
"learning_rate": 8.588606553012884e-06,
"loss": 0.058,
"step": 482
},
{
"epoch": 0.966,
"grad_norm": 1.520519733428955,
"learning_rate": 8.580494670526725e-06,
"loss": 0.1542,
"step": 483
},
{
"epoch": 0.968,
"grad_norm": 1.3340709209442139,
"learning_rate": 8.572363398164017e-06,
"loss": 0.1246,
"step": 484
},
{
"epoch": 0.97,
"grad_norm": 1.5186916589736938,
"learning_rate": 8.564212779959003e-06,
"loss": 0.1466,
"step": 485
},
{
"epoch": 0.972,
"grad_norm": 1.2340925931930542,
"learning_rate": 8.556042860050686e-06,
"loss": 0.0813,
"step": 486
},
{
"epoch": 0.974,
"grad_norm": 1.3246345520019531,
"learning_rate": 8.547853682682605e-06,
"loss": 0.1439,
"step": 487
},
{
"epoch": 0.976,
"grad_norm": 1.258293867111206,
"learning_rate": 8.539645292202579e-06,
"loss": 0.1046,
"step": 488
},
{
"epoch": 0.978,
"grad_norm": 1.4723703861236572,
"learning_rate": 8.531417733062476e-06,
"loss": 0.1719,
"step": 489
},
{
"epoch": 0.98,
"grad_norm": 1.373752236366272,
"learning_rate": 8.523171049817974e-06,
"loss": 0.1234,
"step": 490
},
{
"epoch": 0.982,
"grad_norm": 1.7229127883911133,
"learning_rate": 8.51490528712831e-06,
"loss": 0.1663,
"step": 491
},
{
"epoch": 0.984,
"grad_norm": 1.5550212860107422,
"learning_rate": 8.506620489756045e-06,
"loss": 0.1655,
"step": 492
},
{
"epoch": 0.986,
"grad_norm": 1.3695895671844482,
"learning_rate": 8.498316702566828e-06,
"loss": 0.1345,
"step": 493
},
{
"epoch": 0.988,
"grad_norm": 1.432288646697998,
"learning_rate": 8.489993970529137e-06,
"loss": 0.111,
"step": 494
},
{
"epoch": 0.99,
"grad_norm": 1.3665354251861572,
"learning_rate": 8.481652338714048e-06,
"loss": 0.1865,
"step": 495
},
{
"epoch": 0.992,
"grad_norm": 1.4727295637130737,
"learning_rate": 8.473291852294986e-06,
"loss": 0.196,
"step": 496
},
{
"epoch": 0.994,
"grad_norm": 1.5306068658828735,
"learning_rate": 8.464912556547486e-06,
"loss": 0.196,
"step": 497
},
{
"epoch": 0.996,
"grad_norm": 1.512803554534912,
"learning_rate": 8.456514496848938e-06,
"loss": 0.0902,
"step": 498
},
{
"epoch": 0.998,
"grad_norm": 1.4982142448425293,
"learning_rate": 8.44809771867835e-06,
"loss": 0.1175,
"step": 499
},
{
"epoch": 1.0,
"grad_norm": 1.9177815914154053,
"learning_rate": 8.439662267616093e-06,
"loss": 0.2034,
"step": 500
},
{
"epoch": 1.002,
"grad_norm": 1.4074664115905762,
"learning_rate": 8.43120818934367e-06,
"loss": 0.0809,
"step": 501
},
{
"epoch": 1.004,
"grad_norm": 1.3982102870941162,
"learning_rate": 8.422735529643445e-06,
"loss": 0.0911,
"step": 502
},
{
"epoch": 1.006,
"grad_norm": 1.4477176666259766,
"learning_rate": 8.414244334398418e-06,
"loss": 0.1385,
"step": 503
},
{
"epoch": 1.008,
"grad_norm": 1.540793538093567,
"learning_rate": 8.405734649591964e-06,
"loss": 0.156,
"step": 504
},
{
"epoch": 1.01,
"grad_norm": 1.245995283126831,
"learning_rate": 8.397206521307584e-06,
"loss": 0.0915,
"step": 505
},
{
"epoch": 1.012,
"grad_norm": 1.2165762186050415,
"learning_rate": 8.388659995728662e-06,
"loss": 0.0792,
"step": 506
},
{
"epoch": 1.014,
"grad_norm": 1.4472664594650269,
"learning_rate": 8.380095119138209e-06,
"loss": 0.0765,
"step": 507
},
{
"epoch": 1.016,
"grad_norm": 1.1731630563735962,
"learning_rate": 8.371511937918616e-06,
"loss": 0.0772,
"step": 508
},
{
"epoch": 1.018,
"grad_norm": 1.2897391319274902,
"learning_rate": 8.362910498551402e-06,
"loss": 0.0827,
"step": 509
},
{
"epoch": 1.02,
"grad_norm": 1.1869486570358276,
"learning_rate": 8.354290847616954e-06,
"loss": 0.0553,
"step": 510
},
{
"epoch": 1.022,
"grad_norm": 1.0010886192321777,
"learning_rate": 8.345653031794292e-06,
"loss": 0.0357,
"step": 511
},
{
"epoch": 1.024,
"grad_norm": 1.442855715751648,
"learning_rate": 8.3369970978608e-06,
"loss": 0.1053,
"step": 512
},
{
"epoch": 1.026,
"grad_norm": 1.6988791227340698,
"learning_rate": 8.328323092691985e-06,
"loss": 0.1412,
"step": 513
},
{
"epoch": 1.028,
"grad_norm": 1.2314667701721191,
"learning_rate": 8.319631063261209e-06,
"loss": 0.0752,
"step": 514
},
{
"epoch": 1.03,
"grad_norm": 1.2584779262542725,
"learning_rate": 8.310921056639451e-06,
"loss": 0.0802,
"step": 515
},
{
"epoch": 1.032,
"grad_norm": 1.3586853742599487,
"learning_rate": 8.302193119995038e-06,
"loss": 0.0683,
"step": 516
},
{
"epoch": 1.034,
"grad_norm": 1.622609257698059,
"learning_rate": 8.293447300593402e-06,
"loss": 0.0936,
"step": 517
},
{
"epoch": 1.036,
"grad_norm": 1.5385489463806152,
"learning_rate": 8.284683645796814e-06,
"loss": 0.1009,
"step": 518
},
{
"epoch": 1.038,
"grad_norm": 1.3496136665344238,
"learning_rate": 8.275902203064125e-06,
"loss": 0.117,
"step": 519
},
{
"epoch": 1.04,
"grad_norm": 1.3850297927856445,
"learning_rate": 8.267103019950529e-06,
"loss": 0.0871,
"step": 520
},
{
"epoch": 1.042,
"grad_norm": 1.3631035089492798,
"learning_rate": 8.258286144107277e-06,
"loss": 0.1119,
"step": 521
},
{
"epoch": 1.044,
"grad_norm": 1.0882558822631836,
"learning_rate": 8.249451623281444e-06,
"loss": 0.0629,
"step": 522
},
{
"epoch": 1.046,
"grad_norm": 1.2716962099075317,
"learning_rate": 8.240599505315656e-06,
"loss": 0.1426,
"step": 523
},
{
"epoch": 1.048,
"grad_norm": 0.9287087321281433,
"learning_rate": 8.231729838147833e-06,
"loss": 0.0439,
"step": 524
},
{
"epoch": 1.05,
"grad_norm": 1.3585246801376343,
"learning_rate": 8.222842669810936e-06,
"loss": 0.0997,
"step": 525
},
{
"epoch": 1.052,
"grad_norm": 1.942495584487915,
"learning_rate": 8.213938048432697e-06,
"loss": 0.1591,
"step": 526
},
{
"epoch": 1.054,
"grad_norm": 0.990554928779602,
"learning_rate": 8.205016022235368e-06,
"loss": 0.0452,
"step": 527
},
{
"epoch": 1.056,
"grad_norm": 1.2105649709701538,
"learning_rate": 8.196076639535453e-06,
"loss": 0.0484,
"step": 528
},
{
"epoch": 1.058,
"grad_norm": 1.478988766670227,
"learning_rate": 8.18711994874345e-06,
"loss": 0.1172,
"step": 529
},
{
"epoch": 1.06,
"grad_norm": 1.5153695344924927,
"learning_rate": 8.178145998363585e-06,
"loss": 0.1165,
"step": 530
},
{
"epoch": 1.062,
"grad_norm": 1.9413139820098877,
"learning_rate": 8.16915483699355e-06,
"loss": 0.1477,
"step": 531
},
{
"epoch": 1.064,
"grad_norm": 1.581265926361084,
"learning_rate": 8.160146513324256e-06,
"loss": 0.125,
"step": 532
},
{
"epoch": 1.066,
"grad_norm": 1.3135696649551392,
"learning_rate": 8.151121076139534e-06,
"loss": 0.0889,
"step": 533
},
{
"epoch": 1.068,
"grad_norm": 1.047702670097351,
"learning_rate": 8.142078574315907e-06,
"loss": 0.0592,
"step": 534
},
{
"epoch": 1.07,
"grad_norm": 1.2292808294296265,
"learning_rate": 8.133019056822303e-06,
"loss": 0.0882,
"step": 535
},
{
"epoch": 1.072,
"grad_norm": 1.0881916284561157,
"learning_rate": 8.123942572719801e-06,
"loss": 0.0667,
"step": 536
},
{
"epoch": 1.074,
"grad_norm": 1.3366835117340088,
"learning_rate": 8.11484917116136e-06,
"loss": 0.0785,
"step": 537
},
{
"epoch": 1.076,
"grad_norm": 1.179837942123413,
"learning_rate": 8.105738901391553e-06,
"loss": 0.0914,
"step": 538
},
{
"epoch": 1.078,
"grad_norm": 1.1984621286392212,
"learning_rate": 8.096611812746302e-06,
"loss": 0.0604,
"step": 539
},
{
"epoch": 1.08,
"grad_norm": 1.167480230331421,
"learning_rate": 8.087467954652608e-06,
"loss": 0.0792,
"step": 540
},
{
"epoch": 1.082,
"grad_norm": 1.8635348081588745,
"learning_rate": 8.078307376628292e-06,
"loss": 0.0813,
"step": 541
},
{
"epoch": 1.084,
"grad_norm": 1.6677511930465698,
"learning_rate": 8.069130128281714e-06,
"loss": 0.0936,
"step": 542
},
{
"epoch": 1.086,
"grad_norm": 1.1988495588302612,
"learning_rate": 8.059936259311514e-06,
"loss": 0.0647,
"step": 543
},
{
"epoch": 1.088,
"grad_norm": 1.2146140336990356,
"learning_rate": 8.05072581950634e-06,
"loss": 0.1185,
"step": 544
},
{
"epoch": 1.09,
"grad_norm": 1.4924564361572266,
"learning_rate": 8.041498858744572e-06,
"loss": 0.1183,
"step": 545
},
{
"epoch": 1.092,
"grad_norm": 1.1881672143936157,
"learning_rate": 8.032255426994069e-06,
"loss": 0.0465,
"step": 546
},
{
"epoch": 1.094,
"grad_norm": 1.723354697227478,
"learning_rate": 8.022995574311876e-06,
"loss": 0.1093,
"step": 547
},
{
"epoch": 1.096,
"grad_norm": 1.3465012311935425,
"learning_rate": 8.013719350843969e-06,
"loss": 0.0956,
"step": 548
},
{
"epoch": 1.098,
"grad_norm": 1.4414770603179932,
"learning_rate": 8.004426806824985e-06,
"loss": 0.1028,
"step": 549
},
{
"epoch": 1.1,
"grad_norm": 1.053121566772461,
"learning_rate": 7.99511799257793e-06,
"loss": 0.0597,
"step": 550
},
{
"epoch": 1.102,
"grad_norm": 1.1670550107955933,
"learning_rate": 7.985792958513932e-06,
"loss": 0.0658,
"step": 551
},
{
"epoch": 1.104,
"grad_norm": 1.426408052444458,
"learning_rate": 7.97645175513195e-06,
"loss": 0.0767,
"step": 552
},
{
"epoch": 1.106,
"grad_norm": 1.1832488775253296,
"learning_rate": 7.967094433018508e-06,
"loss": 0.089,
"step": 553
},
{
"epoch": 1.108,
"grad_norm": 1.3124687671661377,
"learning_rate": 7.95772104284742e-06,
"loss": 0.0674,
"step": 554
},
{
"epoch": 1.11,
"grad_norm": 1.193248987197876,
"learning_rate": 7.948331635379517e-06,
"loss": 0.0602,
"step": 555
},
{
"epoch": 1.112,
"grad_norm": 1.0218859910964966,
"learning_rate": 7.938926261462366e-06,
"loss": 0.0692,
"step": 556
},
{
"epoch": 1.114,
"grad_norm": 1.4290659427642822,
"learning_rate": 7.929504972030003e-06,
"loss": 0.1171,
"step": 557
},
{
"epoch": 1.116,
"grad_norm": 1.4135209321975708,
"learning_rate": 7.920067818102652e-06,
"loss": 0.0841,
"step": 558
},
{
"epoch": 1.1179999999999999,
"grad_norm": 1.4746184349060059,
"learning_rate": 7.910614850786448e-06,
"loss": 0.0894,
"step": 559
},
{
"epoch": 1.12,
"grad_norm": 1.4937189817428589,
"learning_rate": 7.901146121273165e-06,
"loss": 0.1206,
"step": 560
},
{
"epoch": 1.1219999999999999,
"grad_norm": 1.5291107892990112,
"learning_rate": 7.891661680839932e-06,
"loss": 0.131,
"step": 561
},
{
"epoch": 1.124,
"grad_norm": 1.065200924873352,
"learning_rate": 7.882161580848966e-06,
"loss": 0.0466,
"step": 562
},
{
"epoch": 1.126,
"grad_norm": 1.4422272443771362,
"learning_rate": 7.872645872747281e-06,
"loss": 0.0745,
"step": 563
},
{
"epoch": 1.1280000000000001,
"grad_norm": 0.9669689536094666,
"learning_rate": 7.863114608066417e-06,
"loss": 0.0744,
"step": 564
},
{
"epoch": 1.13,
"grad_norm": 0.9402322173118591,
"learning_rate": 7.85356783842216e-06,
"loss": 0.062,
"step": 565
},
{
"epoch": 1.1320000000000001,
"grad_norm": 1.339349627494812,
"learning_rate": 7.84400561551426e-06,
"loss": 0.1108,
"step": 566
},
{
"epoch": 1.134,
"grad_norm": 1.3024260997772217,
"learning_rate": 7.834427991126155e-06,
"loss": 0.0687,
"step": 567
},
{
"epoch": 1.1360000000000001,
"grad_norm": 1.167820692062378,
"learning_rate": 7.82483501712469e-06,
"loss": 0.0736,
"step": 568
},
{
"epoch": 1.138,
"grad_norm": 1.4007395505905151,
"learning_rate": 7.815226745459831e-06,
"loss": 0.079,
"step": 569
},
{
"epoch": 1.1400000000000001,
"grad_norm": 1.4969840049743652,
"learning_rate": 7.80560322816439e-06,
"loss": 0.0921,
"step": 570
},
{
"epoch": 1.142,
"grad_norm": 1.6289410591125488,
"learning_rate": 7.795964517353734e-06,
"loss": 0.1482,
"step": 571
},
{
"epoch": 1.144,
"grad_norm": 1.1523656845092773,
"learning_rate": 7.786310665225522e-06,
"loss": 0.06,
"step": 572
},
{
"epoch": 1.146,
"grad_norm": 1.2471307516098022,
"learning_rate": 7.776641724059398e-06,
"loss": 0.0959,
"step": 573
},
{
"epoch": 1.148,
"grad_norm": 1.285406470298767,
"learning_rate": 7.76695774621672e-06,
"loss": 0.0868,
"step": 574
},
{
"epoch": 1.15,
"grad_norm": 1.1705622673034668,
"learning_rate": 7.757258784140286e-06,
"loss": 0.0666,
"step": 575
},
{
"epoch": 1.152,
"grad_norm": 1.426995873451233,
"learning_rate": 7.747544890354031e-06,
"loss": 0.1463,
"step": 576
},
{
"epoch": 1.154,
"grad_norm": 1.361234188079834,
"learning_rate": 7.737816117462752e-06,
"loss": 0.1009,
"step": 577
},
{
"epoch": 1.156,
"grad_norm": 1.0927761793136597,
"learning_rate": 7.728072518151826e-06,
"loss": 0.0795,
"step": 578
},
{
"epoch": 1.158,
"grad_norm": 1.3462144136428833,
"learning_rate": 7.718314145186918e-06,
"loss": 0.0921,
"step": 579
},
{
"epoch": 1.16,
"grad_norm": 1.0117051601409912,
"learning_rate": 7.7085410514137e-06,
"loss": 0.0425,
"step": 580
},
{
"epoch": 1.162,
"grad_norm": 1.2194225788116455,
"learning_rate": 7.698753289757565e-06,
"loss": 0.0533,
"step": 581
},
{
"epoch": 1.164,
"grad_norm": 1.6970041990280151,
"learning_rate": 7.688950913223336e-06,
"loss": 0.1019,
"step": 582
},
{
"epoch": 1.166,
"grad_norm": 1.1927522420883179,
"learning_rate": 7.679133974894984e-06,
"loss": 0.0902,
"step": 583
},
{
"epoch": 1.168,
"grad_norm": 1.0492830276489258,
"learning_rate": 7.669302527935334e-06,
"loss": 0.06,
"step": 584
},
{
"epoch": 1.17,
"grad_norm": 1.0530539751052856,
"learning_rate": 7.65945662558579e-06,
"loss": 0.0676,
"step": 585
},
{
"epoch": 1.172,
"grad_norm": 1.158579707145691,
"learning_rate": 7.649596321166024e-06,
"loss": 0.0676,
"step": 586
},
{
"epoch": 1.174,
"grad_norm": 1.1233559846878052,
"learning_rate": 7.639721668073718e-06,
"loss": 0.045,
"step": 587
},
{
"epoch": 1.176,
"grad_norm": 1.0013350248336792,
"learning_rate": 7.629832719784245e-06,
"loss": 0.0714,
"step": 588
},
{
"epoch": 1.178,
"grad_norm": 1.3075058460235596,
"learning_rate": 7.619929529850397e-06,
"loss": 0.065,
"step": 589
},
{
"epoch": 1.18,
"grad_norm": 1.1243176460266113,
"learning_rate": 7.610012151902091e-06,
"loss": 0.0575,
"step": 590
},
{
"epoch": 1.182,
"grad_norm": 1.3878341913223267,
"learning_rate": 7.600080639646077e-06,
"loss": 0.0544,
"step": 591
},
{
"epoch": 1.184,
"grad_norm": 1.202048420906067,
"learning_rate": 7.590135046865652e-06,
"loss": 0.1045,
"step": 592
},
{
"epoch": 1.186,
"grad_norm": 1.0473711490631104,
"learning_rate": 7.580175427420358e-06,
"loss": 0.043,
"step": 593
},
{
"epoch": 1.188,
"grad_norm": 1.1366267204284668,
"learning_rate": 7.570201835245703e-06,
"loss": 0.0529,
"step": 594
},
{
"epoch": 1.19,
"grad_norm": 1.1529453992843628,
"learning_rate": 7.560214324352858e-06,
"loss": 0.0696,
"step": 595
},
{
"epoch": 1.192,
"grad_norm": 1.0182459354400635,
"learning_rate": 7.550212948828377e-06,
"loss": 0.0387,
"step": 596
},
{
"epoch": 1.194,
"grad_norm": 1.2447912693023682,
"learning_rate": 7.54019776283389e-06,
"loss": 0.0706,
"step": 597
},
{
"epoch": 1.196,
"grad_norm": 1.0753467082977295,
"learning_rate": 7.530168820605819e-06,
"loss": 0.0521,
"step": 598
},
{
"epoch": 1.198,
"grad_norm": 1.341948390007019,
"learning_rate": 7.520126176455084e-06,
"loss": 0.0926,
"step": 599
},
{
"epoch": 1.2,
"grad_norm": 1.0409612655639648,
"learning_rate": 7.510069884766802e-06,
"loss": 0.0802,
"step": 600
}
],
"logging_steps": 1,
"max_steps": 1500,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5412817212850831e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}