sft_lora / workmem /checkpoint-1096 /trainer_state.json
SUJQ's picture
Upload folder using huggingface_hub
389c0ee verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.078762306610408,
"eval_steps": 50,
"global_step": 1096,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0028129395218002813,
"grad_norm": 75.0997085571289,
"learning_rate": 0.0,
"loss": 6.7334,
"step": 1
},
{
"epoch": 0.005625879043600563,
"grad_norm": 77.64845275878906,
"learning_rate": 1.8181818181818183e-07,
"loss": 7.1245,
"step": 2
},
{
"epoch": 0.008438818565400843,
"grad_norm": 72.08865356445312,
"learning_rate": 3.6363636363636366e-07,
"loss": 7.1721,
"step": 3
},
{
"epoch": 0.011251758087201125,
"grad_norm": 86.44847106933594,
"learning_rate": 5.454545454545455e-07,
"loss": 7.4339,
"step": 4
},
{
"epoch": 0.014064697609001406,
"grad_norm": 75.51171875,
"learning_rate": 7.272727272727273e-07,
"loss": 6.9712,
"step": 5
},
{
"epoch": 0.016877637130801686,
"grad_norm": 73.0139389038086,
"learning_rate": 9.090909090909091e-07,
"loss": 7.0641,
"step": 6
},
{
"epoch": 0.01969057665260197,
"grad_norm": 78.8460464477539,
"learning_rate": 1.090909090909091e-06,
"loss": 7.3246,
"step": 7
},
{
"epoch": 0.02250351617440225,
"grad_norm": 73.03649139404297,
"learning_rate": 1.2727272727272728e-06,
"loss": 6.5645,
"step": 8
},
{
"epoch": 0.02531645569620253,
"grad_norm": 83.64058685302734,
"learning_rate": 1.4545454545454546e-06,
"loss": 7.0709,
"step": 9
},
{
"epoch": 0.02812939521800281,
"grad_norm": 98.82685089111328,
"learning_rate": 1.6363636363636365e-06,
"loss": 7.9752,
"step": 10
},
{
"epoch": 0.030942334739803096,
"grad_norm": 77.33248901367188,
"learning_rate": 1.8181818181818183e-06,
"loss": 6.7595,
"step": 11
},
{
"epoch": 0.03375527426160337,
"grad_norm": 75.38268280029297,
"learning_rate": 2.0000000000000003e-06,
"loss": 7.2824,
"step": 12
},
{
"epoch": 0.03656821378340366,
"grad_norm": 84.62842559814453,
"learning_rate": 2.181818181818182e-06,
"loss": 7.3766,
"step": 13
},
{
"epoch": 0.03938115330520394,
"grad_norm": 79.66527557373047,
"learning_rate": 2.363636363636364e-06,
"loss": 6.902,
"step": 14
},
{
"epoch": 0.04219409282700422,
"grad_norm": 88.1312026977539,
"learning_rate": 2.5454545454545456e-06,
"loss": 7.5326,
"step": 15
},
{
"epoch": 0.0450070323488045,
"grad_norm": 67.1671371459961,
"learning_rate": 2.7272727272727272e-06,
"loss": 6.6224,
"step": 16
},
{
"epoch": 0.04781997187060478,
"grad_norm": 78.15252685546875,
"learning_rate": 2.9090909090909093e-06,
"loss": 7.2991,
"step": 17
},
{
"epoch": 0.05063291139240506,
"grad_norm": 89.14740753173828,
"learning_rate": 3.090909090909091e-06,
"loss": 7.316,
"step": 18
},
{
"epoch": 0.053445850914205346,
"grad_norm": 82.18356323242188,
"learning_rate": 3.272727272727273e-06,
"loss": 7.2328,
"step": 19
},
{
"epoch": 0.05625879043600562,
"grad_norm": 75.57778930664062,
"learning_rate": 3.454545454545455e-06,
"loss": 6.8493,
"step": 20
},
{
"epoch": 0.05907172995780591,
"grad_norm": 91.4236068725586,
"learning_rate": 3.6363636363636366e-06,
"loss": 7.251,
"step": 21
},
{
"epoch": 0.06188466947960619,
"grad_norm": 69.89977264404297,
"learning_rate": 3.818181818181819e-06,
"loss": 6.1767,
"step": 22
},
{
"epoch": 0.06469760900140648,
"grad_norm": 79.23346710205078,
"learning_rate": 4.000000000000001e-06,
"loss": 6.5113,
"step": 23
},
{
"epoch": 0.06751054852320675,
"grad_norm": 75.04463958740234,
"learning_rate": 4.181818181818182e-06,
"loss": 6.3854,
"step": 24
},
{
"epoch": 0.07032348804500703,
"grad_norm": 90.92483520507812,
"learning_rate": 4.363636363636364e-06,
"loss": 7.383,
"step": 25
},
{
"epoch": 0.07313642756680731,
"grad_norm": 69.97665405273438,
"learning_rate": 4.5454545454545455e-06,
"loss": 6.1346,
"step": 26
},
{
"epoch": 0.0759493670886076,
"grad_norm": 89.80615234375,
"learning_rate": 4.727272727272728e-06,
"loss": 6.9564,
"step": 27
},
{
"epoch": 0.07876230661040788,
"grad_norm": 81.48190307617188,
"learning_rate": 4.90909090909091e-06,
"loss": 6.7021,
"step": 28
},
{
"epoch": 0.08157524613220815,
"grad_norm": 79.94600677490234,
"learning_rate": 5.090909090909091e-06,
"loss": 5.7781,
"step": 29
},
{
"epoch": 0.08438818565400844,
"grad_norm": 80.19660949707031,
"learning_rate": 5.272727272727273e-06,
"loss": 5.7521,
"step": 30
},
{
"epoch": 0.08720112517580872,
"grad_norm": 74.73751831054688,
"learning_rate": 5.4545454545454545e-06,
"loss": 5.2344,
"step": 31
},
{
"epoch": 0.090014064697609,
"grad_norm": 84.3858871459961,
"learning_rate": 5.636363636363636e-06,
"loss": 5.6553,
"step": 32
},
{
"epoch": 0.09282700421940929,
"grad_norm": 93.59162902832031,
"learning_rate": 5.8181818181818185e-06,
"loss": 5.0521,
"step": 33
},
{
"epoch": 0.09563994374120956,
"grad_norm": 68.47096252441406,
"learning_rate": 6e-06,
"loss": 5.225,
"step": 34
},
{
"epoch": 0.09845288326300984,
"grad_norm": 62.65687942504883,
"learning_rate": 6.181818181818182e-06,
"loss": 4.338,
"step": 35
},
{
"epoch": 0.10126582278481013,
"grad_norm": 66.28219604492188,
"learning_rate": 6.363636363636364e-06,
"loss": 4.714,
"step": 36
},
{
"epoch": 0.10407876230661041,
"grad_norm": 64.53064727783203,
"learning_rate": 6.545454545454546e-06,
"loss": 4.8437,
"step": 37
},
{
"epoch": 0.10689170182841069,
"grad_norm": 40.499000549316406,
"learning_rate": 6.7272727272727275e-06,
"loss": 4.5761,
"step": 38
},
{
"epoch": 0.10970464135021098,
"grad_norm": 40.0291633605957,
"learning_rate": 6.90909090909091e-06,
"loss": 4.1693,
"step": 39
},
{
"epoch": 0.11251758087201125,
"grad_norm": 19.737794876098633,
"learning_rate": 7.0909090909090916e-06,
"loss": 3.7602,
"step": 40
},
{
"epoch": 0.11533052039381153,
"grad_norm": 18.727174758911133,
"learning_rate": 7.272727272727273e-06,
"loss": 3.941,
"step": 41
},
{
"epoch": 0.11814345991561181,
"grad_norm": 15.440817832946777,
"learning_rate": 7.454545454545456e-06,
"loss": 3.676,
"step": 42
},
{
"epoch": 0.1209563994374121,
"grad_norm": 49.393409729003906,
"learning_rate": 7.636363636363638e-06,
"loss": 3.5502,
"step": 43
},
{
"epoch": 0.12376933895921238,
"grad_norm": 29.366811752319336,
"learning_rate": 7.81818181818182e-06,
"loss": 3.174,
"step": 44
},
{
"epoch": 0.12658227848101267,
"grad_norm": 26.623790740966797,
"learning_rate": 8.000000000000001e-06,
"loss": 3.1249,
"step": 45
},
{
"epoch": 0.12939521800281295,
"grad_norm": 23.555133819580078,
"learning_rate": 8.181818181818183e-06,
"loss": 3.5312,
"step": 46
},
{
"epoch": 0.13220815752461323,
"grad_norm": 33.85753631591797,
"learning_rate": 8.363636363636365e-06,
"loss": 3.4659,
"step": 47
},
{
"epoch": 0.1350210970464135,
"grad_norm": 17.72439193725586,
"learning_rate": 8.545454545454546e-06,
"loss": 2.7741,
"step": 48
},
{
"epoch": 0.13783403656821377,
"grad_norm": 17.903911590576172,
"learning_rate": 8.727272727272728e-06,
"loss": 3.334,
"step": 49
},
{
"epoch": 0.14064697609001406,
"grad_norm": 15.8783597946167,
"learning_rate": 8.90909090909091e-06,
"loss": 2.7859,
"step": 50
},
{
"epoch": 0.14064697609001406,
"eval_loss": 1.4928081035614014,
"eval_runtime": 2.8043,
"eval_samples_per_second": 9.271,
"eval_steps_per_second": 1.426,
"step": 50
},
{
"epoch": 0.14064697609001406,
"eval_active_sample_count": 30,
"eval_avg_loss": 805.75,
"eval_avg_mem_token_accuracy": 0.25177304964539005,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007424448394855171,
"eval_avg_mem_token_rate": 0.6361338388877802,
"eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 50,
"eval_loss": 1.4928081035614014,
"eval_num_samples": 30,
"eval_runtime": 2.8043,
"eval_samples_per_second": 9.271,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.426,
"eval_total_correct_count": 71,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 9563,
"step": 50
},
{
"epoch": 0.14345991561181434,
"grad_norm": 9.468743324279785,
"learning_rate": 9.090909090909091e-06,
"loss": 2.8193,
"step": 51
},
{
"epoch": 0.14627285513361463,
"grad_norm": 8.735793113708496,
"learning_rate": 9.272727272727273e-06,
"loss": 2.624,
"step": 52
},
{
"epoch": 0.1490857946554149,
"grad_norm": 11.669722557067871,
"learning_rate": 9.454545454545456e-06,
"loss": 2.8725,
"step": 53
},
{
"epoch": 0.1518987341772152,
"grad_norm": 16.81114959716797,
"learning_rate": 9.636363636363638e-06,
"loss": 2.4657,
"step": 54
},
{
"epoch": 0.15471167369901548,
"grad_norm": 19.379348754882812,
"learning_rate": 9.81818181818182e-06,
"loss": 2.1279,
"step": 55
},
{
"epoch": 0.15752461322081576,
"grad_norm": 13.823864936828613,
"learning_rate": 1e-05,
"loss": 2.3733,
"step": 56
},
{
"epoch": 0.16033755274261605,
"grad_norm": 14.514190673828125,
"learning_rate": 9.999977231314128e-06,
"loss": 2.1855,
"step": 57
},
{
"epoch": 0.1631504922644163,
"grad_norm": 24.133705139160156,
"learning_rate": 9.99990892546387e-06,
"loss": 2.2268,
"step": 58
},
{
"epoch": 0.1659634317862166,
"grad_norm": 13.885165214538574,
"learning_rate": 9.999795083071328e-06,
"loss": 2.1062,
"step": 59
},
{
"epoch": 0.16877637130801687,
"grad_norm": 6.023658752441406,
"learning_rate": 9.999635705173312e-06,
"loss": 1.9233,
"step": 60
},
{
"epoch": 0.17158931082981715,
"grad_norm": 5.1499104499816895,
"learning_rate": 9.999430793221356e-06,
"loss": 2.117,
"step": 61
},
{
"epoch": 0.17440225035161744,
"grad_norm": 5.638373851776123,
"learning_rate": 9.999180349081688e-06,
"loss": 2.2507,
"step": 62
},
{
"epoch": 0.17721518987341772,
"grad_norm": 5.992455959320068,
"learning_rate": 9.998884375035221e-06,
"loss": 1.9682,
"step": 63
},
{
"epoch": 0.180028129395218,
"grad_norm": 4.536100387573242,
"learning_rate": 9.998542873777534e-06,
"loss": 1.955,
"step": 64
},
{
"epoch": 0.1828410689170183,
"grad_norm": 11.286314964294434,
"learning_rate": 9.99815584841884e-06,
"loss": 2.1629,
"step": 65
},
{
"epoch": 0.18565400843881857,
"grad_norm": 9.133061408996582,
"learning_rate": 9.99772330248396e-06,
"loss": 1.4339,
"step": 66
},
{
"epoch": 0.18846694796061886,
"grad_norm": 7.25726842880249,
"learning_rate": 9.997245239912299e-06,
"loss": 2.0025,
"step": 67
},
{
"epoch": 0.19127988748241911,
"grad_norm": 5.315834045410156,
"learning_rate": 9.996721665057796e-06,
"loss": 1.7737,
"step": 68
},
{
"epoch": 0.1940928270042194,
"grad_norm": 3.770214080810547,
"learning_rate": 9.996152582688899e-06,
"loss": 1.8984,
"step": 69
},
{
"epoch": 0.19690576652601968,
"grad_norm": 4.797364711761475,
"learning_rate": 9.995537997988507e-06,
"loss": 2.0319,
"step": 70
},
{
"epoch": 0.19971870604781997,
"grad_norm": 5.449586391448975,
"learning_rate": 9.994877916553937e-06,
"loss": 1.7875,
"step": 71
},
{
"epoch": 0.20253164556962025,
"grad_norm": 3.06927227973938,
"learning_rate": 9.994172344396866e-06,
"loss": 1.5467,
"step": 72
},
{
"epoch": 0.20534458509142053,
"grad_norm": 3.089805841445923,
"learning_rate": 9.99342128794327e-06,
"loss": 1.3562,
"step": 73
},
{
"epoch": 0.20815752461322082,
"grad_norm": 3.4402778148651123,
"learning_rate": 9.992624754033377e-06,
"loss": 1.7436,
"step": 74
},
{
"epoch": 0.2109704641350211,
"grad_norm": 2.948519706726074,
"learning_rate": 9.991782749921601e-06,
"loss": 1.5302,
"step": 75
},
{
"epoch": 0.21378340365682139,
"grad_norm": 6.839716911315918,
"learning_rate": 9.990895283276472e-06,
"loss": 1.6953,
"step": 76
},
{
"epoch": 0.21659634317862167,
"grad_norm": 4.01812219619751,
"learning_rate": 9.98996236218057e-06,
"loss": 1.7822,
"step": 77
},
{
"epoch": 0.21940928270042195,
"grad_norm": 4.928662300109863,
"learning_rate": 9.98898399513045e-06,
"loss": 1.4248,
"step": 78
},
{
"epoch": 0.2222222222222222,
"grad_norm": 3.146573305130005,
"learning_rate": 9.987960191036564e-06,
"loss": 1.6365,
"step": 79
},
{
"epoch": 0.2250351617440225,
"grad_norm": 4.380753993988037,
"learning_rate": 9.986890959223181e-06,
"loss": 1.7186,
"step": 80
},
{
"epoch": 0.22784810126582278,
"grad_norm": 2.831251621246338,
"learning_rate": 9.985776309428306e-06,
"loss": 1.4852,
"step": 81
},
{
"epoch": 0.23066104078762306,
"grad_norm": 3.742809772491455,
"learning_rate": 9.984616251803577e-06,
"loss": 1.5631,
"step": 82
},
{
"epoch": 0.23347398030942335,
"grad_norm": 3.9068987369537354,
"learning_rate": 9.983410796914197e-06,
"loss": 1.482,
"step": 83
},
{
"epoch": 0.23628691983122363,
"grad_norm": 3.327174663543701,
"learning_rate": 9.982159955738808e-06,
"loss": 1.608,
"step": 84
},
{
"epoch": 0.2390998593530239,
"grad_norm": 3.083757162094116,
"learning_rate": 9.980863739669419e-06,
"loss": 1.5167,
"step": 85
},
{
"epoch": 0.2419127988748242,
"grad_norm": 2.9441981315612793,
"learning_rate": 9.979522160511282e-06,
"loss": 1.6137,
"step": 86
},
{
"epoch": 0.24472573839662448,
"grad_norm": 2.8649449348449707,
"learning_rate": 9.978135230482797e-06,
"loss": 1.665,
"step": 87
},
{
"epoch": 0.24753867791842477,
"grad_norm": 3.0601882934570312,
"learning_rate": 9.97670296221539e-06,
"loss": 1.5845,
"step": 88
},
{
"epoch": 0.25035161744022505,
"grad_norm": 4.856632232666016,
"learning_rate": 9.975225368753412e-06,
"loss": 1.5959,
"step": 89
},
{
"epoch": 0.25316455696202533,
"grad_norm": 3.0896317958831787,
"learning_rate": 9.973702463554004e-06,
"loss": 1.2724,
"step": 90
},
{
"epoch": 0.2559774964838256,
"grad_norm": 2.862079381942749,
"learning_rate": 9.972134260486989e-06,
"loss": 1.73,
"step": 91
},
{
"epoch": 0.2587904360056259,
"grad_norm": 2.281548500061035,
"learning_rate": 9.970520773834734e-06,
"loss": 1.4366,
"step": 92
},
{
"epoch": 0.2616033755274262,
"grad_norm": 2.9218814373016357,
"learning_rate": 9.968862018292025e-06,
"loss": 1.7853,
"step": 93
},
{
"epoch": 0.26441631504922647,
"grad_norm": 3.361042022705078,
"learning_rate": 9.967158008965942e-06,
"loss": 1.5868,
"step": 94
},
{
"epoch": 0.2672292545710267,
"grad_norm": 2.6090950965881348,
"learning_rate": 9.965408761375702e-06,
"loss": 1.6479,
"step": 95
},
{
"epoch": 0.270042194092827,
"grad_norm": 2.4182980060577393,
"learning_rate": 9.963614291452532e-06,
"loss": 1.4854,
"step": 96
},
{
"epoch": 0.27285513361462727,
"grad_norm": 2.7494289875030518,
"learning_rate": 9.961774615539523e-06,
"loss": 1.6097,
"step": 97
},
{
"epoch": 0.27566807313642755,
"grad_norm": 3.082038402557373,
"learning_rate": 9.959889750391474e-06,
"loss": 1.3752,
"step": 98
},
{
"epoch": 0.27848101265822783,
"grad_norm": 3.282862663269043,
"learning_rate": 9.957959713174748e-06,
"loss": 1.3782,
"step": 99
},
{
"epoch": 0.2812939521800281,
"grad_norm": 2.0881476402282715,
"learning_rate": 9.955984521467108e-06,
"loss": 1.3952,
"step": 100
},
{
"epoch": 0.2812939521800281,
"eval_loss": 0.7734614014625549,
"eval_runtime": 2.8846,
"eval_samples_per_second": 9.013,
"eval_steps_per_second": 1.387,
"step": 100
},
{
"epoch": 0.2812939521800281,
"eval_active_sample_count": 30,
"eval_avg_loss": 638.5,
"eval_avg_mem_token_accuracy": 0.24822695035460993,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008693492300049677,
"eval_avg_mem_token_rate": 0.5356216324087009,
"eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 100,
"eval_loss": 0.7734614014625549,
"eval_num_samples": 30,
"eval_runtime": 2.8846,
"eval_samples_per_second": 9.013,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.387,
"eval_total_correct_count": 70,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8052,
"step": 100
},
{
"epoch": 0.2841068917018284,
"grad_norm": 2.6222341060638428,
"learning_rate": 9.953964193257563e-06,
"loss": 1.5721,
"step": 101
},
{
"epoch": 0.2869198312236287,
"grad_norm": 2.3316454887390137,
"learning_rate": 9.951898746946201e-06,
"loss": 1.3596,
"step": 102
},
{
"epoch": 0.28973277074542897,
"grad_norm": 2.174182176589966,
"learning_rate": 9.949788201344019e-06,
"loss": 1.2779,
"step": 103
},
{
"epoch": 0.29254571026722925,
"grad_norm": 2.538205862045288,
"learning_rate": 9.947632575672758e-06,
"loss": 1.3406,
"step": 104
},
{
"epoch": 0.29535864978902954,
"grad_norm": 1.902901291847229,
"learning_rate": 9.945431889564724e-06,
"loss": 1.1408,
"step": 105
},
{
"epoch": 0.2981715893108298,
"grad_norm": 2.382870674133301,
"learning_rate": 9.943186163062607e-06,
"loss": 1.3498,
"step": 106
},
{
"epoch": 0.3009845288326301,
"grad_norm": 2.490842342376709,
"learning_rate": 9.940895416619308e-06,
"loss": 1.401,
"step": 107
},
{
"epoch": 0.3037974683544304,
"grad_norm": 2.9286532402038574,
"learning_rate": 9.938559671097739e-06,
"loss": 1.5762,
"step": 108
},
{
"epoch": 0.3066104078762307,
"grad_norm": 2.838031530380249,
"learning_rate": 9.93617894777064e-06,
"loss": 1.5001,
"step": 109
},
{
"epoch": 0.30942334739803096,
"grad_norm": 2.0874297618865967,
"learning_rate": 9.933753268320391e-06,
"loss": 1.3123,
"step": 110
},
{
"epoch": 0.31223628691983124,
"grad_norm": 2.5237607955932617,
"learning_rate": 9.931282654838803e-06,
"loss": 1.2764,
"step": 111
},
{
"epoch": 0.3150492264416315,
"grad_norm": 2.4033403396606445,
"learning_rate": 9.928767129826929e-06,
"loss": 1.3374,
"step": 112
},
{
"epoch": 0.3178621659634318,
"grad_norm": 2.2955803871154785,
"learning_rate": 9.926206716194842e-06,
"loss": 1.3878,
"step": 113
},
{
"epoch": 0.3206751054852321,
"grad_norm": 3.3657052516937256,
"learning_rate": 9.92360143726145e-06,
"loss": 1.288,
"step": 114
},
{
"epoch": 0.3234880450070324,
"grad_norm": 3.1771109104156494,
"learning_rate": 9.920951316754259e-06,
"loss": 1.4854,
"step": 115
},
{
"epoch": 0.3263009845288326,
"grad_norm": 2.6639983654022217,
"learning_rate": 9.918256378809178e-06,
"loss": 1.5049,
"step": 116
},
{
"epoch": 0.3291139240506329,
"grad_norm": 2.107646942138672,
"learning_rate": 9.915516647970283e-06,
"loss": 1.2783,
"step": 117
},
{
"epoch": 0.3319268635724332,
"grad_norm": 2.307697296142578,
"learning_rate": 9.9127321491896e-06,
"loss": 1.3444,
"step": 118
},
{
"epoch": 0.33473980309423346,
"grad_norm": 2.160855293273926,
"learning_rate": 9.909902907826884e-06,
"loss": 1.112,
"step": 119
},
{
"epoch": 0.33755274261603374,
"grad_norm": 2.300719976425171,
"learning_rate": 9.907028949649376e-06,
"loss": 1.3957,
"step": 120
},
{
"epoch": 0.340365682137834,
"grad_norm": 2.3513684272766113,
"learning_rate": 9.904110300831577e-06,
"loss": 1.224,
"step": 121
},
{
"epoch": 0.3431786216596343,
"grad_norm": 2.0586118698120117,
"learning_rate": 9.901146987955008e-06,
"loss": 1.1874,
"step": 122
},
{
"epoch": 0.3459915611814346,
"grad_norm": 2.517422676086426,
"learning_rate": 9.898139038007962e-06,
"loss": 1.2165,
"step": 123
},
{
"epoch": 0.3488045007032349,
"grad_norm": 2.1542768478393555,
"learning_rate": 9.895086478385267e-06,
"loss": 1.3451,
"step": 124
},
{
"epoch": 0.35161744022503516,
"grad_norm": 2.022313356399536,
"learning_rate": 9.891989336888033e-06,
"loss": 1.2169,
"step": 125
},
{
"epoch": 0.35443037974683544,
"grad_norm": 2.6460540294647217,
"learning_rate": 9.888847641723394e-06,
"loss": 1.4583,
"step": 126
},
{
"epoch": 0.35724331926863573,
"grad_norm": 2.2727549076080322,
"learning_rate": 9.88566142150426e-06,
"loss": 1.2032,
"step": 127
},
{
"epoch": 0.360056258790436,
"grad_norm": 2.1075050830841064,
"learning_rate": 9.88243070524905e-06,
"loss": 1.1943,
"step": 128
},
{
"epoch": 0.3628691983122363,
"grad_norm": 2.352522611618042,
"learning_rate": 9.87915552238143e-06,
"loss": 1.3522,
"step": 129
},
{
"epoch": 0.3656821378340366,
"grad_norm": 2.469947338104248,
"learning_rate": 9.87583590273004e-06,
"loss": 1.1493,
"step": 130
},
{
"epoch": 0.36849507735583686,
"grad_norm": 2.1671838760375977,
"learning_rate": 9.872471876528235e-06,
"loss": 1.3792,
"step": 131
},
{
"epoch": 0.37130801687763715,
"grad_norm": 2.235957622528076,
"learning_rate": 9.869063474413798e-06,
"loss": 1.3672,
"step": 132
},
{
"epoch": 0.37412095639943743,
"grad_norm": 2.241083860397339,
"learning_rate": 9.865610727428661e-06,
"loss": 1.1784,
"step": 133
},
{
"epoch": 0.3769338959212377,
"grad_norm": 2.1455912590026855,
"learning_rate": 9.862113667018628e-06,
"loss": 1.2497,
"step": 134
},
{
"epoch": 0.379746835443038,
"grad_norm": 2.49971342086792,
"learning_rate": 9.858572325033089e-06,
"loss": 1.4471,
"step": 135
},
{
"epoch": 0.38255977496483823,
"grad_norm": 2.6926071643829346,
"learning_rate": 9.854986733724724e-06,
"loss": 1.1595,
"step": 136
},
{
"epoch": 0.3853727144866385,
"grad_norm": 2.2876596450805664,
"learning_rate": 9.851356925749218e-06,
"loss": 1.1668,
"step": 137
},
{
"epoch": 0.3881856540084388,
"grad_norm": 2.018536329269409,
"learning_rate": 9.847682934164948e-06,
"loss": 1.1446,
"step": 138
},
{
"epoch": 0.3909985935302391,
"grad_norm": 2.660203456878662,
"learning_rate": 9.843964792432701e-06,
"loss": 1.3112,
"step": 139
},
{
"epoch": 0.39381153305203936,
"grad_norm": 2.4841043949127197,
"learning_rate": 9.840202534415358e-06,
"loss": 1.3684,
"step": 140
},
{
"epoch": 0.39662447257383965,
"grad_norm": 2.1534616947174072,
"learning_rate": 9.836396194377587e-06,
"loss": 1.2795,
"step": 141
},
{
"epoch": 0.39943741209563993,
"grad_norm": 2.2963688373565674,
"learning_rate": 9.832545806985532e-06,
"loss": 1.298,
"step": 142
},
{
"epoch": 0.4022503516174402,
"grad_norm": 2.911456346511841,
"learning_rate": 9.828651407306495e-06,
"loss": 1.3186,
"step": 143
},
{
"epoch": 0.4050632911392405,
"grad_norm": 3.0715761184692383,
"learning_rate": 9.824713030808626e-06,
"loss": 1.378,
"step": 144
},
{
"epoch": 0.4078762306610408,
"grad_norm": 2.150747537612915,
"learning_rate": 9.820730713360585e-06,
"loss": 1.1809,
"step": 145
},
{
"epoch": 0.41068917018284107,
"grad_norm": 2.1824264526367188,
"learning_rate": 9.816704491231226e-06,
"loss": 1.0561,
"step": 146
},
{
"epoch": 0.41350210970464135,
"grad_norm": 2.2817230224609375,
"learning_rate": 9.812634401089265e-06,
"loss": 1.2782,
"step": 147
},
{
"epoch": 0.41631504922644164,
"grad_norm": 2.196108341217041,
"learning_rate": 9.808520480002942e-06,
"loss": 1.1196,
"step": 148
},
{
"epoch": 0.4191279887482419,
"grad_norm": 2.3351998329162598,
"learning_rate": 9.804362765439688e-06,
"loss": 1.4752,
"step": 149
},
{
"epoch": 0.4219409282700422,
"grad_norm": 1.8851360082626343,
"learning_rate": 9.800161295265782e-06,
"loss": 1.1407,
"step": 150
},
{
"epoch": 0.4219409282700422,
"eval_loss": 0.7094771862030029,
"eval_runtime": 2.855,
"eval_samples_per_second": 9.107,
"eval_steps_per_second": 1.401,
"step": 150
},
{
"epoch": 0.4219409282700422,
"eval_active_sample_count": 30,
"eval_avg_loss": 615.25,
"eval_avg_mem_token_accuracy": 0.23404255319148937,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007896625987078248,
"eval_avg_mem_token_rate": 0.5559768509279585,
"eval_avg_mem_token_recall(Accuracy)": 0.23404255319148937,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 150,
"eval_loss": 0.7094771862030029,
"eval_num_samples": 30,
"eval_runtime": 2.855,
"eval_samples_per_second": 9.107,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.401,
"eval_total_correct_count": 66,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8358,
"step": 150
},
{
"epoch": 0.4247538677918425,
"grad_norm": 2.1879961490631104,
"learning_rate": 9.795916107746009e-06,
"loss": 1.1632,
"step": 151
},
{
"epoch": 0.42756680731364277,
"grad_norm": 2.7381277084350586,
"learning_rate": 9.7916272415433e-06,
"loss": 1.3305,
"step": 152
},
{
"epoch": 0.43037974683544306,
"grad_norm": 2.1921334266662598,
"learning_rate": 9.787294735718397e-06,
"loss": 1.1759,
"step": 153
},
{
"epoch": 0.43319268635724334,
"grad_norm": 2.2524077892303467,
"learning_rate": 9.782918629729486e-06,
"loss": 1.1278,
"step": 154
},
{
"epoch": 0.4360056258790436,
"grad_norm": 2.3991479873657227,
"learning_rate": 9.778498963431838e-06,
"loss": 1.2304,
"step": 155
},
{
"epoch": 0.4388185654008439,
"grad_norm": 2.4503281116485596,
"learning_rate": 9.774035777077452e-06,
"loss": 1.3168,
"step": 156
},
{
"epoch": 0.44163150492264414,
"grad_norm": 2.1630754470825195,
"learning_rate": 9.769529111314683e-06,
"loss": 1.1698,
"step": 157
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.1806483268737793,
"learning_rate": 9.764979007187874e-06,
"loss": 1.1485,
"step": 158
},
{
"epoch": 0.4472573839662447,
"grad_norm": 2.1980652809143066,
"learning_rate": 9.760385506136982e-06,
"loss": 1.3419,
"step": 159
},
{
"epoch": 0.450070323488045,
"grad_norm": 4.968358039855957,
"learning_rate": 9.755748649997197e-06,
"loss": 1.19,
"step": 160
},
{
"epoch": 0.45288326300984527,
"grad_norm": 2.15004563331604,
"learning_rate": 9.751068480998572e-06,
"loss": 1.2162,
"step": 161
},
{
"epoch": 0.45569620253164556,
"grad_norm": 2.2927024364471436,
"learning_rate": 9.746345041765624e-06,
"loss": 1.2539,
"step": 162
},
{
"epoch": 0.45850914205344584,
"grad_norm": 2.2658493518829346,
"learning_rate": 9.741578375316953e-06,
"loss": 1.4352,
"step": 163
},
{
"epoch": 0.4613220815752461,
"grad_norm": 2.3411777019500732,
"learning_rate": 9.736768525064852e-06,
"loss": 1.4317,
"step": 164
},
{
"epoch": 0.4641350210970464,
"grad_norm": 2.0097508430480957,
"learning_rate": 9.731915534814912e-06,
"loss": 1.1761,
"step": 165
},
{
"epoch": 0.4669479606188467,
"grad_norm": 2.312138080596924,
"learning_rate": 9.727019448765613e-06,
"loss": 1.2183,
"step": 166
},
{
"epoch": 0.469760900140647,
"grad_norm": 2.3369953632354736,
"learning_rate": 9.722080311507938e-06,
"loss": 1.3209,
"step": 167
},
{
"epoch": 0.47257383966244726,
"grad_norm": 2.1543290615081787,
"learning_rate": 9.717098168024948e-06,
"loss": 1.2806,
"step": 168
},
{
"epoch": 0.47538677918424754,
"grad_norm": 2.3597400188446045,
"learning_rate": 9.712073063691388e-06,
"loss": 1.2461,
"step": 169
},
{
"epoch": 0.4781997187060478,
"grad_norm": 2.410320520401001,
"learning_rate": 9.707005044273268e-06,
"loss": 1.3153,
"step": 170
},
{
"epoch": 0.4810126582278481,
"grad_norm": 2.5447475910186768,
"learning_rate": 9.701894155927445e-06,
"loss": 1.3782,
"step": 171
},
{
"epoch": 0.4838255977496484,
"grad_norm": 2.600811004638672,
"learning_rate": 9.696740445201202e-06,
"loss": 1.5061,
"step": 172
},
{
"epoch": 0.4866385372714487,
"grad_norm": 2.225473642349243,
"learning_rate": 9.691543959031831e-06,
"loss": 1.3204,
"step": 173
},
{
"epoch": 0.48945147679324896,
"grad_norm": 2.2354350090026855,
"learning_rate": 9.68630474474619e-06,
"loss": 1.3342,
"step": 174
},
{
"epoch": 0.49226441631504925,
"grad_norm": 2.4795658588409424,
"learning_rate": 9.681022850060297e-06,
"loss": 1.2004,
"step": 175
},
{
"epoch": 0.49507735583684953,
"grad_norm": 2.111879348754883,
"learning_rate": 9.675698323078865e-06,
"loss": 1.0086,
"step": 176
},
{
"epoch": 0.4978902953586498,
"grad_norm": 2.0163023471832275,
"learning_rate": 9.67033121229489e-06,
"loss": 1.0946,
"step": 177
},
{
"epoch": 0.5007032348804501,
"grad_norm": 2.2219393253326416,
"learning_rate": 9.664921566589195e-06,
"loss": 1.3935,
"step": 178
},
{
"epoch": 0.5035161744022504,
"grad_norm": 2.128089189529419,
"learning_rate": 9.659469435229992e-06,
"loss": 1.1659,
"step": 179
},
{
"epoch": 0.5063291139240507,
"grad_norm": 2.5307302474975586,
"learning_rate": 9.653974867872424e-06,
"loss": 1.1473,
"step": 180
},
{
"epoch": 0.509142053445851,
"grad_norm": 2.2050728797912598,
"learning_rate": 9.648437914558126e-06,
"loss": 1.3126,
"step": 181
},
{
"epoch": 0.5119549929676512,
"grad_norm": 2.1602675914764404,
"learning_rate": 9.642858625714753e-06,
"loss": 1.0508,
"step": 182
},
{
"epoch": 0.5147679324894515,
"grad_norm": 2.3411359786987305,
"learning_rate": 9.637237052155541e-06,
"loss": 1.2805,
"step": 183
},
{
"epoch": 0.5175808720112518,
"grad_norm": 2.3061892986297607,
"learning_rate": 9.631573245078823e-06,
"loss": 1.324,
"step": 184
},
{
"epoch": 0.5203938115330521,
"grad_norm": 2.0462026596069336,
"learning_rate": 9.625867256067577e-06,
"loss": 1.2376,
"step": 185
},
{
"epoch": 0.5232067510548524,
"grad_norm": 2.2104408740997314,
"learning_rate": 9.620119137088954e-06,
"loss": 1.2963,
"step": 186
},
{
"epoch": 0.5260196905766527,
"grad_norm": 2.5065929889678955,
"learning_rate": 9.614328940493797e-06,
"loss": 1.3735,
"step": 187
},
{
"epoch": 0.5288326300984529,
"grad_norm": 2.349320888519287,
"learning_rate": 9.608496719016176e-06,
"loss": 1.2742,
"step": 188
},
{
"epoch": 0.5316455696202531,
"grad_norm": 2.519850730895996,
"learning_rate": 9.602622525772895e-06,
"loss": 1.4212,
"step": 189
},
{
"epoch": 0.5344585091420534,
"grad_norm": 2.0543527603149414,
"learning_rate": 9.596706414263022e-06,
"loss": 1.1391,
"step": 190
},
{
"epoch": 0.5372714486638537,
"grad_norm": 2.289496898651123,
"learning_rate": 9.59074843836739e-06,
"loss": 1.2401,
"step": 191
},
{
"epoch": 0.540084388185654,
"grad_norm": 2.350924491882324,
"learning_rate": 9.584748652348107e-06,
"loss": 1.3712,
"step": 192
},
{
"epoch": 0.5428973277074542,
"grad_norm": 2.23681640625,
"learning_rate": 9.578707110848077e-06,
"loss": 1.1505,
"step": 193
},
{
"epoch": 0.5457102672292545,
"grad_norm": 2.008516788482666,
"learning_rate": 9.572623868890482e-06,
"loss": 1.0241,
"step": 194
},
{
"epoch": 0.5485232067510548,
"grad_norm": 2.3972671031951904,
"learning_rate": 9.566498981878289e-06,
"loss": 1.4334,
"step": 195
},
{
"epoch": 0.5513361462728551,
"grad_norm": 1.9378750324249268,
"learning_rate": 9.560332505593754e-06,
"loss": 1.0679,
"step": 196
},
{
"epoch": 0.5541490857946554,
"grad_norm": 2.3928143978118896,
"learning_rate": 9.554124496197899e-06,
"loss": 1.0903,
"step": 197
},
{
"epoch": 0.5569620253164557,
"grad_norm": 2.4164905548095703,
"learning_rate": 9.547875010230009e-06,
"loss": 1.3779,
"step": 198
},
{
"epoch": 0.559774964838256,
"grad_norm": 2.0729787349700928,
"learning_rate": 9.54158410460712e-06,
"loss": 1.114,
"step": 199
},
{
"epoch": 0.5625879043600562,
"grad_norm": 1.9305024147033691,
"learning_rate": 9.535251836623491e-06,
"loss": 1.1579,
"step": 200
},
{
"epoch": 0.5625879043600562,
"eval_loss": 0.6872708797454834,
"eval_runtime": 2.8553,
"eval_samples_per_second": 9.106,
"eval_steps_per_second": 1.401,
"step": 200
},
{
"epoch": 0.5625879043600562,
"eval_active_sample_count": 30,
"eval_avg_loss": 608.5,
"eval_avg_mem_token_accuracy": 0.2198581560283688,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007418930238123729,
"eval_avg_mem_token_rate": 0.5559103306060001,
"eval_avg_mem_token_recall(Accuracy)": 0.2198581560283688,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 200,
"eval_loss": 0.6872708797454834,
"eval_num_samples": 30,
"eval_runtime": 2.8553,
"eval_samples_per_second": 9.106,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.401,
"eval_total_correct_count": 62,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8357,
"step": 200
},
{
"epoch": 0.5654008438818565,
"grad_norm": 2.2860162258148193,
"learning_rate": 9.528878263950094e-06,
"loss": 1.2892,
"step": 201
},
{
"epoch": 0.5682137834036568,
"grad_norm": 2.314282178878784,
"learning_rate": 9.522463444634075e-06,
"loss": 1.0782,
"step": 202
},
{
"epoch": 0.5710267229254571,
"grad_norm": 15.191813468933105,
"learning_rate": 9.516007437098238e-06,
"loss": 1.2559,
"step": 203
},
{
"epoch": 0.5738396624472574,
"grad_norm": 1.9443162679672241,
"learning_rate": 9.509510300140506e-06,
"loss": 0.8679,
"step": 204
},
{
"epoch": 0.5766526019690577,
"grad_norm": 2.5310826301574707,
"learning_rate": 9.502972092933384e-06,
"loss": 1.2779,
"step": 205
},
{
"epoch": 0.5794655414908579,
"grad_norm": 2.4394469261169434,
"learning_rate": 9.496392875023433e-06,
"loss": 1.1331,
"step": 206
},
{
"epoch": 0.5822784810126582,
"grad_norm": 2.40698504447937,
"learning_rate": 9.489772706330707e-06,
"loss": 1.4669,
"step": 207
},
{
"epoch": 0.5850914205344585,
"grad_norm": 2.0934903621673584,
"learning_rate": 9.483111647148223e-06,
"loss": 1.2372,
"step": 208
},
{
"epoch": 0.5879043600562588,
"grad_norm": 2.2789113521575928,
"learning_rate": 9.476409758141404e-06,
"loss": 1.3838,
"step": 209
},
{
"epoch": 0.5907172995780591,
"grad_norm": 2.0439610481262207,
"learning_rate": 9.469667100347539e-06,
"loss": 1.1897,
"step": 210
},
{
"epoch": 0.5935302390998594,
"grad_norm": 2.5594871044158936,
"learning_rate": 9.462883735175205e-06,
"loss": 1.2361,
"step": 211
},
{
"epoch": 0.5963431786216596,
"grad_norm": 2.417461395263672,
"learning_rate": 9.45605972440373e-06,
"loss": 1.3818,
"step": 212
},
{
"epoch": 0.5991561181434599,
"grad_norm": 2.030989170074463,
"learning_rate": 9.449195130182614e-06,
"loss": 1.2072,
"step": 213
},
{
"epoch": 0.6019690576652602,
"grad_norm": 1.9220385551452637,
"learning_rate": 9.442290015030974e-06,
"loss": 1.1057,
"step": 214
},
{
"epoch": 0.6047819971870605,
"grad_norm": 2.4362001419067383,
"learning_rate": 9.43534444183697e-06,
"loss": 1.3472,
"step": 215
},
{
"epoch": 0.6075949367088608,
"grad_norm": 1.9925367832183838,
"learning_rate": 9.42835847385723e-06,
"loss": 1.2851,
"step": 216
},
{
"epoch": 0.6104078762306611,
"grad_norm": 2.3182199001312256,
"learning_rate": 9.42133217471628e-06,
"loss": 1.2026,
"step": 217
},
{
"epoch": 0.6132208157524613,
"grad_norm": 2.7779831886291504,
"learning_rate": 9.414265608405956e-06,
"loss": 1.2488,
"step": 218
},
{
"epoch": 0.6160337552742616,
"grad_norm": 2.6299376487731934,
"learning_rate": 9.407158839284836e-06,
"loss": 1.3019,
"step": 219
},
{
"epoch": 0.6188466947960619,
"grad_norm": 3.4749839305877686,
"learning_rate": 9.40001193207763e-06,
"loss": 1.4892,
"step": 220
},
{
"epoch": 0.6216596343178622,
"grad_norm": 2.2574360370635986,
"learning_rate": 9.392824951874618e-06,
"loss": 1.2897,
"step": 221
},
{
"epoch": 0.6244725738396625,
"grad_norm": 2.16740083694458,
"learning_rate": 9.385597964131033e-06,
"loss": 1.2792,
"step": 222
},
{
"epoch": 0.6272855133614628,
"grad_norm": 2.0155792236328125,
"learning_rate": 9.378331034666483e-06,
"loss": 1.2584,
"step": 223
},
{
"epoch": 0.630098452883263,
"grad_norm": 2.4452121257781982,
"learning_rate": 9.371024229664342e-06,
"loss": 1.4524,
"step": 224
},
{
"epoch": 0.6329113924050633,
"grad_norm": 2.295438766479492,
"learning_rate": 9.363677615671148e-06,
"loss": 1.2677,
"step": 225
},
{
"epoch": 0.6357243319268636,
"grad_norm": 2.1375696659088135,
"learning_rate": 9.356291259596e-06,
"loss": 1.265,
"step": 226
},
{
"epoch": 0.6385372714486639,
"grad_norm": 2.3946800231933594,
"learning_rate": 9.348865228709947e-06,
"loss": 1.3528,
"step": 227
},
{
"epoch": 0.6413502109704642,
"grad_norm": 2.332805871963501,
"learning_rate": 9.341399590645373e-06,
"loss": 1.3119,
"step": 228
},
{
"epoch": 0.6441631504922645,
"grad_norm": 2.3480770587921143,
"learning_rate": 9.333894413395388e-06,
"loss": 1.33,
"step": 229
},
{
"epoch": 0.6469760900140648,
"grad_norm": 2.432349681854248,
"learning_rate": 9.326349765313199e-06,
"loss": 1.1957,
"step": 230
},
{
"epoch": 0.6497890295358649,
"grad_norm": 2.0219781398773193,
"learning_rate": 9.318765715111497e-06,
"loss": 1.2202,
"step": 231
},
{
"epoch": 0.6526019690576652,
"grad_norm": 2.8865296840667725,
"learning_rate": 9.311142331861821e-06,
"loss": 1.5149,
"step": 232
},
{
"epoch": 0.6554149085794655,
"grad_norm": 2.1823160648345947,
"learning_rate": 9.303479684993943e-06,
"loss": 1.2677,
"step": 233
},
{
"epoch": 0.6582278481012658,
"grad_norm": 2.011133909225464,
"learning_rate": 9.295777844295219e-06,
"loss": 1.0202,
"step": 234
},
{
"epoch": 0.6610407876230661,
"grad_norm": 2.2680437564849854,
"learning_rate": 9.288036879909967e-06,
"loss": 1.2755,
"step": 235
},
{
"epoch": 0.6638537271448663,
"grad_norm": 2.297574520111084,
"learning_rate": 9.280256862338822e-06,
"loss": 1.2567,
"step": 236
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.2774109840393066,
"learning_rate": 9.272437862438095e-06,
"loss": 1.1645,
"step": 237
},
{
"epoch": 0.6694796061884669,
"grad_norm": 2.4613051414489746,
"learning_rate": 9.264579951419126e-06,
"loss": 1.3841,
"step": 238
},
{
"epoch": 0.6722925457102672,
"grad_norm": 2.2511165142059326,
"learning_rate": 9.256683200847638e-06,
"loss": 1.2692,
"step": 239
},
{
"epoch": 0.6751054852320675,
"grad_norm": 2.209132432937622,
"learning_rate": 9.248747682643085e-06,
"loss": 1.2905,
"step": 240
},
{
"epoch": 0.6779184247538678,
"grad_norm": 2.3346107006073,
"learning_rate": 9.240773469077994e-06,
"loss": 1.189,
"step": 241
},
{
"epoch": 0.680731364275668,
"grad_norm": 2.3697586059570312,
"learning_rate": 9.232760632777311e-06,
"loss": 1.236,
"step": 242
},
{
"epoch": 0.6835443037974683,
"grad_norm": 2.7163619995117188,
"learning_rate": 9.22470924671774e-06,
"loss": 1.3411,
"step": 243
},
{
"epoch": 0.6863572433192686,
"grad_norm": 2.210554838180542,
"learning_rate": 9.216619384227068e-06,
"loss": 1.2791,
"step": 244
},
{
"epoch": 0.6891701828410689,
"grad_norm": 2.2112317085266113,
"learning_rate": 9.208491118983515e-06,
"loss": 1.2984,
"step": 245
},
{
"epoch": 0.6919831223628692,
"grad_norm": 2.247898817062378,
"learning_rate": 9.200324525015046e-06,
"loss": 1.2766,
"step": 246
},
{
"epoch": 0.6947960618846695,
"grad_norm": 2.2993924617767334,
"learning_rate": 9.192119676698703e-06,
"loss": 1.1908,
"step": 247
},
{
"epoch": 0.6976090014064698,
"grad_norm": 2.4729530811309814,
"learning_rate": 9.183876648759937e-06,
"loss": 1.364,
"step": 248
},
{
"epoch": 0.70042194092827,
"grad_norm": 2.201533794403076,
"learning_rate": 9.175595516271911e-06,
"loss": 1.344,
"step": 249
},
{
"epoch": 0.7032348804500703,
"grad_norm": 2.3106961250305176,
"learning_rate": 9.167276354654827e-06,
"loss": 1.313,
"step": 250
},
{
"epoch": 0.7032348804500703,
"eval_loss": 0.6741299033164978,
"eval_runtime": 2.8499,
"eval_samples_per_second": 9.123,
"eval_steps_per_second": 1.404,
"step": 250
},
{
"epoch": 0.7032348804500703,
"eval_active_sample_count": 30,
"eval_avg_loss": 590.625,
"eval_avg_mem_token_accuracy": 0.23049645390070922,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007840772014475271,
"eval_avg_mem_token_rate": 0.5514534690347901,
"eval_avg_mem_token_recall(Accuracy)": 0.23049645390070922,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 250,
"eval_loss": 0.6741299033164978,
"eval_num_samples": 30,
"eval_runtime": 2.8499,
"eval_samples_per_second": 9.123,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.404,
"eval_total_correct_count": 65,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8290,
"step": 250
},
{
"epoch": 0.7060478199718706,
"grad_norm": 2.2657763957977295,
"learning_rate": 9.158919239675237e-06,
"loss": 0.9924,
"step": 251
},
{
"epoch": 0.7088607594936709,
"grad_norm": 2.8294458389282227,
"learning_rate": 9.150524247445346e-06,
"loss": 1.5447,
"step": 252
},
{
"epoch": 0.7116736990154712,
"grad_norm": 2.327502489089966,
"learning_rate": 9.14209145442234e-06,
"loss": 1.3784,
"step": 253
},
{
"epoch": 0.7144866385372715,
"grad_norm": 2.2193102836608887,
"learning_rate": 9.133620937407656e-06,
"loss": 1.2874,
"step": 254
},
{
"epoch": 0.7172995780590717,
"grad_norm": 2.400413990020752,
"learning_rate": 9.125112773546315e-06,
"loss": 1.2711,
"step": 255
},
{
"epoch": 0.720112517580872,
"grad_norm": 2.1976544857025146,
"learning_rate": 9.1165670403262e-06,
"loss": 1.399,
"step": 256
},
{
"epoch": 0.7229254571026723,
"grad_norm": 2.2996156215667725,
"learning_rate": 9.107983815577359e-06,
"loss": 1.4082,
"step": 257
},
{
"epoch": 0.7257383966244726,
"grad_norm": 2.307288408279419,
"learning_rate": 9.09936317747129e-06,
"loss": 1.275,
"step": 258
},
{
"epoch": 0.7285513361462729,
"grad_norm": 2.204585552215576,
"learning_rate": 9.090705204520231e-06,
"loss": 1.3542,
"step": 259
},
{
"epoch": 0.7313642756680732,
"grad_norm": 2.3391809463500977,
"learning_rate": 9.082009975576452e-06,
"loss": 1.231,
"step": 260
},
{
"epoch": 0.7341772151898734,
"grad_norm": 2.5154929161071777,
"learning_rate": 9.073277569831526e-06,
"loss": 1.3549,
"step": 261
},
{
"epoch": 0.7369901547116737,
"grad_norm": 2.1306750774383545,
"learning_rate": 9.064508066815614e-06,
"loss": 1.1,
"step": 262
},
{
"epoch": 0.739803094233474,
"grad_norm": 1.9493396282196045,
"learning_rate": 9.05570154639674e-06,
"loss": 1.0767,
"step": 263
},
{
"epoch": 0.7426160337552743,
"grad_norm": 2.2229723930358887,
"learning_rate": 9.046858088780064e-06,
"loss": 1.1945,
"step": 264
},
{
"epoch": 0.7454289732770746,
"grad_norm": 2.0410044193267822,
"learning_rate": 9.03797777450715e-06,
"loss": 1.2284,
"step": 265
},
{
"epoch": 0.7482419127988749,
"grad_norm": 2.533954381942749,
"learning_rate": 9.02906068445523e-06,
"loss": 1.4345,
"step": 266
},
{
"epoch": 0.7510548523206751,
"grad_norm": 2.324066162109375,
"learning_rate": 9.020106899836471e-06,
"loss": 1.2716,
"step": 267
},
{
"epoch": 0.7538677918424754,
"grad_norm": 2.0535366535186768,
"learning_rate": 9.011116502197243e-06,
"loss": 1.1823,
"step": 268
},
{
"epoch": 0.7566807313642757,
"grad_norm": 2.3328094482421875,
"learning_rate": 9.002089573417356e-06,
"loss": 1.2959,
"step": 269
},
{
"epoch": 0.759493670886076,
"grad_norm": 2.3262429237365723,
"learning_rate": 8.993026195709337e-06,
"loss": 0.965,
"step": 270
},
{
"epoch": 0.7623066104078763,
"grad_norm": 2.247913122177124,
"learning_rate": 8.983926451617664e-06,
"loss": 1.291,
"step": 271
},
{
"epoch": 0.7651195499296765,
"grad_norm": 2.140726089477539,
"learning_rate": 8.974790424018022e-06,
"loss": 1.2708,
"step": 272
},
{
"epoch": 0.7679324894514767,
"grad_norm": 2.0828731060028076,
"learning_rate": 8.96561819611655e-06,
"loss": 1.2937,
"step": 273
},
{
"epoch": 0.770745428973277,
"grad_norm": 2.237555742263794,
"learning_rate": 8.956409851449076e-06,
"loss": 1.1241,
"step": 274
},
{
"epoch": 0.7735583684950773,
"grad_norm": 1.906575083732605,
"learning_rate": 8.947165473880364e-06,
"loss": 1.0149,
"step": 275
},
{
"epoch": 0.7763713080168776,
"grad_norm": 2.204448699951172,
"learning_rate": 8.937885147603345e-06,
"loss": 1.2036,
"step": 276
},
{
"epoch": 0.7791842475386779,
"grad_norm": 2.151160717010498,
"learning_rate": 8.928568957138356e-06,
"loss": 1.2992,
"step": 277
},
{
"epoch": 0.7819971870604782,
"grad_norm": 2.286642551422119,
"learning_rate": 8.919216987332357e-06,
"loss": 1.2701,
"step": 278
},
{
"epoch": 0.7848101265822784,
"grad_norm": 3.3560984134674072,
"learning_rate": 8.909829323358177e-06,
"loss": 1.3486,
"step": 279
},
{
"epoch": 0.7876230661040787,
"grad_norm": 1.9844144582748413,
"learning_rate": 8.900406050713723e-06,
"loss": 0.967,
"step": 280
},
{
"epoch": 0.790436005625879,
"grad_norm": 2.1631999015808105,
"learning_rate": 8.89094725522121e-06,
"loss": 1.2139,
"step": 281
},
{
"epoch": 0.7932489451476793,
"grad_norm": 2.1446194648742676,
"learning_rate": 8.881453023026373e-06,
"loss": 1.2743,
"step": 282
},
{
"epoch": 0.7960618846694796,
"grad_norm": 1.9020416736602783,
"learning_rate": 8.871923440597694e-06,
"loss": 1.0834,
"step": 283
},
{
"epoch": 0.7988748241912799,
"grad_norm": 2.1618247032165527,
"learning_rate": 8.862358594725595e-06,
"loss": 1.151,
"step": 284
},
{
"epoch": 0.8016877637130801,
"grad_norm": 2.3456199169158936,
"learning_rate": 8.852758572521666e-06,
"loss": 1.206,
"step": 285
},
{
"epoch": 0.8045007032348804,
"grad_norm": 2.2839531898498535,
"learning_rate": 8.843123461417864e-06,
"loss": 1.248,
"step": 286
},
{
"epoch": 0.8073136427566807,
"grad_norm": 2.277515411376953,
"learning_rate": 8.833453349165713e-06,
"loss": 1.3061,
"step": 287
},
{
"epoch": 0.810126582278481,
"grad_norm": 2.3145205974578857,
"learning_rate": 8.823748323835517e-06,
"loss": 1.4309,
"step": 288
},
{
"epoch": 0.8129395218002813,
"grad_norm": 2.298470973968506,
"learning_rate": 8.814008473815542e-06,
"loss": 1.1581,
"step": 289
},
{
"epoch": 0.8157524613220816,
"grad_norm": 2.4578652381896973,
"learning_rate": 8.804233887811224e-06,
"loss": 1.328,
"step": 290
},
{
"epoch": 0.8185654008438819,
"grad_norm": 2.162040948867798,
"learning_rate": 8.794424654844352e-06,
"loss": 1.041,
"step": 291
},
{
"epoch": 0.8213783403656821,
"grad_norm": 2.1940865516662598,
"learning_rate": 8.784580864252266e-06,
"loss": 1.2024,
"step": 292
},
{
"epoch": 0.8241912798874824,
"grad_norm": 2.127418041229248,
"learning_rate": 8.774702605687036e-06,
"loss": 1.1357,
"step": 293
},
{
"epoch": 0.8270042194092827,
"grad_norm": 2.259040355682373,
"learning_rate": 8.764789969114647e-06,
"loss": 1.2494,
"step": 294
},
{
"epoch": 0.829817158931083,
"grad_norm": 2.398115634918213,
"learning_rate": 8.754843044814183e-06,
"loss": 1.3409,
"step": 295
},
{
"epoch": 0.8326300984528833,
"grad_norm": 1.94135320186615,
"learning_rate": 8.744861923377e-06,
"loss": 1.0011,
"step": 296
},
{
"epoch": 0.8354430379746836,
"grad_norm": 2.3360581398010254,
"learning_rate": 8.734846695705912e-06,
"loss": 1.3973,
"step": 297
},
{
"epoch": 0.8382559774964838,
"grad_norm": 2.0555343627929688,
"learning_rate": 8.724797453014342e-06,
"loss": 1.0796,
"step": 298
},
{
"epoch": 0.8410689170182841,
"grad_norm": 2.26999831199646,
"learning_rate": 8.714714286825512e-06,
"loss": 1.2569,
"step": 299
},
{
"epoch": 0.8438818565400844,
"grad_norm": 2.004324197769165,
"learning_rate": 8.704597288971598e-06,
"loss": 1.1934,
"step": 300
},
{
"epoch": 0.8438818565400844,
"eval_loss": 0.6666268110275269,
"eval_runtime": 2.761,
"eval_samples_per_second": 9.417,
"eval_steps_per_second": 1.449,
"step": 300
},
{
"epoch": 0.8438818565400844,
"eval_active_sample_count": 30,
"eval_avg_loss": 590.375,
"eval_avg_mem_token_accuracy": 0.22340425531914893,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007591276057356308,
"eval_avg_mem_token_rate": 0.5520521519324153,
"eval_avg_mem_token_recall(Accuracy)": 0.22340425531914893,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 300,
"eval_loss": 0.6666268110275269,
"eval_num_samples": 30,
"eval_runtime": 2.761,
"eval_samples_per_second": 9.417,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.449,
"eval_total_correct_count": 63,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8299,
"step": 300
},
{
"epoch": 0.8466947960618847,
"grad_norm": 2.1731441020965576,
"learning_rate": 8.6944465515929e-06,
"loss": 1.1642,
"step": 301
},
{
"epoch": 0.849507735583685,
"grad_norm": 1.9805549383163452,
"learning_rate": 8.684262167136999e-06,
"loss": 1.1963,
"step": 302
},
{
"epoch": 0.8523206751054853,
"grad_norm": 1.985160231590271,
"learning_rate": 8.674044228357915e-06,
"loss": 1.0271,
"step": 303
},
{
"epoch": 0.8551336146272855,
"grad_norm": 2.233934164047241,
"learning_rate": 8.663792828315259e-06,
"loss": 1.3379,
"step": 304
},
{
"epoch": 0.8579465541490858,
"grad_norm": 2.1742870807647705,
"learning_rate": 8.6535080603734e-06,
"loss": 1.2982,
"step": 305
},
{
"epoch": 0.8607594936708861,
"grad_norm": 2.2393639087677,
"learning_rate": 8.643190018200595e-06,
"loss": 1.2925,
"step": 306
},
{
"epoch": 0.8635724331926864,
"grad_norm": 2.395679473876953,
"learning_rate": 8.632838795768149e-06,
"loss": 1.3027,
"step": 307
},
{
"epoch": 0.8663853727144867,
"grad_norm": 1.976331353187561,
"learning_rate": 8.622454487349556e-06,
"loss": 1.1242,
"step": 308
},
{
"epoch": 0.869198312236287,
"grad_norm": 2.1286044120788574,
"learning_rate": 8.612037187519635e-06,
"loss": 1.1868,
"step": 309
},
{
"epoch": 0.8720112517580872,
"grad_norm": 2.2224793434143066,
"learning_rate": 8.601586991153681e-06,
"loss": 1.2595,
"step": 310
},
{
"epoch": 0.8748241912798875,
"grad_norm": 2.282410144805908,
"learning_rate": 8.591103993426588e-06,
"loss": 1.1068,
"step": 311
},
{
"epoch": 0.8776371308016878,
"grad_norm": 2.000074625015259,
"learning_rate": 8.580588289811987e-06,
"loss": 1.1547,
"step": 312
},
{
"epoch": 0.8804500703234881,
"grad_norm": 2.108109474182129,
"learning_rate": 8.570039976081382e-06,
"loss": 1.1654,
"step": 313
},
{
"epoch": 0.8832630098452883,
"grad_norm": 2.2698593139648438,
"learning_rate": 8.559459148303268e-06,
"loss": 1.0082,
"step": 314
},
{
"epoch": 0.8860759493670886,
"grad_norm": 2.04703426361084,
"learning_rate": 8.548845902842264e-06,
"loss": 1.2114,
"step": 315
},
{
"epoch": 0.8888888888888888,
"grad_norm": 1.9669705629348755,
"learning_rate": 8.538200336358227e-06,
"loss": 1.0822,
"step": 316
},
{
"epoch": 0.8917018284106891,
"grad_norm": 2.058732271194458,
"learning_rate": 8.527522545805386e-06,
"loss": 1.056,
"step": 317
},
{
"epoch": 0.8945147679324894,
"grad_norm": 2.1475107669830322,
"learning_rate": 8.51681262843144e-06,
"loss": 1.2073,
"step": 318
},
{
"epoch": 0.8973277074542897,
"grad_norm": 1.9537756443023682,
"learning_rate": 8.50607068177669e-06,
"loss": 1.026,
"step": 319
},
{
"epoch": 0.90014064697609,
"grad_norm": 2.14225172996521,
"learning_rate": 8.495296803673138e-06,
"loss": 1.3038,
"step": 320
},
{
"epoch": 0.9029535864978903,
"grad_norm": 2.2561981678009033,
"learning_rate": 8.484491092243603e-06,
"loss": 1.0576,
"step": 321
},
{
"epoch": 0.9057665260196905,
"grad_norm": 1.9777567386627197,
"learning_rate": 8.473653645900825e-06,
"loss": 1.1675,
"step": 322
},
{
"epoch": 0.9085794655414908,
"grad_norm": 2.2552154064178467,
"learning_rate": 8.462784563346567e-06,
"loss": 1.2568,
"step": 323
},
{
"epoch": 0.9113924050632911,
"grad_norm": 2.19797945022583,
"learning_rate": 8.451883943570722e-06,
"loss": 1.1247,
"step": 324
},
{
"epoch": 0.9142053445850914,
"grad_norm": 2.176769971847534,
"learning_rate": 8.440951885850402e-06,
"loss": 1.0333,
"step": 325
},
{
"epoch": 0.9170182841068917,
"grad_norm": 2.011472463607788,
"learning_rate": 8.429988489749045e-06,
"loss": 1.2882,
"step": 326
},
{
"epoch": 0.919831223628692,
"grad_norm": 2.276411294937134,
"learning_rate": 8.418993855115498e-06,
"loss": 1.2682,
"step": 327
},
{
"epoch": 0.9226441631504922,
"grad_norm": 1.9374414682388306,
"learning_rate": 8.407968082083116e-06,
"loss": 1.198,
"step": 328
},
{
"epoch": 0.9254571026722925,
"grad_norm": 2.0080978870391846,
"learning_rate": 8.396911271068842e-06,
"loss": 1.0495,
"step": 329
},
{
"epoch": 0.9282700421940928,
"grad_norm": 2.410945415496826,
"learning_rate": 8.385823522772299e-06,
"loss": 1.3558,
"step": 330
},
{
"epoch": 0.9310829817158931,
"grad_norm": 2.205632448196411,
"learning_rate": 8.37470493817487e-06,
"loss": 1.1552,
"step": 331
},
{
"epoch": 0.9338959212376934,
"grad_norm": 1.9957945346832275,
"learning_rate": 8.36355561853878e-06,
"loss": 1.2074,
"step": 332
},
{
"epoch": 0.9367088607594937,
"grad_norm": 1.889917254447937,
"learning_rate": 8.352375665406171e-06,
"loss": 0.8613,
"step": 333
},
{
"epoch": 0.939521800281294,
"grad_norm": 2.4653337001800537,
"learning_rate": 8.341165180598182e-06,
"loss": 1.3945,
"step": 334
},
{
"epoch": 0.9423347398030942,
"grad_norm": 2.15743088722229,
"learning_rate": 8.32992426621401e-06,
"loss": 1.1899,
"step": 335
},
{
"epoch": 0.9451476793248945,
"grad_norm": 2.014369010925293,
"learning_rate": 8.318653024629999e-06,
"loss": 1.2004,
"step": 336
},
{
"epoch": 0.9479606188466948,
"grad_norm": 2.475370168685913,
"learning_rate": 8.307351558498692e-06,
"loss": 1.0919,
"step": 337
},
{
"epoch": 0.9507735583684951,
"grad_norm": 2.288590669631958,
"learning_rate": 8.296019970747901e-06,
"loss": 1.054,
"step": 338
},
{
"epoch": 0.9535864978902954,
"grad_norm": 2.0414512157440186,
"learning_rate": 8.284658364579771e-06,
"loss": 1.2336,
"step": 339
},
{
"epoch": 0.9563994374120957,
"grad_norm": 2.192631483078003,
"learning_rate": 8.27326684346984e-06,
"loss": 1.2078,
"step": 340
},
{
"epoch": 0.9592123769338959,
"grad_norm": 2.109923839569092,
"learning_rate": 8.261845511166092e-06,
"loss": 1.2295,
"step": 341
},
{
"epoch": 0.9620253164556962,
"grad_norm": 1.7825968265533447,
"learning_rate": 8.250394471688018e-06,
"loss": 1.1074,
"step": 342
},
{
"epoch": 0.9648382559774965,
"grad_norm": 1.9041146039962769,
"learning_rate": 8.23891382932567e-06,
"loss": 1.1283,
"step": 343
},
{
"epoch": 0.9676511954992968,
"grad_norm": 2.0874454975128174,
"learning_rate": 8.2274036886387e-06,
"loss": 1.1228,
"step": 344
},
{
"epoch": 0.9704641350210971,
"grad_norm": 1.9520052671432495,
"learning_rate": 8.215864154455421e-06,
"loss": 1.2209,
"step": 345
},
{
"epoch": 0.9732770745428974,
"grad_norm": 2.6171762943267822,
"learning_rate": 8.204295331871844e-06,
"loss": 1.6231,
"step": 346
},
{
"epoch": 0.9760900140646976,
"grad_norm": 2.0320959091186523,
"learning_rate": 8.192697326250722e-06,
"loss": 1.153,
"step": 347
},
{
"epoch": 0.9789029535864979,
"grad_norm": 1.8297227621078491,
"learning_rate": 8.1810702432206e-06,
"loss": 0.9717,
"step": 348
},
{
"epoch": 0.9817158931082982,
"grad_norm": 2.077699661254883,
"learning_rate": 8.169414188674829e-06,
"loss": 0.9804,
"step": 349
},
{
"epoch": 0.9845288326300985,
"grad_norm": 2.002263069152832,
"learning_rate": 8.157729268770636e-06,
"loss": 1.1233,
"step": 350
},
{
"epoch": 0.9845288326300985,
"eval_loss": 0.6594013571739197,
"eval_runtime": 2.8213,
"eval_samples_per_second": 9.216,
"eval_steps_per_second": 1.418,
"step": 350
},
{
"epoch": 0.9845288326300985,
"eval_active_sample_count": 30,
"eval_avg_loss": 591.0,
"eval_avg_mem_token_accuracy": 0.22340425531914893,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007495538370017847,
"eval_avg_mem_token_rate": 0.5591033060600014,
"eval_avg_mem_token_recall(Accuracy)": 0.22340425531914893,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 350,
"eval_loss": 0.6594013571739197,
"eval_num_samples": 30,
"eval_runtime": 2.8213,
"eval_samples_per_second": 9.216,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.418,
"eval_total_correct_count": 63,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8405,
"step": 350
},
{
"epoch": 0.9873417721518988,
"grad_norm": 2.112032890319824,
"learning_rate": 8.146015589928123e-06,
"loss": 1.1559,
"step": 351
},
{
"epoch": 0.9901547116736991,
"grad_norm": 2.227578639984131,
"learning_rate": 8.134273258829322e-06,
"loss": 1.2947,
"step": 352
},
{
"epoch": 0.9929676511954993,
"grad_norm": 2.0214011669158936,
"learning_rate": 8.122502382417211e-06,
"loss": 1.3415,
"step": 353
},
{
"epoch": 0.9957805907172996,
"grad_norm": 2.176740884780884,
"learning_rate": 8.110703067894747e-06,
"loss": 1.3129,
"step": 354
},
{
"epoch": 0.9985935302390999,
"grad_norm": 1.9748849868774414,
"learning_rate": 8.098875422723884e-06,
"loss": 1.0268,
"step": 355
},
{
"epoch": 1.0,
"grad_norm": 1.5412670373916626,
"learning_rate": 8.087019554624595e-06,
"loss": 0.657,
"step": 356
},
{
"epoch": 1.0028129395218002,
"grad_norm": 2.013446092605591,
"learning_rate": 8.075135571573898e-06,
"loss": 1.1009,
"step": 357
},
{
"epoch": 1.0056258790436006,
"grad_norm": 2.034468412399292,
"learning_rate": 8.06322358180486e-06,
"loss": 1.1514,
"step": 358
},
{
"epoch": 1.0084388185654007,
"grad_norm": 2.1513798236846924,
"learning_rate": 8.051283693805624e-06,
"loss": 1.1312,
"step": 359
},
{
"epoch": 1.0112517580872011,
"grad_norm": 1.8825079202651978,
"learning_rate": 8.039316016318415e-06,
"loss": 0.9748,
"step": 360
},
{
"epoch": 1.0140646976090013,
"grad_norm": 2.040106773376465,
"learning_rate": 8.027320658338547e-06,
"loss": 1.2061,
"step": 361
},
{
"epoch": 1.0168776371308017,
"grad_norm": 2.0149614810943604,
"learning_rate": 8.015297729113436e-06,
"loss": 1.0372,
"step": 362
},
{
"epoch": 1.0196905766526019,
"grad_norm": 1.8744758367538452,
"learning_rate": 8.0032473381416e-06,
"loss": 1.1538,
"step": 363
},
{
"epoch": 1.0225035161744023,
"grad_norm": 2.2196671962738037,
"learning_rate": 7.991169595171669e-06,
"loss": 1.1131,
"step": 364
},
{
"epoch": 1.0253164556962024,
"grad_norm": 2.2530996799468994,
"learning_rate": 7.979064610201372e-06,
"loss": 1.3786,
"step": 365
},
{
"epoch": 1.0281293952180028,
"grad_norm": 2.0854427814483643,
"learning_rate": 7.966932493476554e-06,
"loss": 1.0615,
"step": 366
},
{
"epoch": 1.030942334739803,
"grad_norm": 2.3596975803375244,
"learning_rate": 7.954773355490155e-06,
"loss": 1.366,
"step": 367
},
{
"epoch": 1.0337552742616034,
"grad_norm": 1.9892560243606567,
"learning_rate": 7.942587306981213e-06,
"loss": 1.0439,
"step": 368
},
{
"epoch": 1.0365682137834036,
"grad_norm": 1.8899530172348022,
"learning_rate": 7.930374458933852e-06,
"loss": 1.0212,
"step": 369
},
{
"epoch": 1.039381153305204,
"grad_norm": 2.1707684993743896,
"learning_rate": 7.918134922576271e-06,
"loss": 1.1767,
"step": 370
},
{
"epoch": 1.0421940928270041,
"grad_norm": 2.041611671447754,
"learning_rate": 7.905868809379735e-06,
"loss": 1.2155,
"step": 371
},
{
"epoch": 1.0450070323488045,
"grad_norm": 1.939260482788086,
"learning_rate": 7.893576231057553e-06,
"loss": 1.0179,
"step": 372
},
{
"epoch": 1.0478199718706047,
"grad_norm": 1.9848639965057373,
"learning_rate": 7.88125729956407e-06,
"loss": 1.0099,
"step": 373
},
{
"epoch": 1.0506329113924051,
"grad_norm": 2.0023953914642334,
"learning_rate": 7.868912127093638e-06,
"loss": 1.119,
"step": 374
},
{
"epoch": 1.0534458509142053,
"grad_norm": 1.7961069345474243,
"learning_rate": 7.856540826079595e-06,
"loss": 0.7417,
"step": 375
},
{
"epoch": 1.0562587904360057,
"grad_norm": 1.8289830684661865,
"learning_rate": 7.844143509193252e-06,
"loss": 1.0566,
"step": 376
},
{
"epoch": 1.0590717299578059,
"grad_norm": 1.8681098222732544,
"learning_rate": 7.831720289342853e-06,
"loss": 0.9817,
"step": 377
},
{
"epoch": 1.0618846694796062,
"grad_norm": 1.9967904090881348,
"learning_rate": 7.819271279672553e-06,
"loss": 0.9361,
"step": 378
},
{
"epoch": 1.0646976090014064,
"grad_norm": 1.7474114894866943,
"learning_rate": 7.806796593561389e-06,
"loss": 0.9923,
"step": 379
},
{
"epoch": 1.0675105485232068,
"grad_norm": 2.514089822769165,
"learning_rate": 7.794296344622246e-06,
"loss": 1.2647,
"step": 380
},
{
"epoch": 1.070323488045007,
"grad_norm": 2.2486379146575928,
"learning_rate": 7.78177064670082e-06,
"loss": 1.1741,
"step": 381
},
{
"epoch": 1.0731364275668074,
"grad_norm": 2.0108935832977295,
"learning_rate": 7.769219613874581e-06,
"loss": 1.0724,
"step": 382
},
{
"epoch": 1.0759493670886076,
"grad_norm": 2.316124677658081,
"learning_rate": 7.756643360451744e-06,
"loss": 1.2943,
"step": 383
},
{
"epoch": 1.078762306610408,
"grad_norm": 2.3428173065185547,
"learning_rate": 7.744042000970207e-06,
"loss": 1.2522,
"step": 384
},
{
"epoch": 1.0815752461322081,
"grad_norm": 2.087315797805786,
"learning_rate": 7.731415650196535e-06,
"loss": 1.0241,
"step": 385
},
{
"epoch": 1.0843881856540085,
"grad_norm": 2.1546409130096436,
"learning_rate": 7.718764423124892e-06,
"loss": 1.2256,
"step": 386
},
{
"epoch": 1.0872011251758087,
"grad_norm": 2.559561252593994,
"learning_rate": 7.706088434976e-06,
"loss": 1.4538,
"step": 387
},
{
"epoch": 1.090014064697609,
"grad_norm": 2.023336410522461,
"learning_rate": 7.6933878011961e-06,
"loss": 1.1043,
"step": 388
},
{
"epoch": 1.0928270042194093,
"grad_norm": 2.1914350986480713,
"learning_rate": 7.68066263745589e-06,
"loss": 1.1997,
"step": 389
},
{
"epoch": 1.0956399437412097,
"grad_norm": 1.8683468103408813,
"learning_rate": 7.667913059649468e-06,
"loss": 1.0576,
"step": 390
},
{
"epoch": 1.0984528832630098,
"grad_norm": 2.2158288955688477,
"learning_rate": 7.65513918389329e-06,
"loss": 1.2133,
"step": 391
},
{
"epoch": 1.1012658227848102,
"grad_norm": 2.4496500492095947,
"learning_rate": 7.6423411265251e-06,
"loss": 1.309,
"step": 392
},
{
"epoch": 1.1040787623066104,
"grad_norm": 2.3594353199005127,
"learning_rate": 7.629519004102876e-06,
"loss": 1.2893,
"step": 393
},
{
"epoch": 1.1068917018284108,
"grad_norm": 2.0072391033172607,
"learning_rate": 7.616672933403772e-06,
"loss": 0.9854,
"step": 394
},
{
"epoch": 1.109704641350211,
"grad_norm": 2.1165082454681396,
"learning_rate": 7.603803031423046e-06,
"loss": 1.0648,
"step": 395
},
{
"epoch": 1.1125175808720114,
"grad_norm": 2.136019229888916,
"learning_rate": 7.590909415373e-06,
"loss": 1.2763,
"step": 396
},
{
"epoch": 1.1153305203938115,
"grad_norm": 2.089963912963867,
"learning_rate": 7.577992202681912e-06,
"loss": 1.1498,
"step": 397
},
{
"epoch": 1.1181434599156117,
"grad_norm": 2.0347511768341064,
"learning_rate": 7.565051510992964e-06,
"loss": 1.0931,
"step": 398
},
{
"epoch": 1.120956399437412,
"grad_norm": 1.902830958366394,
"learning_rate": 7.552087458163177e-06,
"loss": 1.0382,
"step": 399
},
{
"epoch": 1.1237693389592125,
"grad_norm": 2.3222129344940186,
"learning_rate": 7.539100162262325e-06,
"loss": 1.3173,
"step": 400
},
{
"epoch": 1.1237693389592125,
"eval_loss": 0.6541261672973633,
"eval_runtime": 2.7652,
"eval_samples_per_second": 9.403,
"eval_steps_per_second": 1.447,
"step": 400
},
{
"epoch": 1.1237693389592125,
"eval_active_sample_count": 30,
"eval_avg_loss": 579.625,
"eval_avg_mem_token_accuracy": 0.23049645390070922,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007830381881701,
"eval_avg_mem_token_rate": 0.552185192576332,
"eval_avg_mem_token_recall(Accuracy)": 0.23049645390070922,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 400,
"eval_loss": 0.6541261672973633,
"eval_num_samples": 30,
"eval_runtime": 2.7652,
"eval_samples_per_second": 9.403,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.447,
"eval_total_correct_count": 65,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8301,
"step": 400
},
{
"epoch": 1.1265822784810127,
"grad_norm": 2.1172144412994385,
"learning_rate": 7.526089741571876e-06,
"loss": 1.2135,
"step": 401
},
{
"epoch": 1.1293952180028128,
"grad_norm": 2.117197036743164,
"learning_rate": 7.5130563145838994e-06,
"loss": 1.2903,
"step": 402
},
{
"epoch": 1.1322081575246132,
"grad_norm": 1.8641384840011597,
"learning_rate": 7.500000000000001e-06,
"loss": 0.902,
"step": 403
},
{
"epoch": 1.1350210970464134,
"grad_norm": 2.043870449066162,
"learning_rate": 7.486920916730228e-06,
"loss": 1.14,
"step": 404
},
{
"epoch": 1.1378340365682138,
"grad_norm": 2.371406078338623,
"learning_rate": 7.473819183891997e-06,
"loss": 1.168,
"step": 405
},
{
"epoch": 1.140646976090014,
"grad_norm": 2.017378807067871,
"learning_rate": 7.460694920809004e-06,
"loss": 1.2308,
"step": 406
},
{
"epoch": 1.1434599156118144,
"grad_norm": 3.638538122177124,
"learning_rate": 7.447548247010137e-06,
"loss": 1.1636,
"step": 407
},
{
"epoch": 1.1462728551336145,
"grad_norm": 1.9470067024230957,
"learning_rate": 7.434379282228393e-06,
"loss": 1.1502,
"step": 408
},
{
"epoch": 1.149085794655415,
"grad_norm": 2.1175174713134766,
"learning_rate": 7.421188146399776e-06,
"loss": 1.0217,
"step": 409
},
{
"epoch": 1.1518987341772151,
"grad_norm": 1.9489398002624512,
"learning_rate": 7.407974959662222e-06,
"loss": 1.223,
"step": 410
},
{
"epoch": 1.1547116736990155,
"grad_norm": 2.227391242980957,
"learning_rate": 7.394739842354489e-06,
"loss": 1.1757,
"step": 411
},
{
"epoch": 1.1575246132208157,
"grad_norm": 1.961480736732483,
"learning_rate": 7.381482915015068e-06,
"loss": 1.1204,
"step": 412
},
{
"epoch": 1.160337552742616,
"grad_norm": 1.8854504823684692,
"learning_rate": 7.368204298381085e-06,
"loss": 1.0732,
"step": 413
},
{
"epoch": 1.1631504922644162,
"grad_norm": 2.4665989875793457,
"learning_rate": 7.3549041133872004e-06,
"loss": 1.2208,
"step": 414
},
{
"epoch": 1.1659634317862166,
"grad_norm": 2.293067216873169,
"learning_rate": 7.341582481164508e-06,
"loss": 1.0995,
"step": 415
},
{
"epoch": 1.1687763713080168,
"grad_norm": 1.636135458946228,
"learning_rate": 7.328239523039431e-06,
"loss": 1.0113,
"step": 416
},
{
"epoch": 1.1715893108298172,
"grad_norm": 2.080463171005249,
"learning_rate": 7.314875360532618e-06,
"loss": 1.2187,
"step": 417
},
{
"epoch": 1.1744022503516174,
"grad_norm": 2.316681146621704,
"learning_rate": 7.301490115357837e-06,
"loss": 1.0254,
"step": 418
},
{
"epoch": 1.1772151898734178,
"grad_norm": 1.9154740571975708,
"learning_rate": 7.288083909420866e-06,
"loss": 1.0994,
"step": 419
},
{
"epoch": 1.180028129395218,
"grad_norm": 2.2701125144958496,
"learning_rate": 7.274656864818379e-06,
"loss": 1.193,
"step": 420
},
{
"epoch": 1.1828410689170183,
"grad_norm": 2.259373188018799,
"learning_rate": 7.261209103836843e-06,
"loss": 1.2083,
"step": 421
},
{
"epoch": 1.1856540084388185,
"grad_norm": 2.170278787612915,
"learning_rate": 7.247740748951394e-06,
"loss": 1.108,
"step": 422
},
{
"epoch": 1.188466947960619,
"grad_norm": 2.3180534839630127,
"learning_rate": 7.234251922824731e-06,
"loss": 1.0838,
"step": 423
},
{
"epoch": 1.191279887482419,
"grad_norm": 2.200087308883667,
"learning_rate": 7.220742748305989e-06,
"loss": 1.2188,
"step": 424
},
{
"epoch": 1.1940928270042195,
"grad_norm": 2.148313045501709,
"learning_rate": 7.20721334842963e-06,
"loss": 1.1162,
"step": 425
},
{
"epoch": 1.1969057665260197,
"grad_norm": 2.109539270401001,
"learning_rate": 7.193663846414318e-06,
"loss": 1.126,
"step": 426
},
{
"epoch": 1.19971870604782,
"grad_norm": 2.3250086307525635,
"learning_rate": 7.180094365661793e-06,
"loss": 1.216,
"step": 427
},
{
"epoch": 1.2025316455696202,
"grad_norm": 2.1778461933135986,
"learning_rate": 7.166505029755753e-06,
"loss": 1.1582,
"step": 428
},
{
"epoch": 1.2053445850914206,
"grad_norm": 2.0346758365631104,
"learning_rate": 7.152895962460727e-06,
"loss": 1.0597,
"step": 429
},
{
"epoch": 1.2081575246132208,
"grad_norm": 2.2523462772369385,
"learning_rate": 7.139267287720945e-06,
"loss": 1.3096,
"step": 430
},
{
"epoch": 1.2109704641350212,
"grad_norm": 2.1248557567596436,
"learning_rate": 7.125619129659215e-06,
"loss": 1.2255,
"step": 431
},
{
"epoch": 1.2137834036568214,
"grad_norm": 2.402777671813965,
"learning_rate": 7.111951612575783e-06,
"loss": 1.2178,
"step": 432
},
{
"epoch": 1.2165963431786218,
"grad_norm": 2.1899073123931885,
"learning_rate": 7.0982648609472135e-06,
"loss": 1.1086,
"step": 433
},
{
"epoch": 1.219409282700422,
"grad_norm": 2.306647777557373,
"learning_rate": 7.084558999425245e-06,
"loss": 1.2791,
"step": 434
},
{
"epoch": 1.2222222222222223,
"grad_norm": 2.1083829402923584,
"learning_rate": 7.0708341528356585e-06,
"loss": 1.2203,
"step": 435
},
{
"epoch": 1.2250351617440225,
"grad_norm": 1.9246402978897095,
"learning_rate": 7.0570904461771426e-06,
"loss": 1.1293,
"step": 436
},
{
"epoch": 1.2278481012658227,
"grad_norm": 2.0863969326019287,
"learning_rate": 7.043328004620154e-06,
"loss": 1.1112,
"step": 437
},
{
"epoch": 1.230661040787623,
"grad_norm": 2.237459421157837,
"learning_rate": 7.029546953505776e-06,
"loss": 1.1374,
"step": 438
},
{
"epoch": 1.2334739803094235,
"grad_norm": 1.9015916585922241,
"learning_rate": 7.015747418344578e-06,
"loss": 1.0886,
"step": 439
},
{
"epoch": 1.2362869198312236,
"grad_norm": 2.1524229049682617,
"learning_rate": 7.0019295248154714e-06,
"loss": 1.1271,
"step": 440
},
{
"epoch": 1.2390998593530238,
"grad_norm": 2.171227216720581,
"learning_rate": 6.98809339876457e-06,
"loss": 1.2677,
"step": 441
},
{
"epoch": 1.2419127988748242,
"grad_norm": 2.0763444900512695,
"learning_rate": 6.974239166204034e-06,
"loss": 1.0989,
"step": 442
},
{
"epoch": 1.2447257383966246,
"grad_norm": 2.1066906452178955,
"learning_rate": 6.960366953310931e-06,
"loss": 1.2027,
"step": 443
},
{
"epoch": 1.2475386779184248,
"grad_norm": 2.748056650161743,
"learning_rate": 6.946476886426087e-06,
"loss": 1.0004,
"step": 444
},
{
"epoch": 1.250351617440225,
"grad_norm": 1.90733003616333,
"learning_rate": 6.932569092052927e-06,
"loss": 0.9063,
"step": 445
},
{
"epoch": 1.2531645569620253,
"grad_norm": 2.3296380043029785,
"learning_rate": 6.918643696856333e-06,
"loss": 1.2053,
"step": 446
},
{
"epoch": 1.2559774964838257,
"grad_norm": 2.194408416748047,
"learning_rate": 6.904700827661484e-06,
"loss": 1.2663,
"step": 447
},
{
"epoch": 1.258790436005626,
"grad_norm": 2.2270679473876953,
"learning_rate": 6.890740611452705e-06,
"loss": 1.1718,
"step": 448
},
{
"epoch": 1.261603375527426,
"grad_norm": 1.8598543405532837,
"learning_rate": 6.876763175372306e-06,
"loss": 0.958,
"step": 449
},
{
"epoch": 1.2644163150492265,
"grad_norm": 2.112734079360962,
"learning_rate": 6.862768646719425e-06,
"loss": 1.2674,
"step": 450
},
{
"epoch": 1.2644163150492265,
"eval_loss": 0.6488014459609985,
"eval_runtime": 2.7966,
"eval_samples_per_second": 9.297,
"eval_steps_per_second": 1.43,
"step": 450
},
{
"epoch": 1.2644163150492265,
"eval_active_sample_count": 30,
"eval_avg_loss": 588.0,
"eval_avg_mem_token_accuracy": 0.23404255319148937,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007794048181388758,
"eval_avg_mem_token_rate": 0.5632940863433779,
"eval_avg_mem_token_recall(Accuracy)": 0.23404255319148937,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 450,
"eval_loss": 0.6488014459609985,
"eval_num_samples": 30,
"eval_runtime": 2.7966,
"eval_samples_per_second": 9.297,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.43,
"eval_total_correct_count": 66,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8468,
"step": 450
},
{
"epoch": 1.2672292545710266,
"grad_norm": 1.943136215209961,
"learning_rate": 6.848757152948876e-06,
"loss": 1.0877,
"step": 451
},
{
"epoch": 1.270042194092827,
"grad_norm": 2.015427589416504,
"learning_rate": 6.834728821669978e-06,
"loss": 1.0226,
"step": 452
},
{
"epoch": 1.2728551336146272,
"grad_norm": 2.0203545093536377,
"learning_rate": 6.820683780645397e-06,
"loss": 1.0537,
"step": 453
},
{
"epoch": 1.2756680731364276,
"grad_norm": 1.9082456827163696,
"learning_rate": 6.806622157789989e-06,
"loss": 1.0811,
"step": 454
},
{
"epoch": 1.2784810126582278,
"grad_norm": 2.0107004642486572,
"learning_rate": 6.7925440811696165e-06,
"loss": 1.1643,
"step": 455
},
{
"epoch": 1.2812939521800282,
"grad_norm": 1.968511700630188,
"learning_rate": 6.778449679000006e-06,
"loss": 0.9849,
"step": 456
},
{
"epoch": 1.2841068917018283,
"grad_norm": 2.0401535034179688,
"learning_rate": 6.764339079645561e-06,
"loss": 1.1488,
"step": 457
},
{
"epoch": 1.2869198312236287,
"grad_norm": 1.788967251777649,
"learning_rate": 6.7502124116182066e-06,
"loss": 0.8775,
"step": 458
},
{
"epoch": 1.289732770745429,
"grad_norm": 1.8958114385604858,
"learning_rate": 6.736069803576205e-06,
"loss": 1.1991,
"step": 459
},
{
"epoch": 1.2925457102672293,
"grad_norm": 2.1174044609069824,
"learning_rate": 6.721911384323e-06,
"loss": 1.2373,
"step": 460
},
{
"epoch": 1.2953586497890295,
"grad_norm": 2.2091267108917236,
"learning_rate": 6.7077372828060294e-06,
"loss": 1.1511,
"step": 461
},
{
"epoch": 1.2981715893108299,
"grad_norm": 1.904528021812439,
"learning_rate": 6.693547628115561e-06,
"loss": 0.9815,
"step": 462
},
{
"epoch": 1.30098452883263,
"grad_norm": 2.0216708183288574,
"learning_rate": 6.67934254948351e-06,
"loss": 1.0773,
"step": 463
},
{
"epoch": 1.3037974683544304,
"grad_norm": 2.3458025455474854,
"learning_rate": 6.6651221762822635e-06,
"loss": 1.2122,
"step": 464
},
{
"epoch": 1.3066104078762306,
"grad_norm": 2.210007905960083,
"learning_rate": 6.650886638023508e-06,
"loss": 1.2001,
"step": 465
},
{
"epoch": 1.309423347398031,
"grad_norm": 2.168041229248047,
"learning_rate": 6.636636064357045e-06,
"loss": 1.1748,
"step": 466
},
{
"epoch": 1.3122362869198312,
"grad_norm": 2.1177752017974854,
"learning_rate": 6.622370585069605e-06,
"loss": 1.1441,
"step": 467
},
{
"epoch": 1.3150492264416316,
"grad_norm": 4.13400411605835,
"learning_rate": 6.608090330083677e-06,
"loss": 1.0154,
"step": 468
},
{
"epoch": 1.3178621659634318,
"grad_norm": 1.8855236768722534,
"learning_rate": 6.593795429456317e-06,
"loss": 1.1638,
"step": 469
},
{
"epoch": 1.3206751054852321,
"grad_norm": 2.1128952503204346,
"learning_rate": 6.579486013377963e-06,
"loss": 1.2435,
"step": 470
},
{
"epoch": 1.3234880450070323,
"grad_norm": 2.091977119445801,
"learning_rate": 6.565162212171257e-06,
"loss": 1.1948,
"step": 471
},
{
"epoch": 1.3263009845288325,
"grad_norm": 1.8725004196166992,
"learning_rate": 6.550824156289852e-06,
"loss": 0.9448,
"step": 472
},
{
"epoch": 1.3291139240506329,
"grad_norm": 2.134361982345581,
"learning_rate": 6.536471976317223e-06,
"loss": 1.1985,
"step": 473
},
{
"epoch": 1.3319268635724333,
"grad_norm": 2.0700531005859375,
"learning_rate": 6.5221058029654815e-06,
"loss": 1.1321,
"step": 474
},
{
"epoch": 1.3347398030942335,
"grad_norm": 2.336487054824829,
"learning_rate": 6.507725767074181e-06,
"loss": 1.2447,
"step": 475
},
{
"epoch": 1.3375527426160336,
"grad_norm": 2.1936490535736084,
"learning_rate": 6.493331999609132e-06,
"loss": 1.3264,
"step": 476
},
{
"epoch": 1.340365682137834,
"grad_norm": 1.8957630395889282,
"learning_rate": 6.4789246316612e-06,
"loss": 1.0029,
"step": 477
},
{
"epoch": 1.3431786216596344,
"grad_norm": 2.329432249069214,
"learning_rate": 6.464503794445121e-06,
"loss": 1.3139,
"step": 478
},
{
"epoch": 1.3459915611814346,
"grad_norm": 2.2381882667541504,
"learning_rate": 6.450069619298299e-06,
"loss": 1.0446,
"step": 479
},
{
"epoch": 1.3488045007032348,
"grad_norm": 2.235319137573242,
"learning_rate": 6.435622237679615e-06,
"loss": 1.1327,
"step": 480
},
{
"epoch": 1.3516174402250352,
"grad_norm": 2.1776840686798096,
"learning_rate": 6.421161781168226e-06,
"loss": 1.0707,
"step": 481
},
{
"epoch": 1.3544303797468356,
"grad_norm": 2.003654956817627,
"learning_rate": 6.4066883814623674e-06,
"loss": 1.0294,
"step": 482
},
{
"epoch": 1.3572433192686357,
"grad_norm": 2.2653419971466064,
"learning_rate": 6.3922021703781574e-06,
"loss": 1.1558,
"step": 483
},
{
"epoch": 1.360056258790436,
"grad_norm": 1.8952243328094482,
"learning_rate": 6.377703279848393e-06,
"loss": 1.1621,
"step": 484
},
{
"epoch": 1.3628691983122363,
"grad_norm": 1.818117618560791,
"learning_rate": 6.363191841921345e-06,
"loss": 1.1758,
"step": 485
},
{
"epoch": 1.3656821378340367,
"grad_norm": 2.188119411468506,
"learning_rate": 6.3486679887595635e-06,
"loss": 1.4035,
"step": 486
},
{
"epoch": 1.3684950773558369,
"grad_norm": 2.2680625915527344,
"learning_rate": 6.334131852638669e-06,
"loss": 1.3802,
"step": 487
},
{
"epoch": 1.371308016877637,
"grad_norm": 2.239824056625366,
"learning_rate": 6.319583565946147e-06,
"loss": 0.978,
"step": 488
},
{
"epoch": 1.3741209563994374,
"grad_norm": 2.084578275680542,
"learning_rate": 6.305023261180146e-06,
"loss": 1.1592,
"step": 489
},
{
"epoch": 1.3769338959212378,
"grad_norm": 2.074716329574585,
"learning_rate": 6.290451070948269e-06,
"loss": 1.1417,
"step": 490
},
{
"epoch": 1.379746835443038,
"grad_norm": 2.2187070846557617,
"learning_rate": 6.275867127966364e-06,
"loss": 1.3134,
"step": 491
},
{
"epoch": 1.3825597749648382,
"grad_norm": 1.9704614877700806,
"learning_rate": 6.261271565057318e-06,
"loss": 1.2947,
"step": 492
},
{
"epoch": 1.3853727144866386,
"grad_norm": 2.0791146755218506,
"learning_rate": 6.246664515149845e-06,
"loss": 1.1796,
"step": 493
},
{
"epoch": 1.3881856540084387,
"grad_norm": 2.070108413696289,
"learning_rate": 6.232046111277277e-06,
"loss": 1.016,
"step": 494
},
{
"epoch": 1.3909985935302391,
"grad_norm": 2.40295147895813,
"learning_rate": 6.217416486576354e-06,
"loss": 1.247,
"step": 495
},
{
"epoch": 1.3938115330520393,
"grad_norm": 1.9346283674240112,
"learning_rate": 6.202775774286007e-06,
"loss": 1.0943,
"step": 496
},
{
"epoch": 1.3966244725738397,
"grad_norm": 1.88413667678833,
"learning_rate": 6.188124107746148e-06,
"loss": 1.0378,
"step": 497
},
{
"epoch": 1.3994374120956399,
"grad_norm": 2.3754115104675293,
"learning_rate": 6.173461620396453e-06,
"loss": 1.1976,
"step": 498
},
{
"epoch": 1.4022503516174403,
"grad_norm": 2.2472076416015625,
"learning_rate": 6.158788445775151e-06,
"loss": 1.348,
"step": 499
},
{
"epoch": 1.4050632911392404,
"grad_norm": 2.299577474594116,
"learning_rate": 6.1441047175178025e-06,
"loss": 1.3543,
"step": 500
},
{
"epoch": 1.4050632911392404,
"eval_loss": 0.6480849385261536,
"eval_runtime": 2.7664,
"eval_samples_per_second": 9.398,
"eval_steps_per_second": 1.446,
"step": 500
},
{
"epoch": 1.4050632911392404,
"eval_active_sample_count": 30,
"eval_avg_loss": 593.75,
"eval_avg_mem_token_accuracy": 0.2375886524822695,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.007852789498359119,
"eval_avg_mem_token_rate": 0.5675513869487129,
"eval_avg_mem_token_recall(Accuracy)": 0.2375886524822695,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 500,
"eval_loss": 0.6480849385261536,
"eval_num_samples": 30,
"eval_runtime": 2.7664,
"eval_samples_per_second": 9.398,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.446,
"eval_total_correct_count": 67,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8532,
"step": 500
},
{
"epoch": 1.4078762306610408,
"grad_norm": 2.4926252365112305,
"learning_rate": 6.129410569356086e-06,
"loss": 1.1548,
"step": 501
},
{
"epoch": 1.410689170182841,
"grad_norm": 1.9530552625656128,
"learning_rate": 6.11470613511658e-06,
"loss": 0.9438,
"step": 502
},
{
"epoch": 1.4135021097046414,
"grad_norm": 2.046297788619995,
"learning_rate": 6.0999915487195395e-06,
"loss": 1.0105,
"step": 503
},
{
"epoch": 1.4163150492264416,
"grad_norm": 2.359480619430542,
"learning_rate": 6.085266944177686e-06,
"loss": 1.2237,
"step": 504
},
{
"epoch": 1.419127988748242,
"grad_norm": 2.0814826488494873,
"learning_rate": 6.070532455594974e-06,
"loss": 1.3641,
"step": 505
},
{
"epoch": 1.4219409282700421,
"grad_norm": 2.5021960735321045,
"learning_rate": 6.055788217165384e-06,
"loss": 1.1271,
"step": 506
},
{
"epoch": 1.4247538677918425,
"grad_norm": 2.1782703399658203,
"learning_rate": 6.0410343631716865e-06,
"loss": 1.1237,
"step": 507
},
{
"epoch": 1.4275668073136427,
"grad_norm": 1.9032992124557495,
"learning_rate": 6.0262710279842305e-06,
"loss": 1.2318,
"step": 508
},
{
"epoch": 1.4303797468354431,
"grad_norm": 1.969860315322876,
"learning_rate": 6.011498346059712e-06,
"loss": 1.0196,
"step": 509
},
{
"epoch": 1.4331926863572433,
"grad_norm": 2.1782121658325195,
"learning_rate": 5.99671645193995e-06,
"loss": 1.1725,
"step": 510
},
{
"epoch": 1.4360056258790437,
"grad_norm": 2.0659401416778564,
"learning_rate": 5.98192548025067e-06,
"loss": 1.1655,
"step": 511
},
{
"epoch": 1.4388185654008439,
"grad_norm": 2.1270692348480225,
"learning_rate": 5.967125565700266e-06,
"loss": 0.9583,
"step": 512
},
{
"epoch": 1.4416315049226442,
"grad_norm": 2.146409034729004,
"learning_rate": 5.952316843078579e-06,
"loss": 1.1295,
"step": 513
},
{
"epoch": 1.4444444444444444,
"grad_norm": 2.323197364807129,
"learning_rate": 5.9374994472556715e-06,
"loss": 1.1557,
"step": 514
},
{
"epoch": 1.4472573839662446,
"grad_norm": 2.1008739471435547,
"learning_rate": 5.922673513180596e-06,
"loss": 1.24,
"step": 515
},
{
"epoch": 1.450070323488045,
"grad_norm": 2.4466872215270996,
"learning_rate": 5.9078391758801646e-06,
"loss": 1.2434,
"step": 516
},
{
"epoch": 1.4528832630098454,
"grad_norm": 2.210320234298706,
"learning_rate": 5.8929965704577275e-06,
"loss": 1.136,
"step": 517
},
{
"epoch": 1.4556962025316456,
"grad_norm": 2.259718894958496,
"learning_rate": 5.878145832091929e-06,
"loss": 1.3789,
"step": 518
},
{
"epoch": 1.4585091420534457,
"grad_norm": 2.305795431137085,
"learning_rate": 5.863287096035491e-06,
"loss": 1.0189,
"step": 519
},
{
"epoch": 1.4613220815752461,
"grad_norm": 2.283437967300415,
"learning_rate": 5.848420497613969e-06,
"loss": 1.1944,
"step": 520
},
{
"epoch": 1.4641350210970465,
"grad_norm": 2.0504446029663086,
"learning_rate": 5.833546172224527e-06,
"loss": 1.22,
"step": 521
},
{
"epoch": 1.4669479606188467,
"grad_norm": 2.018839120864868,
"learning_rate": 5.818664255334702e-06,
"loss": 1.0634,
"step": 522
},
{
"epoch": 1.4697609001406469,
"grad_norm": 2.3706552982330322,
"learning_rate": 5.803774882481171e-06,
"loss": 1.1355,
"step": 523
},
{
"epoch": 1.4725738396624473,
"grad_norm": 2.355933427810669,
"learning_rate": 5.788878189268516e-06,
"loss": 1.2492,
"step": 524
},
{
"epoch": 1.4753867791842477,
"grad_norm": 2.439201831817627,
"learning_rate": 5.773974311367987e-06,
"loss": 1.3196,
"step": 525
},
{
"epoch": 1.4781997187060478,
"grad_norm": 2.0663866996765137,
"learning_rate": 5.759063384516271e-06,
"loss": 1.1885,
"step": 526
},
{
"epoch": 1.481012658227848,
"grad_norm": 2.264146327972412,
"learning_rate": 5.7441455445142505e-06,
"loss": 1.2146,
"step": 527
},
{
"epoch": 1.4838255977496484,
"grad_norm": 1.8687844276428223,
"learning_rate": 5.729220927225769e-06,
"loss": 0.9485,
"step": 528
},
{
"epoch": 1.4866385372714488,
"grad_norm": 2.1123878955841064,
"learning_rate": 5.714289668576401e-06,
"loss": 1.0617,
"step": 529
},
{
"epoch": 1.489451476793249,
"grad_norm": 2.460676670074463,
"learning_rate": 5.699351904552196e-06,
"loss": 1.5609,
"step": 530
},
{
"epoch": 1.4922644163150491,
"grad_norm": 2.3636927604675293,
"learning_rate": 5.68440777119846e-06,
"loss": 1.2612,
"step": 531
},
{
"epoch": 1.4950773558368495,
"grad_norm": 1.9600480794906616,
"learning_rate": 5.669457404618502e-06,
"loss": 0.9536,
"step": 532
},
{
"epoch": 1.49789029535865,
"grad_norm": 1.95573091506958,
"learning_rate": 5.654500940972405e-06,
"loss": 1.0379,
"step": 533
},
{
"epoch": 1.50070323488045,
"grad_norm": 1.8376390933990479,
"learning_rate": 5.639538516475775e-06,
"loss": 1.1431,
"step": 534
},
{
"epoch": 1.5035161744022503,
"grad_norm": 1.8683063983917236,
"learning_rate": 5.624570267398511e-06,
"loss": 1.0917,
"step": 535
},
{
"epoch": 1.5063291139240507,
"grad_norm": 2.060288906097412,
"learning_rate": 5.6095963300635585e-06,
"loss": 1.0954,
"step": 536
},
{
"epoch": 1.509142053445851,
"grad_norm": 2.148991107940674,
"learning_rate": 5.594616840845666e-06,
"loss": 1.0198,
"step": 537
},
{
"epoch": 1.5119549929676512,
"grad_norm": 2.234832286834717,
"learning_rate": 5.579631936170147e-06,
"loss": 1.1007,
"step": 538
},
{
"epoch": 1.5147679324894514,
"grad_norm": 2.1892640590667725,
"learning_rate": 5.564641752511637e-06,
"loss": 1.0431,
"step": 539
},
{
"epoch": 1.5175808720112518,
"grad_norm": 2.029608726501465,
"learning_rate": 5.54964642639285e-06,
"loss": 1.0874,
"step": 540
},
{
"epoch": 1.5203938115330522,
"grad_norm": 2.019705057144165,
"learning_rate": 5.534646094383333e-06,
"loss": 1.0566,
"step": 541
},
{
"epoch": 1.5232067510548524,
"grad_norm": 2.067397117614746,
"learning_rate": 5.519640893098227e-06,
"loss": 1.1467,
"step": 542
},
{
"epoch": 1.5260196905766525,
"grad_norm": 2.2218313217163086,
"learning_rate": 5.504630959197014e-06,
"loss": 1.2784,
"step": 543
},
{
"epoch": 1.528832630098453,
"grad_norm": 2.1426005363464355,
"learning_rate": 5.489616429382285e-06,
"loss": 1.217,
"step": 544
},
{
"epoch": 1.5316455696202531,
"grad_norm": 2.0496666431427,
"learning_rate": 5.474597440398483e-06,
"loss": 1.1561,
"step": 545
},
{
"epoch": 1.5344585091420533,
"grad_norm": 1.9886417388916016,
"learning_rate": 5.459574129030669e-06,
"loss": 1.2286,
"step": 546
},
{
"epoch": 1.5372714486638537,
"grad_norm": 1.9588450193405151,
"learning_rate": 5.444546632103262e-06,
"loss": 1.1474,
"step": 547
},
{
"epoch": 1.540084388185654,
"grad_norm": 2.0006983280181885,
"learning_rate": 5.429515086478805e-06,
"loss": 1.1519,
"step": 548
},
{
"epoch": 1.5428973277074542,
"grad_norm": 2.1134023666381836,
"learning_rate": 5.414479629056717e-06,
"loss": 1.1426,
"step": 549
},
{
"epoch": 1.5457102672292544,
"grad_norm": 2.110901355743408,
"learning_rate": 5.3994403967720366e-06,
"loss": 1.0726,
"step": 550
},
{
"epoch": 1.5457102672292544,
"eval_loss": 0.6454769372940063,
"eval_runtime": 2.82,
"eval_samples_per_second": 9.22,
"eval_steps_per_second": 1.418,
"step": 550
},
{
"epoch": 1.5457102672292544,
"eval_active_sample_count": 30,
"eval_avg_loss": 591.125,
"eval_avg_mem_token_accuracy": 0.24113475177304963,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008006593665371483,
"eval_avg_mem_token_rate": 0.5649570943923369,
"eval_avg_mem_token_recall(Accuracy)": 0.24113475177304963,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 550,
"eval_loss": 0.6454769372940063,
"eval_num_samples": 30,
"eval_runtime": 2.82,
"eval_samples_per_second": 9.22,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.418,
"eval_total_correct_count": 68,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8493,
"step": 550
},
{
"epoch": 1.5485232067510548,
"grad_norm": 1.9120993614196777,
"learning_rate": 5.3843975265941896e-06,
"loss": 1.1199,
"step": 551
},
{
"epoch": 1.5513361462728552,
"grad_norm": 2.0266835689544678,
"learning_rate": 5.369351155525729e-06,
"loss": 1.1231,
"step": 552
},
{
"epoch": 1.5541490857946554,
"grad_norm": 2.3950095176696777,
"learning_rate": 5.354301420601095e-06,
"loss": 1.2016,
"step": 553
},
{
"epoch": 1.5569620253164556,
"grad_norm": 2.245199680328369,
"learning_rate": 5.33924845888536e-06,
"loss": 1.1973,
"step": 554
},
{
"epoch": 1.559774964838256,
"grad_norm": 2.302870988845825,
"learning_rate": 5.3241924074729865e-06,
"loss": 1.1057,
"step": 555
},
{
"epoch": 1.5625879043600563,
"grad_norm": 2.439229726791382,
"learning_rate": 5.30913340348658e-06,
"loss": 1.0278,
"step": 556
},
{
"epoch": 1.5654008438818565,
"grad_norm": 2.243025779724121,
"learning_rate": 5.294071584075628e-06,
"loss": 1.2353,
"step": 557
},
{
"epoch": 1.5682137834036567,
"grad_norm": 2.1339046955108643,
"learning_rate": 5.279007086415268e-06,
"loss": 1.2753,
"step": 558
},
{
"epoch": 1.571026722925457,
"grad_norm": 2.055248260498047,
"learning_rate": 5.263940047705026e-06,
"loss": 1.0207,
"step": 559
},
{
"epoch": 1.5738396624472575,
"grad_norm": 2.2932729721069336,
"learning_rate": 5.24887060516757e-06,
"loss": 1.0904,
"step": 560
},
{
"epoch": 1.5766526019690577,
"grad_norm": 2.3540918827056885,
"learning_rate": 5.233798896047461e-06,
"loss": 1.045,
"step": 561
},
{
"epoch": 1.5794655414908578,
"grad_norm": 1.941489338874817,
"learning_rate": 5.218725057609901e-06,
"loss": 0.9543,
"step": 562
},
{
"epoch": 1.5822784810126582,
"grad_norm": 1.9541575908660889,
"learning_rate": 5.2036492271394915e-06,
"loss": 0.9803,
"step": 563
},
{
"epoch": 1.5850914205344586,
"grad_norm": 2.066892147064209,
"learning_rate": 5.188571541938968e-06,
"loss": 1.1598,
"step": 564
},
{
"epoch": 1.5879043600562588,
"grad_norm": 2.207688093185425,
"learning_rate": 5.1734921393279644e-06,
"loss": 1.14,
"step": 565
},
{
"epoch": 1.590717299578059,
"grad_norm": 2.2512924671173096,
"learning_rate": 5.158411156641752e-06,
"loss": 1.2269,
"step": 566
},
{
"epoch": 1.5935302390998594,
"grad_norm": 1.9499599933624268,
"learning_rate": 5.143328731229994e-06,
"loss": 0.9949,
"step": 567
},
{
"epoch": 1.5963431786216598,
"grad_norm": 2.176727056503296,
"learning_rate": 5.128245000455493e-06,
"loss": 1.1866,
"step": 568
},
{
"epoch": 1.59915611814346,
"grad_norm": 2.0169143676757812,
"learning_rate": 5.113160101692939e-06,
"loss": 1.1554,
"step": 569
},
{
"epoch": 1.60196905766526,
"grad_norm": 2.1123158931732178,
"learning_rate": 5.098074172327661e-06,
"loss": 0.9758,
"step": 570
},
{
"epoch": 1.6047819971870605,
"grad_norm": 1.8653483390808105,
"learning_rate": 5.082987349754376e-06,
"loss": 1.009,
"step": 571
},
{
"epoch": 1.6075949367088609,
"grad_norm": 2.3386378288269043,
"learning_rate": 5.0678997713759305e-06,
"loss": 1.1193,
"step": 572
},
{
"epoch": 1.610407876230661,
"grad_norm": 2.200810432434082,
"learning_rate": 5.052811574602059e-06,
"loss": 1.2255,
"step": 573
},
{
"epoch": 1.6132208157524612,
"grad_norm": 2.702786922454834,
"learning_rate": 5.0377228968481274e-06,
"loss": 1.2351,
"step": 574
},
{
"epoch": 1.6160337552742616,
"grad_norm": 2.252342462539673,
"learning_rate": 5.022633875533879e-06,
"loss": 1.095,
"step": 575
},
{
"epoch": 1.618846694796062,
"grad_norm": 2.326218605041504,
"learning_rate": 5.00754464808219e-06,
"loss": 1.1578,
"step": 576
},
{
"epoch": 1.6216596343178622,
"grad_norm": 2.0061216354370117,
"learning_rate": 4.992455351917812e-06,
"loss": 0.974,
"step": 577
},
{
"epoch": 1.6244725738396624,
"grad_norm": 2.0241732597351074,
"learning_rate": 4.977366124466122e-06,
"loss": 1.0518,
"step": 578
},
{
"epoch": 1.6272855133614628,
"grad_norm": 2.2035324573516846,
"learning_rate": 4.962277103151876e-06,
"loss": 1.0806,
"step": 579
},
{
"epoch": 1.6300984528832632,
"grad_norm": 1.9597488641738892,
"learning_rate": 4.947188425397942e-06,
"loss": 0.9929,
"step": 580
},
{
"epoch": 1.6329113924050633,
"grad_norm": 1.8797650337219238,
"learning_rate": 4.932100228624072e-06,
"loss": 1.0142,
"step": 581
},
{
"epoch": 1.6357243319268635,
"grad_norm": 2.195955514907837,
"learning_rate": 4.917012650245626e-06,
"loss": 1.2481,
"step": 582
},
{
"epoch": 1.638537271448664,
"grad_norm": 2.0398526191711426,
"learning_rate": 4.901925827672341e-06,
"loss": 0.9249,
"step": 583
},
{
"epoch": 1.6413502109704643,
"grad_norm": 2.003324508666992,
"learning_rate": 4.886839898307062e-06,
"loss": 1.0438,
"step": 584
},
{
"epoch": 1.6441631504922645,
"grad_norm": 1.6683696508407593,
"learning_rate": 4.8717549995445105e-06,
"loss": 0.8833,
"step": 585
},
{
"epoch": 1.6469760900140646,
"grad_norm": 2.1678078174591064,
"learning_rate": 4.856671268770007e-06,
"loss": 1.1291,
"step": 586
},
{
"epoch": 1.649789029535865,
"grad_norm": 1.9070981740951538,
"learning_rate": 4.841588843358251e-06,
"loss": 0.9658,
"step": 587
},
{
"epoch": 1.6526019690576652,
"grad_norm": 1.897820234298706,
"learning_rate": 4.826507860672036e-06,
"loss": 0.9903,
"step": 588
},
{
"epoch": 1.6554149085794654,
"grad_norm": 2.141012668609619,
"learning_rate": 4.811428458061033e-06,
"loss": 1.3183,
"step": 589
},
{
"epoch": 1.6582278481012658,
"grad_norm": 1.9511604309082031,
"learning_rate": 4.796350772860511e-06,
"loss": 1.2011,
"step": 590
},
{
"epoch": 1.6610407876230662,
"grad_norm": 2.517437696456909,
"learning_rate": 4.7812749423901e-06,
"loss": 1.1229,
"step": 591
},
{
"epoch": 1.6638537271448663,
"grad_norm": 1.9676152467727661,
"learning_rate": 4.7662011039525416e-06,
"loss": 1.1357,
"step": 592
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.9041470289230347,
"learning_rate": 4.7511293948324325e-06,
"loss": 1.0166,
"step": 593
},
{
"epoch": 1.669479606188467,
"grad_norm": 2.15259051322937,
"learning_rate": 4.736059952294975e-06,
"loss": 1.011,
"step": 594
},
{
"epoch": 1.6722925457102673,
"grad_norm": 2.361236333847046,
"learning_rate": 4.720992913584732e-06,
"loss": 1.3296,
"step": 595
},
{
"epoch": 1.6751054852320675,
"grad_norm": 2.3137876987457275,
"learning_rate": 4.7059284159243725e-06,
"loss": 1.3602,
"step": 596
},
{
"epoch": 1.6779184247538677,
"grad_norm": 2.085984230041504,
"learning_rate": 4.690866596513421e-06,
"loss": 1.247,
"step": 597
},
{
"epoch": 1.680731364275668,
"grad_norm": 2.2906124591827393,
"learning_rate": 4.675807592527014e-06,
"loss": 1.2777,
"step": 598
},
{
"epoch": 1.6835443037974684,
"grad_norm": 2.461681842803955,
"learning_rate": 4.660751541114641e-06,
"loss": 1.3176,
"step": 599
},
{
"epoch": 1.6863572433192686,
"grad_norm": 2.259167194366455,
"learning_rate": 4.645698579398907e-06,
"loss": 1.145,
"step": 600
},
{
"epoch": 1.6863572433192686,
"eval_loss": 0.6439154744148254,
"eval_runtime": 2.7846,
"eval_samples_per_second": 9.337,
"eval_steps_per_second": 1.436,
"step": 600
},
{
"epoch": 1.6863572433192686,
"eval_active_sample_count": 30,
"eval_avg_loss": 596.375,
"eval_avg_mem_token_accuracy": 0.24822695035460993,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008178525528683258,
"eval_avg_mem_token_rate": 0.5693474356415885,
"eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 600,
"eval_loss": 0.6439154744148254,
"eval_num_samples": 30,
"eval_runtime": 2.7846,
"eval_samples_per_second": 9.337,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.436,
"eval_total_correct_count": 70,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8559,
"step": 600
},
{
"epoch": 1.6891701828410688,
"grad_norm": 2.261350154876709,
"learning_rate": 4.630648844474271e-06,
"loss": 1.3461,
"step": 601
},
{
"epoch": 1.6919831223628692,
"grad_norm": 2.463414192199707,
"learning_rate": 4.615602473405812e-06,
"loss": 1.1112,
"step": 602
},
{
"epoch": 1.6947960618846696,
"grad_norm": 2.262482166290283,
"learning_rate": 4.600559603227963e-06,
"loss": 1.208,
"step": 603
},
{
"epoch": 1.6976090014064698,
"grad_norm": 2.235854387283325,
"learning_rate": 4.585520370943285e-06,
"loss": 0.8357,
"step": 604
},
{
"epoch": 1.70042194092827,
"grad_norm": 2.0354301929473877,
"learning_rate": 4.570484913521196e-06,
"loss": 0.9843,
"step": 605
},
{
"epoch": 1.7032348804500703,
"grad_norm": 2.3465640544891357,
"learning_rate": 4.55545336789674e-06,
"loss": 1.2206,
"step": 606
},
{
"epoch": 1.7060478199718707,
"grad_norm": 1.846433162689209,
"learning_rate": 4.540425870969332e-06,
"loss": 0.9545,
"step": 607
},
{
"epoch": 1.7088607594936709,
"grad_norm": 2.3210694789886475,
"learning_rate": 4.5254025596015175e-06,
"loss": 1.2733,
"step": 608
},
{
"epoch": 1.711673699015471,
"grad_norm": 2.5384347438812256,
"learning_rate": 4.510383570617716e-06,
"loss": 1.2064,
"step": 609
},
{
"epoch": 1.7144866385372715,
"grad_norm": 2.0778439044952393,
"learning_rate": 4.495369040802988e-06,
"loss": 1.1119,
"step": 610
},
{
"epoch": 1.7172995780590719,
"grad_norm": 2.212078332901001,
"learning_rate": 4.480359106901775e-06,
"loss": 1.1948,
"step": 611
},
{
"epoch": 1.720112517580872,
"grad_norm": 2.3751208782196045,
"learning_rate": 4.465353905616668e-06,
"loss": 1.2253,
"step": 612
},
{
"epoch": 1.7229254571026722,
"grad_norm": 2.196316957473755,
"learning_rate": 4.4503535736071505e-06,
"loss": 1.159,
"step": 613
},
{
"epoch": 1.7257383966244726,
"grad_norm": 2.1474740505218506,
"learning_rate": 4.435358247488365e-06,
"loss": 1.143,
"step": 614
},
{
"epoch": 1.728551336146273,
"grad_norm": 2.5476577281951904,
"learning_rate": 4.420368063829854e-06,
"loss": 1.157,
"step": 615
},
{
"epoch": 1.7313642756680732,
"grad_norm": 2.186852216720581,
"learning_rate": 4.405383159154337e-06,
"loss": 1.1052,
"step": 616
},
{
"epoch": 1.7341772151898733,
"grad_norm": 2.162107467651367,
"learning_rate": 4.390403669936443e-06,
"loss": 1.1342,
"step": 617
},
{
"epoch": 1.7369901547116737,
"grad_norm": 2.093745470046997,
"learning_rate": 4.37542973260149e-06,
"loss": 0.9557,
"step": 618
},
{
"epoch": 1.7398030942334741,
"grad_norm": 1.8521722555160522,
"learning_rate": 4.3604614835242255e-06,
"loss": 1.0542,
"step": 619
},
{
"epoch": 1.7426160337552743,
"grad_norm": 2.1983838081359863,
"learning_rate": 4.3454990590275966e-06,
"loss": 0.7818,
"step": 620
},
{
"epoch": 1.7454289732770745,
"grad_norm": 2.261500597000122,
"learning_rate": 4.3305425953814985e-06,
"loss": 1.1948,
"step": 621
},
{
"epoch": 1.7482419127988749,
"grad_norm": 2.4740712642669678,
"learning_rate": 4.315592228801543e-06,
"loss": 1.3438,
"step": 622
},
{
"epoch": 1.7510548523206753,
"grad_norm": 2.277127981185913,
"learning_rate": 4.300648095447806e-06,
"loss": 1.2477,
"step": 623
},
{
"epoch": 1.7538677918424754,
"grad_norm": 2.1069774627685547,
"learning_rate": 4.285710331423603e-06,
"loss": 1.208,
"step": 624
},
{
"epoch": 1.7566807313642756,
"grad_norm": 2.1714632511138916,
"learning_rate": 4.2707790727742315e-06,
"loss": 1.2219,
"step": 625
},
{
"epoch": 1.759493670886076,
"grad_norm": 2.2100682258605957,
"learning_rate": 4.255854455485753e-06,
"loss": 1.284,
"step": 626
},
{
"epoch": 1.7623066104078764,
"grad_norm": 2.0882930755615234,
"learning_rate": 4.24093661548373e-06,
"loss": 1.1695,
"step": 627
},
{
"epoch": 1.7651195499296763,
"grad_norm": 2.3131346702575684,
"learning_rate": 4.226025688632013e-06,
"loss": 1.1353,
"step": 628
},
{
"epoch": 1.7679324894514767,
"grad_norm": 2.0631368160247803,
"learning_rate": 4.211121810731485e-06,
"loss": 1.175,
"step": 629
},
{
"epoch": 1.7707454289732771,
"grad_norm": 2.4987428188323975,
"learning_rate": 4.196225117518828e-06,
"loss": 1.2522,
"step": 630
},
{
"epoch": 1.7735583684950773,
"grad_norm": 1.8051552772521973,
"learning_rate": 4.181335744665299e-06,
"loss": 1.0842,
"step": 631
},
{
"epoch": 1.7763713080168775,
"grad_norm": 2.0841329097747803,
"learning_rate": 4.166453827775474e-06,
"loss": 1.331,
"step": 632
},
{
"epoch": 1.7791842475386779,
"grad_norm": 2.309027910232544,
"learning_rate": 4.1515795023860325e-06,
"loss": 1.2727,
"step": 633
},
{
"epoch": 1.7819971870604783,
"grad_norm": 2.1550230979919434,
"learning_rate": 4.136712903964511e-06,
"loss": 1.2984,
"step": 634
},
{
"epoch": 1.7848101265822784,
"grad_norm": 1.9745640754699707,
"learning_rate": 4.121854167908072e-06,
"loss": 0.8655,
"step": 635
},
{
"epoch": 1.7876230661040786,
"grad_norm": 1.838762879371643,
"learning_rate": 4.107003429542273e-06,
"loss": 0.8657,
"step": 636
},
{
"epoch": 1.790436005625879,
"grad_norm": 3.8649277687072754,
"learning_rate": 4.092160824119836e-06,
"loss": 1.0927,
"step": 637
},
{
"epoch": 1.7932489451476794,
"grad_norm": 1.946352481842041,
"learning_rate": 4.077326486819405e-06,
"loss": 0.922,
"step": 638
},
{
"epoch": 1.7960618846694796,
"grad_norm": 1.9564697742462158,
"learning_rate": 4.06250055274433e-06,
"loss": 1.1767,
"step": 639
},
{
"epoch": 1.7988748241912798,
"grad_norm": 2.0671567916870117,
"learning_rate": 4.047683156921422e-06,
"loss": 1.1347,
"step": 640
},
{
"epoch": 1.8016877637130801,
"grad_norm": 2.086289167404175,
"learning_rate": 4.0328744342997355e-06,
"loss": 1.2172,
"step": 641
},
{
"epoch": 1.8045007032348805,
"grad_norm": 1.74513578414917,
"learning_rate": 4.0180745197493295e-06,
"loss": 1.1084,
"step": 642
},
{
"epoch": 1.8073136427566807,
"grad_norm": 2.2042808532714844,
"learning_rate": 4.0032835480600516e-06,
"loss": 1.3802,
"step": 643
},
{
"epoch": 1.810126582278481,
"grad_norm": 2.1729772090911865,
"learning_rate": 3.9885016539402896e-06,
"loss": 1.1866,
"step": 644
},
{
"epoch": 1.8129395218002813,
"grad_norm": 2.0441439151763916,
"learning_rate": 3.973728972015771e-06,
"loss": 1.1282,
"step": 645
},
{
"epoch": 1.8157524613220817,
"grad_norm": 2.044088125228882,
"learning_rate": 3.958965636828314e-06,
"loss": 1.0972,
"step": 646
},
{
"epoch": 1.8185654008438819,
"grad_norm": 2.1738321781158447,
"learning_rate": 3.944211782834618e-06,
"loss": 1.1018,
"step": 647
},
{
"epoch": 1.821378340365682,
"grad_norm": 2.4498589038848877,
"learning_rate": 3.929467544405027e-06,
"loss": 1.1727,
"step": 648
},
{
"epoch": 1.8241912798874824,
"grad_norm": 2.110391616821289,
"learning_rate": 3.9147330558223175e-06,
"loss": 1.2465,
"step": 649
},
{
"epoch": 1.8270042194092828,
"grad_norm": 2.273608684539795,
"learning_rate": 3.900008451280462e-06,
"loss": 1.1749,
"step": 650
},
{
"epoch": 1.8270042194092828,
"eval_loss": 0.6407925486564636,
"eval_runtime": 2.8075,
"eval_samples_per_second": 9.261,
"eval_steps_per_second": 1.425,
"step": 650
},
{
"epoch": 1.8270042194092828,
"eval_active_sample_count": 30,
"eval_avg_loss": 601.5,
"eval_avg_mem_token_accuracy": 0.25177304964539005,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.0082385704339754,
"eval_avg_mem_token_rate": 0.5732721346371317,
"eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 650,
"eval_loss": 0.6407925486564636,
"eval_num_samples": 30,
"eval_runtime": 2.8075,
"eval_samples_per_second": 9.261,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.425,
"eval_total_correct_count": 71,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8618,
"step": 650
},
{
"epoch": 1.829817158931083,
"grad_norm": 2.3315672874450684,
"learning_rate": 3.885293864883423e-06,
"loss": 1.1839,
"step": 651
},
{
"epoch": 1.8326300984528832,
"grad_norm": 2.203946828842163,
"learning_rate": 3.870589430643915e-06,
"loss": 1.1069,
"step": 652
},
{
"epoch": 1.8354430379746836,
"grad_norm": 2.159895896911621,
"learning_rate": 3.8558952824822e-06,
"loss": 1.147,
"step": 653
},
{
"epoch": 1.838255977496484,
"grad_norm": 2.023045301437378,
"learning_rate": 3.84121155422485e-06,
"loss": 0.888,
"step": 654
},
{
"epoch": 1.8410689170182841,
"grad_norm": 2.383005380630493,
"learning_rate": 3.826538379603549e-06,
"loss": 1.4156,
"step": 655
},
{
"epoch": 1.8438818565400843,
"grad_norm": 2.3636224269866943,
"learning_rate": 3.8118758922538533e-06,
"loss": 1.0916,
"step": 656
},
{
"epoch": 1.8466947960618847,
"grad_norm": 2.039092779159546,
"learning_rate": 3.7972242257139953e-06,
"loss": 1.2214,
"step": 657
},
{
"epoch": 1.849507735583685,
"grad_norm": 2.0451226234436035,
"learning_rate": 3.782583513423647e-06,
"loss": 1.3025,
"step": 658
},
{
"epoch": 1.8523206751054853,
"grad_norm": 2.2477307319641113,
"learning_rate": 3.7679538887227247e-06,
"loss": 1.3284,
"step": 659
},
{
"epoch": 1.8551336146272854,
"grad_norm": 2.366098165512085,
"learning_rate": 3.753335484850157e-06,
"loss": 1.2683,
"step": 660
},
{
"epoch": 1.8579465541490858,
"grad_norm": 2.1643450260162354,
"learning_rate": 3.738728434942684e-06,
"loss": 1.1879,
"step": 661
},
{
"epoch": 1.8607594936708862,
"grad_norm": 2.3253345489501953,
"learning_rate": 3.7241328720336377e-06,
"loss": 1.2502,
"step": 662
},
{
"epoch": 1.8635724331926864,
"grad_norm": 1.8580361604690552,
"learning_rate": 3.709548929051732e-06,
"loss": 0.9708,
"step": 663
},
{
"epoch": 1.8663853727144866,
"grad_norm": 2.173644542694092,
"learning_rate": 3.6949767388198554e-06,
"loss": 1.2449,
"step": 664
},
{
"epoch": 1.869198312236287,
"grad_norm": 1.964975357055664,
"learning_rate": 3.680416434053854e-06,
"loss": 1.1799,
"step": 665
},
{
"epoch": 1.8720112517580874,
"grad_norm": 2.169707775115967,
"learning_rate": 3.6658681473613333e-06,
"loss": 1.2694,
"step": 666
},
{
"epoch": 1.8748241912798875,
"grad_norm": 1.9698622226715088,
"learning_rate": 3.651332011240437e-06,
"loss": 1.1431,
"step": 667
},
{
"epoch": 1.8776371308016877,
"grad_norm": 2.4650795459747314,
"learning_rate": 3.636808158078656e-06,
"loss": 1.3374,
"step": 668
},
{
"epoch": 1.880450070323488,
"grad_norm": 1.978132724761963,
"learning_rate": 3.622296720151608e-06,
"loss": 0.9086,
"step": 669
},
{
"epoch": 1.8832630098452883,
"grad_norm": 1.8494510650634766,
"learning_rate": 3.607797829621843e-06,
"loss": 1.0412,
"step": 670
},
{
"epoch": 1.8860759493670884,
"grad_norm": 2.31000018119812,
"learning_rate": 3.5933116185376325e-06,
"loss": 1.2616,
"step": 671
},
{
"epoch": 1.8888888888888888,
"grad_norm": 2.1177399158477783,
"learning_rate": 3.578838218831776e-06,
"loss": 1.0584,
"step": 672
},
{
"epoch": 1.8917018284106892,
"grad_norm": 2.711202621459961,
"learning_rate": 3.5643777623203857e-06,
"loss": 1.4235,
"step": 673
},
{
"epoch": 1.8945147679324894,
"grad_norm": 2.0394771099090576,
"learning_rate": 3.5499303807017018e-06,
"loss": 1.0978,
"step": 674
},
{
"epoch": 1.8973277074542896,
"grad_norm": 1.9236093759536743,
"learning_rate": 3.5354962055548802e-06,
"loss": 1.0943,
"step": 675
},
{
"epoch": 1.90014064697609,
"grad_norm": 2.159970283508301,
"learning_rate": 3.5210753683388014e-06,
"loss": 1.1188,
"step": 676
},
{
"epoch": 1.9029535864978904,
"grad_norm": 2.201075315475464,
"learning_rate": 3.5066680003908695e-06,
"loss": 1.0096,
"step": 677
},
{
"epoch": 1.9057665260196905,
"grad_norm": 2.2006876468658447,
"learning_rate": 3.4922742329258207e-06,
"loss": 1.2433,
"step": 678
},
{
"epoch": 1.9085794655414907,
"grad_norm": 2.1321656703948975,
"learning_rate": 3.47789419703452e-06,
"loss": 1.2714,
"step": 679
},
{
"epoch": 1.9113924050632911,
"grad_norm": 2.141841173171997,
"learning_rate": 3.463528023682779e-06,
"loss": 1.0148,
"step": 680
},
{
"epoch": 1.9142053445850915,
"grad_norm": 2.4476535320281982,
"learning_rate": 3.4491758437101487e-06,
"loss": 1.2952,
"step": 681
},
{
"epoch": 1.9170182841068917,
"grad_norm": 2.855252742767334,
"learning_rate": 3.4348377878287443e-06,
"loss": 1.0821,
"step": 682
},
{
"epoch": 1.9198312236286919,
"grad_norm": 2.2479875087738037,
"learning_rate": 3.4205139866220384e-06,
"loss": 0.9025,
"step": 683
},
{
"epoch": 1.9226441631504922,
"grad_norm": 1.734316349029541,
"learning_rate": 3.4062045705436863e-06,
"loss": 0.9917,
"step": 684
},
{
"epoch": 1.9254571026722926,
"grad_norm": 1.7392464876174927,
"learning_rate": 3.391909669916324e-06,
"loss": 0.6617,
"step": 685
},
{
"epoch": 1.9282700421940928,
"grad_norm": 2.1003048419952393,
"learning_rate": 3.3776294149303956e-06,
"loss": 1.2154,
"step": 686
},
{
"epoch": 1.931082981715893,
"grad_norm": 2.3303074836730957,
"learning_rate": 3.3633639356429564e-06,
"loss": 1.2461,
"step": 687
},
{
"epoch": 1.9338959212376934,
"grad_norm": 2.2976810932159424,
"learning_rate": 3.3491133619764925e-06,
"loss": 1.3707,
"step": 688
},
{
"epoch": 1.9367088607594938,
"grad_norm": 1.9439120292663574,
"learning_rate": 3.334877823717737e-06,
"loss": 0.9291,
"step": 689
},
{
"epoch": 1.939521800281294,
"grad_norm": 2.5753273963928223,
"learning_rate": 3.3206574505164934e-06,
"loss": 1.0634,
"step": 690
},
{
"epoch": 1.9423347398030941,
"grad_norm": 2.2259931564331055,
"learning_rate": 3.306452371884441e-06,
"loss": 1.1333,
"step": 691
},
{
"epoch": 1.9451476793248945,
"grad_norm": 2.0289406776428223,
"learning_rate": 3.2922627171939726e-06,
"loss": 1.138,
"step": 692
},
{
"epoch": 1.947960618846695,
"grad_norm": 2.4240784645080566,
"learning_rate": 3.2780886156770016e-06,
"loss": 1.1418,
"step": 693
},
{
"epoch": 1.950773558368495,
"grad_norm": 2.215083122253418,
"learning_rate": 3.263930196423797e-06,
"loss": 1.42,
"step": 694
},
{
"epoch": 1.9535864978902953,
"grad_norm": 2.2829818725585938,
"learning_rate": 3.2497875883817955e-06,
"loss": 1.1413,
"step": 695
},
{
"epoch": 1.9563994374120957,
"grad_norm": 2.153489828109741,
"learning_rate": 3.2356609203544387e-06,
"loss": 1.2167,
"step": 696
},
{
"epoch": 1.959212376933896,
"grad_norm": 1.974264144897461,
"learning_rate": 3.2215503209999952e-06,
"loss": 1.1241,
"step": 697
},
{
"epoch": 1.9620253164556962,
"grad_norm": 1.9400849342346191,
"learning_rate": 3.207455918830384e-06,
"loss": 1.036,
"step": 698
},
{
"epoch": 1.9648382559774964,
"grad_norm": 2.141404628753662,
"learning_rate": 3.193377842210014e-06,
"loss": 1.1286,
"step": 699
},
{
"epoch": 1.9676511954992968,
"grad_norm": 2.2581005096435547,
"learning_rate": 3.179316219354602e-06,
"loss": 1.385,
"step": 700
},
{
"epoch": 1.9676511954992968,
"eval_loss": 0.6409177184104919,
"eval_runtime": 2.8079,
"eval_samples_per_second": 9.26,
"eval_steps_per_second": 1.425,
"step": 700
},
{
"epoch": 1.9676511954992968,
"eval_active_sample_count": 30,
"eval_avg_loss": 603.5,
"eval_avg_mem_token_accuracy": 0.25177304964539005,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008195775135634306,
"eval_avg_mem_token_rate": 0.5762655491252577,
"eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 700,
"eval_loss": 0.6409177184104919,
"eval_num_samples": 30,
"eval_runtime": 2.8079,
"eval_samples_per_second": 9.26,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.425,
"eval_total_correct_count": 71,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8663,
"step": 700
},
{
"epoch": 1.9704641350210972,
"grad_norm": 2.2288899421691895,
"learning_rate": 3.1652711783300234e-06,
"loss": 1.3147,
"step": 701
},
{
"epoch": 1.9732770745428974,
"grad_norm": 2.327530860900879,
"learning_rate": 3.1512428470511257e-06,
"loss": 1.2538,
"step": 702
},
{
"epoch": 1.9760900140646975,
"grad_norm": 1.8935436010360718,
"learning_rate": 3.1372313532805766e-06,
"loss": 0.8867,
"step": 703
},
{
"epoch": 1.978902953586498,
"grad_norm": 2.1964917182922363,
"learning_rate": 3.1232368246276956e-06,
"loss": 1.1226,
"step": 704
},
{
"epoch": 1.9817158931082983,
"grad_norm": 2.11517333984375,
"learning_rate": 3.1092593885472965e-06,
"loss": 1.1076,
"step": 705
},
{
"epoch": 1.9845288326300985,
"grad_norm": 2.36454439163208,
"learning_rate": 3.0952991723385152e-06,
"loss": 1.1308,
"step": 706
},
{
"epoch": 1.9873417721518987,
"grad_norm": 4.362302780151367,
"learning_rate": 3.0813563031436676e-06,
"loss": 1.3241,
"step": 707
},
{
"epoch": 1.990154711673699,
"grad_norm": 2.1657958030700684,
"learning_rate": 3.067430907947073e-06,
"loss": 1.1269,
"step": 708
},
{
"epoch": 1.9929676511954995,
"grad_norm": 1.7424006462097168,
"learning_rate": 3.053523113573914e-06,
"loss": 0.9743,
"step": 709
},
{
"epoch": 1.9957805907172996,
"grad_norm": 2.1487817764282227,
"learning_rate": 3.039633046689069e-06,
"loss": 1.0117,
"step": 710
},
{
"epoch": 1.9985935302390998,
"grad_norm": 2.059786319732666,
"learning_rate": 3.0257608337959683e-06,
"loss": 1.0671,
"step": 711
},
{
"epoch": 2.0,
"grad_norm": 1.65206778049469,
"learning_rate": 3.0119066012354316e-06,
"loss": 0.5849,
"step": 712
},
{
"epoch": 2.0028129395218004,
"grad_norm": 1.849442720413208,
"learning_rate": 2.9980704751845302e-06,
"loss": 1.0254,
"step": 713
},
{
"epoch": 2.0056258790436003,
"grad_norm": 2.224947690963745,
"learning_rate": 2.9842525816554237e-06,
"loss": 1.3435,
"step": 714
},
{
"epoch": 2.0084388185654007,
"grad_norm": 2.0207643508911133,
"learning_rate": 2.9704530464942254e-06,
"loss": 1.1889,
"step": 715
},
{
"epoch": 2.011251758087201,
"grad_norm": 1.9327627420425415,
"learning_rate": 2.9566719953798474e-06,
"loss": 0.9725,
"step": 716
},
{
"epoch": 2.0140646976090015,
"grad_norm": 2.2062811851501465,
"learning_rate": 2.942909553822859e-06,
"loss": 1.1318,
"step": 717
},
{
"epoch": 2.0168776371308015,
"grad_norm": 1.9610023498535156,
"learning_rate": 2.929165847164343e-06,
"loss": 1.02,
"step": 718
},
{
"epoch": 2.019690576652602,
"grad_norm": 2.012442111968994,
"learning_rate": 2.9154410005747586e-06,
"loss": 1.073,
"step": 719
},
{
"epoch": 2.0225035161744023,
"grad_norm": 1.9642077684402466,
"learning_rate": 2.901735139052787e-06,
"loss": 1.0427,
"step": 720
},
{
"epoch": 2.0253164556962027,
"grad_norm": 2.19358491897583,
"learning_rate": 2.888048387424218e-06,
"loss": 1.1162,
"step": 721
},
{
"epoch": 2.0281293952180026,
"grad_norm": 1.9871453046798706,
"learning_rate": 2.8743808703407866e-06,
"loss": 1.1066,
"step": 722
},
{
"epoch": 2.030942334739803,
"grad_norm": 2.278085947036743,
"learning_rate": 2.8607327122790555e-06,
"loss": 1.1253,
"step": 723
},
{
"epoch": 2.0337552742616034,
"grad_norm": 1.7093780040740967,
"learning_rate": 2.8471040375392745e-06,
"loss": 1.0754,
"step": 724
},
{
"epoch": 2.036568213783404,
"grad_norm": 2.088590621948242,
"learning_rate": 2.833494970244248e-06,
"loss": 1.2312,
"step": 725
},
{
"epoch": 2.0393811533052038,
"grad_norm": 1.8987199068069458,
"learning_rate": 2.819905634338208e-06,
"loss": 0.9913,
"step": 726
},
{
"epoch": 2.042194092827004,
"grad_norm": 2.069563627243042,
"learning_rate": 2.8063361535856838e-06,
"loss": 1.1635,
"step": 727
},
{
"epoch": 2.0450070323488045,
"grad_norm": 2.440237522125244,
"learning_rate": 2.7927866515703705e-06,
"loss": 1.2113,
"step": 728
},
{
"epoch": 2.047819971870605,
"grad_norm": 2.0094406604766846,
"learning_rate": 2.7792572516940108e-06,
"loss": 0.9271,
"step": 729
},
{
"epoch": 2.050632911392405,
"grad_norm": 2.2327640056610107,
"learning_rate": 2.765748077175272e-06,
"loss": 1.1026,
"step": 730
},
{
"epoch": 2.0534458509142053,
"grad_norm": 2.1008453369140625,
"learning_rate": 2.752259251048606e-06,
"loss": 1.1666,
"step": 731
},
{
"epoch": 2.0562587904360057,
"grad_norm": 1.8837400674819946,
"learning_rate": 2.7387908961631597e-06,
"loss": 0.8817,
"step": 732
},
{
"epoch": 2.059071729957806,
"grad_norm": 1.993558645248413,
"learning_rate": 2.725343135181622e-06,
"loss": 1.0745,
"step": 733
},
{
"epoch": 2.061884669479606,
"grad_norm": 2.122399091720581,
"learning_rate": 2.711916090579137e-06,
"loss": 1.1435,
"step": 734
},
{
"epoch": 2.0646976090014064,
"grad_norm": 2.0384397506713867,
"learning_rate": 2.698509884642163e-06,
"loss": 1.181,
"step": 735
},
{
"epoch": 2.067510548523207,
"grad_norm": 2.315969944000244,
"learning_rate": 2.6851246394673822e-06,
"loss": 0.9172,
"step": 736
},
{
"epoch": 2.070323488045007,
"grad_norm": 1.8696023225784302,
"learning_rate": 2.67176047696057e-06,
"loss": 0.9634,
"step": 737
},
{
"epoch": 2.073136427566807,
"grad_norm": 2.3400771617889404,
"learning_rate": 2.6584175188354934e-06,
"loss": 1.1388,
"step": 738
},
{
"epoch": 2.0759493670886076,
"grad_norm": 2.0902152061462402,
"learning_rate": 2.6450958866128e-06,
"loss": 0.9649,
"step": 739
},
{
"epoch": 2.078762306610408,
"grad_norm": 1.8135625123977661,
"learning_rate": 2.6317957016189155e-06,
"loss": 1.1267,
"step": 740
},
{
"epoch": 2.0815752461322083,
"grad_norm": 1.949086308479309,
"learning_rate": 2.618517084984933e-06,
"loss": 1.056,
"step": 741
},
{
"epoch": 2.0843881856540083,
"grad_norm": 2.1474437713623047,
"learning_rate": 2.6052601576455116e-06,
"loss": 1.1126,
"step": 742
},
{
"epoch": 2.0872011251758087,
"grad_norm": 2.2054314613342285,
"learning_rate": 2.592025040337779e-06,
"loss": 1.1921,
"step": 743
},
{
"epoch": 2.090014064697609,
"grad_norm": 1.9321085214614868,
"learning_rate": 2.578811853600226e-06,
"loss": 0.9129,
"step": 744
},
{
"epoch": 2.0928270042194095,
"grad_norm": 2.050908327102661,
"learning_rate": 2.5656207177716107e-06,
"loss": 1.0466,
"step": 745
},
{
"epoch": 2.0956399437412094,
"grad_norm": 2.335043430328369,
"learning_rate": 2.552451752989865e-06,
"loss": 0.9907,
"step": 746
},
{
"epoch": 2.09845288326301,
"grad_norm": 2.1719613075256348,
"learning_rate": 2.539305079190999e-06,
"loss": 1.1855,
"step": 747
},
{
"epoch": 2.1012658227848102,
"grad_norm": 2.2501490116119385,
"learning_rate": 2.5261808161080047e-06,
"loss": 1.1693,
"step": 748
},
{
"epoch": 2.1040787623066106,
"grad_norm": 2.1329755783081055,
"learning_rate": 2.513079083269774e-06,
"loss": 1.1507,
"step": 749
},
{
"epoch": 2.1068917018284106,
"grad_norm": 1.9924427270889282,
"learning_rate": 2.5000000000000015e-06,
"loss": 1.035,
"step": 750
},
{
"epoch": 2.1068917018284106,
"eval_loss": 0.6396089792251587,
"eval_runtime": 2.846,
"eval_samples_per_second": 9.136,
"eval_steps_per_second": 1.405,
"step": 750
},
{
"epoch": 2.1068917018284106,
"eval_active_sample_count": 30,
"eval_avg_loss": 596.5,
"eval_avg_mem_token_accuracy": 0.2553191489361702,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.00837404047452896,
"eval_avg_mem_token_rate": 0.5719417281979645,
"eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 750,
"eval_loss": 0.6396089792251587,
"eval_num_samples": 30,
"eval_runtime": 2.846,
"eval_samples_per_second": 9.136,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.405,
"eval_total_correct_count": 72,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8598,
"step": 750
},
{
"epoch": 2.109704641350211,
"grad_norm": 1.8336857557296753,
"learning_rate": 2.4869436854161e-06,
"loss": 0.9486,
"step": 751
},
{
"epoch": 2.1125175808720114,
"grad_norm": 2.0312447547912598,
"learning_rate": 2.4739102584281268e-06,
"loss": 1.2113,
"step": 752
},
{
"epoch": 2.1153305203938118,
"grad_norm": 2.0355682373046875,
"learning_rate": 2.4608998377376752e-06,
"loss": 1.1002,
"step": 753
},
{
"epoch": 2.1181434599156117,
"grad_norm": 2.6331751346588135,
"learning_rate": 2.447912541836826e-06,
"loss": 1.1891,
"step": 754
},
{
"epoch": 2.120956399437412,
"grad_norm": 2.6497652530670166,
"learning_rate": 2.4349484890070357e-06,
"loss": 1.0924,
"step": 755
},
{
"epoch": 2.1237693389592125,
"grad_norm": 1.9082729816436768,
"learning_rate": 2.4220077973180906e-06,
"loss": 1.0445,
"step": 756
},
{
"epoch": 2.1265822784810124,
"grad_norm": 1.8643864393234253,
"learning_rate": 2.4090905846270006e-06,
"loss": 1.0385,
"step": 757
},
{
"epoch": 2.129395218002813,
"grad_norm": 2.0666754245758057,
"learning_rate": 2.396196968576957e-06,
"loss": 1.2737,
"step": 758
},
{
"epoch": 2.1322081575246132,
"grad_norm": 2.5806944370269775,
"learning_rate": 2.3833270665962293e-06,
"loss": 0.9353,
"step": 759
},
{
"epoch": 2.1350210970464136,
"grad_norm": 2.0371792316436768,
"learning_rate": 2.370480995897127e-06,
"loss": 1.1003,
"step": 760
},
{
"epoch": 2.1378340365682136,
"grad_norm": 1.9753756523132324,
"learning_rate": 2.3576588734749022e-06,
"loss": 0.9872,
"step": 761
},
{
"epoch": 2.140646976090014,
"grad_norm": 2.2429325580596924,
"learning_rate": 2.3448608161067117e-06,
"loss": 1.0195,
"step": 762
},
{
"epoch": 2.1434599156118144,
"grad_norm": 1.8056210279464722,
"learning_rate": 2.3320869403505324e-06,
"loss": 0.9248,
"step": 763
},
{
"epoch": 2.1462728551336148,
"grad_norm": 1.9145182371139526,
"learning_rate": 2.3193373625441113e-06,
"loss": 0.9601,
"step": 764
},
{
"epoch": 2.1490857946554147,
"grad_norm": 2.0845413208007812,
"learning_rate": 2.3066121988038996e-06,
"loss": 1.1699,
"step": 765
},
{
"epoch": 2.151898734177215,
"grad_norm": 1.9216276407241821,
"learning_rate": 2.2939115650240008e-06,
"loss": 1.0108,
"step": 766
},
{
"epoch": 2.1547116736990155,
"grad_norm": 2.0462570190429688,
"learning_rate": 2.2812355768751106e-06,
"loss": 0.8837,
"step": 767
},
{
"epoch": 2.157524613220816,
"grad_norm": 2.385082721710205,
"learning_rate": 2.268584349803464e-06,
"loss": 1.1446,
"step": 768
},
{
"epoch": 2.160337552742616,
"grad_norm": 2.243379592895508,
"learning_rate": 2.2559579990297943e-06,
"loss": 1.2207,
"step": 769
},
{
"epoch": 2.1631504922644162,
"grad_norm": 2.170370101928711,
"learning_rate": 2.2433566395482577e-06,
"loss": 1.3006,
"step": 770
},
{
"epoch": 2.1659634317862166,
"grad_norm": 2.1776270866394043,
"learning_rate": 2.2307803861254207e-06,
"loss": 1.1889,
"step": 771
},
{
"epoch": 2.168776371308017,
"grad_norm": 2.114034652709961,
"learning_rate": 2.218229353299181e-06,
"loss": 1.2131,
"step": 772
},
{
"epoch": 2.171589310829817,
"grad_norm": 2.2640528678894043,
"learning_rate": 2.2057036553777565e-06,
"loss": 1.3633,
"step": 773
},
{
"epoch": 2.1744022503516174,
"grad_norm": 1.7782313823699951,
"learning_rate": 2.1932034064386113e-06,
"loss": 0.9327,
"step": 774
},
{
"epoch": 2.1772151898734178,
"grad_norm": 2.046961545944214,
"learning_rate": 2.1807287203274504e-06,
"loss": 1.2086,
"step": 775
},
{
"epoch": 2.180028129395218,
"grad_norm": 2.103487491607666,
"learning_rate": 2.168279710657149e-06,
"loss": 1.0986,
"step": 776
},
{
"epoch": 2.182841068917018,
"grad_norm": 2.1570355892181396,
"learning_rate": 2.1558564908067497e-06,
"loss": 1.0043,
"step": 777
},
{
"epoch": 2.1856540084388185,
"grad_norm": 1.9457972049713135,
"learning_rate": 2.1434591739204062e-06,
"loss": 1.067,
"step": 778
},
{
"epoch": 2.188466947960619,
"grad_norm": 2.141794204711914,
"learning_rate": 2.1310878729063645e-06,
"loss": 1.1144,
"step": 779
},
{
"epoch": 2.1912798874824193,
"grad_norm": 1.9879792928695679,
"learning_rate": 2.118742700435931e-06,
"loss": 1.0625,
"step": 780
},
{
"epoch": 2.1940928270042193,
"grad_norm": 2.3529539108276367,
"learning_rate": 2.1064237689424483e-06,
"loss": 1.2867,
"step": 781
},
{
"epoch": 2.1969057665260197,
"grad_norm": 2.0593795776367188,
"learning_rate": 2.0941311906202672e-06,
"loss": 1.3383,
"step": 782
},
{
"epoch": 2.19971870604782,
"grad_norm": 2.1530141830444336,
"learning_rate": 2.081865077423731e-06,
"loss": 1.2258,
"step": 783
},
{
"epoch": 2.2025316455696204,
"grad_norm": 1.9634898900985718,
"learning_rate": 2.06962554106615e-06,
"loss": 1.1629,
"step": 784
},
{
"epoch": 2.2053445850914204,
"grad_norm": 2.2565033435821533,
"learning_rate": 2.0574126930187882e-06,
"loss": 1.3058,
"step": 785
},
{
"epoch": 2.208157524613221,
"grad_norm": 2.420267105102539,
"learning_rate": 2.0452266445098457e-06,
"loss": 1.2447,
"step": 786
},
{
"epoch": 2.210970464135021,
"grad_norm": 2.2069785594940186,
"learning_rate": 2.0330675065234466e-06,
"loss": 1.1835,
"step": 787
},
{
"epoch": 2.2137834036568216,
"grad_norm": 2.1070237159729004,
"learning_rate": 2.0209353897986288e-06,
"loss": 1.1873,
"step": 788
},
{
"epoch": 2.2165963431786215,
"grad_norm": 1.9886164665222168,
"learning_rate": 2.0088304048283337e-06,
"loss": 1.0022,
"step": 789
},
{
"epoch": 2.219409282700422,
"grad_norm": 2.1714046001434326,
"learning_rate": 1.9967526618584016e-06,
"loss": 1.1458,
"step": 790
},
{
"epoch": 2.2222222222222223,
"grad_norm": 2.1026611328125,
"learning_rate": 1.984702270886566e-06,
"loss": 1.1671,
"step": 791
},
{
"epoch": 2.2250351617440227,
"grad_norm": 2.3853933811187744,
"learning_rate": 1.9726793416614532e-06,
"loss": 1.2162,
"step": 792
},
{
"epoch": 2.2278481012658227,
"grad_norm": 2.1531338691711426,
"learning_rate": 1.9606839836815872e-06,
"loss": 1.2844,
"step": 793
},
{
"epoch": 2.230661040787623,
"grad_norm": 2.198315143585205,
"learning_rate": 1.948716306194376e-06,
"loss": 1.1015,
"step": 794
},
{
"epoch": 2.2334739803094235,
"grad_norm": 1.9941608905792236,
"learning_rate": 1.9367764181951403e-06,
"loss": 0.9099,
"step": 795
},
{
"epoch": 2.2362869198312234,
"grad_norm": 2.1348161697387695,
"learning_rate": 1.924864428426103e-06,
"loss": 0.9096,
"step": 796
},
{
"epoch": 2.239099859353024,
"grad_norm": 2.182652235031128,
"learning_rate": 1.9129804453754053e-06,
"loss": 1.2748,
"step": 797
},
{
"epoch": 2.241912798874824,
"grad_norm": 2.1464662551879883,
"learning_rate": 1.9011245772761173e-06,
"loss": 1.2931,
"step": 798
},
{
"epoch": 2.2447257383966246,
"grad_norm": 2.15000319480896,
"learning_rate": 1.889296932105254e-06,
"loss": 0.9775,
"step": 799
},
{
"epoch": 2.247538677918425,
"grad_norm": 2.1729373931884766,
"learning_rate": 1.8774976175827898e-06,
"loss": 1.182,
"step": 800
},
{
"epoch": 2.247538677918425,
"eval_loss": 0.6389347910881042,
"eval_runtime": 2.7883,
"eval_samples_per_second": 9.325,
"eval_steps_per_second": 1.435,
"step": 800
},
{
"epoch": 2.247538677918425,
"eval_active_sample_count": 30,
"eval_avg_loss": 600.75,
"eval_avg_mem_token_accuracy": 0.24822695035460993,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008117824423054622,
"eval_avg_mem_token_rate": 0.5736047362469234,
"eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 800,
"eval_loss": 0.6389347910881042,
"eval_num_samples": 30,
"eval_runtime": 2.7883,
"eval_samples_per_second": 9.325,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.435,
"eval_total_correct_count": 70,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8623,
"step": 800
},
{
"epoch": 2.250351617440225,
"grad_norm": 2.0819458961486816,
"learning_rate": 1.8657267411706802e-06,
"loss": 1.0104,
"step": 801
},
{
"epoch": 2.2531645569620253,
"grad_norm": 2.394252061843872,
"learning_rate": 1.853984410071879e-06,
"loss": 1.249,
"step": 802
},
{
"epoch": 2.2559774964838257,
"grad_norm": 2.0108907222747803,
"learning_rate": 1.8422707312293663e-06,
"loss": 1.0054,
"step": 803
},
{
"epoch": 2.2587904360056257,
"grad_norm": 2.035367488861084,
"learning_rate": 1.8305858113251717e-06,
"loss": 1.052,
"step": 804
},
{
"epoch": 2.261603375527426,
"grad_norm": 2.199094772338867,
"learning_rate": 1.8189297567794029e-06,
"loss": 1.2031,
"step": 805
},
{
"epoch": 2.2644163150492265,
"grad_norm": 2.0634264945983887,
"learning_rate": 1.8073026737492783e-06,
"loss": 1.1867,
"step": 806
},
{
"epoch": 2.267229254571027,
"grad_norm": 2.314810037612915,
"learning_rate": 1.7957046681281582e-06,
"loss": 1.2492,
"step": 807
},
{
"epoch": 2.270042194092827,
"grad_norm": 2.0201666355133057,
"learning_rate": 1.7841358455445807e-06,
"loss": 1.1079,
"step": 808
},
{
"epoch": 2.272855133614627,
"grad_norm": 2.239051342010498,
"learning_rate": 1.7725963113612998e-06,
"loss": 1.1677,
"step": 809
},
{
"epoch": 2.2756680731364276,
"grad_norm": 2.3143956661224365,
"learning_rate": 1.7610861706743316e-06,
"loss": 1.1724,
"step": 810
},
{
"epoch": 2.278481012658228,
"grad_norm": 2.3886356353759766,
"learning_rate": 1.7496055283119812e-06,
"loss": 1.2109,
"step": 811
},
{
"epoch": 2.281293952180028,
"grad_norm": 2.2909440994262695,
"learning_rate": 1.7381544888339103e-06,
"loss": 1.0614,
"step": 812
},
{
"epoch": 2.2841068917018283,
"grad_norm": 2.069227695465088,
"learning_rate": 1.726733156530161e-06,
"loss": 1.0202,
"step": 813
},
{
"epoch": 2.2869198312236287,
"grad_norm": 2.242708683013916,
"learning_rate": 1.7153416354202307e-06,
"loss": 1.0972,
"step": 814
},
{
"epoch": 2.289732770745429,
"grad_norm": 2.0846173763275146,
"learning_rate": 1.7039800292520997e-06,
"loss": 1.1095,
"step": 815
},
{
"epoch": 2.292545710267229,
"grad_norm": 1.924421787261963,
"learning_rate": 1.69264844150131e-06,
"loss": 1.1585,
"step": 816
},
{
"epoch": 2.2953586497890295,
"grad_norm": 1.7929229736328125,
"learning_rate": 1.6813469753700013e-06,
"loss": 0.9856,
"step": 817
},
{
"epoch": 2.29817158931083,
"grad_norm": 1.9918988943099976,
"learning_rate": 1.6700757337859907e-06,
"loss": 1.0617,
"step": 818
},
{
"epoch": 2.3009845288326303,
"grad_norm": 2.357882499694824,
"learning_rate": 1.6588348194018205e-06,
"loss": 1.0826,
"step": 819
},
{
"epoch": 2.3037974683544302,
"grad_norm": 2.163602828979492,
"learning_rate": 1.6476243345938293e-06,
"loss": 1.342,
"step": 820
},
{
"epoch": 2.3066104078762306,
"grad_norm": 1.7069376707077026,
"learning_rate": 1.6364443814612207e-06,
"loss": 0.933,
"step": 821
},
{
"epoch": 2.309423347398031,
"grad_norm": 2.1436493396759033,
"learning_rate": 1.6252950618251311e-06,
"loss": 1.2028,
"step": 822
},
{
"epoch": 2.3122362869198314,
"grad_norm": 2.0016818046569824,
"learning_rate": 1.614176477227703e-06,
"loss": 1.1039,
"step": 823
},
{
"epoch": 2.3150492264416314,
"grad_norm": 2.098785400390625,
"learning_rate": 1.6030887289311604e-06,
"loss": 1.0678,
"step": 824
},
{
"epoch": 2.3178621659634318,
"grad_norm": 2.156809091567993,
"learning_rate": 1.5920319179168859e-06,
"loss": 1.2103,
"step": 825
},
{
"epoch": 2.320675105485232,
"grad_norm": 2.111753463745117,
"learning_rate": 1.5810061448845028e-06,
"loss": 1.1346,
"step": 826
},
{
"epoch": 2.3234880450070325,
"grad_norm": 2.18839693069458,
"learning_rate": 1.5700115102509562e-06,
"loss": 1.1966,
"step": 827
},
{
"epoch": 2.3263009845288325,
"grad_norm": 2.2580389976501465,
"learning_rate": 1.5590481141495988e-06,
"loss": 1.2102,
"step": 828
},
{
"epoch": 2.329113924050633,
"grad_norm": 2.530665874481201,
"learning_rate": 1.5481160564292802e-06,
"loss": 1.3096,
"step": 829
},
{
"epoch": 2.3319268635724333,
"grad_norm": 2.008321523666382,
"learning_rate": 1.5372154366534325e-06,
"loss": 1.0493,
"step": 830
},
{
"epoch": 2.3347398030942337,
"grad_norm": 1.8788542747497559,
"learning_rate": 1.5263463540991769e-06,
"loss": 1.1453,
"step": 831
},
{
"epoch": 2.3375527426160336,
"grad_norm": 2.1390604972839355,
"learning_rate": 1.5155089077563968e-06,
"loss": 0.9813,
"step": 832
},
{
"epoch": 2.340365682137834,
"grad_norm": 2.1308085918426514,
"learning_rate": 1.5047031963268617e-06,
"loss": 1.3274,
"step": 833
},
{
"epoch": 2.3431786216596344,
"grad_norm": 2.2323601245880127,
"learning_rate": 1.49392931822331e-06,
"loss": 1.1762,
"step": 834
},
{
"epoch": 2.3459915611814344,
"grad_norm": 2.2134149074554443,
"learning_rate": 1.4831873715685597e-06,
"loss": 1.1039,
"step": 835
},
{
"epoch": 2.3488045007032348,
"grad_norm": 1.834775686264038,
"learning_rate": 1.4724774541946145e-06,
"loss": 0.9826,
"step": 836
},
{
"epoch": 2.351617440225035,
"grad_norm": 1.9355462789535522,
"learning_rate": 1.461799663641773e-06,
"loss": 1.0111,
"step": 837
},
{
"epoch": 2.3544303797468356,
"grad_norm": 2.2236545085906982,
"learning_rate": 1.4511540971577377e-06,
"loss": 1.1159,
"step": 838
},
{
"epoch": 2.357243319268636,
"grad_norm": 2.29103946685791,
"learning_rate": 1.440540851696733e-06,
"loss": 1.3618,
"step": 839
},
{
"epoch": 2.360056258790436,
"grad_norm": 2.335484743118286,
"learning_rate": 1.429960023918619e-06,
"loss": 1.165,
"step": 840
},
{
"epoch": 2.3628691983122363,
"grad_norm": 2.207131862640381,
"learning_rate": 1.4194117101880134e-06,
"loss": 1.11,
"step": 841
},
{
"epoch": 2.3656821378340367,
"grad_norm": 1.7570301294326782,
"learning_rate": 1.4088960065734137e-06,
"loss": 0.9707,
"step": 842
},
{
"epoch": 2.3684950773558366,
"grad_norm": 2.027989149093628,
"learning_rate": 1.3984130088463204e-06,
"loss": 1.1416,
"step": 843
},
{
"epoch": 2.371308016877637,
"grad_norm": 2.0788614749908447,
"learning_rate": 1.3879628124803662e-06,
"loss": 1.0461,
"step": 844
},
{
"epoch": 2.3741209563994374,
"grad_norm": 1.9784637689590454,
"learning_rate": 1.3775455126504466e-06,
"loss": 1.0517,
"step": 845
},
{
"epoch": 2.376933895921238,
"grad_norm": 1.6520678997039795,
"learning_rate": 1.3671612042318527e-06,
"loss": 0.8804,
"step": 846
},
{
"epoch": 2.379746835443038,
"grad_norm": 2.11843204498291,
"learning_rate": 1.3568099817994068e-06,
"loss": 1.0982,
"step": 847
},
{
"epoch": 2.382559774964838,
"grad_norm": 2.0866153240203857,
"learning_rate": 1.3464919396266018e-06,
"loss": 1.1652,
"step": 848
},
{
"epoch": 2.3853727144866386,
"grad_norm": 2.224863052368164,
"learning_rate": 1.3362071716847424e-06,
"loss": 1.2356,
"step": 849
},
{
"epoch": 2.388185654008439,
"grad_norm": 2.0457394123077393,
"learning_rate": 1.3259557716420868e-06,
"loss": 1.2145,
"step": 850
},
{
"epoch": 2.388185654008439,
"eval_loss": 0.6384085416793823,
"eval_runtime": 2.8184,
"eval_samples_per_second": 9.225,
"eval_steps_per_second": 1.419,
"step": 850
},
{
"epoch": 2.388185654008439,
"eval_active_sample_count": 30,
"eval_avg_loss": 600.875,
"eval_avg_mem_token_accuracy": 0.25177304964539005,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008220446914437884,
"eval_avg_mem_token_rate": 0.5745360207543404,
"eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 850,
"eval_loss": 0.6384085416793823,
"eval_num_samples": 30,
"eval_runtime": 2.8184,
"eval_samples_per_second": 9.225,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.419,
"eval_total_correct_count": 71,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8637,
"step": 850
},
{
"epoch": 2.390998593530239,
"grad_norm": 2.302175760269165,
"learning_rate": 1.3157378328630027e-06,
"loss": 1.2569,
"step": 851
},
{
"epoch": 2.3938115330520393,
"grad_norm": 2.0771360397338867,
"learning_rate": 1.3055534484070997e-06,
"loss": 1.0361,
"step": 852
},
{
"epoch": 2.3966244725738397,
"grad_norm": 2.1782445907592773,
"learning_rate": 1.2954027110284035e-06,
"loss": 1.1286,
"step": 853
},
{
"epoch": 2.39943741209564,
"grad_norm": 2.210466146469116,
"learning_rate": 1.285285713174489e-06,
"loss": 1.0967,
"step": 854
},
{
"epoch": 2.40225035161744,
"grad_norm": 2.1318819522857666,
"learning_rate": 1.2752025469856598e-06,
"loss": 1.1318,
"step": 855
},
{
"epoch": 2.4050632911392404,
"grad_norm": 2.405397653579712,
"learning_rate": 1.2651533042940883e-06,
"loss": 1.1057,
"step": 856
},
{
"epoch": 2.407876230661041,
"grad_norm": 1.8459330797195435,
"learning_rate": 1.2551380766230003e-06,
"loss": 0.9308,
"step": 857
},
{
"epoch": 2.4106891701828412,
"grad_norm": 2.1533725261688232,
"learning_rate": 1.2451569551858183e-06,
"loss": 1.1996,
"step": 858
},
{
"epoch": 2.413502109704641,
"grad_norm": 2.2185754776000977,
"learning_rate": 1.2352100308853548e-06,
"loss": 1.3325,
"step": 859
},
{
"epoch": 2.4163150492264416,
"grad_norm": 1.8294565677642822,
"learning_rate": 1.225297394312966e-06,
"loss": 0.8245,
"step": 860
},
{
"epoch": 2.419127988748242,
"grad_norm": 2.1881840229034424,
"learning_rate": 1.2154191357477352e-06,
"loss": 1.1655,
"step": 861
},
{
"epoch": 2.4219409282700424,
"grad_norm": 1.8707904815673828,
"learning_rate": 1.205575345155649e-06,
"loss": 0.9647,
"step": 862
},
{
"epoch": 2.4247538677918423,
"grad_norm": 1.8865529298782349,
"learning_rate": 1.1957661121887782e-06,
"loss": 0.972,
"step": 863
},
{
"epoch": 2.4275668073136427,
"grad_norm": 2.1275415420532227,
"learning_rate": 1.1859915261844596e-06,
"loss": 0.9982,
"step": 864
},
{
"epoch": 2.430379746835443,
"grad_norm": 2.7815465927124023,
"learning_rate": 1.1762516761644831e-06,
"loss": 0.9779,
"step": 865
},
{
"epoch": 2.4331926863572435,
"grad_norm": 2.201364517211914,
"learning_rate": 1.1665466508342876e-06,
"loss": 1.1864,
"step": 866
},
{
"epoch": 2.4360056258790435,
"grad_norm": 1.9111566543579102,
"learning_rate": 1.1568765385821373e-06,
"loss": 1.1079,
"step": 867
},
{
"epoch": 2.438818565400844,
"grad_norm": 2.0928750038146973,
"learning_rate": 1.147241427478336e-06,
"loss": 0.8893,
"step": 868
},
{
"epoch": 2.4416315049226442,
"grad_norm": 2.2094082832336426,
"learning_rate": 1.1376414052744055e-06,
"loss": 1.1135,
"step": 869
},
{
"epoch": 2.4444444444444446,
"grad_norm": 2.2001736164093018,
"learning_rate": 1.128076559402308e-06,
"loss": 1.0784,
"step": 870
},
{
"epoch": 2.4472573839662446,
"grad_norm": 2.1906962394714355,
"learning_rate": 1.1185469769736262e-06,
"loss": 1.0625,
"step": 871
},
{
"epoch": 2.450070323488045,
"grad_norm": 2.111055612564087,
"learning_rate": 1.1090527447787924e-06,
"loss": 1.0759,
"step": 872
},
{
"epoch": 2.4528832630098454,
"grad_norm": 2.1977760791778564,
"learning_rate": 1.0995939492862783e-06,
"loss": 1.156,
"step": 873
},
{
"epoch": 2.4556962025316453,
"grad_norm": 2.4149186611175537,
"learning_rate": 1.0901706766418247e-06,
"loss": 1.0938,
"step": 874
},
{
"epoch": 2.4585091420534457,
"grad_norm": 1.9314627647399902,
"learning_rate": 1.0807830126676444e-06,
"loss": 0.8718,
"step": 875
},
{
"epoch": 2.461322081575246,
"grad_norm": 2.219050168991089,
"learning_rate": 1.0714310428616464e-06,
"loss": 0.9997,
"step": 876
},
{
"epoch": 2.4641350210970465,
"grad_norm": 1.7131034135818481,
"learning_rate": 1.0621148523966552e-06,
"loss": 0.8264,
"step": 877
},
{
"epoch": 2.466947960618847,
"grad_norm": 2.0101089477539062,
"learning_rate": 1.052834526119637e-06,
"loss": 1.0334,
"step": 878
},
{
"epoch": 2.469760900140647,
"grad_norm": 2.2573459148406982,
"learning_rate": 1.0435901485509254e-06,
"loss": 1.2282,
"step": 879
},
{
"epoch": 2.4725738396624473,
"grad_norm": 2.196690797805786,
"learning_rate": 1.0343818038834513e-06,
"loss": 1.013,
"step": 880
},
{
"epoch": 2.4753867791842477,
"grad_norm": 2.2671730518341064,
"learning_rate": 1.0252095759819785e-06,
"loss": 1.1514,
"step": 881
},
{
"epoch": 2.4781997187060476,
"grad_norm": 2.392235279083252,
"learning_rate": 1.016073548382337e-06,
"loss": 1.2227,
"step": 882
},
{
"epoch": 2.481012658227848,
"grad_norm": 2.245374917984009,
"learning_rate": 1.0069738042906635e-06,
"loss": 1.2656,
"step": 883
},
{
"epoch": 2.4838255977496484,
"grad_norm": 1.7064595222473145,
"learning_rate": 9.979104265826438e-07,
"loss": 0.9954,
"step": 884
},
{
"epoch": 2.486638537271449,
"grad_norm": 1.9993723630905151,
"learning_rate": 9.888834978027589e-07,
"loss": 1.0137,
"step": 885
},
{
"epoch": 2.489451476793249,
"grad_norm": 2.405082941055298,
"learning_rate": 9.798931001635298e-07,
"loss": 1.1,
"step": 886
},
{
"epoch": 2.492264416315049,
"grad_norm": 2.263054132461548,
"learning_rate": 9.709393155447734e-07,
"loss": 1.1043,
"step": 887
},
{
"epoch": 2.4950773558368495,
"grad_norm": 2.4851043224334717,
"learning_rate": 9.62022225492853e-07,
"loss": 1.4185,
"step": 888
},
{
"epoch": 2.49789029535865,
"grad_norm": 2.131120443344116,
"learning_rate": 9.531419112199375e-07,
"loss": 1.0574,
"step": 889
},
{
"epoch": 2.50070323488045,
"grad_norm": 2.3178141117095947,
"learning_rate": 9.442984536032612e-07,
"loss": 1.1726,
"step": 890
},
{
"epoch": 2.5035161744022503,
"grad_norm": 2.0481185913085938,
"learning_rate": 9.354919331843865e-07,
"loss": 1.1169,
"step": 891
},
{
"epoch": 2.5063291139240507,
"grad_norm": 2.4421157836914062,
"learning_rate": 9.267224301684763e-07,
"loss": 1.27,
"step": 892
},
{
"epoch": 2.509142053445851,
"grad_norm": 2.13606333732605,
"learning_rate": 9.17990024423549e-07,
"loss": 1.2005,
"step": 893
},
{
"epoch": 2.5119549929676515,
"grad_norm": 2.085256576538086,
"learning_rate": 9.09294795479771e-07,
"loss": 0.9328,
"step": 894
},
{
"epoch": 2.5147679324894514,
"grad_norm": 1.9264284372329712,
"learning_rate": 9.006368225287116e-07,
"loss": 0.8267,
"step": 895
},
{
"epoch": 2.517580872011252,
"grad_norm": 1.8938343524932861,
"learning_rate": 8.920161844226416e-07,
"loss": 0.9883,
"step": 896
},
{
"epoch": 2.520393811533052,
"grad_norm": 2.379265308380127,
"learning_rate": 8.834329596737995e-07,
"loss": 1.2038,
"step": 897
},
{
"epoch": 2.523206751054852,
"grad_norm": 1.9732309579849243,
"learning_rate": 8.748872264536856e-07,
"loss": 1.0939,
"step": 898
},
{
"epoch": 2.5260196905766525,
"grad_norm": 1.9441081285476685,
"learning_rate": 8.663790625923451e-07,
"loss": 1.1116,
"step": 899
},
{
"epoch": 2.528832630098453,
"grad_norm": 1.8037775754928589,
"learning_rate": 8.57908545577662e-07,
"loss": 0.9497,
"step": 900
},
{
"epoch": 2.528832630098453,
"eval_loss": 0.6382944583892822,
"eval_runtime": 2.8103,
"eval_samples_per_second": 9.252,
"eval_steps_per_second": 1.423,
"step": 900
},
{
"epoch": 2.528832630098453,
"eval_active_sample_count": 30,
"eval_avg_loss": 600.5,
"eval_avg_mem_token_accuracy": 0.24822695035460993,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008113120074177098,
"eval_avg_mem_token_rate": 0.5739373378567152,
"eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 900,
"eval_loss": 0.6382944583892822,
"eval_num_samples": 30,
"eval_runtime": 2.8103,
"eval_samples_per_second": 9.252,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.423,
"eval_total_correct_count": 70,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8628,
"step": 900
},
{
"epoch": 2.5316455696202533,
"grad_norm": 2.0587706565856934,
"learning_rate": 8.494757525546538e-07,
"loss": 0.989,
"step": 901
},
{
"epoch": 2.5344585091420533,
"grad_norm": 2.0397393703460693,
"learning_rate": 8.410807603247656e-07,
"loss": 0.9581,
"step": 902
},
{
"epoch": 2.5372714486638537,
"grad_norm": 1.872904896736145,
"learning_rate": 8.327236453451743e-07,
"loss": 0.9432,
"step": 903
},
{
"epoch": 2.540084388185654,
"grad_norm": 2.3130741119384766,
"learning_rate": 8.244044837280901e-07,
"loss": 1.2045,
"step": 904
},
{
"epoch": 2.542897327707454,
"grad_norm": 2.1820616722106934,
"learning_rate": 8.161233512400641e-07,
"loss": 1.1755,
"step": 905
},
{
"epoch": 2.5457102672292544,
"grad_norm": 1.8425172567367554,
"learning_rate": 8.078803233012966e-07,
"loss": 0.8806,
"step": 906
},
{
"epoch": 2.548523206751055,
"grad_norm": 2.0481603145599365,
"learning_rate": 7.996754749849567e-07,
"loss": 1.0307,
"step": 907
},
{
"epoch": 2.551336146272855,
"grad_norm": 1.9898444414138794,
"learning_rate": 7.915088810164856e-07,
"loss": 0.9326,
"step": 908
},
{
"epoch": 2.5541490857946556,
"grad_norm": 2.014399766921997,
"learning_rate": 7.833806157729329e-07,
"loss": 1.0494,
"step": 909
},
{
"epoch": 2.5569620253164556,
"grad_norm": 1.9588618278503418,
"learning_rate": 7.752907532822613e-07,
"loss": 1.0299,
"step": 910
},
{
"epoch": 2.559774964838256,
"grad_norm": 2.05118465423584,
"learning_rate": 7.672393672226902e-07,
"loss": 1.2032,
"step": 911
},
{
"epoch": 2.5625879043600563,
"grad_norm": 2.032313585281372,
"learning_rate": 7.592265309220071e-07,
"loss": 1.2313,
"step": 912
},
{
"epoch": 2.5654008438818563,
"grad_norm": 2.2414844036102295,
"learning_rate": 7.512523173569175e-07,
"loss": 1.1436,
"step": 913
},
{
"epoch": 2.5682137834036567,
"grad_norm": 2.186063289642334,
"learning_rate": 7.433167991523632e-07,
"loss": 1.1053,
"step": 914
},
{
"epoch": 2.571026722925457,
"grad_norm": 2.098294734954834,
"learning_rate": 7.354200485808749e-07,
"loss": 1.1406,
"step": 915
},
{
"epoch": 2.5738396624472575,
"grad_norm": 2.103463888168335,
"learning_rate": 7.275621375619058e-07,
"loss": 1.2908,
"step": 916
},
{
"epoch": 2.576652601969058,
"grad_norm": 2.070359706878662,
"learning_rate": 7.197431376611785e-07,
"loss": 0.9896,
"step": 917
},
{
"epoch": 2.579465541490858,
"grad_norm": 1.8880215883255005,
"learning_rate": 7.11963120090034e-07,
"loss": 0.9669,
"step": 918
},
{
"epoch": 2.5822784810126582,
"grad_norm": 1.9502841234207153,
"learning_rate": 7.042221557047823e-07,
"loss": 0.9554,
"step": 919
},
{
"epoch": 2.5850914205344586,
"grad_norm": 2.4192519187927246,
"learning_rate": 6.96520315006059e-07,
"loss": 1.4215,
"step": 920
},
{
"epoch": 2.5879043600562586,
"grad_norm": 2.0227794647216797,
"learning_rate": 6.888576681381798e-07,
"loss": 1.0162,
"step": 921
},
{
"epoch": 2.590717299578059,
"grad_norm": 2.049302101135254,
"learning_rate": 6.81234284888505e-07,
"loss": 1.1344,
"step": 922
},
{
"epoch": 2.5935302390998594,
"grad_norm": 2.3195278644561768,
"learning_rate": 6.736502346868018e-07,
"loss": 1.1883,
"step": 923
},
{
"epoch": 2.5963431786216598,
"grad_norm": 1.9605528116226196,
"learning_rate": 6.661055866046134e-07,
"loss": 0.9725,
"step": 924
},
{
"epoch": 2.59915611814346,
"grad_norm": 2.021388530731201,
"learning_rate": 6.586004093546277e-07,
"loss": 1.1272,
"step": 925
},
{
"epoch": 2.60196905766526,
"grad_norm": 1.7564787864685059,
"learning_rate": 6.511347712900545e-07,
"loss": 0.9292,
"step": 926
},
{
"epoch": 2.6047819971870605,
"grad_norm": 1.886629581451416,
"learning_rate": 6.437087404040016e-07,
"loss": 1.027,
"step": 927
},
{
"epoch": 2.607594936708861,
"grad_norm": 2.0022552013397217,
"learning_rate": 6.363223843288535e-07,
"loss": 1.0797,
"step": 928
},
{
"epoch": 2.610407876230661,
"grad_norm": 2.084672451019287,
"learning_rate": 6.289757703356597e-07,
"loss": 1.164,
"step": 929
},
{
"epoch": 2.6132208157524612,
"grad_norm": 2.0323879718780518,
"learning_rate": 6.216689653335184e-07,
"loss": 1.172,
"step": 930
},
{
"epoch": 2.6160337552742616,
"grad_norm": 1.9796019792556763,
"learning_rate": 6.144020358689679e-07,
"loss": 1.1588,
"step": 931
},
{
"epoch": 2.618846694796062,
"grad_norm": 2.1912734508514404,
"learning_rate": 6.071750481253835e-07,
"loss": 1.0916,
"step": 932
},
{
"epoch": 2.6216596343178624,
"grad_norm": 2.242549419403076,
"learning_rate": 5.999880679223702e-07,
"loss": 1.1584,
"step": 933
},
{
"epoch": 2.6244725738396624,
"grad_norm": 2.412274122238159,
"learning_rate": 5.928411607151651e-07,
"loss": 1.2867,
"step": 934
},
{
"epoch": 2.6272855133614628,
"grad_norm": 2.416025161743164,
"learning_rate": 5.857343915940434e-07,
"loss": 1.2418,
"step": 935
},
{
"epoch": 2.630098452883263,
"grad_norm": 2.027195453643799,
"learning_rate": 5.786678252837213e-07,
"loss": 1.1176,
"step": 936
},
{
"epoch": 2.632911392405063,
"grad_norm": 1.915125846862793,
"learning_rate": 5.71641526142771e-07,
"loss": 1.0964,
"step": 937
},
{
"epoch": 2.6357243319268635,
"grad_norm": 1.882155179977417,
"learning_rate": 5.646555581630319e-07,
"loss": 0.9061,
"step": 938
},
{
"epoch": 2.638537271448664,
"grad_norm": 2.08971905708313,
"learning_rate": 5.577099849690276e-07,
"loss": 1.0459,
"step": 939
},
{
"epoch": 2.6413502109704643,
"grad_norm": 2.2240920066833496,
"learning_rate": 5.508048698173879e-07,
"loss": 1.283,
"step": 940
},
{
"epoch": 2.6441631504922647,
"grad_norm": 2.1256864070892334,
"learning_rate": 5.439402755962719e-07,
"loss": 0.9836,
"step": 941
},
{
"epoch": 2.6469760900140646,
"grad_norm": 2.5735840797424316,
"learning_rate": 5.371162648247957e-07,
"loss": 1.3213,
"step": 942
},
{
"epoch": 2.649789029535865,
"grad_norm": 2.2286038398742676,
"learning_rate": 5.303328996524626e-07,
"loss": 1.2165,
"step": 943
},
{
"epoch": 2.652601969057665,
"grad_norm": 1.9804893732070923,
"learning_rate": 5.235902418585958e-07,
"loss": 1.0179,
"step": 944
},
{
"epoch": 2.6554149085794654,
"grad_norm": 2.038052797317505,
"learning_rate": 5.168883528517793e-07,
"loss": 1.0582,
"step": 945
},
{
"epoch": 2.6582278481012658,
"grad_norm": 2.0677716732025146,
"learning_rate": 5.102272936692948e-07,
"loss": 1.2318,
"step": 946
},
{
"epoch": 2.661040787623066,
"grad_norm": 2.240928888320923,
"learning_rate": 5.036071249765673e-07,
"loss": 0.9381,
"step": 947
},
{
"epoch": 2.6638537271448666,
"grad_norm": 2.2003684043884277,
"learning_rate": 4.970279070666162e-07,
"loss": 1.1822,
"step": 948
},
{
"epoch": 2.6666666666666665,
"grad_norm": 2.238095998764038,
"learning_rate": 4.904896998594955e-07,
"loss": 1.2912,
"step": 949
},
{
"epoch": 2.669479606188467,
"grad_norm": 2.166447639465332,
"learning_rate": 4.839925629017638e-07,
"loss": 1.1712,
"step": 950
},
{
"epoch": 2.669479606188467,
"eval_loss": 0.6378054022789001,
"eval_runtime": 2.8903,
"eval_samples_per_second": 8.996,
"eval_steps_per_second": 1.384,
"step": 950
},
{
"epoch": 2.669479606188467,
"eval_active_sample_count": 30,
"eval_avg_loss": 600.375,
"eval_avg_mem_token_accuracy": 0.2553191489361702,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008346858335265477,
"eval_avg_mem_token_rate": 0.5738042972127985,
"eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 950,
"eval_loss": 0.6378054022789001,
"eval_num_samples": 30,
"eval_runtime": 2.8903,
"eval_samples_per_second": 8.996,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.384,
"eval_total_correct_count": 72,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8626,
"step": 950
},
{
"epoch": 2.6722925457102673,
"grad_norm": 2.2033162117004395,
"learning_rate": 4.775365553659256e-07,
"loss": 1.0523,
"step": 951
},
{
"epoch": 2.6751054852320673,
"grad_norm": 2.277907133102417,
"learning_rate": 4.711217360499082e-07,
"loss": 1.0803,
"step": 952
},
{
"epoch": 2.6779184247538677,
"grad_norm": 1.9675984382629395,
"learning_rate": 4.6474816337650883e-07,
"loss": 1.258,
"step": 953
},
{
"epoch": 2.680731364275668,
"grad_norm": 2.1231744289398193,
"learning_rate": 4.5841589539288187e-07,
"loss": 1.0332,
"step": 954
},
{
"epoch": 2.6835443037974684,
"grad_norm": 2.0946061611175537,
"learning_rate": 4.5212498976999196e-07,
"loss": 1.1456,
"step": 955
},
{
"epoch": 2.686357243319269,
"grad_norm": 2.3890576362609863,
"learning_rate": 4.458755038021029e-07,
"loss": 1.2698,
"step": 956
},
{
"epoch": 2.689170182841069,
"grad_norm": 1.8794134855270386,
"learning_rate": 4.3966749440624736e-07,
"loss": 0.9727,
"step": 957
},
{
"epoch": 2.691983122362869,
"grad_norm": 2.3660783767700195,
"learning_rate": 4.3350101812171143e-07,
"loss": 1.1163,
"step": 958
},
{
"epoch": 2.6947960618846696,
"grad_norm": 2.015714168548584,
"learning_rate": 4.2737613110951924e-07,
"loss": 1.1079,
"step": 959
},
{
"epoch": 2.6976090014064695,
"grad_norm": 2.051121234893799,
"learning_rate": 4.2129288915192355e-07,
"loss": 1.1844,
"step": 960
},
{
"epoch": 2.70042194092827,
"grad_norm": 2.295501708984375,
"learning_rate": 4.152513476518927e-07,
"loss": 1.2118,
"step": 961
},
{
"epoch": 2.7032348804500703,
"grad_norm": 1.991119623184204,
"learning_rate": 4.092515616326126e-07,
"loss": 1.1834,
"step": 962
},
{
"epoch": 2.7060478199718707,
"grad_norm": 1.856577754020691,
"learning_rate": 4.0329358573697906e-07,
"loss": 0.972,
"step": 963
},
{
"epoch": 2.708860759493671,
"grad_norm": 2.042525291442871,
"learning_rate": 3.973774742271047e-07,
"loss": 1.1083,
"step": 964
},
{
"epoch": 2.711673699015471,
"grad_norm": 1.8524376153945923,
"learning_rate": 3.9150328098382593e-07,
"loss": 0.9043,
"step": 965
},
{
"epoch": 2.7144866385372715,
"grad_norm": 2.0273165702819824,
"learning_rate": 3.8567105950620353e-07,
"loss": 0.9573,
"step": 966
},
{
"epoch": 2.717299578059072,
"grad_norm": 2.551295757293701,
"learning_rate": 3.798808629110479e-07,
"loss": 1.0811,
"step": 967
},
{
"epoch": 2.720112517580872,
"grad_norm": 2.2737653255462646,
"learning_rate": 3.7413274393242327e-07,
"loss": 1.1984,
"step": 968
},
{
"epoch": 2.722925457102672,
"grad_norm": 2.330913543701172,
"learning_rate": 3.68426754921179e-07,
"loss": 1.223,
"step": 969
},
{
"epoch": 2.7257383966244726,
"grad_norm": 2.24187970161438,
"learning_rate": 3.6276294784446e-07,
"loss": 1.0989,
"step": 970
},
{
"epoch": 2.728551336146273,
"grad_norm": 2.3575563430786133,
"learning_rate": 3.5714137428524754e-07,
"loss": 1.2727,
"step": 971
},
{
"epoch": 2.7313642756680734,
"grad_norm": 2.3462178707122803,
"learning_rate": 3.5156208544187554e-07,
"loss": 1.2697,
"step": 972
},
{
"epoch": 2.7341772151898733,
"grad_norm": 2.2106142044067383,
"learning_rate": 3.460251321275759e-07,
"loss": 0.9519,
"step": 973
},
{
"epoch": 2.7369901547116737,
"grad_norm": 1.885840654373169,
"learning_rate": 3.4053056477000856e-07,
"loss": 0.8887,
"step": 974
},
{
"epoch": 2.739803094233474,
"grad_norm": 1.8733952045440674,
"learning_rate": 3.350784334108048e-07,
"loss": 1.1189,
"step": 975
},
{
"epoch": 2.742616033755274,
"grad_norm": 2.0802693367004395,
"learning_rate": 3.2966878770511025e-07,
"loss": 1.0736,
"step": 976
},
{
"epoch": 2.7454289732770745,
"grad_norm": 2.003995656967163,
"learning_rate": 3.24301676921136e-07,
"loss": 0.9954,
"step": 977
},
{
"epoch": 2.748241912798875,
"grad_norm": 1.968119740486145,
"learning_rate": 3.189771499397043e-07,
"loss": 1.0114,
"step": 978
},
{
"epoch": 2.7510548523206753,
"grad_norm": 2.2957983016967773,
"learning_rate": 3.136952552538092e-07,
"loss": 1.1369,
"step": 979
},
{
"epoch": 2.7538677918424757,
"grad_norm": 2.131643772125244,
"learning_rate": 3.084560409681703e-07,
"loss": 1.2212,
"step": 980
},
{
"epoch": 2.7566807313642756,
"grad_norm": 1.8769854307174683,
"learning_rate": 3.0325955479879765e-07,
"loss": 0.94,
"step": 981
},
{
"epoch": 2.759493670886076,
"grad_norm": 1.8766363859176636,
"learning_rate": 2.981058440725559e-07,
"loss": 0.9704,
"step": 982
},
{
"epoch": 2.7623066104078764,
"grad_norm": 2.0633304119110107,
"learning_rate": 2.929949557267331e-07,
"loss": 0.9554,
"step": 983
},
{
"epoch": 2.7651195499296763,
"grad_norm": 2.1459577083587646,
"learning_rate": 2.8792693630861345e-07,
"loss": 1.0209,
"step": 984
},
{
"epoch": 2.7679324894514767,
"grad_norm": 2.0213375091552734,
"learning_rate": 2.829018319750543e-07,
"loss": 1.0121,
"step": 985
},
{
"epoch": 2.770745428973277,
"grad_norm": 2.148283004760742,
"learning_rate": 2.779196884920643e-07,
"loss": 1.1324,
"step": 986
},
{
"epoch": 2.7735583684950775,
"grad_norm": 2.2942779064178467,
"learning_rate": 2.729805512343875e-07,
"loss": 1.3349,
"step": 987
},
{
"epoch": 2.7763713080168775,
"grad_norm": 1.860045075416565,
"learning_rate": 2.6808446518508835e-07,
"loss": 0.9753,
"step": 988
},
{
"epoch": 2.779184247538678,
"grad_norm": 2.135307550430298,
"learning_rate": 2.632314749351483e-07,
"loss": 1.2426,
"step": 989
},
{
"epoch": 2.7819971870604783,
"grad_norm": 2.502941131591797,
"learning_rate": 2.5842162468304845e-07,
"loss": 1.3143,
"step": 990
},
{
"epoch": 2.7848101265822782,
"grad_norm": 1.8326023817062378,
"learning_rate": 2.5365495823437834e-07,
"loss": 1.0144,
"step": 991
},
{
"epoch": 2.7876230661040786,
"grad_norm": 2.351020574569702,
"learning_rate": 2.489315190014291e-07,
"loss": 1.2042,
"step": 992
},
{
"epoch": 2.790436005625879,
"grad_norm": 1.9044114351272583,
"learning_rate": 2.4425135000280374e-07,
"loss": 1.0554,
"step": 993
},
{
"epoch": 2.7932489451476794,
"grad_norm": 2.1605467796325684,
"learning_rate": 2.3961449386302017e-07,
"loss": 1.1091,
"step": 994
},
{
"epoch": 2.79606188466948,
"grad_norm": 1.9160940647125244,
"learning_rate": 2.3502099281212775e-07,
"loss": 0.9543,
"step": 995
},
{
"epoch": 2.7988748241912798,
"grad_norm": 2.0379810333251953,
"learning_rate": 2.3047088868531796e-07,
"loss": 1.0654,
"step": 996
},
{
"epoch": 2.80168776371308,
"grad_norm": 2.0998106002807617,
"learning_rate": 2.2596422292254893e-07,
"loss": 1.1908,
"step": 997
},
{
"epoch": 2.8045007032348805,
"grad_norm": 2.1208677291870117,
"learning_rate": 2.2150103656816357e-07,
"loss": 1.0795,
"step": 998
},
{
"epoch": 2.8073136427566805,
"grad_norm": 2.2069194316864014,
"learning_rate": 2.1708137027051601e-07,
"loss": 1.1354,
"step": 999
},
{
"epoch": 2.810126582278481,
"grad_norm": 2.2347195148468018,
"learning_rate": 2.1270526428160466e-07,
"loss": 1.3928,
"step": 1000
},
{
"epoch": 2.810126582278481,
"eval_loss": 0.63798987865448,
"eval_runtime": 2.8525,
"eval_samples_per_second": 9.115,
"eval_steps_per_second": 1.402,
"step": 1000
},
{
"epoch": 2.810126582278481,
"eval_active_sample_count": 30,
"eval_avg_loss": 599.75,
"eval_avg_mem_token_accuracy": 0.24113475177304963,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.00790054606715464,
"eval_avg_mem_token_rate": 0.5725404110955897,
"eval_avg_mem_token_recall(Accuracy)": 0.24113475177304963,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 1000,
"eval_loss": 0.63798987865448,
"eval_num_samples": 30,
"eval_runtime": 2.8525,
"eval_samples_per_second": 9.115,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.402,
"eval_total_correct_count": 68,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8607,
"step": 1000
},
{
"epoch": 2.8129395218002813,
"grad_norm": 2.0307729244232178,
"learning_rate": 2.0837275845670135e-07,
"loss": 1.2427,
"step": 1001
},
{
"epoch": 2.8157524613220817,
"grad_norm": 2.4855947494506836,
"learning_rate": 2.0408389225399339e-07,
"loss": 1.1572,
"step": 1002
},
{
"epoch": 2.818565400843882,
"grad_norm": 2.137430429458618,
"learning_rate": 1.9983870473421761e-07,
"loss": 1.1247,
"step": 1003
},
{
"epoch": 2.821378340365682,
"grad_norm": 1.7523655891418457,
"learning_rate": 1.9563723456031303e-07,
"loss": 1.1162,
"step": 1004
},
{
"epoch": 2.8241912798874824,
"grad_norm": 2.1431448459625244,
"learning_rate": 1.9147951999705928e-07,
"loss": 1.2084,
"step": 1005
},
{
"epoch": 2.827004219409283,
"grad_norm": 2.178713798522949,
"learning_rate": 1.8736559891073703e-07,
"loss": 1.2073,
"step": 1006
},
{
"epoch": 2.8298171589310828,
"grad_norm": 2.0820088386535645,
"learning_rate": 1.8329550876877488e-07,
"loss": 1.1191,
"step": 1007
},
{
"epoch": 2.832630098452883,
"grad_norm": 2.0419578552246094,
"learning_rate": 1.7926928663941635e-07,
"loss": 1.0641,
"step": 1008
},
{
"epoch": 2.8354430379746836,
"grad_norm": 2.2004177570343018,
"learning_rate": 1.7528696919137444e-07,
"loss": 1.3558,
"step": 1009
},
{
"epoch": 2.838255977496484,
"grad_norm": 2.3024518489837646,
"learning_rate": 1.7134859269350546e-07,
"loss": 1.2914,
"step": 1010
},
{
"epoch": 2.8410689170182843,
"grad_norm": 2.0407681465148926,
"learning_rate": 1.6745419301446962e-07,
"loss": 1.0491,
"step": 1011
},
{
"epoch": 2.8438818565400843,
"grad_norm": 2.028738498687744,
"learning_rate": 1.6360380562241428e-07,
"loss": 1.2034,
"step": 1012
},
{
"epoch": 2.8466947960618847,
"grad_norm": 2.436655044555664,
"learning_rate": 1.5979746558464237e-07,
"loss": 1.4506,
"step": 1013
},
{
"epoch": 2.849507735583685,
"grad_norm": 2.0717296600341797,
"learning_rate": 1.5603520756729885e-07,
"loss": 1.1103,
"step": 1014
},
{
"epoch": 2.852320675105485,
"grad_norm": 2.195970058441162,
"learning_rate": 1.5231706583505256e-07,
"loss": 1.2775,
"step": 1015
},
{
"epoch": 2.8551336146272854,
"grad_norm": 2.2911033630371094,
"learning_rate": 1.486430742507833e-07,
"loss": 1.1482,
"step": 1016
},
{
"epoch": 2.857946554149086,
"grad_norm": 2.503101348876953,
"learning_rate": 1.4501326627527513e-07,
"loss": 1.4186,
"step": 1017
},
{
"epoch": 2.8607594936708862,
"grad_norm": 1.9371217489242554,
"learning_rate": 1.4142767496691135e-07,
"loss": 0.9705,
"step": 1018
},
{
"epoch": 2.8635724331926866,
"grad_norm": 2.0493252277374268,
"learning_rate": 1.3788633298137288e-07,
"loss": 0.9959,
"step": 1019
},
{
"epoch": 2.8663853727144866,
"grad_norm": 1.987891674041748,
"learning_rate": 1.3438927257134083e-07,
"loss": 0.9549,
"step": 1020
},
{
"epoch": 2.869198312236287,
"grad_norm": 2.177379608154297,
"learning_rate": 1.3093652558620384e-07,
"loss": 1.1057,
"step": 1021
},
{
"epoch": 2.8720112517580874,
"grad_norm": 1.7878172397613525,
"learning_rate": 1.2752812347176514e-07,
"loss": 0.865,
"step": 1022
},
{
"epoch": 2.8748241912798873,
"grad_norm": 2.258223295211792,
"learning_rate": 1.2416409726996037e-07,
"loss": 1.1227,
"step": 1023
},
{
"epoch": 2.8776371308016877,
"grad_norm": 2.103666067123413,
"learning_rate": 1.2084447761857244e-07,
"loss": 1.1573,
"step": 1024
},
{
"epoch": 2.880450070323488,
"grad_norm": 1.982913851737976,
"learning_rate": 1.1756929475095103e-07,
"loss": 1.0078,
"step": 1025
},
{
"epoch": 2.8832630098452885,
"grad_norm": 1.9436091184616089,
"learning_rate": 1.143385784957407e-07,
"loss": 1.0486,
"step": 1026
},
{
"epoch": 2.8860759493670884,
"grad_norm": 2.438931465148926,
"learning_rate": 1.111523582766072e-07,
"loss": 1.2295,
"step": 1027
},
{
"epoch": 2.888888888888889,
"grad_norm": 1.8638874292373657,
"learning_rate": 1.0801066311196872e-07,
"loss": 1.06,
"step": 1028
},
{
"epoch": 2.8917018284106892,
"grad_norm": 1.9490095376968384,
"learning_rate": 1.0491352161473345e-07,
"loss": 1.0883,
"step": 1029
},
{
"epoch": 2.894514767932489,
"grad_norm": 2.201900005340576,
"learning_rate": 1.018609619920391e-07,
"loss": 0.9764,
"step": 1030
},
{
"epoch": 2.8973277074542896,
"grad_norm": 2.4178552627563477,
"learning_rate": 9.885301204499321e-08,
"loss": 1.2852,
"step": 1031
},
{
"epoch": 2.90014064697609,
"grad_norm": 2.231503486633301,
"learning_rate": 9.588969916842272e-08,
"loss": 1.1528,
"step": 1032
},
{
"epoch": 2.9029535864978904,
"grad_norm": 1.870887041091919,
"learning_rate": 9.297105035062426e-08,
"loss": 1.0726,
"step": 1033
},
{
"epoch": 2.9057665260196908,
"grad_norm": 2.3219852447509766,
"learning_rate": 9.009709217311702e-08,
"loss": 1.1784,
"step": 1034
},
{
"epoch": 2.9085794655414907,
"grad_norm": 2.1292107105255127,
"learning_rate": 8.72678508104008e-08,
"loss": 1.2251,
"step": 1035
},
{
"epoch": 2.911392405063291,
"grad_norm": 2.016449451446533,
"learning_rate": 8.448335202971891e-08,
"loss": 0.9478,
"step": 1036
},
{
"epoch": 2.9142053445850915,
"grad_norm": 2.08313250541687,
"learning_rate": 8.174362119082291e-08,
"loss": 1.0649,
"step": 1037
},
{
"epoch": 2.9170182841068915,
"grad_norm": 2.0640265941619873,
"learning_rate": 7.9048683245741e-08,
"loss": 1.1765,
"step": 1038
},
{
"epoch": 2.919831223628692,
"grad_norm": 2.1048390865325928,
"learning_rate": 7.639856273855106e-08,
"loss": 1.0642,
"step": 1039
},
{
"epoch": 2.9226441631504922,
"grad_norm": 2.1916463375091553,
"learning_rate": 7.379328380515805e-08,
"loss": 1.2419,
"step": 1040
},
{
"epoch": 2.9254571026722926,
"grad_norm": 2.252420425415039,
"learning_rate": 7.123287017307302e-08,
"loss": 1.3343,
"step": 1041
},
{
"epoch": 2.928270042194093,
"grad_norm": 2.1169185638427734,
"learning_rate": 6.871734516119721e-08,
"loss": 1.129,
"step": 1042
},
{
"epoch": 2.931082981715893,
"grad_norm": 2.2315621376037598,
"learning_rate": 6.624673167961004e-08,
"loss": 1.1125,
"step": 1043
},
{
"epoch": 2.9338959212376934,
"grad_norm": 1.8748716115951538,
"learning_rate": 6.382105222936085e-08,
"loss": 1.049,
"step": 1044
},
{
"epoch": 2.9367088607594938,
"grad_norm": 1.9676600694656372,
"learning_rate": 6.144032890226304e-08,
"loss": 1.1791,
"step": 1045
},
{
"epoch": 2.9395218002812937,
"grad_norm": 1.765437126159668,
"learning_rate": 5.910458338069192e-08,
"loss": 0.9795,
"step": 1046
},
{
"epoch": 2.942334739803094,
"grad_norm": 2.3168399333953857,
"learning_rate": 5.6813836937392175e-08,
"loss": 1.1186,
"step": 1047
},
{
"epoch": 2.9451476793248945,
"grad_norm": 2.183238983154297,
"learning_rate": 5.456811043527632e-08,
"loss": 1.1833,
"step": 1048
},
{
"epoch": 2.947960618846695,
"grad_norm": 1.8787195682525635,
"learning_rate": 5.236742432724262e-08,
"loss": 0.9953,
"step": 1049
},
{
"epoch": 2.9507735583684953,
"grad_norm": 2.0316836833953857,
"learning_rate": 5.021179865598136e-08,
"loss": 1.0088,
"step": 1050
},
{
"epoch": 2.9507735583684953,
"eval_loss": 0.6373986005783081,
"eval_runtime": 2.8523,
"eval_samples_per_second": 9.115,
"eval_steps_per_second": 1.402,
"step": 1050
},
{
"epoch": 2.9507735583684953,
"eval_active_sample_count": 30,
"eval_avg_loss": 599.625,
"eval_avg_mem_token_accuracy": 0.2553191489361702,
"eval_avg_mem_token_gt_count": 9.4,
"eval_avg_mem_token_precision": 0.008363340689975607,
"eval_avg_mem_token_rate": 0.5726734517395065,
"eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702,
"eval_avg_slot_norm_mean": 197.63333333333333,
"eval_avg_slot_sim_mean": 0.996875,
"eval_global_step": 1050,
"eval_loss": 0.6373986005783081,
"eval_num_samples": 30,
"eval_runtime": 2.8523,
"eval_samples_per_second": 9.115,
"eval_sim_active_sample_count": 30,
"eval_steps_per_second": 1.402,
"eval_total_correct_count": 72,
"eval_total_gt_mem_token_count": 282,
"eval_total_positions": 15033,
"eval_total_pred_mem_token_count": 8609,
"step": 1050
},
{
"epoch": 2.9535864978902953,
"grad_norm": 2.193411111831665,
"learning_rate": 4.810125305379998e-08,
"loss": 1.086,
"step": 1051
},
{
"epoch": 2.9563994374120957,
"grad_norm": 1.7261470556259155,
"learning_rate": 4.6035806742436575e-08,
"loss": 1.004,
"step": 1052
},
{
"epoch": 2.959212376933896,
"grad_norm": 1.943182110786438,
"learning_rate": 4.4015478532891675e-08,
"loss": 1.1523,
"step": 1053
},
{
"epoch": 2.962025316455696,
"grad_norm": 2.992014169692993,
"learning_rate": 4.20402868252523e-08,
"loss": 1.1195,
"step": 1054
},
{
"epoch": 2.9648382559774964,
"grad_norm": 2.0633037090301514,
"learning_rate": 4.01102496085265e-08,
"loss": 1.1554,
"step": 1055
},
{
"epoch": 2.967651195499297,
"grad_norm": 5.867424964904785,
"learning_rate": 3.822538446047852e-08,
"loss": 1.1499,
"step": 1056
},
{
"epoch": 2.970464135021097,
"grad_norm": 2.3555386066436768,
"learning_rate": 3.6385708547468925e-08,
"loss": 1.296,
"step": 1057
},
{
"epoch": 2.9732770745428976,
"grad_norm": 2.298612594604492,
"learning_rate": 3.4591238624299696e-08,
"loss": 1.1622,
"step": 1058
},
{
"epoch": 2.9760900140646975,
"grad_norm": 2.095074415206909,
"learning_rate": 3.284199103405883e-08,
"loss": 1.0392,
"step": 1059
},
{
"epoch": 2.978902953586498,
"grad_norm": 1.7967655658721924,
"learning_rate": 3.113798170797489e-08,
"loss": 0.8557,
"step": 1060
},
{
"epoch": 2.9817158931082983,
"grad_norm": 2.187788963317871,
"learning_rate": 2.9479226165268216e-08,
"loss": 1.2315,
"step": 1061
},
{
"epoch": 2.9845288326300983,
"grad_norm": 2.0555531978607178,
"learning_rate": 2.7865739513012746e-08,
"loss": 1.0719,
"step": 1062
},
{
"epoch": 2.9873417721518987,
"grad_norm": 2.1727023124694824,
"learning_rate": 2.629753644599664e-08,
"loss": 1.0655,
"step": 1063
},
{
"epoch": 2.990154711673699,
"grad_norm": 2.1658568382263184,
"learning_rate": 2.4774631246589075e-08,
"loss": 1.0773,
"step": 1064
},
{
"epoch": 2.9929676511954995,
"grad_norm": 2.12109112739563,
"learning_rate": 2.3297037784609787e-08,
"loss": 1.1639,
"step": 1065
},
{
"epoch": 2.9957805907173,
"grad_norm": 2.118447780609131,
"learning_rate": 2.1864769517204177e-08,
"loss": 1.1426,
"step": 1066
},
{
"epoch": 2.9985935302391,
"grad_norm": 1.9243059158325195,
"learning_rate": 2.0477839488718398e-08,
"loss": 0.9786,
"step": 1067
},
{
"epoch": 3.0,
"grad_norm": 1.6388542652130127,
"learning_rate": 1.913626033058169e-08,
"loss": 0.5664,
"step": 1068
},
{
"epoch": 3.0028129395218004,
"grad_norm": 1.8314422369003296,
"learning_rate": 1.784004426119257e-08,
"loss": 1.0312,
"step": 1069
},
{
"epoch": 3.0056258790436003,
"grad_norm": 2.122387170791626,
"learning_rate": 1.6589203085804473e-08,
"loss": 1.0936,
"step": 1070
},
{
"epoch": 3.0084388185654007,
"grad_norm": 2.0820372104644775,
"learning_rate": 1.538374819642252e-08,
"loss": 1.0541,
"step": 1071
},
{
"epoch": 3.011251758087201,
"grad_norm": 1.9248408079147339,
"learning_rate": 1.4223690571695815e-08,
"loss": 0.9005,
"step": 1072
},
{
"epoch": 3.0140646976090015,
"grad_norm": 1.9669166803359985,
"learning_rate": 1.3109040776819181e-08,
"loss": 1.1376,
"step": 1073
},
{
"epoch": 3.0168776371308015,
"grad_norm": 1.9701210260391235,
"learning_rate": 1.2039808963437705e-08,
"loss": 1.0197,
"step": 1074
},
{
"epoch": 3.019690576652602,
"grad_norm": 2.451758623123169,
"learning_rate": 1.1016004869551788e-08,
"loss": 1.2066,
"step": 1075
},
{
"epoch": 3.0225035161744023,
"grad_norm": 1.9009047746658325,
"learning_rate": 1.0037637819431123e-08,
"loss": 1.0529,
"step": 1076
},
{
"epoch": 3.0253164556962027,
"grad_norm": 2.054837465286255,
"learning_rate": 9.10471672352864e-09,
"loss": 1.1907,
"step": 1077
},
{
"epoch": 3.0281293952180026,
"grad_norm": 2.181744337081909,
"learning_rate": 8.217250078400018e-09,
"loss": 1.1479,
"step": 1078
},
{
"epoch": 3.030942334739803,
"grad_norm": 2.066051721572876,
"learning_rate": 7.375245966623757e-09,
"loss": 1.2419,
"step": 1079
},
{
"epoch": 3.0337552742616034,
"grad_norm": 2.2346465587615967,
"learning_rate": 6.5787120567317734e-09,
"loss": 0.9984,
"step": 1080
},
{
"epoch": 3.036568213783404,
"grad_norm": 1.9933655261993408,
"learning_rate": 5.827655603135585e-09,
"loss": 1.0698,
"step": 1081
},
{
"epoch": 3.0393811533052038,
"grad_norm": 2.1959750652313232,
"learning_rate": 5.122083446062464e-09,
"loss": 1.1049,
"step": 1082
},
{
"epoch": 3.042194092827004,
"grad_norm": 2.2590200901031494,
"learning_rate": 4.462002011493271e-09,
"loss": 1.1198,
"step": 1083
},
{
"epoch": 3.0450070323488045,
"grad_norm": 2.1988589763641357,
"learning_rate": 3.847417311102497e-09,
"loss": 1.1142,
"step": 1084
},
{
"epoch": 3.047819971870605,
"grad_norm": 2.254117727279663,
"learning_rate": 3.2783349422044197e-09,
"loss": 1.199,
"step": 1085
},
{
"epoch": 3.050632911392405,
"grad_norm": 1.9562636613845825,
"learning_rate": 2.7547600877020355e-09,
"loss": 1.0887,
"step": 1086
},
{
"epoch": 3.0534458509142053,
"grad_norm": 1.9559649229049683,
"learning_rate": 2.276697516039872e-09,
"loss": 1.0819,
"step": 1087
},
{
"epoch": 3.0562587904360057,
"grad_norm": 2.017869472503662,
"learning_rate": 1.8441515811612465e-09,
"loss": 0.9884,
"step": 1088
},
{
"epoch": 3.059071729957806,
"grad_norm": 1.8643865585327148,
"learning_rate": 1.4571262224666315e-09,
"loss": 0.9771,
"step": 1089
},
{
"epoch": 3.061884669479606,
"grad_norm": 2.1424920558929443,
"learning_rate": 1.1156249647797934e-09,
"loss": 1.2107,
"step": 1090
},
{
"epoch": 3.0646976090014064,
"grad_norm": 2.071485757827759,
"learning_rate": 8.196509183139301e-10,
"loss": 0.8257,
"step": 1091
},
{
"epoch": 3.067510548523207,
"grad_norm": 1.8392572402954102,
"learning_rate": 5.692067786455813e-10,
"loss": 1.119,
"step": 1092
},
{
"epoch": 3.070323488045007,
"grad_norm": 2.0427193641662598,
"learning_rate": 3.6429482668853824e-10,
"loss": 1.0698,
"step": 1093
},
{
"epoch": 3.073136427566807,
"grad_norm": 2.2885656356811523,
"learning_rate": 2.0491692867330438e-10,
"loss": 1.4175,
"step": 1094
},
{
"epoch": 3.0759493670886076,
"grad_norm": 2.181267499923706,
"learning_rate": 9.107453612933192e-11,
"loss": 1.0596,
"step": 1095
},
{
"epoch": 3.078762306610408,
"grad_norm": 2.340491533279419,
"learning_rate": 2.2768685873364448e-11,
"loss": 1.1616,
"step": 1096
}
],
"logging_steps": 1,
"max_steps": 1096,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}