{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.078762306610408, "eval_steps": 50, "global_step": 1096, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0028129395218002813, "grad_norm": 75.0997085571289, "learning_rate": 0.0, "loss": 6.7334, "step": 1 }, { "epoch": 0.005625879043600563, "grad_norm": 77.64845275878906, "learning_rate": 1.8181818181818183e-07, "loss": 7.1245, "step": 2 }, { "epoch": 0.008438818565400843, "grad_norm": 72.08865356445312, "learning_rate": 3.6363636363636366e-07, "loss": 7.1721, "step": 3 }, { "epoch": 0.011251758087201125, "grad_norm": 86.44847106933594, "learning_rate": 5.454545454545455e-07, "loss": 7.4339, "step": 4 }, { "epoch": 0.014064697609001406, "grad_norm": 75.51171875, "learning_rate": 7.272727272727273e-07, "loss": 6.9712, "step": 5 }, { "epoch": 0.016877637130801686, "grad_norm": 73.0139389038086, "learning_rate": 9.090909090909091e-07, "loss": 7.0641, "step": 6 }, { "epoch": 0.01969057665260197, "grad_norm": 78.8460464477539, "learning_rate": 1.090909090909091e-06, "loss": 7.3246, "step": 7 }, { "epoch": 0.02250351617440225, "grad_norm": 73.03649139404297, "learning_rate": 1.2727272727272728e-06, "loss": 6.5645, "step": 8 }, { "epoch": 0.02531645569620253, "grad_norm": 83.64058685302734, "learning_rate": 1.4545454545454546e-06, "loss": 7.0709, "step": 9 }, { "epoch": 0.02812939521800281, "grad_norm": 98.82685089111328, "learning_rate": 1.6363636363636365e-06, "loss": 7.9752, "step": 10 }, { "epoch": 0.030942334739803096, "grad_norm": 77.33248901367188, "learning_rate": 1.8181818181818183e-06, "loss": 6.7595, "step": 11 }, { "epoch": 0.03375527426160337, "grad_norm": 75.38268280029297, "learning_rate": 2.0000000000000003e-06, "loss": 7.2824, "step": 12 }, { "epoch": 0.03656821378340366, "grad_norm": 84.62842559814453, "learning_rate": 2.181818181818182e-06, "loss": 7.3766, "step": 13 }, { "epoch": 0.03938115330520394, "grad_norm": 79.66527557373047, "learning_rate": 2.363636363636364e-06, "loss": 6.902, "step": 14 }, { "epoch": 0.04219409282700422, "grad_norm": 88.1312026977539, "learning_rate": 2.5454545454545456e-06, "loss": 7.5326, "step": 15 }, { "epoch": 0.0450070323488045, "grad_norm": 67.1671371459961, "learning_rate": 2.7272727272727272e-06, "loss": 6.6224, "step": 16 }, { "epoch": 0.04781997187060478, "grad_norm": 78.15252685546875, "learning_rate": 2.9090909090909093e-06, "loss": 7.2991, "step": 17 }, { "epoch": 0.05063291139240506, "grad_norm": 89.14740753173828, "learning_rate": 3.090909090909091e-06, "loss": 7.316, "step": 18 }, { "epoch": 0.053445850914205346, "grad_norm": 82.18356323242188, "learning_rate": 3.272727272727273e-06, "loss": 7.2328, "step": 19 }, { "epoch": 0.05625879043600562, "grad_norm": 75.57778930664062, "learning_rate": 3.454545454545455e-06, "loss": 6.8493, "step": 20 }, { "epoch": 0.05907172995780591, "grad_norm": 91.4236068725586, "learning_rate": 3.6363636363636366e-06, "loss": 7.251, "step": 21 }, { "epoch": 0.06188466947960619, "grad_norm": 69.89977264404297, "learning_rate": 3.818181818181819e-06, "loss": 6.1767, "step": 22 }, { "epoch": 0.06469760900140648, "grad_norm": 79.23346710205078, "learning_rate": 4.000000000000001e-06, "loss": 6.5113, "step": 23 }, { "epoch": 0.06751054852320675, "grad_norm": 75.04463958740234, "learning_rate": 4.181818181818182e-06, "loss": 6.3854, "step": 24 }, { "epoch": 0.07032348804500703, "grad_norm": 90.92483520507812, "learning_rate": 4.363636363636364e-06, "loss": 7.383, "step": 25 }, { "epoch": 0.07313642756680731, "grad_norm": 69.97665405273438, "learning_rate": 4.5454545454545455e-06, "loss": 6.1346, "step": 26 }, { "epoch": 0.0759493670886076, "grad_norm": 89.80615234375, "learning_rate": 4.727272727272728e-06, "loss": 6.9564, "step": 27 }, { "epoch": 0.07876230661040788, "grad_norm": 81.48190307617188, "learning_rate": 4.90909090909091e-06, "loss": 6.7021, "step": 28 }, { "epoch": 0.08157524613220815, "grad_norm": 79.94600677490234, "learning_rate": 5.090909090909091e-06, "loss": 5.7781, "step": 29 }, { "epoch": 0.08438818565400844, "grad_norm": 80.19660949707031, "learning_rate": 5.272727272727273e-06, "loss": 5.7521, "step": 30 }, { "epoch": 0.08720112517580872, "grad_norm": 74.73751831054688, "learning_rate": 5.4545454545454545e-06, "loss": 5.2344, "step": 31 }, { "epoch": 0.090014064697609, "grad_norm": 84.3858871459961, "learning_rate": 5.636363636363636e-06, "loss": 5.6553, "step": 32 }, { "epoch": 0.09282700421940929, "grad_norm": 93.59162902832031, "learning_rate": 5.8181818181818185e-06, "loss": 5.0521, "step": 33 }, { "epoch": 0.09563994374120956, "grad_norm": 68.47096252441406, "learning_rate": 6e-06, "loss": 5.225, "step": 34 }, { "epoch": 0.09845288326300984, "grad_norm": 62.65687942504883, "learning_rate": 6.181818181818182e-06, "loss": 4.338, "step": 35 }, { "epoch": 0.10126582278481013, "grad_norm": 66.28219604492188, "learning_rate": 6.363636363636364e-06, "loss": 4.714, "step": 36 }, { "epoch": 0.10407876230661041, "grad_norm": 64.53064727783203, "learning_rate": 6.545454545454546e-06, "loss": 4.8437, "step": 37 }, { "epoch": 0.10689170182841069, "grad_norm": 40.499000549316406, "learning_rate": 6.7272727272727275e-06, "loss": 4.5761, "step": 38 }, { "epoch": 0.10970464135021098, "grad_norm": 40.0291633605957, "learning_rate": 6.90909090909091e-06, "loss": 4.1693, "step": 39 }, { "epoch": 0.11251758087201125, "grad_norm": 19.737794876098633, "learning_rate": 7.0909090909090916e-06, "loss": 3.7602, "step": 40 }, { "epoch": 0.11533052039381153, "grad_norm": 18.727174758911133, "learning_rate": 7.272727272727273e-06, "loss": 3.941, "step": 41 }, { "epoch": 0.11814345991561181, "grad_norm": 15.440817832946777, "learning_rate": 7.454545454545456e-06, "loss": 3.676, "step": 42 }, { "epoch": 0.1209563994374121, "grad_norm": 49.393409729003906, "learning_rate": 7.636363636363638e-06, "loss": 3.5502, "step": 43 }, { "epoch": 0.12376933895921238, "grad_norm": 29.366811752319336, "learning_rate": 7.81818181818182e-06, "loss": 3.174, "step": 44 }, { "epoch": 0.12658227848101267, "grad_norm": 26.623790740966797, "learning_rate": 8.000000000000001e-06, "loss": 3.1249, "step": 45 }, { "epoch": 0.12939521800281295, "grad_norm": 23.555133819580078, "learning_rate": 8.181818181818183e-06, "loss": 3.5312, "step": 46 }, { "epoch": 0.13220815752461323, "grad_norm": 33.85753631591797, "learning_rate": 8.363636363636365e-06, "loss": 3.4659, "step": 47 }, { "epoch": 0.1350210970464135, "grad_norm": 17.72439193725586, "learning_rate": 8.545454545454546e-06, "loss": 2.7741, "step": 48 }, { "epoch": 0.13783403656821377, "grad_norm": 17.903911590576172, "learning_rate": 8.727272727272728e-06, "loss": 3.334, "step": 49 }, { "epoch": 0.14064697609001406, "grad_norm": 15.8783597946167, "learning_rate": 8.90909090909091e-06, "loss": 2.7859, "step": 50 }, { "epoch": 0.14064697609001406, "eval_loss": 1.4928081035614014, "eval_runtime": 2.8043, "eval_samples_per_second": 9.271, "eval_steps_per_second": 1.426, "step": 50 }, { "epoch": 0.14064697609001406, "eval_active_sample_count": 30, "eval_avg_loss": 805.75, "eval_avg_mem_token_accuracy": 0.25177304964539005, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007424448394855171, "eval_avg_mem_token_rate": 0.6361338388877802, "eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 50, "eval_loss": 1.4928081035614014, "eval_num_samples": 30, "eval_runtime": 2.8043, "eval_samples_per_second": 9.271, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.426, "eval_total_correct_count": 71, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 9563, "step": 50 }, { "epoch": 0.14345991561181434, "grad_norm": 9.468743324279785, "learning_rate": 9.090909090909091e-06, "loss": 2.8193, "step": 51 }, { "epoch": 0.14627285513361463, "grad_norm": 8.735793113708496, "learning_rate": 9.272727272727273e-06, "loss": 2.624, "step": 52 }, { "epoch": 0.1490857946554149, "grad_norm": 11.669722557067871, "learning_rate": 9.454545454545456e-06, "loss": 2.8725, "step": 53 }, { "epoch": 0.1518987341772152, "grad_norm": 16.81114959716797, "learning_rate": 9.636363636363638e-06, "loss": 2.4657, "step": 54 }, { "epoch": 0.15471167369901548, "grad_norm": 19.379348754882812, "learning_rate": 9.81818181818182e-06, "loss": 2.1279, "step": 55 }, { "epoch": 0.15752461322081576, "grad_norm": 13.823864936828613, "learning_rate": 1e-05, "loss": 2.3733, "step": 56 }, { "epoch": 0.16033755274261605, "grad_norm": 14.514190673828125, "learning_rate": 9.999977231314128e-06, "loss": 2.1855, "step": 57 }, { "epoch": 0.1631504922644163, "grad_norm": 24.133705139160156, "learning_rate": 9.99990892546387e-06, "loss": 2.2268, "step": 58 }, { "epoch": 0.1659634317862166, "grad_norm": 13.885165214538574, "learning_rate": 9.999795083071328e-06, "loss": 2.1062, "step": 59 }, { "epoch": 0.16877637130801687, "grad_norm": 6.023658752441406, "learning_rate": 9.999635705173312e-06, "loss": 1.9233, "step": 60 }, { "epoch": 0.17158931082981715, "grad_norm": 5.1499104499816895, "learning_rate": 9.999430793221356e-06, "loss": 2.117, "step": 61 }, { "epoch": 0.17440225035161744, "grad_norm": 5.638373851776123, "learning_rate": 9.999180349081688e-06, "loss": 2.2507, "step": 62 }, { "epoch": 0.17721518987341772, "grad_norm": 5.992455959320068, "learning_rate": 9.998884375035221e-06, "loss": 1.9682, "step": 63 }, { "epoch": 0.180028129395218, "grad_norm": 4.536100387573242, "learning_rate": 9.998542873777534e-06, "loss": 1.955, "step": 64 }, { "epoch": 0.1828410689170183, "grad_norm": 11.286314964294434, "learning_rate": 9.99815584841884e-06, "loss": 2.1629, "step": 65 }, { "epoch": 0.18565400843881857, "grad_norm": 9.133061408996582, "learning_rate": 9.99772330248396e-06, "loss": 1.4339, "step": 66 }, { "epoch": 0.18846694796061886, "grad_norm": 7.25726842880249, "learning_rate": 9.997245239912299e-06, "loss": 2.0025, "step": 67 }, { "epoch": 0.19127988748241911, "grad_norm": 5.315834045410156, "learning_rate": 9.996721665057796e-06, "loss": 1.7737, "step": 68 }, { "epoch": 0.1940928270042194, "grad_norm": 3.770214080810547, "learning_rate": 9.996152582688899e-06, "loss": 1.8984, "step": 69 }, { "epoch": 0.19690576652601968, "grad_norm": 4.797364711761475, "learning_rate": 9.995537997988507e-06, "loss": 2.0319, "step": 70 }, { "epoch": 0.19971870604781997, "grad_norm": 5.449586391448975, "learning_rate": 9.994877916553937e-06, "loss": 1.7875, "step": 71 }, { "epoch": 0.20253164556962025, "grad_norm": 3.06927227973938, "learning_rate": 9.994172344396866e-06, "loss": 1.5467, "step": 72 }, { "epoch": 0.20534458509142053, "grad_norm": 3.089805841445923, "learning_rate": 9.99342128794327e-06, "loss": 1.3562, "step": 73 }, { "epoch": 0.20815752461322082, "grad_norm": 3.4402778148651123, "learning_rate": 9.992624754033377e-06, "loss": 1.7436, "step": 74 }, { "epoch": 0.2109704641350211, "grad_norm": 2.948519706726074, "learning_rate": 9.991782749921601e-06, "loss": 1.5302, "step": 75 }, { "epoch": 0.21378340365682139, "grad_norm": 6.839716911315918, "learning_rate": 9.990895283276472e-06, "loss": 1.6953, "step": 76 }, { "epoch": 0.21659634317862167, "grad_norm": 4.01812219619751, "learning_rate": 9.98996236218057e-06, "loss": 1.7822, "step": 77 }, { "epoch": 0.21940928270042195, "grad_norm": 4.928662300109863, "learning_rate": 9.98898399513045e-06, "loss": 1.4248, "step": 78 }, { "epoch": 0.2222222222222222, "grad_norm": 3.146573305130005, "learning_rate": 9.987960191036564e-06, "loss": 1.6365, "step": 79 }, { "epoch": 0.2250351617440225, "grad_norm": 4.380753993988037, "learning_rate": 9.986890959223181e-06, "loss": 1.7186, "step": 80 }, { "epoch": 0.22784810126582278, "grad_norm": 2.831251621246338, "learning_rate": 9.985776309428306e-06, "loss": 1.4852, "step": 81 }, { "epoch": 0.23066104078762306, "grad_norm": 3.742809772491455, "learning_rate": 9.984616251803577e-06, "loss": 1.5631, "step": 82 }, { "epoch": 0.23347398030942335, "grad_norm": 3.9068987369537354, "learning_rate": 9.983410796914197e-06, "loss": 1.482, "step": 83 }, { "epoch": 0.23628691983122363, "grad_norm": 3.327174663543701, "learning_rate": 9.982159955738808e-06, "loss": 1.608, "step": 84 }, { "epoch": 0.2390998593530239, "grad_norm": 3.083757162094116, "learning_rate": 9.980863739669419e-06, "loss": 1.5167, "step": 85 }, { "epoch": 0.2419127988748242, "grad_norm": 2.9441981315612793, "learning_rate": 9.979522160511282e-06, "loss": 1.6137, "step": 86 }, { "epoch": 0.24472573839662448, "grad_norm": 2.8649449348449707, "learning_rate": 9.978135230482797e-06, "loss": 1.665, "step": 87 }, { "epoch": 0.24753867791842477, "grad_norm": 3.0601882934570312, "learning_rate": 9.97670296221539e-06, "loss": 1.5845, "step": 88 }, { "epoch": 0.25035161744022505, "grad_norm": 4.856632232666016, "learning_rate": 9.975225368753412e-06, "loss": 1.5959, "step": 89 }, { "epoch": 0.25316455696202533, "grad_norm": 3.0896317958831787, "learning_rate": 9.973702463554004e-06, "loss": 1.2724, "step": 90 }, { "epoch": 0.2559774964838256, "grad_norm": 2.862079381942749, "learning_rate": 9.972134260486989e-06, "loss": 1.73, "step": 91 }, { "epoch": 0.2587904360056259, "grad_norm": 2.281548500061035, "learning_rate": 9.970520773834734e-06, "loss": 1.4366, "step": 92 }, { "epoch": 0.2616033755274262, "grad_norm": 2.9218814373016357, "learning_rate": 9.968862018292025e-06, "loss": 1.7853, "step": 93 }, { "epoch": 0.26441631504922647, "grad_norm": 3.361042022705078, "learning_rate": 9.967158008965942e-06, "loss": 1.5868, "step": 94 }, { "epoch": 0.2672292545710267, "grad_norm": 2.6090950965881348, "learning_rate": 9.965408761375702e-06, "loss": 1.6479, "step": 95 }, { "epoch": 0.270042194092827, "grad_norm": 2.4182980060577393, "learning_rate": 9.963614291452532e-06, "loss": 1.4854, "step": 96 }, { "epoch": 0.27285513361462727, "grad_norm": 2.7494289875030518, "learning_rate": 9.961774615539523e-06, "loss": 1.6097, "step": 97 }, { "epoch": 0.27566807313642755, "grad_norm": 3.082038402557373, "learning_rate": 9.959889750391474e-06, "loss": 1.3752, "step": 98 }, { "epoch": 0.27848101265822783, "grad_norm": 3.282862663269043, "learning_rate": 9.957959713174748e-06, "loss": 1.3782, "step": 99 }, { "epoch": 0.2812939521800281, "grad_norm": 2.0881476402282715, "learning_rate": 9.955984521467108e-06, "loss": 1.3952, "step": 100 }, { "epoch": 0.2812939521800281, "eval_loss": 0.7734614014625549, "eval_runtime": 2.8846, "eval_samples_per_second": 9.013, "eval_steps_per_second": 1.387, "step": 100 }, { "epoch": 0.2812939521800281, "eval_active_sample_count": 30, "eval_avg_loss": 638.5, "eval_avg_mem_token_accuracy": 0.24822695035460993, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008693492300049677, "eval_avg_mem_token_rate": 0.5356216324087009, "eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 100, "eval_loss": 0.7734614014625549, "eval_num_samples": 30, "eval_runtime": 2.8846, "eval_samples_per_second": 9.013, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.387, "eval_total_correct_count": 70, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8052, "step": 100 }, { "epoch": 0.2841068917018284, "grad_norm": 2.6222341060638428, "learning_rate": 9.953964193257563e-06, "loss": 1.5721, "step": 101 }, { "epoch": 0.2869198312236287, "grad_norm": 2.3316454887390137, "learning_rate": 9.951898746946201e-06, "loss": 1.3596, "step": 102 }, { "epoch": 0.28973277074542897, "grad_norm": 2.174182176589966, "learning_rate": 9.949788201344019e-06, "loss": 1.2779, "step": 103 }, { "epoch": 0.29254571026722925, "grad_norm": 2.538205862045288, "learning_rate": 9.947632575672758e-06, "loss": 1.3406, "step": 104 }, { "epoch": 0.29535864978902954, "grad_norm": 1.902901291847229, "learning_rate": 9.945431889564724e-06, "loss": 1.1408, "step": 105 }, { "epoch": 0.2981715893108298, "grad_norm": 2.382870674133301, "learning_rate": 9.943186163062607e-06, "loss": 1.3498, "step": 106 }, { "epoch": 0.3009845288326301, "grad_norm": 2.490842342376709, "learning_rate": 9.940895416619308e-06, "loss": 1.401, "step": 107 }, { "epoch": 0.3037974683544304, "grad_norm": 2.9286532402038574, "learning_rate": 9.938559671097739e-06, "loss": 1.5762, "step": 108 }, { "epoch": 0.3066104078762307, "grad_norm": 2.838031530380249, "learning_rate": 9.93617894777064e-06, "loss": 1.5001, "step": 109 }, { "epoch": 0.30942334739803096, "grad_norm": 2.0874297618865967, "learning_rate": 9.933753268320391e-06, "loss": 1.3123, "step": 110 }, { "epoch": 0.31223628691983124, "grad_norm": 2.5237607955932617, "learning_rate": 9.931282654838803e-06, "loss": 1.2764, "step": 111 }, { "epoch": 0.3150492264416315, "grad_norm": 2.4033403396606445, "learning_rate": 9.928767129826929e-06, "loss": 1.3374, "step": 112 }, { "epoch": 0.3178621659634318, "grad_norm": 2.2955803871154785, "learning_rate": 9.926206716194842e-06, "loss": 1.3878, "step": 113 }, { "epoch": 0.3206751054852321, "grad_norm": 3.3657052516937256, "learning_rate": 9.92360143726145e-06, "loss": 1.288, "step": 114 }, { "epoch": 0.3234880450070324, "grad_norm": 3.1771109104156494, "learning_rate": 9.920951316754259e-06, "loss": 1.4854, "step": 115 }, { "epoch": 0.3263009845288326, "grad_norm": 2.6639983654022217, "learning_rate": 9.918256378809178e-06, "loss": 1.5049, "step": 116 }, { "epoch": 0.3291139240506329, "grad_norm": 2.107646942138672, "learning_rate": 9.915516647970283e-06, "loss": 1.2783, "step": 117 }, { "epoch": 0.3319268635724332, "grad_norm": 2.307697296142578, "learning_rate": 9.9127321491896e-06, "loss": 1.3444, "step": 118 }, { "epoch": 0.33473980309423346, "grad_norm": 2.160855293273926, "learning_rate": 9.909902907826884e-06, "loss": 1.112, "step": 119 }, { "epoch": 0.33755274261603374, "grad_norm": 2.300719976425171, "learning_rate": 9.907028949649376e-06, "loss": 1.3957, "step": 120 }, { "epoch": 0.340365682137834, "grad_norm": 2.3513684272766113, "learning_rate": 9.904110300831577e-06, "loss": 1.224, "step": 121 }, { "epoch": 0.3431786216596343, "grad_norm": 2.0586118698120117, "learning_rate": 9.901146987955008e-06, "loss": 1.1874, "step": 122 }, { "epoch": 0.3459915611814346, "grad_norm": 2.517422676086426, "learning_rate": 9.898139038007962e-06, "loss": 1.2165, "step": 123 }, { "epoch": 0.3488045007032349, "grad_norm": 2.1542768478393555, "learning_rate": 9.895086478385267e-06, "loss": 1.3451, "step": 124 }, { "epoch": 0.35161744022503516, "grad_norm": 2.022313356399536, "learning_rate": 9.891989336888033e-06, "loss": 1.2169, "step": 125 }, { "epoch": 0.35443037974683544, "grad_norm": 2.6460540294647217, "learning_rate": 9.888847641723394e-06, "loss": 1.4583, "step": 126 }, { "epoch": 0.35724331926863573, "grad_norm": 2.2727549076080322, "learning_rate": 9.88566142150426e-06, "loss": 1.2032, "step": 127 }, { "epoch": 0.360056258790436, "grad_norm": 2.1075050830841064, "learning_rate": 9.88243070524905e-06, "loss": 1.1943, "step": 128 }, { "epoch": 0.3628691983122363, "grad_norm": 2.352522611618042, "learning_rate": 9.87915552238143e-06, "loss": 1.3522, "step": 129 }, { "epoch": 0.3656821378340366, "grad_norm": 2.469947338104248, "learning_rate": 9.87583590273004e-06, "loss": 1.1493, "step": 130 }, { "epoch": 0.36849507735583686, "grad_norm": 2.1671838760375977, "learning_rate": 9.872471876528235e-06, "loss": 1.3792, "step": 131 }, { "epoch": 0.37130801687763715, "grad_norm": 2.235957622528076, "learning_rate": 9.869063474413798e-06, "loss": 1.3672, "step": 132 }, { "epoch": 0.37412095639943743, "grad_norm": 2.241083860397339, "learning_rate": 9.865610727428661e-06, "loss": 1.1784, "step": 133 }, { "epoch": 0.3769338959212377, "grad_norm": 2.1455912590026855, "learning_rate": 9.862113667018628e-06, "loss": 1.2497, "step": 134 }, { "epoch": 0.379746835443038, "grad_norm": 2.49971342086792, "learning_rate": 9.858572325033089e-06, "loss": 1.4471, "step": 135 }, { "epoch": 0.38255977496483823, "grad_norm": 2.6926071643829346, "learning_rate": 9.854986733724724e-06, "loss": 1.1595, "step": 136 }, { "epoch": 0.3853727144866385, "grad_norm": 2.2876596450805664, "learning_rate": 9.851356925749218e-06, "loss": 1.1668, "step": 137 }, { "epoch": 0.3881856540084388, "grad_norm": 2.018536329269409, "learning_rate": 9.847682934164948e-06, "loss": 1.1446, "step": 138 }, { "epoch": 0.3909985935302391, "grad_norm": 2.660203456878662, "learning_rate": 9.843964792432701e-06, "loss": 1.3112, "step": 139 }, { "epoch": 0.39381153305203936, "grad_norm": 2.4841043949127197, "learning_rate": 9.840202534415358e-06, "loss": 1.3684, "step": 140 }, { "epoch": 0.39662447257383965, "grad_norm": 2.1534616947174072, "learning_rate": 9.836396194377587e-06, "loss": 1.2795, "step": 141 }, { "epoch": 0.39943741209563993, "grad_norm": 2.2963688373565674, "learning_rate": 9.832545806985532e-06, "loss": 1.298, "step": 142 }, { "epoch": 0.4022503516174402, "grad_norm": 2.911456346511841, "learning_rate": 9.828651407306495e-06, "loss": 1.3186, "step": 143 }, { "epoch": 0.4050632911392405, "grad_norm": 3.0715761184692383, "learning_rate": 9.824713030808626e-06, "loss": 1.378, "step": 144 }, { "epoch": 0.4078762306610408, "grad_norm": 2.150747537612915, "learning_rate": 9.820730713360585e-06, "loss": 1.1809, "step": 145 }, { "epoch": 0.41068917018284107, "grad_norm": 2.1824264526367188, "learning_rate": 9.816704491231226e-06, "loss": 1.0561, "step": 146 }, { "epoch": 0.41350210970464135, "grad_norm": 2.2817230224609375, "learning_rate": 9.812634401089265e-06, "loss": 1.2782, "step": 147 }, { "epoch": 0.41631504922644164, "grad_norm": 2.196108341217041, "learning_rate": 9.808520480002942e-06, "loss": 1.1196, "step": 148 }, { "epoch": 0.4191279887482419, "grad_norm": 2.3351998329162598, "learning_rate": 9.804362765439688e-06, "loss": 1.4752, "step": 149 }, { "epoch": 0.4219409282700422, "grad_norm": 1.8851360082626343, "learning_rate": 9.800161295265782e-06, "loss": 1.1407, "step": 150 }, { "epoch": 0.4219409282700422, "eval_loss": 0.7094771862030029, "eval_runtime": 2.855, "eval_samples_per_second": 9.107, "eval_steps_per_second": 1.401, "step": 150 }, { "epoch": 0.4219409282700422, "eval_active_sample_count": 30, "eval_avg_loss": 615.25, "eval_avg_mem_token_accuracy": 0.23404255319148937, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007896625987078248, "eval_avg_mem_token_rate": 0.5559768509279585, "eval_avg_mem_token_recall(Accuracy)": 0.23404255319148937, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 150, "eval_loss": 0.7094771862030029, "eval_num_samples": 30, "eval_runtime": 2.855, "eval_samples_per_second": 9.107, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.401, "eval_total_correct_count": 66, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8358, "step": 150 }, { "epoch": 0.4247538677918425, "grad_norm": 2.1879961490631104, "learning_rate": 9.795916107746009e-06, "loss": 1.1632, "step": 151 }, { "epoch": 0.42756680731364277, "grad_norm": 2.7381277084350586, "learning_rate": 9.7916272415433e-06, "loss": 1.3305, "step": 152 }, { "epoch": 0.43037974683544306, "grad_norm": 2.1921334266662598, "learning_rate": 9.787294735718397e-06, "loss": 1.1759, "step": 153 }, { "epoch": 0.43319268635724334, "grad_norm": 2.2524077892303467, "learning_rate": 9.782918629729486e-06, "loss": 1.1278, "step": 154 }, { "epoch": 0.4360056258790436, "grad_norm": 2.3991479873657227, "learning_rate": 9.778498963431838e-06, "loss": 1.2304, "step": 155 }, { "epoch": 0.4388185654008439, "grad_norm": 2.4503281116485596, "learning_rate": 9.774035777077452e-06, "loss": 1.3168, "step": 156 }, { "epoch": 0.44163150492264414, "grad_norm": 2.1630754470825195, "learning_rate": 9.769529111314683e-06, "loss": 1.1698, "step": 157 }, { "epoch": 0.4444444444444444, "grad_norm": 2.1806483268737793, "learning_rate": 9.764979007187874e-06, "loss": 1.1485, "step": 158 }, { "epoch": 0.4472573839662447, "grad_norm": 2.1980652809143066, "learning_rate": 9.760385506136982e-06, "loss": 1.3419, "step": 159 }, { "epoch": 0.450070323488045, "grad_norm": 4.968358039855957, "learning_rate": 9.755748649997197e-06, "loss": 1.19, "step": 160 }, { "epoch": 0.45288326300984527, "grad_norm": 2.15004563331604, "learning_rate": 9.751068480998572e-06, "loss": 1.2162, "step": 161 }, { "epoch": 0.45569620253164556, "grad_norm": 2.2927024364471436, "learning_rate": 9.746345041765624e-06, "loss": 1.2539, "step": 162 }, { "epoch": 0.45850914205344584, "grad_norm": 2.2658493518829346, "learning_rate": 9.741578375316953e-06, "loss": 1.4352, "step": 163 }, { "epoch": 0.4613220815752461, "grad_norm": 2.3411777019500732, "learning_rate": 9.736768525064852e-06, "loss": 1.4317, "step": 164 }, { "epoch": 0.4641350210970464, "grad_norm": 2.0097508430480957, "learning_rate": 9.731915534814912e-06, "loss": 1.1761, "step": 165 }, { "epoch": 0.4669479606188467, "grad_norm": 2.312138080596924, "learning_rate": 9.727019448765613e-06, "loss": 1.2183, "step": 166 }, { "epoch": 0.469760900140647, "grad_norm": 2.3369953632354736, "learning_rate": 9.722080311507938e-06, "loss": 1.3209, "step": 167 }, { "epoch": 0.47257383966244726, "grad_norm": 2.1543290615081787, "learning_rate": 9.717098168024948e-06, "loss": 1.2806, "step": 168 }, { "epoch": 0.47538677918424754, "grad_norm": 2.3597400188446045, "learning_rate": 9.712073063691388e-06, "loss": 1.2461, "step": 169 }, { "epoch": 0.4781997187060478, "grad_norm": 2.410320520401001, "learning_rate": 9.707005044273268e-06, "loss": 1.3153, "step": 170 }, { "epoch": 0.4810126582278481, "grad_norm": 2.5447475910186768, "learning_rate": 9.701894155927445e-06, "loss": 1.3782, "step": 171 }, { "epoch": 0.4838255977496484, "grad_norm": 2.600811004638672, "learning_rate": 9.696740445201202e-06, "loss": 1.5061, "step": 172 }, { "epoch": 0.4866385372714487, "grad_norm": 2.225473642349243, "learning_rate": 9.691543959031831e-06, "loss": 1.3204, "step": 173 }, { "epoch": 0.48945147679324896, "grad_norm": 2.2354350090026855, "learning_rate": 9.68630474474619e-06, "loss": 1.3342, "step": 174 }, { "epoch": 0.49226441631504925, "grad_norm": 2.4795658588409424, "learning_rate": 9.681022850060297e-06, "loss": 1.2004, "step": 175 }, { "epoch": 0.49507735583684953, "grad_norm": 2.111879348754883, "learning_rate": 9.675698323078865e-06, "loss": 1.0086, "step": 176 }, { "epoch": 0.4978902953586498, "grad_norm": 2.0163023471832275, "learning_rate": 9.67033121229489e-06, "loss": 1.0946, "step": 177 }, { "epoch": 0.5007032348804501, "grad_norm": 2.2219393253326416, "learning_rate": 9.664921566589195e-06, "loss": 1.3935, "step": 178 }, { "epoch": 0.5035161744022504, "grad_norm": 2.128089189529419, "learning_rate": 9.659469435229992e-06, "loss": 1.1659, "step": 179 }, { "epoch": 0.5063291139240507, "grad_norm": 2.5307302474975586, "learning_rate": 9.653974867872424e-06, "loss": 1.1473, "step": 180 }, { "epoch": 0.509142053445851, "grad_norm": 2.2050728797912598, "learning_rate": 9.648437914558126e-06, "loss": 1.3126, "step": 181 }, { "epoch": 0.5119549929676512, "grad_norm": 2.1602675914764404, "learning_rate": 9.642858625714753e-06, "loss": 1.0508, "step": 182 }, { "epoch": 0.5147679324894515, "grad_norm": 2.3411359786987305, "learning_rate": 9.637237052155541e-06, "loss": 1.2805, "step": 183 }, { "epoch": 0.5175808720112518, "grad_norm": 2.3061892986297607, "learning_rate": 9.631573245078823e-06, "loss": 1.324, "step": 184 }, { "epoch": 0.5203938115330521, "grad_norm": 2.0462026596069336, "learning_rate": 9.625867256067577e-06, "loss": 1.2376, "step": 185 }, { "epoch": 0.5232067510548524, "grad_norm": 2.2104408740997314, "learning_rate": 9.620119137088954e-06, "loss": 1.2963, "step": 186 }, { "epoch": 0.5260196905766527, "grad_norm": 2.5065929889678955, "learning_rate": 9.614328940493797e-06, "loss": 1.3735, "step": 187 }, { "epoch": 0.5288326300984529, "grad_norm": 2.349320888519287, "learning_rate": 9.608496719016176e-06, "loss": 1.2742, "step": 188 }, { "epoch": 0.5316455696202531, "grad_norm": 2.519850730895996, "learning_rate": 9.602622525772895e-06, "loss": 1.4212, "step": 189 }, { "epoch": 0.5344585091420534, "grad_norm": 2.0543527603149414, "learning_rate": 9.596706414263022e-06, "loss": 1.1391, "step": 190 }, { "epoch": 0.5372714486638537, "grad_norm": 2.289496898651123, "learning_rate": 9.59074843836739e-06, "loss": 1.2401, "step": 191 }, { "epoch": 0.540084388185654, "grad_norm": 2.350924491882324, "learning_rate": 9.584748652348107e-06, "loss": 1.3712, "step": 192 }, { "epoch": 0.5428973277074542, "grad_norm": 2.23681640625, "learning_rate": 9.578707110848077e-06, "loss": 1.1505, "step": 193 }, { "epoch": 0.5457102672292545, "grad_norm": 2.008516788482666, "learning_rate": 9.572623868890482e-06, "loss": 1.0241, "step": 194 }, { "epoch": 0.5485232067510548, "grad_norm": 2.3972671031951904, "learning_rate": 9.566498981878289e-06, "loss": 1.4334, "step": 195 }, { "epoch": 0.5513361462728551, "grad_norm": 1.9378750324249268, "learning_rate": 9.560332505593754e-06, "loss": 1.0679, "step": 196 }, { "epoch": 0.5541490857946554, "grad_norm": 2.3928143978118896, "learning_rate": 9.554124496197899e-06, "loss": 1.0903, "step": 197 }, { "epoch": 0.5569620253164557, "grad_norm": 2.4164905548095703, "learning_rate": 9.547875010230009e-06, "loss": 1.3779, "step": 198 }, { "epoch": 0.559774964838256, "grad_norm": 2.0729787349700928, "learning_rate": 9.54158410460712e-06, "loss": 1.114, "step": 199 }, { "epoch": 0.5625879043600562, "grad_norm": 1.9305024147033691, "learning_rate": 9.535251836623491e-06, "loss": 1.1579, "step": 200 }, { "epoch": 0.5625879043600562, "eval_loss": 0.6872708797454834, "eval_runtime": 2.8553, "eval_samples_per_second": 9.106, "eval_steps_per_second": 1.401, "step": 200 }, { "epoch": 0.5625879043600562, "eval_active_sample_count": 30, "eval_avg_loss": 608.5, "eval_avg_mem_token_accuracy": 0.2198581560283688, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007418930238123729, "eval_avg_mem_token_rate": 0.5559103306060001, "eval_avg_mem_token_recall(Accuracy)": 0.2198581560283688, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 200, "eval_loss": 0.6872708797454834, "eval_num_samples": 30, "eval_runtime": 2.8553, "eval_samples_per_second": 9.106, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.401, "eval_total_correct_count": 62, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8357, "step": 200 }, { "epoch": 0.5654008438818565, "grad_norm": 2.2860162258148193, "learning_rate": 9.528878263950094e-06, "loss": 1.2892, "step": 201 }, { "epoch": 0.5682137834036568, "grad_norm": 2.314282178878784, "learning_rate": 9.522463444634075e-06, "loss": 1.0782, "step": 202 }, { "epoch": 0.5710267229254571, "grad_norm": 15.191813468933105, "learning_rate": 9.516007437098238e-06, "loss": 1.2559, "step": 203 }, { "epoch": 0.5738396624472574, "grad_norm": 1.9443162679672241, "learning_rate": 9.509510300140506e-06, "loss": 0.8679, "step": 204 }, { "epoch": 0.5766526019690577, "grad_norm": 2.5310826301574707, "learning_rate": 9.502972092933384e-06, "loss": 1.2779, "step": 205 }, { "epoch": 0.5794655414908579, "grad_norm": 2.4394469261169434, "learning_rate": 9.496392875023433e-06, "loss": 1.1331, "step": 206 }, { "epoch": 0.5822784810126582, "grad_norm": 2.40698504447937, "learning_rate": 9.489772706330707e-06, "loss": 1.4669, "step": 207 }, { "epoch": 0.5850914205344585, "grad_norm": 2.0934903621673584, "learning_rate": 9.483111647148223e-06, "loss": 1.2372, "step": 208 }, { "epoch": 0.5879043600562588, "grad_norm": 2.2789113521575928, "learning_rate": 9.476409758141404e-06, "loss": 1.3838, "step": 209 }, { "epoch": 0.5907172995780591, "grad_norm": 2.0439610481262207, "learning_rate": 9.469667100347539e-06, "loss": 1.1897, "step": 210 }, { "epoch": 0.5935302390998594, "grad_norm": 2.5594871044158936, "learning_rate": 9.462883735175205e-06, "loss": 1.2361, "step": 211 }, { "epoch": 0.5963431786216596, "grad_norm": 2.417461395263672, "learning_rate": 9.45605972440373e-06, "loss": 1.3818, "step": 212 }, { "epoch": 0.5991561181434599, "grad_norm": 2.030989170074463, "learning_rate": 9.449195130182614e-06, "loss": 1.2072, "step": 213 }, { "epoch": 0.6019690576652602, "grad_norm": 1.9220385551452637, "learning_rate": 9.442290015030974e-06, "loss": 1.1057, "step": 214 }, { "epoch": 0.6047819971870605, "grad_norm": 2.4362001419067383, "learning_rate": 9.43534444183697e-06, "loss": 1.3472, "step": 215 }, { "epoch": 0.6075949367088608, "grad_norm": 1.9925367832183838, "learning_rate": 9.42835847385723e-06, "loss": 1.2851, "step": 216 }, { "epoch": 0.6104078762306611, "grad_norm": 2.3182199001312256, "learning_rate": 9.42133217471628e-06, "loss": 1.2026, "step": 217 }, { "epoch": 0.6132208157524613, "grad_norm": 2.7779831886291504, "learning_rate": 9.414265608405956e-06, "loss": 1.2488, "step": 218 }, { "epoch": 0.6160337552742616, "grad_norm": 2.6299376487731934, "learning_rate": 9.407158839284836e-06, "loss": 1.3019, "step": 219 }, { "epoch": 0.6188466947960619, "grad_norm": 3.4749839305877686, "learning_rate": 9.40001193207763e-06, "loss": 1.4892, "step": 220 }, { "epoch": 0.6216596343178622, "grad_norm": 2.2574360370635986, "learning_rate": 9.392824951874618e-06, "loss": 1.2897, "step": 221 }, { "epoch": 0.6244725738396625, "grad_norm": 2.16740083694458, "learning_rate": 9.385597964131033e-06, "loss": 1.2792, "step": 222 }, { "epoch": 0.6272855133614628, "grad_norm": 2.0155792236328125, "learning_rate": 9.378331034666483e-06, "loss": 1.2584, "step": 223 }, { "epoch": 0.630098452883263, "grad_norm": 2.4452121257781982, "learning_rate": 9.371024229664342e-06, "loss": 1.4524, "step": 224 }, { "epoch": 0.6329113924050633, "grad_norm": 2.295438766479492, "learning_rate": 9.363677615671148e-06, "loss": 1.2677, "step": 225 }, { "epoch": 0.6357243319268636, "grad_norm": 2.1375696659088135, "learning_rate": 9.356291259596e-06, "loss": 1.265, "step": 226 }, { "epoch": 0.6385372714486639, "grad_norm": 2.3946800231933594, "learning_rate": 9.348865228709947e-06, "loss": 1.3528, "step": 227 }, { "epoch": 0.6413502109704642, "grad_norm": 2.332805871963501, "learning_rate": 9.341399590645373e-06, "loss": 1.3119, "step": 228 }, { "epoch": 0.6441631504922645, "grad_norm": 2.3480770587921143, "learning_rate": 9.333894413395388e-06, "loss": 1.33, "step": 229 }, { "epoch": 0.6469760900140648, "grad_norm": 2.432349681854248, "learning_rate": 9.326349765313199e-06, "loss": 1.1957, "step": 230 }, { "epoch": 0.6497890295358649, "grad_norm": 2.0219781398773193, "learning_rate": 9.318765715111497e-06, "loss": 1.2202, "step": 231 }, { "epoch": 0.6526019690576652, "grad_norm": 2.8865296840667725, "learning_rate": 9.311142331861821e-06, "loss": 1.5149, "step": 232 }, { "epoch": 0.6554149085794655, "grad_norm": 2.1823160648345947, "learning_rate": 9.303479684993943e-06, "loss": 1.2677, "step": 233 }, { "epoch": 0.6582278481012658, "grad_norm": 2.011133909225464, "learning_rate": 9.295777844295219e-06, "loss": 1.0202, "step": 234 }, { "epoch": 0.6610407876230661, "grad_norm": 2.2680437564849854, "learning_rate": 9.288036879909967e-06, "loss": 1.2755, "step": 235 }, { "epoch": 0.6638537271448663, "grad_norm": 2.297574520111084, "learning_rate": 9.280256862338822e-06, "loss": 1.2567, "step": 236 }, { "epoch": 0.6666666666666666, "grad_norm": 2.2774109840393066, "learning_rate": 9.272437862438095e-06, "loss": 1.1645, "step": 237 }, { "epoch": 0.6694796061884669, "grad_norm": 2.4613051414489746, "learning_rate": 9.264579951419126e-06, "loss": 1.3841, "step": 238 }, { "epoch": 0.6722925457102672, "grad_norm": 2.2511165142059326, "learning_rate": 9.256683200847638e-06, "loss": 1.2692, "step": 239 }, { "epoch": 0.6751054852320675, "grad_norm": 2.209132432937622, "learning_rate": 9.248747682643085e-06, "loss": 1.2905, "step": 240 }, { "epoch": 0.6779184247538678, "grad_norm": 2.3346107006073, "learning_rate": 9.240773469077994e-06, "loss": 1.189, "step": 241 }, { "epoch": 0.680731364275668, "grad_norm": 2.3697586059570312, "learning_rate": 9.232760632777311e-06, "loss": 1.236, "step": 242 }, { "epoch": 0.6835443037974683, "grad_norm": 2.7163619995117188, "learning_rate": 9.22470924671774e-06, "loss": 1.3411, "step": 243 }, { "epoch": 0.6863572433192686, "grad_norm": 2.210554838180542, "learning_rate": 9.216619384227068e-06, "loss": 1.2791, "step": 244 }, { "epoch": 0.6891701828410689, "grad_norm": 2.2112317085266113, "learning_rate": 9.208491118983515e-06, "loss": 1.2984, "step": 245 }, { "epoch": 0.6919831223628692, "grad_norm": 2.247898817062378, "learning_rate": 9.200324525015046e-06, "loss": 1.2766, "step": 246 }, { "epoch": 0.6947960618846695, "grad_norm": 2.2993924617767334, "learning_rate": 9.192119676698703e-06, "loss": 1.1908, "step": 247 }, { "epoch": 0.6976090014064698, "grad_norm": 2.4729530811309814, "learning_rate": 9.183876648759937e-06, "loss": 1.364, "step": 248 }, { "epoch": 0.70042194092827, "grad_norm": 2.201533794403076, "learning_rate": 9.175595516271911e-06, "loss": 1.344, "step": 249 }, { "epoch": 0.7032348804500703, "grad_norm": 2.3106961250305176, "learning_rate": 9.167276354654827e-06, "loss": 1.313, "step": 250 }, { "epoch": 0.7032348804500703, "eval_loss": 0.6741299033164978, "eval_runtime": 2.8499, "eval_samples_per_second": 9.123, "eval_steps_per_second": 1.404, "step": 250 }, { "epoch": 0.7032348804500703, "eval_active_sample_count": 30, "eval_avg_loss": 590.625, "eval_avg_mem_token_accuracy": 0.23049645390070922, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007840772014475271, "eval_avg_mem_token_rate": 0.5514534690347901, "eval_avg_mem_token_recall(Accuracy)": 0.23049645390070922, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 250, "eval_loss": 0.6741299033164978, "eval_num_samples": 30, "eval_runtime": 2.8499, "eval_samples_per_second": 9.123, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.404, "eval_total_correct_count": 65, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8290, "step": 250 }, { "epoch": 0.7060478199718706, "grad_norm": 2.2657763957977295, "learning_rate": 9.158919239675237e-06, "loss": 0.9924, "step": 251 }, { "epoch": 0.7088607594936709, "grad_norm": 2.8294458389282227, "learning_rate": 9.150524247445346e-06, "loss": 1.5447, "step": 252 }, { "epoch": 0.7116736990154712, "grad_norm": 2.327502489089966, "learning_rate": 9.14209145442234e-06, "loss": 1.3784, "step": 253 }, { "epoch": 0.7144866385372715, "grad_norm": 2.2193102836608887, "learning_rate": 9.133620937407656e-06, "loss": 1.2874, "step": 254 }, { "epoch": 0.7172995780590717, "grad_norm": 2.400413990020752, "learning_rate": 9.125112773546315e-06, "loss": 1.2711, "step": 255 }, { "epoch": 0.720112517580872, "grad_norm": 2.1976544857025146, "learning_rate": 9.1165670403262e-06, "loss": 1.399, "step": 256 }, { "epoch": 0.7229254571026723, "grad_norm": 2.2996156215667725, "learning_rate": 9.107983815577359e-06, "loss": 1.4082, "step": 257 }, { "epoch": 0.7257383966244726, "grad_norm": 2.307288408279419, "learning_rate": 9.09936317747129e-06, "loss": 1.275, "step": 258 }, { "epoch": 0.7285513361462729, "grad_norm": 2.204585552215576, "learning_rate": 9.090705204520231e-06, "loss": 1.3542, "step": 259 }, { "epoch": 0.7313642756680732, "grad_norm": 2.3391809463500977, "learning_rate": 9.082009975576452e-06, "loss": 1.231, "step": 260 }, { "epoch": 0.7341772151898734, "grad_norm": 2.5154929161071777, "learning_rate": 9.073277569831526e-06, "loss": 1.3549, "step": 261 }, { "epoch": 0.7369901547116737, "grad_norm": 2.1306750774383545, "learning_rate": 9.064508066815614e-06, "loss": 1.1, "step": 262 }, { "epoch": 0.739803094233474, "grad_norm": 1.9493396282196045, "learning_rate": 9.05570154639674e-06, "loss": 1.0767, "step": 263 }, { "epoch": 0.7426160337552743, "grad_norm": 2.2229723930358887, "learning_rate": 9.046858088780064e-06, "loss": 1.1945, "step": 264 }, { "epoch": 0.7454289732770746, "grad_norm": 2.0410044193267822, "learning_rate": 9.03797777450715e-06, "loss": 1.2284, "step": 265 }, { "epoch": 0.7482419127988749, "grad_norm": 2.533954381942749, "learning_rate": 9.02906068445523e-06, "loss": 1.4345, "step": 266 }, { "epoch": 0.7510548523206751, "grad_norm": 2.324066162109375, "learning_rate": 9.020106899836471e-06, "loss": 1.2716, "step": 267 }, { "epoch": 0.7538677918424754, "grad_norm": 2.0535366535186768, "learning_rate": 9.011116502197243e-06, "loss": 1.1823, "step": 268 }, { "epoch": 0.7566807313642757, "grad_norm": 2.3328094482421875, "learning_rate": 9.002089573417356e-06, "loss": 1.2959, "step": 269 }, { "epoch": 0.759493670886076, "grad_norm": 2.3262429237365723, "learning_rate": 8.993026195709337e-06, "loss": 0.965, "step": 270 }, { "epoch": 0.7623066104078763, "grad_norm": 2.247913122177124, "learning_rate": 8.983926451617664e-06, "loss": 1.291, "step": 271 }, { "epoch": 0.7651195499296765, "grad_norm": 2.140726089477539, "learning_rate": 8.974790424018022e-06, "loss": 1.2708, "step": 272 }, { "epoch": 0.7679324894514767, "grad_norm": 2.0828731060028076, "learning_rate": 8.96561819611655e-06, "loss": 1.2937, "step": 273 }, { "epoch": 0.770745428973277, "grad_norm": 2.237555742263794, "learning_rate": 8.956409851449076e-06, "loss": 1.1241, "step": 274 }, { "epoch": 0.7735583684950773, "grad_norm": 1.906575083732605, "learning_rate": 8.947165473880364e-06, "loss": 1.0149, "step": 275 }, { "epoch": 0.7763713080168776, "grad_norm": 2.204448699951172, "learning_rate": 8.937885147603345e-06, "loss": 1.2036, "step": 276 }, { "epoch": 0.7791842475386779, "grad_norm": 2.151160717010498, "learning_rate": 8.928568957138356e-06, "loss": 1.2992, "step": 277 }, { "epoch": 0.7819971870604782, "grad_norm": 2.286642551422119, "learning_rate": 8.919216987332357e-06, "loss": 1.2701, "step": 278 }, { "epoch": 0.7848101265822784, "grad_norm": 3.3560984134674072, "learning_rate": 8.909829323358177e-06, "loss": 1.3486, "step": 279 }, { "epoch": 0.7876230661040787, "grad_norm": 1.9844144582748413, "learning_rate": 8.900406050713723e-06, "loss": 0.967, "step": 280 }, { "epoch": 0.790436005625879, "grad_norm": 2.1631999015808105, "learning_rate": 8.89094725522121e-06, "loss": 1.2139, "step": 281 }, { "epoch": 0.7932489451476793, "grad_norm": 2.1446194648742676, "learning_rate": 8.881453023026373e-06, "loss": 1.2743, "step": 282 }, { "epoch": 0.7960618846694796, "grad_norm": 1.9020416736602783, "learning_rate": 8.871923440597694e-06, "loss": 1.0834, "step": 283 }, { "epoch": 0.7988748241912799, "grad_norm": 2.1618247032165527, "learning_rate": 8.862358594725595e-06, "loss": 1.151, "step": 284 }, { "epoch": 0.8016877637130801, "grad_norm": 2.3456199169158936, "learning_rate": 8.852758572521666e-06, "loss": 1.206, "step": 285 }, { "epoch": 0.8045007032348804, "grad_norm": 2.2839531898498535, "learning_rate": 8.843123461417864e-06, "loss": 1.248, "step": 286 }, { "epoch": 0.8073136427566807, "grad_norm": 2.277515411376953, "learning_rate": 8.833453349165713e-06, "loss": 1.3061, "step": 287 }, { "epoch": 0.810126582278481, "grad_norm": 2.3145205974578857, "learning_rate": 8.823748323835517e-06, "loss": 1.4309, "step": 288 }, { "epoch": 0.8129395218002813, "grad_norm": 2.298470973968506, "learning_rate": 8.814008473815542e-06, "loss": 1.1581, "step": 289 }, { "epoch": 0.8157524613220816, "grad_norm": 2.4578652381896973, "learning_rate": 8.804233887811224e-06, "loss": 1.328, "step": 290 }, { "epoch": 0.8185654008438819, "grad_norm": 2.162040948867798, "learning_rate": 8.794424654844352e-06, "loss": 1.041, "step": 291 }, { "epoch": 0.8213783403656821, "grad_norm": 2.1940865516662598, "learning_rate": 8.784580864252266e-06, "loss": 1.2024, "step": 292 }, { "epoch": 0.8241912798874824, "grad_norm": 2.127418041229248, "learning_rate": 8.774702605687036e-06, "loss": 1.1357, "step": 293 }, { "epoch": 0.8270042194092827, "grad_norm": 2.259040355682373, "learning_rate": 8.764789969114647e-06, "loss": 1.2494, "step": 294 }, { "epoch": 0.829817158931083, "grad_norm": 2.398115634918213, "learning_rate": 8.754843044814183e-06, "loss": 1.3409, "step": 295 }, { "epoch": 0.8326300984528833, "grad_norm": 1.94135320186615, "learning_rate": 8.744861923377e-06, "loss": 1.0011, "step": 296 }, { "epoch": 0.8354430379746836, "grad_norm": 2.3360581398010254, "learning_rate": 8.734846695705912e-06, "loss": 1.3973, "step": 297 }, { "epoch": 0.8382559774964838, "grad_norm": 2.0555343627929688, "learning_rate": 8.724797453014342e-06, "loss": 1.0796, "step": 298 }, { "epoch": 0.8410689170182841, "grad_norm": 2.26999831199646, "learning_rate": 8.714714286825512e-06, "loss": 1.2569, "step": 299 }, { "epoch": 0.8438818565400844, "grad_norm": 2.004324197769165, "learning_rate": 8.704597288971598e-06, "loss": 1.1934, "step": 300 }, { "epoch": 0.8438818565400844, "eval_loss": 0.6666268110275269, "eval_runtime": 2.761, "eval_samples_per_second": 9.417, "eval_steps_per_second": 1.449, "step": 300 }, { "epoch": 0.8438818565400844, "eval_active_sample_count": 30, "eval_avg_loss": 590.375, "eval_avg_mem_token_accuracy": 0.22340425531914893, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007591276057356308, "eval_avg_mem_token_rate": 0.5520521519324153, "eval_avg_mem_token_recall(Accuracy)": 0.22340425531914893, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 300, "eval_loss": 0.6666268110275269, "eval_num_samples": 30, "eval_runtime": 2.761, "eval_samples_per_second": 9.417, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.449, "eval_total_correct_count": 63, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8299, "step": 300 }, { "epoch": 0.8466947960618847, "grad_norm": 2.1731441020965576, "learning_rate": 8.6944465515929e-06, "loss": 1.1642, "step": 301 }, { "epoch": 0.849507735583685, "grad_norm": 1.9805549383163452, "learning_rate": 8.684262167136999e-06, "loss": 1.1963, "step": 302 }, { "epoch": 0.8523206751054853, "grad_norm": 1.985160231590271, "learning_rate": 8.674044228357915e-06, "loss": 1.0271, "step": 303 }, { "epoch": 0.8551336146272855, "grad_norm": 2.233934164047241, "learning_rate": 8.663792828315259e-06, "loss": 1.3379, "step": 304 }, { "epoch": 0.8579465541490858, "grad_norm": 2.1742870807647705, "learning_rate": 8.6535080603734e-06, "loss": 1.2982, "step": 305 }, { "epoch": 0.8607594936708861, "grad_norm": 2.2393639087677, "learning_rate": 8.643190018200595e-06, "loss": 1.2925, "step": 306 }, { "epoch": 0.8635724331926864, "grad_norm": 2.395679473876953, "learning_rate": 8.632838795768149e-06, "loss": 1.3027, "step": 307 }, { "epoch": 0.8663853727144867, "grad_norm": 1.976331353187561, "learning_rate": 8.622454487349556e-06, "loss": 1.1242, "step": 308 }, { "epoch": 0.869198312236287, "grad_norm": 2.1286044120788574, "learning_rate": 8.612037187519635e-06, "loss": 1.1868, "step": 309 }, { "epoch": 0.8720112517580872, "grad_norm": 2.2224793434143066, "learning_rate": 8.601586991153681e-06, "loss": 1.2595, "step": 310 }, { "epoch": 0.8748241912798875, "grad_norm": 2.282410144805908, "learning_rate": 8.591103993426588e-06, "loss": 1.1068, "step": 311 }, { "epoch": 0.8776371308016878, "grad_norm": 2.000074625015259, "learning_rate": 8.580588289811987e-06, "loss": 1.1547, "step": 312 }, { "epoch": 0.8804500703234881, "grad_norm": 2.108109474182129, "learning_rate": 8.570039976081382e-06, "loss": 1.1654, "step": 313 }, { "epoch": 0.8832630098452883, "grad_norm": 2.2698593139648438, "learning_rate": 8.559459148303268e-06, "loss": 1.0082, "step": 314 }, { "epoch": 0.8860759493670886, "grad_norm": 2.04703426361084, "learning_rate": 8.548845902842264e-06, "loss": 1.2114, "step": 315 }, { "epoch": 0.8888888888888888, "grad_norm": 1.9669705629348755, "learning_rate": 8.538200336358227e-06, "loss": 1.0822, "step": 316 }, { "epoch": 0.8917018284106891, "grad_norm": 2.058732271194458, "learning_rate": 8.527522545805386e-06, "loss": 1.056, "step": 317 }, { "epoch": 0.8945147679324894, "grad_norm": 2.1475107669830322, "learning_rate": 8.51681262843144e-06, "loss": 1.2073, "step": 318 }, { "epoch": 0.8973277074542897, "grad_norm": 1.9537756443023682, "learning_rate": 8.50607068177669e-06, "loss": 1.026, "step": 319 }, { "epoch": 0.90014064697609, "grad_norm": 2.14225172996521, "learning_rate": 8.495296803673138e-06, "loss": 1.3038, "step": 320 }, { "epoch": 0.9029535864978903, "grad_norm": 2.2561981678009033, "learning_rate": 8.484491092243603e-06, "loss": 1.0576, "step": 321 }, { "epoch": 0.9057665260196905, "grad_norm": 1.9777567386627197, "learning_rate": 8.473653645900825e-06, "loss": 1.1675, "step": 322 }, { "epoch": 0.9085794655414908, "grad_norm": 2.2552154064178467, "learning_rate": 8.462784563346567e-06, "loss": 1.2568, "step": 323 }, { "epoch": 0.9113924050632911, "grad_norm": 2.19797945022583, "learning_rate": 8.451883943570722e-06, "loss": 1.1247, "step": 324 }, { "epoch": 0.9142053445850914, "grad_norm": 2.176769971847534, "learning_rate": 8.440951885850402e-06, "loss": 1.0333, "step": 325 }, { "epoch": 0.9170182841068917, "grad_norm": 2.011472463607788, "learning_rate": 8.429988489749045e-06, "loss": 1.2882, "step": 326 }, { "epoch": 0.919831223628692, "grad_norm": 2.276411294937134, "learning_rate": 8.418993855115498e-06, "loss": 1.2682, "step": 327 }, { "epoch": 0.9226441631504922, "grad_norm": 1.9374414682388306, "learning_rate": 8.407968082083116e-06, "loss": 1.198, "step": 328 }, { "epoch": 0.9254571026722925, "grad_norm": 2.0080978870391846, "learning_rate": 8.396911271068842e-06, "loss": 1.0495, "step": 329 }, { "epoch": 0.9282700421940928, "grad_norm": 2.410945415496826, "learning_rate": 8.385823522772299e-06, "loss": 1.3558, "step": 330 }, { "epoch": 0.9310829817158931, "grad_norm": 2.205632448196411, "learning_rate": 8.37470493817487e-06, "loss": 1.1552, "step": 331 }, { "epoch": 0.9338959212376934, "grad_norm": 1.9957945346832275, "learning_rate": 8.36355561853878e-06, "loss": 1.2074, "step": 332 }, { "epoch": 0.9367088607594937, "grad_norm": 1.889917254447937, "learning_rate": 8.352375665406171e-06, "loss": 0.8613, "step": 333 }, { "epoch": 0.939521800281294, "grad_norm": 2.4653337001800537, "learning_rate": 8.341165180598182e-06, "loss": 1.3945, "step": 334 }, { "epoch": 0.9423347398030942, "grad_norm": 2.15743088722229, "learning_rate": 8.32992426621401e-06, "loss": 1.1899, "step": 335 }, { "epoch": 0.9451476793248945, "grad_norm": 2.014369010925293, "learning_rate": 8.318653024629999e-06, "loss": 1.2004, "step": 336 }, { "epoch": 0.9479606188466948, "grad_norm": 2.475370168685913, "learning_rate": 8.307351558498692e-06, "loss": 1.0919, "step": 337 }, { "epoch": 0.9507735583684951, "grad_norm": 2.288590669631958, "learning_rate": 8.296019970747901e-06, "loss": 1.054, "step": 338 }, { "epoch": 0.9535864978902954, "grad_norm": 2.0414512157440186, "learning_rate": 8.284658364579771e-06, "loss": 1.2336, "step": 339 }, { "epoch": 0.9563994374120957, "grad_norm": 2.192631483078003, "learning_rate": 8.27326684346984e-06, "loss": 1.2078, "step": 340 }, { "epoch": 0.9592123769338959, "grad_norm": 2.109923839569092, "learning_rate": 8.261845511166092e-06, "loss": 1.2295, "step": 341 }, { "epoch": 0.9620253164556962, "grad_norm": 1.7825968265533447, "learning_rate": 8.250394471688018e-06, "loss": 1.1074, "step": 342 }, { "epoch": 0.9648382559774965, "grad_norm": 1.9041146039962769, "learning_rate": 8.23891382932567e-06, "loss": 1.1283, "step": 343 }, { "epoch": 0.9676511954992968, "grad_norm": 2.0874454975128174, "learning_rate": 8.2274036886387e-06, "loss": 1.1228, "step": 344 }, { "epoch": 0.9704641350210971, "grad_norm": 1.9520052671432495, "learning_rate": 8.215864154455421e-06, "loss": 1.2209, "step": 345 }, { "epoch": 0.9732770745428974, "grad_norm": 2.6171762943267822, "learning_rate": 8.204295331871844e-06, "loss": 1.6231, "step": 346 }, { "epoch": 0.9760900140646976, "grad_norm": 2.0320959091186523, "learning_rate": 8.192697326250722e-06, "loss": 1.153, "step": 347 }, { "epoch": 0.9789029535864979, "grad_norm": 1.8297227621078491, "learning_rate": 8.1810702432206e-06, "loss": 0.9717, "step": 348 }, { "epoch": 0.9817158931082982, "grad_norm": 2.077699661254883, "learning_rate": 8.169414188674829e-06, "loss": 0.9804, "step": 349 }, { "epoch": 0.9845288326300985, "grad_norm": 2.002263069152832, "learning_rate": 8.157729268770636e-06, "loss": 1.1233, "step": 350 }, { "epoch": 0.9845288326300985, "eval_loss": 0.6594013571739197, "eval_runtime": 2.8213, "eval_samples_per_second": 9.216, "eval_steps_per_second": 1.418, "step": 350 }, { "epoch": 0.9845288326300985, "eval_active_sample_count": 30, "eval_avg_loss": 591.0, "eval_avg_mem_token_accuracy": 0.22340425531914893, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007495538370017847, "eval_avg_mem_token_rate": 0.5591033060600014, "eval_avg_mem_token_recall(Accuracy)": 0.22340425531914893, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 350, "eval_loss": 0.6594013571739197, "eval_num_samples": 30, "eval_runtime": 2.8213, "eval_samples_per_second": 9.216, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.418, "eval_total_correct_count": 63, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8405, "step": 350 }, { "epoch": 0.9873417721518988, "grad_norm": 2.112032890319824, "learning_rate": 8.146015589928123e-06, "loss": 1.1559, "step": 351 }, { "epoch": 0.9901547116736991, "grad_norm": 2.227578639984131, "learning_rate": 8.134273258829322e-06, "loss": 1.2947, "step": 352 }, { "epoch": 0.9929676511954993, "grad_norm": 2.0214011669158936, "learning_rate": 8.122502382417211e-06, "loss": 1.3415, "step": 353 }, { "epoch": 0.9957805907172996, "grad_norm": 2.176740884780884, "learning_rate": 8.110703067894747e-06, "loss": 1.3129, "step": 354 }, { "epoch": 0.9985935302390999, "grad_norm": 1.9748849868774414, "learning_rate": 8.098875422723884e-06, "loss": 1.0268, "step": 355 }, { "epoch": 1.0, "grad_norm": 1.5412670373916626, "learning_rate": 8.087019554624595e-06, "loss": 0.657, "step": 356 }, { "epoch": 1.0028129395218002, "grad_norm": 2.013446092605591, "learning_rate": 8.075135571573898e-06, "loss": 1.1009, "step": 357 }, { "epoch": 1.0056258790436006, "grad_norm": 2.034468412399292, "learning_rate": 8.06322358180486e-06, "loss": 1.1514, "step": 358 }, { "epoch": 1.0084388185654007, "grad_norm": 2.1513798236846924, "learning_rate": 8.051283693805624e-06, "loss": 1.1312, "step": 359 }, { "epoch": 1.0112517580872011, "grad_norm": 1.8825079202651978, "learning_rate": 8.039316016318415e-06, "loss": 0.9748, "step": 360 }, { "epoch": 1.0140646976090013, "grad_norm": 2.040106773376465, "learning_rate": 8.027320658338547e-06, "loss": 1.2061, "step": 361 }, { "epoch": 1.0168776371308017, "grad_norm": 2.0149614810943604, "learning_rate": 8.015297729113436e-06, "loss": 1.0372, "step": 362 }, { "epoch": 1.0196905766526019, "grad_norm": 1.8744758367538452, "learning_rate": 8.0032473381416e-06, "loss": 1.1538, "step": 363 }, { "epoch": 1.0225035161744023, "grad_norm": 2.2196671962738037, "learning_rate": 7.991169595171669e-06, "loss": 1.1131, "step": 364 }, { "epoch": 1.0253164556962024, "grad_norm": 2.2530996799468994, "learning_rate": 7.979064610201372e-06, "loss": 1.3786, "step": 365 }, { "epoch": 1.0281293952180028, "grad_norm": 2.0854427814483643, "learning_rate": 7.966932493476554e-06, "loss": 1.0615, "step": 366 }, { "epoch": 1.030942334739803, "grad_norm": 2.3596975803375244, "learning_rate": 7.954773355490155e-06, "loss": 1.366, "step": 367 }, { "epoch": 1.0337552742616034, "grad_norm": 1.9892560243606567, "learning_rate": 7.942587306981213e-06, "loss": 1.0439, "step": 368 }, { "epoch": 1.0365682137834036, "grad_norm": 1.8899530172348022, "learning_rate": 7.930374458933852e-06, "loss": 1.0212, "step": 369 }, { "epoch": 1.039381153305204, "grad_norm": 2.1707684993743896, "learning_rate": 7.918134922576271e-06, "loss": 1.1767, "step": 370 }, { "epoch": 1.0421940928270041, "grad_norm": 2.041611671447754, "learning_rate": 7.905868809379735e-06, "loss": 1.2155, "step": 371 }, { "epoch": 1.0450070323488045, "grad_norm": 1.939260482788086, "learning_rate": 7.893576231057553e-06, "loss": 1.0179, "step": 372 }, { "epoch": 1.0478199718706047, "grad_norm": 1.9848639965057373, "learning_rate": 7.88125729956407e-06, "loss": 1.0099, "step": 373 }, { "epoch": 1.0506329113924051, "grad_norm": 2.0023953914642334, "learning_rate": 7.868912127093638e-06, "loss": 1.119, "step": 374 }, { "epoch": 1.0534458509142053, "grad_norm": 1.7961069345474243, "learning_rate": 7.856540826079595e-06, "loss": 0.7417, "step": 375 }, { "epoch": 1.0562587904360057, "grad_norm": 1.8289830684661865, "learning_rate": 7.844143509193252e-06, "loss": 1.0566, "step": 376 }, { "epoch": 1.0590717299578059, "grad_norm": 1.8681098222732544, "learning_rate": 7.831720289342853e-06, "loss": 0.9817, "step": 377 }, { "epoch": 1.0618846694796062, "grad_norm": 1.9967904090881348, "learning_rate": 7.819271279672553e-06, "loss": 0.9361, "step": 378 }, { "epoch": 1.0646976090014064, "grad_norm": 1.7474114894866943, "learning_rate": 7.806796593561389e-06, "loss": 0.9923, "step": 379 }, { "epoch": 1.0675105485232068, "grad_norm": 2.514089822769165, "learning_rate": 7.794296344622246e-06, "loss": 1.2647, "step": 380 }, { "epoch": 1.070323488045007, "grad_norm": 2.2486379146575928, "learning_rate": 7.78177064670082e-06, "loss": 1.1741, "step": 381 }, { "epoch": 1.0731364275668074, "grad_norm": 2.0108935832977295, "learning_rate": 7.769219613874581e-06, "loss": 1.0724, "step": 382 }, { "epoch": 1.0759493670886076, "grad_norm": 2.316124677658081, "learning_rate": 7.756643360451744e-06, "loss": 1.2943, "step": 383 }, { "epoch": 1.078762306610408, "grad_norm": 2.3428173065185547, "learning_rate": 7.744042000970207e-06, "loss": 1.2522, "step": 384 }, { "epoch": 1.0815752461322081, "grad_norm": 2.087315797805786, "learning_rate": 7.731415650196535e-06, "loss": 1.0241, "step": 385 }, { "epoch": 1.0843881856540085, "grad_norm": 2.1546409130096436, "learning_rate": 7.718764423124892e-06, "loss": 1.2256, "step": 386 }, { "epoch": 1.0872011251758087, "grad_norm": 2.559561252593994, "learning_rate": 7.706088434976e-06, "loss": 1.4538, "step": 387 }, { "epoch": 1.090014064697609, "grad_norm": 2.023336410522461, "learning_rate": 7.6933878011961e-06, "loss": 1.1043, "step": 388 }, { "epoch": 1.0928270042194093, "grad_norm": 2.1914350986480713, "learning_rate": 7.68066263745589e-06, "loss": 1.1997, "step": 389 }, { "epoch": 1.0956399437412097, "grad_norm": 1.8683468103408813, "learning_rate": 7.667913059649468e-06, "loss": 1.0576, "step": 390 }, { "epoch": 1.0984528832630098, "grad_norm": 2.2158288955688477, "learning_rate": 7.65513918389329e-06, "loss": 1.2133, "step": 391 }, { "epoch": 1.1012658227848102, "grad_norm": 2.4496500492095947, "learning_rate": 7.6423411265251e-06, "loss": 1.309, "step": 392 }, { "epoch": 1.1040787623066104, "grad_norm": 2.3594353199005127, "learning_rate": 7.629519004102876e-06, "loss": 1.2893, "step": 393 }, { "epoch": 1.1068917018284108, "grad_norm": 2.0072391033172607, "learning_rate": 7.616672933403772e-06, "loss": 0.9854, "step": 394 }, { "epoch": 1.109704641350211, "grad_norm": 2.1165082454681396, "learning_rate": 7.603803031423046e-06, "loss": 1.0648, "step": 395 }, { "epoch": 1.1125175808720114, "grad_norm": 2.136019229888916, "learning_rate": 7.590909415373e-06, "loss": 1.2763, "step": 396 }, { "epoch": 1.1153305203938115, "grad_norm": 2.089963912963867, "learning_rate": 7.577992202681912e-06, "loss": 1.1498, "step": 397 }, { "epoch": 1.1181434599156117, "grad_norm": 2.0347511768341064, "learning_rate": 7.565051510992964e-06, "loss": 1.0931, "step": 398 }, { "epoch": 1.120956399437412, "grad_norm": 1.902830958366394, "learning_rate": 7.552087458163177e-06, "loss": 1.0382, "step": 399 }, { "epoch": 1.1237693389592125, "grad_norm": 2.3222129344940186, "learning_rate": 7.539100162262325e-06, "loss": 1.3173, "step": 400 }, { "epoch": 1.1237693389592125, "eval_loss": 0.6541261672973633, "eval_runtime": 2.7652, "eval_samples_per_second": 9.403, "eval_steps_per_second": 1.447, "step": 400 }, { "epoch": 1.1237693389592125, "eval_active_sample_count": 30, "eval_avg_loss": 579.625, "eval_avg_mem_token_accuracy": 0.23049645390070922, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007830381881701, "eval_avg_mem_token_rate": 0.552185192576332, "eval_avg_mem_token_recall(Accuracy)": 0.23049645390070922, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 400, "eval_loss": 0.6541261672973633, "eval_num_samples": 30, "eval_runtime": 2.7652, "eval_samples_per_second": 9.403, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.447, "eval_total_correct_count": 65, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8301, "step": 400 }, { "epoch": 1.1265822784810127, "grad_norm": 2.1172144412994385, "learning_rate": 7.526089741571876e-06, "loss": 1.2135, "step": 401 }, { "epoch": 1.1293952180028128, "grad_norm": 2.117197036743164, "learning_rate": 7.5130563145838994e-06, "loss": 1.2903, "step": 402 }, { "epoch": 1.1322081575246132, "grad_norm": 1.8641384840011597, "learning_rate": 7.500000000000001e-06, "loss": 0.902, "step": 403 }, { "epoch": 1.1350210970464134, "grad_norm": 2.043870449066162, "learning_rate": 7.486920916730228e-06, "loss": 1.14, "step": 404 }, { "epoch": 1.1378340365682138, "grad_norm": 2.371406078338623, "learning_rate": 7.473819183891997e-06, "loss": 1.168, "step": 405 }, { "epoch": 1.140646976090014, "grad_norm": 2.017378807067871, "learning_rate": 7.460694920809004e-06, "loss": 1.2308, "step": 406 }, { "epoch": 1.1434599156118144, "grad_norm": 3.638538122177124, "learning_rate": 7.447548247010137e-06, "loss": 1.1636, "step": 407 }, { "epoch": 1.1462728551336145, "grad_norm": 1.9470067024230957, "learning_rate": 7.434379282228393e-06, "loss": 1.1502, "step": 408 }, { "epoch": 1.149085794655415, "grad_norm": 2.1175174713134766, "learning_rate": 7.421188146399776e-06, "loss": 1.0217, "step": 409 }, { "epoch": 1.1518987341772151, "grad_norm": 1.9489398002624512, "learning_rate": 7.407974959662222e-06, "loss": 1.223, "step": 410 }, { "epoch": 1.1547116736990155, "grad_norm": 2.227391242980957, "learning_rate": 7.394739842354489e-06, "loss": 1.1757, "step": 411 }, { "epoch": 1.1575246132208157, "grad_norm": 1.961480736732483, "learning_rate": 7.381482915015068e-06, "loss": 1.1204, "step": 412 }, { "epoch": 1.160337552742616, "grad_norm": 1.8854504823684692, "learning_rate": 7.368204298381085e-06, "loss": 1.0732, "step": 413 }, { "epoch": 1.1631504922644162, "grad_norm": 2.4665989875793457, "learning_rate": 7.3549041133872004e-06, "loss": 1.2208, "step": 414 }, { "epoch": 1.1659634317862166, "grad_norm": 2.293067216873169, "learning_rate": 7.341582481164508e-06, "loss": 1.0995, "step": 415 }, { "epoch": 1.1687763713080168, "grad_norm": 1.636135458946228, "learning_rate": 7.328239523039431e-06, "loss": 1.0113, "step": 416 }, { "epoch": 1.1715893108298172, "grad_norm": 2.080463171005249, "learning_rate": 7.314875360532618e-06, "loss": 1.2187, "step": 417 }, { "epoch": 1.1744022503516174, "grad_norm": 2.316681146621704, "learning_rate": 7.301490115357837e-06, "loss": 1.0254, "step": 418 }, { "epoch": 1.1772151898734178, "grad_norm": 1.9154740571975708, "learning_rate": 7.288083909420866e-06, "loss": 1.0994, "step": 419 }, { "epoch": 1.180028129395218, "grad_norm": 2.2701125144958496, "learning_rate": 7.274656864818379e-06, "loss": 1.193, "step": 420 }, { "epoch": 1.1828410689170183, "grad_norm": 2.259373188018799, "learning_rate": 7.261209103836843e-06, "loss": 1.2083, "step": 421 }, { "epoch": 1.1856540084388185, "grad_norm": 2.170278787612915, "learning_rate": 7.247740748951394e-06, "loss": 1.108, "step": 422 }, { "epoch": 1.188466947960619, "grad_norm": 2.3180534839630127, "learning_rate": 7.234251922824731e-06, "loss": 1.0838, "step": 423 }, { "epoch": 1.191279887482419, "grad_norm": 2.200087308883667, "learning_rate": 7.220742748305989e-06, "loss": 1.2188, "step": 424 }, { "epoch": 1.1940928270042195, "grad_norm": 2.148313045501709, "learning_rate": 7.20721334842963e-06, "loss": 1.1162, "step": 425 }, { "epoch": 1.1969057665260197, "grad_norm": 2.109539270401001, "learning_rate": 7.193663846414318e-06, "loss": 1.126, "step": 426 }, { "epoch": 1.19971870604782, "grad_norm": 2.3250086307525635, "learning_rate": 7.180094365661793e-06, "loss": 1.216, "step": 427 }, { "epoch": 1.2025316455696202, "grad_norm": 2.1778461933135986, "learning_rate": 7.166505029755753e-06, "loss": 1.1582, "step": 428 }, { "epoch": 1.2053445850914206, "grad_norm": 2.0346758365631104, "learning_rate": 7.152895962460727e-06, "loss": 1.0597, "step": 429 }, { "epoch": 1.2081575246132208, "grad_norm": 2.2523462772369385, "learning_rate": 7.139267287720945e-06, "loss": 1.3096, "step": 430 }, { "epoch": 1.2109704641350212, "grad_norm": 2.1248557567596436, "learning_rate": 7.125619129659215e-06, "loss": 1.2255, "step": 431 }, { "epoch": 1.2137834036568214, "grad_norm": 2.402777671813965, "learning_rate": 7.111951612575783e-06, "loss": 1.2178, "step": 432 }, { "epoch": 1.2165963431786218, "grad_norm": 2.1899073123931885, "learning_rate": 7.0982648609472135e-06, "loss": 1.1086, "step": 433 }, { "epoch": 1.219409282700422, "grad_norm": 2.306647777557373, "learning_rate": 7.084558999425245e-06, "loss": 1.2791, "step": 434 }, { "epoch": 1.2222222222222223, "grad_norm": 2.1083829402923584, "learning_rate": 7.0708341528356585e-06, "loss": 1.2203, "step": 435 }, { "epoch": 1.2250351617440225, "grad_norm": 1.9246402978897095, "learning_rate": 7.0570904461771426e-06, "loss": 1.1293, "step": 436 }, { "epoch": 1.2278481012658227, "grad_norm": 2.0863969326019287, "learning_rate": 7.043328004620154e-06, "loss": 1.1112, "step": 437 }, { "epoch": 1.230661040787623, "grad_norm": 2.237459421157837, "learning_rate": 7.029546953505776e-06, "loss": 1.1374, "step": 438 }, { "epoch": 1.2334739803094235, "grad_norm": 1.9015916585922241, "learning_rate": 7.015747418344578e-06, "loss": 1.0886, "step": 439 }, { "epoch": 1.2362869198312236, "grad_norm": 2.1524229049682617, "learning_rate": 7.0019295248154714e-06, "loss": 1.1271, "step": 440 }, { "epoch": 1.2390998593530238, "grad_norm": 2.171227216720581, "learning_rate": 6.98809339876457e-06, "loss": 1.2677, "step": 441 }, { "epoch": 1.2419127988748242, "grad_norm": 2.0763444900512695, "learning_rate": 6.974239166204034e-06, "loss": 1.0989, "step": 442 }, { "epoch": 1.2447257383966246, "grad_norm": 2.1066906452178955, "learning_rate": 6.960366953310931e-06, "loss": 1.2027, "step": 443 }, { "epoch": 1.2475386779184248, "grad_norm": 2.748056650161743, "learning_rate": 6.946476886426087e-06, "loss": 1.0004, "step": 444 }, { "epoch": 1.250351617440225, "grad_norm": 1.90733003616333, "learning_rate": 6.932569092052927e-06, "loss": 0.9063, "step": 445 }, { "epoch": 1.2531645569620253, "grad_norm": 2.3296380043029785, "learning_rate": 6.918643696856333e-06, "loss": 1.2053, "step": 446 }, { "epoch": 1.2559774964838257, "grad_norm": 2.194408416748047, "learning_rate": 6.904700827661484e-06, "loss": 1.2663, "step": 447 }, { "epoch": 1.258790436005626, "grad_norm": 2.2270679473876953, "learning_rate": 6.890740611452705e-06, "loss": 1.1718, "step": 448 }, { "epoch": 1.261603375527426, "grad_norm": 1.8598543405532837, "learning_rate": 6.876763175372306e-06, "loss": 0.958, "step": 449 }, { "epoch": 1.2644163150492265, "grad_norm": 2.112734079360962, "learning_rate": 6.862768646719425e-06, "loss": 1.2674, "step": 450 }, { "epoch": 1.2644163150492265, "eval_loss": 0.6488014459609985, "eval_runtime": 2.7966, "eval_samples_per_second": 9.297, "eval_steps_per_second": 1.43, "step": 450 }, { "epoch": 1.2644163150492265, "eval_active_sample_count": 30, "eval_avg_loss": 588.0, "eval_avg_mem_token_accuracy": 0.23404255319148937, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007794048181388758, "eval_avg_mem_token_rate": 0.5632940863433779, "eval_avg_mem_token_recall(Accuracy)": 0.23404255319148937, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 450, "eval_loss": 0.6488014459609985, "eval_num_samples": 30, "eval_runtime": 2.7966, "eval_samples_per_second": 9.297, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.43, "eval_total_correct_count": 66, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8468, "step": 450 }, { "epoch": 1.2672292545710266, "grad_norm": 1.943136215209961, "learning_rate": 6.848757152948876e-06, "loss": 1.0877, "step": 451 }, { "epoch": 1.270042194092827, "grad_norm": 2.015427589416504, "learning_rate": 6.834728821669978e-06, "loss": 1.0226, "step": 452 }, { "epoch": 1.2728551336146272, "grad_norm": 2.0203545093536377, "learning_rate": 6.820683780645397e-06, "loss": 1.0537, "step": 453 }, { "epoch": 1.2756680731364276, "grad_norm": 1.9082456827163696, "learning_rate": 6.806622157789989e-06, "loss": 1.0811, "step": 454 }, { "epoch": 1.2784810126582278, "grad_norm": 2.0107004642486572, "learning_rate": 6.7925440811696165e-06, "loss": 1.1643, "step": 455 }, { "epoch": 1.2812939521800282, "grad_norm": 1.968511700630188, "learning_rate": 6.778449679000006e-06, "loss": 0.9849, "step": 456 }, { "epoch": 1.2841068917018283, "grad_norm": 2.0401535034179688, "learning_rate": 6.764339079645561e-06, "loss": 1.1488, "step": 457 }, { "epoch": 1.2869198312236287, "grad_norm": 1.788967251777649, "learning_rate": 6.7502124116182066e-06, "loss": 0.8775, "step": 458 }, { "epoch": 1.289732770745429, "grad_norm": 1.8958114385604858, "learning_rate": 6.736069803576205e-06, "loss": 1.1991, "step": 459 }, { "epoch": 1.2925457102672293, "grad_norm": 2.1174044609069824, "learning_rate": 6.721911384323e-06, "loss": 1.2373, "step": 460 }, { "epoch": 1.2953586497890295, "grad_norm": 2.2091267108917236, "learning_rate": 6.7077372828060294e-06, "loss": 1.1511, "step": 461 }, { "epoch": 1.2981715893108299, "grad_norm": 1.904528021812439, "learning_rate": 6.693547628115561e-06, "loss": 0.9815, "step": 462 }, { "epoch": 1.30098452883263, "grad_norm": 2.0216708183288574, "learning_rate": 6.67934254948351e-06, "loss": 1.0773, "step": 463 }, { "epoch": 1.3037974683544304, "grad_norm": 2.3458025455474854, "learning_rate": 6.6651221762822635e-06, "loss": 1.2122, "step": 464 }, { "epoch": 1.3066104078762306, "grad_norm": 2.210007905960083, "learning_rate": 6.650886638023508e-06, "loss": 1.2001, "step": 465 }, { "epoch": 1.309423347398031, "grad_norm": 2.168041229248047, "learning_rate": 6.636636064357045e-06, "loss": 1.1748, "step": 466 }, { "epoch": 1.3122362869198312, "grad_norm": 2.1177752017974854, "learning_rate": 6.622370585069605e-06, "loss": 1.1441, "step": 467 }, { "epoch": 1.3150492264416316, "grad_norm": 4.13400411605835, "learning_rate": 6.608090330083677e-06, "loss": 1.0154, "step": 468 }, { "epoch": 1.3178621659634318, "grad_norm": 1.8855236768722534, "learning_rate": 6.593795429456317e-06, "loss": 1.1638, "step": 469 }, { "epoch": 1.3206751054852321, "grad_norm": 2.1128952503204346, "learning_rate": 6.579486013377963e-06, "loss": 1.2435, "step": 470 }, { "epoch": 1.3234880450070323, "grad_norm": 2.091977119445801, "learning_rate": 6.565162212171257e-06, "loss": 1.1948, "step": 471 }, { "epoch": 1.3263009845288325, "grad_norm": 1.8725004196166992, "learning_rate": 6.550824156289852e-06, "loss": 0.9448, "step": 472 }, { "epoch": 1.3291139240506329, "grad_norm": 2.134361982345581, "learning_rate": 6.536471976317223e-06, "loss": 1.1985, "step": 473 }, { "epoch": 1.3319268635724333, "grad_norm": 2.0700531005859375, "learning_rate": 6.5221058029654815e-06, "loss": 1.1321, "step": 474 }, { "epoch": 1.3347398030942335, "grad_norm": 2.336487054824829, "learning_rate": 6.507725767074181e-06, "loss": 1.2447, "step": 475 }, { "epoch": 1.3375527426160336, "grad_norm": 2.1936490535736084, "learning_rate": 6.493331999609132e-06, "loss": 1.3264, "step": 476 }, { "epoch": 1.340365682137834, "grad_norm": 1.8957630395889282, "learning_rate": 6.4789246316612e-06, "loss": 1.0029, "step": 477 }, { "epoch": 1.3431786216596344, "grad_norm": 2.329432249069214, "learning_rate": 6.464503794445121e-06, "loss": 1.3139, "step": 478 }, { "epoch": 1.3459915611814346, "grad_norm": 2.2381882667541504, "learning_rate": 6.450069619298299e-06, "loss": 1.0446, "step": 479 }, { "epoch": 1.3488045007032348, "grad_norm": 2.235319137573242, "learning_rate": 6.435622237679615e-06, "loss": 1.1327, "step": 480 }, { "epoch": 1.3516174402250352, "grad_norm": 2.1776840686798096, "learning_rate": 6.421161781168226e-06, "loss": 1.0707, "step": 481 }, { "epoch": 1.3544303797468356, "grad_norm": 2.003654956817627, "learning_rate": 6.4066883814623674e-06, "loss": 1.0294, "step": 482 }, { "epoch": 1.3572433192686357, "grad_norm": 2.2653419971466064, "learning_rate": 6.3922021703781574e-06, "loss": 1.1558, "step": 483 }, { "epoch": 1.360056258790436, "grad_norm": 1.8952243328094482, "learning_rate": 6.377703279848393e-06, "loss": 1.1621, "step": 484 }, { "epoch": 1.3628691983122363, "grad_norm": 1.818117618560791, "learning_rate": 6.363191841921345e-06, "loss": 1.1758, "step": 485 }, { "epoch": 1.3656821378340367, "grad_norm": 2.188119411468506, "learning_rate": 6.3486679887595635e-06, "loss": 1.4035, "step": 486 }, { "epoch": 1.3684950773558369, "grad_norm": 2.2680625915527344, "learning_rate": 6.334131852638669e-06, "loss": 1.3802, "step": 487 }, { "epoch": 1.371308016877637, "grad_norm": 2.239824056625366, "learning_rate": 6.319583565946147e-06, "loss": 0.978, "step": 488 }, { "epoch": 1.3741209563994374, "grad_norm": 2.084578275680542, "learning_rate": 6.305023261180146e-06, "loss": 1.1592, "step": 489 }, { "epoch": 1.3769338959212378, "grad_norm": 2.074716329574585, "learning_rate": 6.290451070948269e-06, "loss": 1.1417, "step": 490 }, { "epoch": 1.379746835443038, "grad_norm": 2.2187070846557617, "learning_rate": 6.275867127966364e-06, "loss": 1.3134, "step": 491 }, { "epoch": 1.3825597749648382, "grad_norm": 1.9704614877700806, "learning_rate": 6.261271565057318e-06, "loss": 1.2947, "step": 492 }, { "epoch": 1.3853727144866386, "grad_norm": 2.0791146755218506, "learning_rate": 6.246664515149845e-06, "loss": 1.1796, "step": 493 }, { "epoch": 1.3881856540084387, "grad_norm": 2.070108413696289, "learning_rate": 6.232046111277277e-06, "loss": 1.016, "step": 494 }, { "epoch": 1.3909985935302391, "grad_norm": 2.40295147895813, "learning_rate": 6.217416486576354e-06, "loss": 1.247, "step": 495 }, { "epoch": 1.3938115330520393, "grad_norm": 1.9346283674240112, "learning_rate": 6.202775774286007e-06, "loss": 1.0943, "step": 496 }, { "epoch": 1.3966244725738397, "grad_norm": 1.88413667678833, "learning_rate": 6.188124107746148e-06, "loss": 1.0378, "step": 497 }, { "epoch": 1.3994374120956399, "grad_norm": 2.3754115104675293, "learning_rate": 6.173461620396453e-06, "loss": 1.1976, "step": 498 }, { "epoch": 1.4022503516174403, "grad_norm": 2.2472076416015625, "learning_rate": 6.158788445775151e-06, "loss": 1.348, "step": 499 }, { "epoch": 1.4050632911392404, "grad_norm": 2.299577474594116, "learning_rate": 6.1441047175178025e-06, "loss": 1.3543, "step": 500 }, { "epoch": 1.4050632911392404, "eval_loss": 0.6480849385261536, "eval_runtime": 2.7664, "eval_samples_per_second": 9.398, "eval_steps_per_second": 1.446, "step": 500 }, { "epoch": 1.4050632911392404, "eval_active_sample_count": 30, "eval_avg_loss": 593.75, "eval_avg_mem_token_accuracy": 0.2375886524822695, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.007852789498359119, "eval_avg_mem_token_rate": 0.5675513869487129, "eval_avg_mem_token_recall(Accuracy)": 0.2375886524822695, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 500, "eval_loss": 0.6480849385261536, "eval_num_samples": 30, "eval_runtime": 2.7664, "eval_samples_per_second": 9.398, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.446, "eval_total_correct_count": 67, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8532, "step": 500 }, { "epoch": 1.4078762306610408, "grad_norm": 2.4926252365112305, "learning_rate": 6.129410569356086e-06, "loss": 1.1548, "step": 501 }, { "epoch": 1.410689170182841, "grad_norm": 1.9530552625656128, "learning_rate": 6.11470613511658e-06, "loss": 0.9438, "step": 502 }, { "epoch": 1.4135021097046414, "grad_norm": 2.046297788619995, "learning_rate": 6.0999915487195395e-06, "loss": 1.0105, "step": 503 }, { "epoch": 1.4163150492264416, "grad_norm": 2.359480619430542, "learning_rate": 6.085266944177686e-06, "loss": 1.2237, "step": 504 }, { "epoch": 1.419127988748242, "grad_norm": 2.0814826488494873, "learning_rate": 6.070532455594974e-06, "loss": 1.3641, "step": 505 }, { "epoch": 1.4219409282700421, "grad_norm": 2.5021960735321045, "learning_rate": 6.055788217165384e-06, "loss": 1.1271, "step": 506 }, { "epoch": 1.4247538677918425, "grad_norm": 2.1782703399658203, "learning_rate": 6.0410343631716865e-06, "loss": 1.1237, "step": 507 }, { "epoch": 1.4275668073136427, "grad_norm": 1.9032992124557495, "learning_rate": 6.0262710279842305e-06, "loss": 1.2318, "step": 508 }, { "epoch": 1.4303797468354431, "grad_norm": 1.969860315322876, "learning_rate": 6.011498346059712e-06, "loss": 1.0196, "step": 509 }, { "epoch": 1.4331926863572433, "grad_norm": 2.1782121658325195, "learning_rate": 5.99671645193995e-06, "loss": 1.1725, "step": 510 }, { "epoch": 1.4360056258790437, "grad_norm": 2.0659401416778564, "learning_rate": 5.98192548025067e-06, "loss": 1.1655, "step": 511 }, { "epoch": 1.4388185654008439, "grad_norm": 2.1270692348480225, "learning_rate": 5.967125565700266e-06, "loss": 0.9583, "step": 512 }, { "epoch": 1.4416315049226442, "grad_norm": 2.146409034729004, "learning_rate": 5.952316843078579e-06, "loss": 1.1295, "step": 513 }, { "epoch": 1.4444444444444444, "grad_norm": 2.323197364807129, "learning_rate": 5.9374994472556715e-06, "loss": 1.1557, "step": 514 }, { "epoch": 1.4472573839662446, "grad_norm": 2.1008739471435547, "learning_rate": 5.922673513180596e-06, "loss": 1.24, "step": 515 }, { "epoch": 1.450070323488045, "grad_norm": 2.4466872215270996, "learning_rate": 5.9078391758801646e-06, "loss": 1.2434, "step": 516 }, { "epoch": 1.4528832630098454, "grad_norm": 2.210320234298706, "learning_rate": 5.8929965704577275e-06, "loss": 1.136, "step": 517 }, { "epoch": 1.4556962025316456, "grad_norm": 2.259718894958496, "learning_rate": 5.878145832091929e-06, "loss": 1.3789, "step": 518 }, { "epoch": 1.4585091420534457, "grad_norm": 2.305795431137085, "learning_rate": 5.863287096035491e-06, "loss": 1.0189, "step": 519 }, { "epoch": 1.4613220815752461, "grad_norm": 2.283437967300415, "learning_rate": 5.848420497613969e-06, "loss": 1.1944, "step": 520 }, { "epoch": 1.4641350210970465, "grad_norm": 2.0504446029663086, "learning_rate": 5.833546172224527e-06, "loss": 1.22, "step": 521 }, { "epoch": 1.4669479606188467, "grad_norm": 2.018839120864868, "learning_rate": 5.818664255334702e-06, "loss": 1.0634, "step": 522 }, { "epoch": 1.4697609001406469, "grad_norm": 2.3706552982330322, "learning_rate": 5.803774882481171e-06, "loss": 1.1355, "step": 523 }, { "epoch": 1.4725738396624473, "grad_norm": 2.355933427810669, "learning_rate": 5.788878189268516e-06, "loss": 1.2492, "step": 524 }, { "epoch": 1.4753867791842477, "grad_norm": 2.439201831817627, "learning_rate": 5.773974311367987e-06, "loss": 1.3196, "step": 525 }, { "epoch": 1.4781997187060478, "grad_norm": 2.0663866996765137, "learning_rate": 5.759063384516271e-06, "loss": 1.1885, "step": 526 }, { "epoch": 1.481012658227848, "grad_norm": 2.264146327972412, "learning_rate": 5.7441455445142505e-06, "loss": 1.2146, "step": 527 }, { "epoch": 1.4838255977496484, "grad_norm": 1.8687844276428223, "learning_rate": 5.729220927225769e-06, "loss": 0.9485, "step": 528 }, { "epoch": 1.4866385372714488, "grad_norm": 2.1123878955841064, "learning_rate": 5.714289668576401e-06, "loss": 1.0617, "step": 529 }, { "epoch": 1.489451476793249, "grad_norm": 2.460676670074463, "learning_rate": 5.699351904552196e-06, "loss": 1.5609, "step": 530 }, { "epoch": 1.4922644163150491, "grad_norm": 2.3636927604675293, "learning_rate": 5.68440777119846e-06, "loss": 1.2612, "step": 531 }, { "epoch": 1.4950773558368495, "grad_norm": 1.9600480794906616, "learning_rate": 5.669457404618502e-06, "loss": 0.9536, "step": 532 }, { "epoch": 1.49789029535865, "grad_norm": 1.95573091506958, "learning_rate": 5.654500940972405e-06, "loss": 1.0379, "step": 533 }, { "epoch": 1.50070323488045, "grad_norm": 1.8376390933990479, "learning_rate": 5.639538516475775e-06, "loss": 1.1431, "step": 534 }, { "epoch": 1.5035161744022503, "grad_norm": 1.8683063983917236, "learning_rate": 5.624570267398511e-06, "loss": 1.0917, "step": 535 }, { "epoch": 1.5063291139240507, "grad_norm": 2.060288906097412, "learning_rate": 5.6095963300635585e-06, "loss": 1.0954, "step": 536 }, { "epoch": 1.509142053445851, "grad_norm": 2.148991107940674, "learning_rate": 5.594616840845666e-06, "loss": 1.0198, "step": 537 }, { "epoch": 1.5119549929676512, "grad_norm": 2.234832286834717, "learning_rate": 5.579631936170147e-06, "loss": 1.1007, "step": 538 }, { "epoch": 1.5147679324894514, "grad_norm": 2.1892640590667725, "learning_rate": 5.564641752511637e-06, "loss": 1.0431, "step": 539 }, { "epoch": 1.5175808720112518, "grad_norm": 2.029608726501465, "learning_rate": 5.54964642639285e-06, "loss": 1.0874, "step": 540 }, { "epoch": 1.5203938115330522, "grad_norm": 2.019705057144165, "learning_rate": 5.534646094383333e-06, "loss": 1.0566, "step": 541 }, { "epoch": 1.5232067510548524, "grad_norm": 2.067397117614746, "learning_rate": 5.519640893098227e-06, "loss": 1.1467, "step": 542 }, { "epoch": 1.5260196905766525, "grad_norm": 2.2218313217163086, "learning_rate": 5.504630959197014e-06, "loss": 1.2784, "step": 543 }, { "epoch": 1.528832630098453, "grad_norm": 2.1426005363464355, "learning_rate": 5.489616429382285e-06, "loss": 1.217, "step": 544 }, { "epoch": 1.5316455696202531, "grad_norm": 2.0496666431427, "learning_rate": 5.474597440398483e-06, "loss": 1.1561, "step": 545 }, { "epoch": 1.5344585091420533, "grad_norm": 1.9886417388916016, "learning_rate": 5.459574129030669e-06, "loss": 1.2286, "step": 546 }, { "epoch": 1.5372714486638537, "grad_norm": 1.9588450193405151, "learning_rate": 5.444546632103262e-06, "loss": 1.1474, "step": 547 }, { "epoch": 1.540084388185654, "grad_norm": 2.0006983280181885, "learning_rate": 5.429515086478805e-06, "loss": 1.1519, "step": 548 }, { "epoch": 1.5428973277074542, "grad_norm": 2.1134023666381836, "learning_rate": 5.414479629056717e-06, "loss": 1.1426, "step": 549 }, { "epoch": 1.5457102672292544, "grad_norm": 2.110901355743408, "learning_rate": 5.3994403967720366e-06, "loss": 1.0726, "step": 550 }, { "epoch": 1.5457102672292544, "eval_loss": 0.6454769372940063, "eval_runtime": 2.82, "eval_samples_per_second": 9.22, "eval_steps_per_second": 1.418, "step": 550 }, { "epoch": 1.5457102672292544, "eval_active_sample_count": 30, "eval_avg_loss": 591.125, "eval_avg_mem_token_accuracy": 0.24113475177304963, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008006593665371483, "eval_avg_mem_token_rate": 0.5649570943923369, "eval_avg_mem_token_recall(Accuracy)": 0.24113475177304963, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 550, "eval_loss": 0.6454769372940063, "eval_num_samples": 30, "eval_runtime": 2.82, "eval_samples_per_second": 9.22, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.418, "eval_total_correct_count": 68, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8493, "step": 550 }, { "epoch": 1.5485232067510548, "grad_norm": 1.9120993614196777, "learning_rate": 5.3843975265941896e-06, "loss": 1.1199, "step": 551 }, { "epoch": 1.5513361462728552, "grad_norm": 2.0266835689544678, "learning_rate": 5.369351155525729e-06, "loss": 1.1231, "step": 552 }, { "epoch": 1.5541490857946554, "grad_norm": 2.3950095176696777, "learning_rate": 5.354301420601095e-06, "loss": 1.2016, "step": 553 }, { "epoch": 1.5569620253164556, "grad_norm": 2.245199680328369, "learning_rate": 5.33924845888536e-06, "loss": 1.1973, "step": 554 }, { "epoch": 1.559774964838256, "grad_norm": 2.302870988845825, "learning_rate": 5.3241924074729865e-06, "loss": 1.1057, "step": 555 }, { "epoch": 1.5625879043600563, "grad_norm": 2.439229726791382, "learning_rate": 5.30913340348658e-06, "loss": 1.0278, "step": 556 }, { "epoch": 1.5654008438818565, "grad_norm": 2.243025779724121, "learning_rate": 5.294071584075628e-06, "loss": 1.2353, "step": 557 }, { "epoch": 1.5682137834036567, "grad_norm": 2.1339046955108643, "learning_rate": 5.279007086415268e-06, "loss": 1.2753, "step": 558 }, { "epoch": 1.571026722925457, "grad_norm": 2.055248260498047, "learning_rate": 5.263940047705026e-06, "loss": 1.0207, "step": 559 }, { "epoch": 1.5738396624472575, "grad_norm": 2.2932729721069336, "learning_rate": 5.24887060516757e-06, "loss": 1.0904, "step": 560 }, { "epoch": 1.5766526019690577, "grad_norm": 2.3540918827056885, "learning_rate": 5.233798896047461e-06, "loss": 1.045, "step": 561 }, { "epoch": 1.5794655414908578, "grad_norm": 1.941489338874817, "learning_rate": 5.218725057609901e-06, "loss": 0.9543, "step": 562 }, { "epoch": 1.5822784810126582, "grad_norm": 1.9541575908660889, "learning_rate": 5.2036492271394915e-06, "loss": 0.9803, "step": 563 }, { "epoch": 1.5850914205344586, "grad_norm": 2.066892147064209, "learning_rate": 5.188571541938968e-06, "loss": 1.1598, "step": 564 }, { "epoch": 1.5879043600562588, "grad_norm": 2.207688093185425, "learning_rate": 5.1734921393279644e-06, "loss": 1.14, "step": 565 }, { "epoch": 1.590717299578059, "grad_norm": 2.2512924671173096, "learning_rate": 5.158411156641752e-06, "loss": 1.2269, "step": 566 }, { "epoch": 1.5935302390998594, "grad_norm": 1.9499599933624268, "learning_rate": 5.143328731229994e-06, "loss": 0.9949, "step": 567 }, { "epoch": 1.5963431786216598, "grad_norm": 2.176727056503296, "learning_rate": 5.128245000455493e-06, "loss": 1.1866, "step": 568 }, { "epoch": 1.59915611814346, "grad_norm": 2.0169143676757812, "learning_rate": 5.113160101692939e-06, "loss": 1.1554, "step": 569 }, { "epoch": 1.60196905766526, "grad_norm": 2.1123158931732178, "learning_rate": 5.098074172327661e-06, "loss": 0.9758, "step": 570 }, { "epoch": 1.6047819971870605, "grad_norm": 1.8653483390808105, "learning_rate": 5.082987349754376e-06, "loss": 1.009, "step": 571 }, { "epoch": 1.6075949367088609, "grad_norm": 2.3386378288269043, "learning_rate": 5.0678997713759305e-06, "loss": 1.1193, "step": 572 }, { "epoch": 1.610407876230661, "grad_norm": 2.200810432434082, "learning_rate": 5.052811574602059e-06, "loss": 1.2255, "step": 573 }, { "epoch": 1.6132208157524612, "grad_norm": 2.702786922454834, "learning_rate": 5.0377228968481274e-06, "loss": 1.2351, "step": 574 }, { "epoch": 1.6160337552742616, "grad_norm": 2.252342462539673, "learning_rate": 5.022633875533879e-06, "loss": 1.095, "step": 575 }, { "epoch": 1.618846694796062, "grad_norm": 2.326218605041504, "learning_rate": 5.00754464808219e-06, "loss": 1.1578, "step": 576 }, { "epoch": 1.6216596343178622, "grad_norm": 2.0061216354370117, "learning_rate": 4.992455351917812e-06, "loss": 0.974, "step": 577 }, { "epoch": 1.6244725738396624, "grad_norm": 2.0241732597351074, "learning_rate": 4.977366124466122e-06, "loss": 1.0518, "step": 578 }, { "epoch": 1.6272855133614628, "grad_norm": 2.2035324573516846, "learning_rate": 4.962277103151876e-06, "loss": 1.0806, "step": 579 }, { "epoch": 1.6300984528832632, "grad_norm": 1.9597488641738892, "learning_rate": 4.947188425397942e-06, "loss": 0.9929, "step": 580 }, { "epoch": 1.6329113924050633, "grad_norm": 1.8797650337219238, "learning_rate": 4.932100228624072e-06, "loss": 1.0142, "step": 581 }, { "epoch": 1.6357243319268635, "grad_norm": 2.195955514907837, "learning_rate": 4.917012650245626e-06, "loss": 1.2481, "step": 582 }, { "epoch": 1.638537271448664, "grad_norm": 2.0398526191711426, "learning_rate": 4.901925827672341e-06, "loss": 0.9249, "step": 583 }, { "epoch": 1.6413502109704643, "grad_norm": 2.003324508666992, "learning_rate": 4.886839898307062e-06, "loss": 1.0438, "step": 584 }, { "epoch": 1.6441631504922645, "grad_norm": 1.6683696508407593, "learning_rate": 4.8717549995445105e-06, "loss": 0.8833, "step": 585 }, { "epoch": 1.6469760900140646, "grad_norm": 2.1678078174591064, "learning_rate": 4.856671268770007e-06, "loss": 1.1291, "step": 586 }, { "epoch": 1.649789029535865, "grad_norm": 1.9070981740951538, "learning_rate": 4.841588843358251e-06, "loss": 0.9658, "step": 587 }, { "epoch": 1.6526019690576652, "grad_norm": 1.897820234298706, "learning_rate": 4.826507860672036e-06, "loss": 0.9903, "step": 588 }, { "epoch": 1.6554149085794654, "grad_norm": 2.141012668609619, "learning_rate": 4.811428458061033e-06, "loss": 1.3183, "step": 589 }, { "epoch": 1.6582278481012658, "grad_norm": 1.9511604309082031, "learning_rate": 4.796350772860511e-06, "loss": 1.2011, "step": 590 }, { "epoch": 1.6610407876230662, "grad_norm": 2.517437696456909, "learning_rate": 4.7812749423901e-06, "loss": 1.1229, "step": 591 }, { "epoch": 1.6638537271448663, "grad_norm": 1.9676152467727661, "learning_rate": 4.7662011039525416e-06, "loss": 1.1357, "step": 592 }, { "epoch": 1.6666666666666665, "grad_norm": 1.9041470289230347, "learning_rate": 4.7511293948324325e-06, "loss": 1.0166, "step": 593 }, { "epoch": 1.669479606188467, "grad_norm": 2.15259051322937, "learning_rate": 4.736059952294975e-06, "loss": 1.011, "step": 594 }, { "epoch": 1.6722925457102673, "grad_norm": 2.361236333847046, "learning_rate": 4.720992913584732e-06, "loss": 1.3296, "step": 595 }, { "epoch": 1.6751054852320675, "grad_norm": 2.3137876987457275, "learning_rate": 4.7059284159243725e-06, "loss": 1.3602, "step": 596 }, { "epoch": 1.6779184247538677, "grad_norm": 2.085984230041504, "learning_rate": 4.690866596513421e-06, "loss": 1.247, "step": 597 }, { "epoch": 1.680731364275668, "grad_norm": 2.2906124591827393, "learning_rate": 4.675807592527014e-06, "loss": 1.2777, "step": 598 }, { "epoch": 1.6835443037974684, "grad_norm": 2.461681842803955, "learning_rate": 4.660751541114641e-06, "loss": 1.3176, "step": 599 }, { "epoch": 1.6863572433192686, "grad_norm": 2.259167194366455, "learning_rate": 4.645698579398907e-06, "loss": 1.145, "step": 600 }, { "epoch": 1.6863572433192686, "eval_loss": 0.6439154744148254, "eval_runtime": 2.7846, "eval_samples_per_second": 9.337, "eval_steps_per_second": 1.436, "step": 600 }, { "epoch": 1.6863572433192686, "eval_active_sample_count": 30, "eval_avg_loss": 596.375, "eval_avg_mem_token_accuracy": 0.24822695035460993, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008178525528683258, "eval_avg_mem_token_rate": 0.5693474356415885, "eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 600, "eval_loss": 0.6439154744148254, "eval_num_samples": 30, "eval_runtime": 2.7846, "eval_samples_per_second": 9.337, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.436, "eval_total_correct_count": 70, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8559, "step": 600 }, { "epoch": 1.6891701828410688, "grad_norm": 2.261350154876709, "learning_rate": 4.630648844474271e-06, "loss": 1.3461, "step": 601 }, { "epoch": 1.6919831223628692, "grad_norm": 2.463414192199707, "learning_rate": 4.615602473405812e-06, "loss": 1.1112, "step": 602 }, { "epoch": 1.6947960618846696, "grad_norm": 2.262482166290283, "learning_rate": 4.600559603227963e-06, "loss": 1.208, "step": 603 }, { "epoch": 1.6976090014064698, "grad_norm": 2.235854387283325, "learning_rate": 4.585520370943285e-06, "loss": 0.8357, "step": 604 }, { "epoch": 1.70042194092827, "grad_norm": 2.0354301929473877, "learning_rate": 4.570484913521196e-06, "loss": 0.9843, "step": 605 }, { "epoch": 1.7032348804500703, "grad_norm": 2.3465640544891357, "learning_rate": 4.55545336789674e-06, "loss": 1.2206, "step": 606 }, { "epoch": 1.7060478199718707, "grad_norm": 1.846433162689209, "learning_rate": 4.540425870969332e-06, "loss": 0.9545, "step": 607 }, { "epoch": 1.7088607594936709, "grad_norm": 2.3210694789886475, "learning_rate": 4.5254025596015175e-06, "loss": 1.2733, "step": 608 }, { "epoch": 1.711673699015471, "grad_norm": 2.5384347438812256, "learning_rate": 4.510383570617716e-06, "loss": 1.2064, "step": 609 }, { "epoch": 1.7144866385372715, "grad_norm": 2.0778439044952393, "learning_rate": 4.495369040802988e-06, "loss": 1.1119, "step": 610 }, { "epoch": 1.7172995780590719, "grad_norm": 2.212078332901001, "learning_rate": 4.480359106901775e-06, "loss": 1.1948, "step": 611 }, { "epoch": 1.720112517580872, "grad_norm": 2.3751208782196045, "learning_rate": 4.465353905616668e-06, "loss": 1.2253, "step": 612 }, { "epoch": 1.7229254571026722, "grad_norm": 2.196316957473755, "learning_rate": 4.4503535736071505e-06, "loss": 1.159, "step": 613 }, { "epoch": 1.7257383966244726, "grad_norm": 2.1474740505218506, "learning_rate": 4.435358247488365e-06, "loss": 1.143, "step": 614 }, { "epoch": 1.728551336146273, "grad_norm": 2.5476577281951904, "learning_rate": 4.420368063829854e-06, "loss": 1.157, "step": 615 }, { "epoch": 1.7313642756680732, "grad_norm": 2.186852216720581, "learning_rate": 4.405383159154337e-06, "loss": 1.1052, "step": 616 }, { "epoch": 1.7341772151898733, "grad_norm": 2.162107467651367, "learning_rate": 4.390403669936443e-06, "loss": 1.1342, "step": 617 }, { "epoch": 1.7369901547116737, "grad_norm": 2.093745470046997, "learning_rate": 4.37542973260149e-06, "loss": 0.9557, "step": 618 }, { "epoch": 1.7398030942334741, "grad_norm": 1.8521722555160522, "learning_rate": 4.3604614835242255e-06, "loss": 1.0542, "step": 619 }, { "epoch": 1.7426160337552743, "grad_norm": 2.1983838081359863, "learning_rate": 4.3454990590275966e-06, "loss": 0.7818, "step": 620 }, { "epoch": 1.7454289732770745, "grad_norm": 2.261500597000122, "learning_rate": 4.3305425953814985e-06, "loss": 1.1948, "step": 621 }, { "epoch": 1.7482419127988749, "grad_norm": 2.4740712642669678, "learning_rate": 4.315592228801543e-06, "loss": 1.3438, "step": 622 }, { "epoch": 1.7510548523206753, "grad_norm": 2.277127981185913, "learning_rate": 4.300648095447806e-06, "loss": 1.2477, "step": 623 }, { "epoch": 1.7538677918424754, "grad_norm": 2.1069774627685547, "learning_rate": 4.285710331423603e-06, "loss": 1.208, "step": 624 }, { "epoch": 1.7566807313642756, "grad_norm": 2.1714632511138916, "learning_rate": 4.2707790727742315e-06, "loss": 1.2219, "step": 625 }, { "epoch": 1.759493670886076, "grad_norm": 2.2100682258605957, "learning_rate": 4.255854455485753e-06, "loss": 1.284, "step": 626 }, { "epoch": 1.7623066104078764, "grad_norm": 2.0882930755615234, "learning_rate": 4.24093661548373e-06, "loss": 1.1695, "step": 627 }, { "epoch": 1.7651195499296763, "grad_norm": 2.3131346702575684, "learning_rate": 4.226025688632013e-06, "loss": 1.1353, "step": 628 }, { "epoch": 1.7679324894514767, "grad_norm": 2.0631368160247803, "learning_rate": 4.211121810731485e-06, "loss": 1.175, "step": 629 }, { "epoch": 1.7707454289732771, "grad_norm": 2.4987428188323975, "learning_rate": 4.196225117518828e-06, "loss": 1.2522, "step": 630 }, { "epoch": 1.7735583684950773, "grad_norm": 1.8051552772521973, "learning_rate": 4.181335744665299e-06, "loss": 1.0842, "step": 631 }, { "epoch": 1.7763713080168775, "grad_norm": 2.0841329097747803, "learning_rate": 4.166453827775474e-06, "loss": 1.331, "step": 632 }, { "epoch": 1.7791842475386779, "grad_norm": 2.309027910232544, "learning_rate": 4.1515795023860325e-06, "loss": 1.2727, "step": 633 }, { "epoch": 1.7819971870604783, "grad_norm": 2.1550230979919434, "learning_rate": 4.136712903964511e-06, "loss": 1.2984, "step": 634 }, { "epoch": 1.7848101265822784, "grad_norm": 1.9745640754699707, "learning_rate": 4.121854167908072e-06, "loss": 0.8655, "step": 635 }, { "epoch": 1.7876230661040786, "grad_norm": 1.838762879371643, "learning_rate": 4.107003429542273e-06, "loss": 0.8657, "step": 636 }, { "epoch": 1.790436005625879, "grad_norm": 3.8649277687072754, "learning_rate": 4.092160824119836e-06, "loss": 1.0927, "step": 637 }, { "epoch": 1.7932489451476794, "grad_norm": 1.946352481842041, "learning_rate": 4.077326486819405e-06, "loss": 0.922, "step": 638 }, { "epoch": 1.7960618846694796, "grad_norm": 1.9564697742462158, "learning_rate": 4.06250055274433e-06, "loss": 1.1767, "step": 639 }, { "epoch": 1.7988748241912798, "grad_norm": 2.0671567916870117, "learning_rate": 4.047683156921422e-06, "loss": 1.1347, "step": 640 }, { "epoch": 1.8016877637130801, "grad_norm": 2.086289167404175, "learning_rate": 4.0328744342997355e-06, "loss": 1.2172, "step": 641 }, { "epoch": 1.8045007032348805, "grad_norm": 1.74513578414917, "learning_rate": 4.0180745197493295e-06, "loss": 1.1084, "step": 642 }, { "epoch": 1.8073136427566807, "grad_norm": 2.2042808532714844, "learning_rate": 4.0032835480600516e-06, "loss": 1.3802, "step": 643 }, { "epoch": 1.810126582278481, "grad_norm": 2.1729772090911865, "learning_rate": 3.9885016539402896e-06, "loss": 1.1866, "step": 644 }, { "epoch": 1.8129395218002813, "grad_norm": 2.0441439151763916, "learning_rate": 3.973728972015771e-06, "loss": 1.1282, "step": 645 }, { "epoch": 1.8157524613220817, "grad_norm": 2.044088125228882, "learning_rate": 3.958965636828314e-06, "loss": 1.0972, "step": 646 }, { "epoch": 1.8185654008438819, "grad_norm": 2.1738321781158447, "learning_rate": 3.944211782834618e-06, "loss": 1.1018, "step": 647 }, { "epoch": 1.821378340365682, "grad_norm": 2.4498589038848877, "learning_rate": 3.929467544405027e-06, "loss": 1.1727, "step": 648 }, { "epoch": 1.8241912798874824, "grad_norm": 2.110391616821289, "learning_rate": 3.9147330558223175e-06, "loss": 1.2465, "step": 649 }, { "epoch": 1.8270042194092828, "grad_norm": 2.273608684539795, "learning_rate": 3.900008451280462e-06, "loss": 1.1749, "step": 650 }, { "epoch": 1.8270042194092828, "eval_loss": 0.6407925486564636, "eval_runtime": 2.8075, "eval_samples_per_second": 9.261, "eval_steps_per_second": 1.425, "step": 650 }, { "epoch": 1.8270042194092828, "eval_active_sample_count": 30, "eval_avg_loss": 601.5, "eval_avg_mem_token_accuracy": 0.25177304964539005, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.0082385704339754, "eval_avg_mem_token_rate": 0.5732721346371317, "eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 650, "eval_loss": 0.6407925486564636, "eval_num_samples": 30, "eval_runtime": 2.8075, "eval_samples_per_second": 9.261, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.425, "eval_total_correct_count": 71, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8618, "step": 650 }, { "epoch": 1.829817158931083, "grad_norm": 2.3315672874450684, "learning_rate": 3.885293864883423e-06, "loss": 1.1839, "step": 651 }, { "epoch": 1.8326300984528832, "grad_norm": 2.203946828842163, "learning_rate": 3.870589430643915e-06, "loss": 1.1069, "step": 652 }, { "epoch": 1.8354430379746836, "grad_norm": 2.159895896911621, "learning_rate": 3.8558952824822e-06, "loss": 1.147, "step": 653 }, { "epoch": 1.838255977496484, "grad_norm": 2.023045301437378, "learning_rate": 3.84121155422485e-06, "loss": 0.888, "step": 654 }, { "epoch": 1.8410689170182841, "grad_norm": 2.383005380630493, "learning_rate": 3.826538379603549e-06, "loss": 1.4156, "step": 655 }, { "epoch": 1.8438818565400843, "grad_norm": 2.3636224269866943, "learning_rate": 3.8118758922538533e-06, "loss": 1.0916, "step": 656 }, { "epoch": 1.8466947960618847, "grad_norm": 2.039092779159546, "learning_rate": 3.7972242257139953e-06, "loss": 1.2214, "step": 657 }, { "epoch": 1.849507735583685, "grad_norm": 2.0451226234436035, "learning_rate": 3.782583513423647e-06, "loss": 1.3025, "step": 658 }, { "epoch": 1.8523206751054853, "grad_norm": 2.2477307319641113, "learning_rate": 3.7679538887227247e-06, "loss": 1.3284, "step": 659 }, { "epoch": 1.8551336146272854, "grad_norm": 2.366098165512085, "learning_rate": 3.753335484850157e-06, "loss": 1.2683, "step": 660 }, { "epoch": 1.8579465541490858, "grad_norm": 2.1643450260162354, "learning_rate": 3.738728434942684e-06, "loss": 1.1879, "step": 661 }, { "epoch": 1.8607594936708862, "grad_norm": 2.3253345489501953, "learning_rate": 3.7241328720336377e-06, "loss": 1.2502, "step": 662 }, { "epoch": 1.8635724331926864, "grad_norm": 1.8580361604690552, "learning_rate": 3.709548929051732e-06, "loss": 0.9708, "step": 663 }, { "epoch": 1.8663853727144866, "grad_norm": 2.173644542694092, "learning_rate": 3.6949767388198554e-06, "loss": 1.2449, "step": 664 }, { "epoch": 1.869198312236287, "grad_norm": 1.964975357055664, "learning_rate": 3.680416434053854e-06, "loss": 1.1799, "step": 665 }, { "epoch": 1.8720112517580874, "grad_norm": 2.169707775115967, "learning_rate": 3.6658681473613333e-06, "loss": 1.2694, "step": 666 }, { "epoch": 1.8748241912798875, "grad_norm": 1.9698622226715088, "learning_rate": 3.651332011240437e-06, "loss": 1.1431, "step": 667 }, { "epoch": 1.8776371308016877, "grad_norm": 2.4650795459747314, "learning_rate": 3.636808158078656e-06, "loss": 1.3374, "step": 668 }, { "epoch": 1.880450070323488, "grad_norm": 1.978132724761963, "learning_rate": 3.622296720151608e-06, "loss": 0.9086, "step": 669 }, { "epoch": 1.8832630098452883, "grad_norm": 1.8494510650634766, "learning_rate": 3.607797829621843e-06, "loss": 1.0412, "step": 670 }, { "epoch": 1.8860759493670884, "grad_norm": 2.31000018119812, "learning_rate": 3.5933116185376325e-06, "loss": 1.2616, "step": 671 }, { "epoch": 1.8888888888888888, "grad_norm": 2.1177399158477783, "learning_rate": 3.578838218831776e-06, "loss": 1.0584, "step": 672 }, { "epoch": 1.8917018284106892, "grad_norm": 2.711202621459961, "learning_rate": 3.5643777623203857e-06, "loss": 1.4235, "step": 673 }, { "epoch": 1.8945147679324894, "grad_norm": 2.0394771099090576, "learning_rate": 3.5499303807017018e-06, "loss": 1.0978, "step": 674 }, { "epoch": 1.8973277074542896, "grad_norm": 1.9236093759536743, "learning_rate": 3.5354962055548802e-06, "loss": 1.0943, "step": 675 }, { "epoch": 1.90014064697609, "grad_norm": 2.159970283508301, "learning_rate": 3.5210753683388014e-06, "loss": 1.1188, "step": 676 }, { "epoch": 1.9029535864978904, "grad_norm": 2.201075315475464, "learning_rate": 3.5066680003908695e-06, "loss": 1.0096, "step": 677 }, { "epoch": 1.9057665260196905, "grad_norm": 2.2006876468658447, "learning_rate": 3.4922742329258207e-06, "loss": 1.2433, "step": 678 }, { "epoch": 1.9085794655414907, "grad_norm": 2.1321656703948975, "learning_rate": 3.47789419703452e-06, "loss": 1.2714, "step": 679 }, { "epoch": 1.9113924050632911, "grad_norm": 2.141841173171997, "learning_rate": 3.463528023682779e-06, "loss": 1.0148, "step": 680 }, { "epoch": 1.9142053445850915, "grad_norm": 2.4476535320281982, "learning_rate": 3.4491758437101487e-06, "loss": 1.2952, "step": 681 }, { "epoch": 1.9170182841068917, "grad_norm": 2.855252742767334, "learning_rate": 3.4348377878287443e-06, "loss": 1.0821, "step": 682 }, { "epoch": 1.9198312236286919, "grad_norm": 2.2479875087738037, "learning_rate": 3.4205139866220384e-06, "loss": 0.9025, "step": 683 }, { "epoch": 1.9226441631504922, "grad_norm": 1.734316349029541, "learning_rate": 3.4062045705436863e-06, "loss": 0.9917, "step": 684 }, { "epoch": 1.9254571026722926, "grad_norm": 1.7392464876174927, "learning_rate": 3.391909669916324e-06, "loss": 0.6617, "step": 685 }, { "epoch": 1.9282700421940928, "grad_norm": 2.1003048419952393, "learning_rate": 3.3776294149303956e-06, "loss": 1.2154, "step": 686 }, { "epoch": 1.931082981715893, "grad_norm": 2.3303074836730957, "learning_rate": 3.3633639356429564e-06, "loss": 1.2461, "step": 687 }, { "epoch": 1.9338959212376934, "grad_norm": 2.2976810932159424, "learning_rate": 3.3491133619764925e-06, "loss": 1.3707, "step": 688 }, { "epoch": 1.9367088607594938, "grad_norm": 1.9439120292663574, "learning_rate": 3.334877823717737e-06, "loss": 0.9291, "step": 689 }, { "epoch": 1.939521800281294, "grad_norm": 2.5753273963928223, "learning_rate": 3.3206574505164934e-06, "loss": 1.0634, "step": 690 }, { "epoch": 1.9423347398030941, "grad_norm": 2.2259931564331055, "learning_rate": 3.306452371884441e-06, "loss": 1.1333, "step": 691 }, { "epoch": 1.9451476793248945, "grad_norm": 2.0289406776428223, "learning_rate": 3.2922627171939726e-06, "loss": 1.138, "step": 692 }, { "epoch": 1.947960618846695, "grad_norm": 2.4240784645080566, "learning_rate": 3.2780886156770016e-06, "loss": 1.1418, "step": 693 }, { "epoch": 1.950773558368495, "grad_norm": 2.215083122253418, "learning_rate": 3.263930196423797e-06, "loss": 1.42, "step": 694 }, { "epoch": 1.9535864978902953, "grad_norm": 2.2829818725585938, "learning_rate": 3.2497875883817955e-06, "loss": 1.1413, "step": 695 }, { "epoch": 1.9563994374120957, "grad_norm": 2.153489828109741, "learning_rate": 3.2356609203544387e-06, "loss": 1.2167, "step": 696 }, { "epoch": 1.959212376933896, "grad_norm": 1.974264144897461, "learning_rate": 3.2215503209999952e-06, "loss": 1.1241, "step": 697 }, { "epoch": 1.9620253164556962, "grad_norm": 1.9400849342346191, "learning_rate": 3.207455918830384e-06, "loss": 1.036, "step": 698 }, { "epoch": 1.9648382559774964, "grad_norm": 2.141404628753662, "learning_rate": 3.193377842210014e-06, "loss": 1.1286, "step": 699 }, { "epoch": 1.9676511954992968, "grad_norm": 2.2581005096435547, "learning_rate": 3.179316219354602e-06, "loss": 1.385, "step": 700 }, { "epoch": 1.9676511954992968, "eval_loss": 0.6409177184104919, "eval_runtime": 2.8079, "eval_samples_per_second": 9.26, "eval_steps_per_second": 1.425, "step": 700 }, { "epoch": 1.9676511954992968, "eval_active_sample_count": 30, "eval_avg_loss": 603.5, "eval_avg_mem_token_accuracy": 0.25177304964539005, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008195775135634306, "eval_avg_mem_token_rate": 0.5762655491252577, "eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 700, "eval_loss": 0.6409177184104919, "eval_num_samples": 30, "eval_runtime": 2.8079, "eval_samples_per_second": 9.26, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.425, "eval_total_correct_count": 71, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8663, "step": 700 }, { "epoch": 1.9704641350210972, "grad_norm": 2.2288899421691895, "learning_rate": 3.1652711783300234e-06, "loss": 1.3147, "step": 701 }, { "epoch": 1.9732770745428974, "grad_norm": 2.327530860900879, "learning_rate": 3.1512428470511257e-06, "loss": 1.2538, "step": 702 }, { "epoch": 1.9760900140646975, "grad_norm": 1.8935436010360718, "learning_rate": 3.1372313532805766e-06, "loss": 0.8867, "step": 703 }, { "epoch": 1.978902953586498, "grad_norm": 2.1964917182922363, "learning_rate": 3.1232368246276956e-06, "loss": 1.1226, "step": 704 }, { "epoch": 1.9817158931082983, "grad_norm": 2.11517333984375, "learning_rate": 3.1092593885472965e-06, "loss": 1.1076, "step": 705 }, { "epoch": 1.9845288326300985, "grad_norm": 2.36454439163208, "learning_rate": 3.0952991723385152e-06, "loss": 1.1308, "step": 706 }, { "epoch": 1.9873417721518987, "grad_norm": 4.362302780151367, "learning_rate": 3.0813563031436676e-06, "loss": 1.3241, "step": 707 }, { "epoch": 1.990154711673699, "grad_norm": 2.1657958030700684, "learning_rate": 3.067430907947073e-06, "loss": 1.1269, "step": 708 }, { "epoch": 1.9929676511954995, "grad_norm": 1.7424006462097168, "learning_rate": 3.053523113573914e-06, "loss": 0.9743, "step": 709 }, { "epoch": 1.9957805907172996, "grad_norm": 2.1487817764282227, "learning_rate": 3.039633046689069e-06, "loss": 1.0117, "step": 710 }, { "epoch": 1.9985935302390998, "grad_norm": 2.059786319732666, "learning_rate": 3.0257608337959683e-06, "loss": 1.0671, "step": 711 }, { "epoch": 2.0, "grad_norm": 1.65206778049469, "learning_rate": 3.0119066012354316e-06, "loss": 0.5849, "step": 712 }, { "epoch": 2.0028129395218004, "grad_norm": 1.849442720413208, "learning_rate": 2.9980704751845302e-06, "loss": 1.0254, "step": 713 }, { "epoch": 2.0056258790436003, "grad_norm": 2.224947690963745, "learning_rate": 2.9842525816554237e-06, "loss": 1.3435, "step": 714 }, { "epoch": 2.0084388185654007, "grad_norm": 2.0207643508911133, "learning_rate": 2.9704530464942254e-06, "loss": 1.1889, "step": 715 }, { "epoch": 2.011251758087201, "grad_norm": 1.9327627420425415, "learning_rate": 2.9566719953798474e-06, "loss": 0.9725, "step": 716 }, { "epoch": 2.0140646976090015, "grad_norm": 2.2062811851501465, "learning_rate": 2.942909553822859e-06, "loss": 1.1318, "step": 717 }, { "epoch": 2.0168776371308015, "grad_norm": 1.9610023498535156, "learning_rate": 2.929165847164343e-06, "loss": 1.02, "step": 718 }, { "epoch": 2.019690576652602, "grad_norm": 2.012442111968994, "learning_rate": 2.9154410005747586e-06, "loss": 1.073, "step": 719 }, { "epoch": 2.0225035161744023, "grad_norm": 1.9642077684402466, "learning_rate": 2.901735139052787e-06, "loss": 1.0427, "step": 720 }, { "epoch": 2.0253164556962027, "grad_norm": 2.19358491897583, "learning_rate": 2.888048387424218e-06, "loss": 1.1162, "step": 721 }, { "epoch": 2.0281293952180026, "grad_norm": 1.9871453046798706, "learning_rate": 2.8743808703407866e-06, "loss": 1.1066, "step": 722 }, { "epoch": 2.030942334739803, "grad_norm": 2.278085947036743, "learning_rate": 2.8607327122790555e-06, "loss": 1.1253, "step": 723 }, { "epoch": 2.0337552742616034, "grad_norm": 1.7093780040740967, "learning_rate": 2.8471040375392745e-06, "loss": 1.0754, "step": 724 }, { "epoch": 2.036568213783404, "grad_norm": 2.088590621948242, "learning_rate": 2.833494970244248e-06, "loss": 1.2312, "step": 725 }, { "epoch": 2.0393811533052038, "grad_norm": 1.8987199068069458, "learning_rate": 2.819905634338208e-06, "loss": 0.9913, "step": 726 }, { "epoch": 2.042194092827004, "grad_norm": 2.069563627243042, "learning_rate": 2.8063361535856838e-06, "loss": 1.1635, "step": 727 }, { "epoch": 2.0450070323488045, "grad_norm": 2.440237522125244, "learning_rate": 2.7927866515703705e-06, "loss": 1.2113, "step": 728 }, { "epoch": 2.047819971870605, "grad_norm": 2.0094406604766846, "learning_rate": 2.7792572516940108e-06, "loss": 0.9271, "step": 729 }, { "epoch": 2.050632911392405, "grad_norm": 2.2327640056610107, "learning_rate": 2.765748077175272e-06, "loss": 1.1026, "step": 730 }, { "epoch": 2.0534458509142053, "grad_norm": 2.1008453369140625, "learning_rate": 2.752259251048606e-06, "loss": 1.1666, "step": 731 }, { "epoch": 2.0562587904360057, "grad_norm": 1.8837400674819946, "learning_rate": 2.7387908961631597e-06, "loss": 0.8817, "step": 732 }, { "epoch": 2.059071729957806, "grad_norm": 1.993558645248413, "learning_rate": 2.725343135181622e-06, "loss": 1.0745, "step": 733 }, { "epoch": 2.061884669479606, "grad_norm": 2.122399091720581, "learning_rate": 2.711916090579137e-06, "loss": 1.1435, "step": 734 }, { "epoch": 2.0646976090014064, "grad_norm": 2.0384397506713867, "learning_rate": 2.698509884642163e-06, "loss": 1.181, "step": 735 }, { "epoch": 2.067510548523207, "grad_norm": 2.315969944000244, "learning_rate": 2.6851246394673822e-06, "loss": 0.9172, "step": 736 }, { "epoch": 2.070323488045007, "grad_norm": 1.8696023225784302, "learning_rate": 2.67176047696057e-06, "loss": 0.9634, "step": 737 }, { "epoch": 2.073136427566807, "grad_norm": 2.3400771617889404, "learning_rate": 2.6584175188354934e-06, "loss": 1.1388, "step": 738 }, { "epoch": 2.0759493670886076, "grad_norm": 2.0902152061462402, "learning_rate": 2.6450958866128e-06, "loss": 0.9649, "step": 739 }, { "epoch": 2.078762306610408, "grad_norm": 1.8135625123977661, "learning_rate": 2.6317957016189155e-06, "loss": 1.1267, "step": 740 }, { "epoch": 2.0815752461322083, "grad_norm": 1.949086308479309, "learning_rate": 2.618517084984933e-06, "loss": 1.056, "step": 741 }, { "epoch": 2.0843881856540083, "grad_norm": 2.1474437713623047, "learning_rate": 2.6052601576455116e-06, "loss": 1.1126, "step": 742 }, { "epoch": 2.0872011251758087, "grad_norm": 2.2054314613342285, "learning_rate": 2.592025040337779e-06, "loss": 1.1921, "step": 743 }, { "epoch": 2.090014064697609, "grad_norm": 1.9321085214614868, "learning_rate": 2.578811853600226e-06, "loss": 0.9129, "step": 744 }, { "epoch": 2.0928270042194095, "grad_norm": 2.050908327102661, "learning_rate": 2.5656207177716107e-06, "loss": 1.0466, "step": 745 }, { "epoch": 2.0956399437412094, "grad_norm": 2.335043430328369, "learning_rate": 2.552451752989865e-06, "loss": 0.9907, "step": 746 }, { "epoch": 2.09845288326301, "grad_norm": 2.1719613075256348, "learning_rate": 2.539305079190999e-06, "loss": 1.1855, "step": 747 }, { "epoch": 2.1012658227848102, "grad_norm": 2.2501490116119385, "learning_rate": 2.5261808161080047e-06, "loss": 1.1693, "step": 748 }, { "epoch": 2.1040787623066106, "grad_norm": 2.1329755783081055, "learning_rate": 2.513079083269774e-06, "loss": 1.1507, "step": 749 }, { "epoch": 2.1068917018284106, "grad_norm": 1.9924427270889282, "learning_rate": 2.5000000000000015e-06, "loss": 1.035, "step": 750 }, { "epoch": 2.1068917018284106, "eval_loss": 0.6396089792251587, "eval_runtime": 2.846, "eval_samples_per_second": 9.136, "eval_steps_per_second": 1.405, "step": 750 }, { "epoch": 2.1068917018284106, "eval_active_sample_count": 30, "eval_avg_loss": 596.5, "eval_avg_mem_token_accuracy": 0.2553191489361702, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.00837404047452896, "eval_avg_mem_token_rate": 0.5719417281979645, "eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 750, "eval_loss": 0.6396089792251587, "eval_num_samples": 30, "eval_runtime": 2.846, "eval_samples_per_second": 9.136, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.405, "eval_total_correct_count": 72, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8598, "step": 750 }, { "epoch": 2.109704641350211, "grad_norm": 1.8336857557296753, "learning_rate": 2.4869436854161e-06, "loss": 0.9486, "step": 751 }, { "epoch": 2.1125175808720114, "grad_norm": 2.0312447547912598, "learning_rate": 2.4739102584281268e-06, "loss": 1.2113, "step": 752 }, { "epoch": 2.1153305203938118, "grad_norm": 2.0355682373046875, "learning_rate": 2.4608998377376752e-06, "loss": 1.1002, "step": 753 }, { "epoch": 2.1181434599156117, "grad_norm": 2.6331751346588135, "learning_rate": 2.447912541836826e-06, "loss": 1.1891, "step": 754 }, { "epoch": 2.120956399437412, "grad_norm": 2.6497652530670166, "learning_rate": 2.4349484890070357e-06, "loss": 1.0924, "step": 755 }, { "epoch": 2.1237693389592125, "grad_norm": 1.9082729816436768, "learning_rate": 2.4220077973180906e-06, "loss": 1.0445, "step": 756 }, { "epoch": 2.1265822784810124, "grad_norm": 1.8643864393234253, "learning_rate": 2.4090905846270006e-06, "loss": 1.0385, "step": 757 }, { "epoch": 2.129395218002813, "grad_norm": 2.0666754245758057, "learning_rate": 2.396196968576957e-06, "loss": 1.2737, "step": 758 }, { "epoch": 2.1322081575246132, "grad_norm": 2.5806944370269775, "learning_rate": 2.3833270665962293e-06, "loss": 0.9353, "step": 759 }, { "epoch": 2.1350210970464136, "grad_norm": 2.0371792316436768, "learning_rate": 2.370480995897127e-06, "loss": 1.1003, "step": 760 }, { "epoch": 2.1378340365682136, "grad_norm": 1.9753756523132324, "learning_rate": 2.3576588734749022e-06, "loss": 0.9872, "step": 761 }, { "epoch": 2.140646976090014, "grad_norm": 2.2429325580596924, "learning_rate": 2.3448608161067117e-06, "loss": 1.0195, "step": 762 }, { "epoch": 2.1434599156118144, "grad_norm": 1.8056210279464722, "learning_rate": 2.3320869403505324e-06, "loss": 0.9248, "step": 763 }, { "epoch": 2.1462728551336148, "grad_norm": 1.9145182371139526, "learning_rate": 2.3193373625441113e-06, "loss": 0.9601, "step": 764 }, { "epoch": 2.1490857946554147, "grad_norm": 2.0845413208007812, "learning_rate": 2.3066121988038996e-06, "loss": 1.1699, "step": 765 }, { "epoch": 2.151898734177215, "grad_norm": 1.9216276407241821, "learning_rate": 2.2939115650240008e-06, "loss": 1.0108, "step": 766 }, { "epoch": 2.1547116736990155, "grad_norm": 2.0462570190429688, "learning_rate": 2.2812355768751106e-06, "loss": 0.8837, "step": 767 }, { "epoch": 2.157524613220816, "grad_norm": 2.385082721710205, "learning_rate": 2.268584349803464e-06, "loss": 1.1446, "step": 768 }, { "epoch": 2.160337552742616, "grad_norm": 2.243379592895508, "learning_rate": 2.2559579990297943e-06, "loss": 1.2207, "step": 769 }, { "epoch": 2.1631504922644162, "grad_norm": 2.170370101928711, "learning_rate": 2.2433566395482577e-06, "loss": 1.3006, "step": 770 }, { "epoch": 2.1659634317862166, "grad_norm": 2.1776270866394043, "learning_rate": 2.2307803861254207e-06, "loss": 1.1889, "step": 771 }, { "epoch": 2.168776371308017, "grad_norm": 2.114034652709961, "learning_rate": 2.218229353299181e-06, "loss": 1.2131, "step": 772 }, { "epoch": 2.171589310829817, "grad_norm": 2.2640528678894043, "learning_rate": 2.2057036553777565e-06, "loss": 1.3633, "step": 773 }, { "epoch": 2.1744022503516174, "grad_norm": 1.7782313823699951, "learning_rate": 2.1932034064386113e-06, "loss": 0.9327, "step": 774 }, { "epoch": 2.1772151898734178, "grad_norm": 2.046961545944214, "learning_rate": 2.1807287203274504e-06, "loss": 1.2086, "step": 775 }, { "epoch": 2.180028129395218, "grad_norm": 2.103487491607666, "learning_rate": 2.168279710657149e-06, "loss": 1.0986, "step": 776 }, { "epoch": 2.182841068917018, "grad_norm": 2.1570355892181396, "learning_rate": 2.1558564908067497e-06, "loss": 1.0043, "step": 777 }, { "epoch": 2.1856540084388185, "grad_norm": 1.9457972049713135, "learning_rate": 2.1434591739204062e-06, "loss": 1.067, "step": 778 }, { "epoch": 2.188466947960619, "grad_norm": 2.141794204711914, "learning_rate": 2.1310878729063645e-06, "loss": 1.1144, "step": 779 }, { "epoch": 2.1912798874824193, "grad_norm": 1.9879792928695679, "learning_rate": 2.118742700435931e-06, "loss": 1.0625, "step": 780 }, { "epoch": 2.1940928270042193, "grad_norm": 2.3529539108276367, "learning_rate": 2.1064237689424483e-06, "loss": 1.2867, "step": 781 }, { "epoch": 2.1969057665260197, "grad_norm": 2.0593795776367188, "learning_rate": 2.0941311906202672e-06, "loss": 1.3383, "step": 782 }, { "epoch": 2.19971870604782, "grad_norm": 2.1530141830444336, "learning_rate": 2.081865077423731e-06, "loss": 1.2258, "step": 783 }, { "epoch": 2.2025316455696204, "grad_norm": 1.9634898900985718, "learning_rate": 2.06962554106615e-06, "loss": 1.1629, "step": 784 }, { "epoch": 2.2053445850914204, "grad_norm": 2.2565033435821533, "learning_rate": 2.0574126930187882e-06, "loss": 1.3058, "step": 785 }, { "epoch": 2.208157524613221, "grad_norm": 2.420267105102539, "learning_rate": 2.0452266445098457e-06, "loss": 1.2447, "step": 786 }, { "epoch": 2.210970464135021, "grad_norm": 2.2069785594940186, "learning_rate": 2.0330675065234466e-06, "loss": 1.1835, "step": 787 }, { "epoch": 2.2137834036568216, "grad_norm": 2.1070237159729004, "learning_rate": 2.0209353897986288e-06, "loss": 1.1873, "step": 788 }, { "epoch": 2.2165963431786215, "grad_norm": 1.9886164665222168, "learning_rate": 2.0088304048283337e-06, "loss": 1.0022, "step": 789 }, { "epoch": 2.219409282700422, "grad_norm": 2.1714046001434326, "learning_rate": 1.9967526618584016e-06, "loss": 1.1458, "step": 790 }, { "epoch": 2.2222222222222223, "grad_norm": 2.1026611328125, "learning_rate": 1.984702270886566e-06, "loss": 1.1671, "step": 791 }, { "epoch": 2.2250351617440227, "grad_norm": 2.3853933811187744, "learning_rate": 1.9726793416614532e-06, "loss": 1.2162, "step": 792 }, { "epoch": 2.2278481012658227, "grad_norm": 2.1531338691711426, "learning_rate": 1.9606839836815872e-06, "loss": 1.2844, "step": 793 }, { "epoch": 2.230661040787623, "grad_norm": 2.198315143585205, "learning_rate": 1.948716306194376e-06, "loss": 1.1015, "step": 794 }, { "epoch": 2.2334739803094235, "grad_norm": 1.9941608905792236, "learning_rate": 1.9367764181951403e-06, "loss": 0.9099, "step": 795 }, { "epoch": 2.2362869198312234, "grad_norm": 2.1348161697387695, "learning_rate": 1.924864428426103e-06, "loss": 0.9096, "step": 796 }, { "epoch": 2.239099859353024, "grad_norm": 2.182652235031128, "learning_rate": 1.9129804453754053e-06, "loss": 1.2748, "step": 797 }, { "epoch": 2.241912798874824, "grad_norm": 2.1464662551879883, "learning_rate": 1.9011245772761173e-06, "loss": 1.2931, "step": 798 }, { "epoch": 2.2447257383966246, "grad_norm": 2.15000319480896, "learning_rate": 1.889296932105254e-06, "loss": 0.9775, "step": 799 }, { "epoch": 2.247538677918425, "grad_norm": 2.1729373931884766, "learning_rate": 1.8774976175827898e-06, "loss": 1.182, "step": 800 }, { "epoch": 2.247538677918425, "eval_loss": 0.6389347910881042, "eval_runtime": 2.7883, "eval_samples_per_second": 9.325, "eval_steps_per_second": 1.435, "step": 800 }, { "epoch": 2.247538677918425, "eval_active_sample_count": 30, "eval_avg_loss": 600.75, "eval_avg_mem_token_accuracy": 0.24822695035460993, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008117824423054622, "eval_avg_mem_token_rate": 0.5736047362469234, "eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 800, "eval_loss": 0.6389347910881042, "eval_num_samples": 30, "eval_runtime": 2.7883, "eval_samples_per_second": 9.325, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.435, "eval_total_correct_count": 70, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8623, "step": 800 }, { "epoch": 2.250351617440225, "grad_norm": 2.0819458961486816, "learning_rate": 1.8657267411706802e-06, "loss": 1.0104, "step": 801 }, { "epoch": 2.2531645569620253, "grad_norm": 2.394252061843872, "learning_rate": 1.853984410071879e-06, "loss": 1.249, "step": 802 }, { "epoch": 2.2559774964838257, "grad_norm": 2.0108907222747803, "learning_rate": 1.8422707312293663e-06, "loss": 1.0054, "step": 803 }, { "epoch": 2.2587904360056257, "grad_norm": 2.035367488861084, "learning_rate": 1.8305858113251717e-06, "loss": 1.052, "step": 804 }, { "epoch": 2.261603375527426, "grad_norm": 2.199094772338867, "learning_rate": 1.8189297567794029e-06, "loss": 1.2031, "step": 805 }, { "epoch": 2.2644163150492265, "grad_norm": 2.0634264945983887, "learning_rate": 1.8073026737492783e-06, "loss": 1.1867, "step": 806 }, { "epoch": 2.267229254571027, "grad_norm": 2.314810037612915, "learning_rate": 1.7957046681281582e-06, "loss": 1.2492, "step": 807 }, { "epoch": 2.270042194092827, "grad_norm": 2.0201666355133057, "learning_rate": 1.7841358455445807e-06, "loss": 1.1079, "step": 808 }, { "epoch": 2.272855133614627, "grad_norm": 2.239051342010498, "learning_rate": 1.7725963113612998e-06, "loss": 1.1677, "step": 809 }, { "epoch": 2.2756680731364276, "grad_norm": 2.3143956661224365, "learning_rate": 1.7610861706743316e-06, "loss": 1.1724, "step": 810 }, { "epoch": 2.278481012658228, "grad_norm": 2.3886356353759766, "learning_rate": 1.7496055283119812e-06, "loss": 1.2109, "step": 811 }, { "epoch": 2.281293952180028, "grad_norm": 2.2909440994262695, "learning_rate": 1.7381544888339103e-06, "loss": 1.0614, "step": 812 }, { "epoch": 2.2841068917018283, "grad_norm": 2.069227695465088, "learning_rate": 1.726733156530161e-06, "loss": 1.0202, "step": 813 }, { "epoch": 2.2869198312236287, "grad_norm": 2.242708683013916, "learning_rate": 1.7153416354202307e-06, "loss": 1.0972, "step": 814 }, { "epoch": 2.289732770745429, "grad_norm": 2.0846173763275146, "learning_rate": 1.7039800292520997e-06, "loss": 1.1095, "step": 815 }, { "epoch": 2.292545710267229, "grad_norm": 1.924421787261963, "learning_rate": 1.69264844150131e-06, "loss": 1.1585, "step": 816 }, { "epoch": 2.2953586497890295, "grad_norm": 1.7929229736328125, "learning_rate": 1.6813469753700013e-06, "loss": 0.9856, "step": 817 }, { "epoch": 2.29817158931083, "grad_norm": 1.9918988943099976, "learning_rate": 1.6700757337859907e-06, "loss": 1.0617, "step": 818 }, { "epoch": 2.3009845288326303, "grad_norm": 2.357882499694824, "learning_rate": 1.6588348194018205e-06, "loss": 1.0826, "step": 819 }, { "epoch": 2.3037974683544302, "grad_norm": 2.163602828979492, "learning_rate": 1.6476243345938293e-06, "loss": 1.342, "step": 820 }, { "epoch": 2.3066104078762306, "grad_norm": 1.7069376707077026, "learning_rate": 1.6364443814612207e-06, "loss": 0.933, "step": 821 }, { "epoch": 2.309423347398031, "grad_norm": 2.1436493396759033, "learning_rate": 1.6252950618251311e-06, "loss": 1.2028, "step": 822 }, { "epoch": 2.3122362869198314, "grad_norm": 2.0016818046569824, "learning_rate": 1.614176477227703e-06, "loss": 1.1039, "step": 823 }, { "epoch": 2.3150492264416314, "grad_norm": 2.098785400390625, "learning_rate": 1.6030887289311604e-06, "loss": 1.0678, "step": 824 }, { "epoch": 2.3178621659634318, "grad_norm": 2.156809091567993, "learning_rate": 1.5920319179168859e-06, "loss": 1.2103, "step": 825 }, { "epoch": 2.320675105485232, "grad_norm": 2.111753463745117, "learning_rate": 1.5810061448845028e-06, "loss": 1.1346, "step": 826 }, { "epoch": 2.3234880450070325, "grad_norm": 2.18839693069458, "learning_rate": 1.5700115102509562e-06, "loss": 1.1966, "step": 827 }, { "epoch": 2.3263009845288325, "grad_norm": 2.2580389976501465, "learning_rate": 1.5590481141495988e-06, "loss": 1.2102, "step": 828 }, { "epoch": 2.329113924050633, "grad_norm": 2.530665874481201, "learning_rate": 1.5481160564292802e-06, "loss": 1.3096, "step": 829 }, { "epoch": 2.3319268635724333, "grad_norm": 2.008321523666382, "learning_rate": 1.5372154366534325e-06, "loss": 1.0493, "step": 830 }, { "epoch": 2.3347398030942337, "grad_norm": 1.8788542747497559, "learning_rate": 1.5263463540991769e-06, "loss": 1.1453, "step": 831 }, { "epoch": 2.3375527426160336, "grad_norm": 2.1390604972839355, "learning_rate": 1.5155089077563968e-06, "loss": 0.9813, "step": 832 }, { "epoch": 2.340365682137834, "grad_norm": 2.1308085918426514, "learning_rate": 1.5047031963268617e-06, "loss": 1.3274, "step": 833 }, { "epoch": 2.3431786216596344, "grad_norm": 2.2323601245880127, "learning_rate": 1.49392931822331e-06, "loss": 1.1762, "step": 834 }, { "epoch": 2.3459915611814344, "grad_norm": 2.2134149074554443, "learning_rate": 1.4831873715685597e-06, "loss": 1.1039, "step": 835 }, { "epoch": 2.3488045007032348, "grad_norm": 1.834775686264038, "learning_rate": 1.4724774541946145e-06, "loss": 0.9826, "step": 836 }, { "epoch": 2.351617440225035, "grad_norm": 1.9355462789535522, "learning_rate": 1.461799663641773e-06, "loss": 1.0111, "step": 837 }, { "epoch": 2.3544303797468356, "grad_norm": 2.2236545085906982, "learning_rate": 1.4511540971577377e-06, "loss": 1.1159, "step": 838 }, { "epoch": 2.357243319268636, "grad_norm": 2.29103946685791, "learning_rate": 1.440540851696733e-06, "loss": 1.3618, "step": 839 }, { "epoch": 2.360056258790436, "grad_norm": 2.335484743118286, "learning_rate": 1.429960023918619e-06, "loss": 1.165, "step": 840 }, { "epoch": 2.3628691983122363, "grad_norm": 2.207131862640381, "learning_rate": 1.4194117101880134e-06, "loss": 1.11, "step": 841 }, { "epoch": 2.3656821378340367, "grad_norm": 1.7570301294326782, "learning_rate": 1.4088960065734137e-06, "loss": 0.9707, "step": 842 }, { "epoch": 2.3684950773558366, "grad_norm": 2.027989149093628, "learning_rate": 1.3984130088463204e-06, "loss": 1.1416, "step": 843 }, { "epoch": 2.371308016877637, "grad_norm": 2.0788614749908447, "learning_rate": 1.3879628124803662e-06, "loss": 1.0461, "step": 844 }, { "epoch": 2.3741209563994374, "grad_norm": 1.9784637689590454, "learning_rate": 1.3775455126504466e-06, "loss": 1.0517, "step": 845 }, { "epoch": 2.376933895921238, "grad_norm": 1.6520678997039795, "learning_rate": 1.3671612042318527e-06, "loss": 0.8804, "step": 846 }, { "epoch": 2.379746835443038, "grad_norm": 2.11843204498291, "learning_rate": 1.3568099817994068e-06, "loss": 1.0982, "step": 847 }, { "epoch": 2.382559774964838, "grad_norm": 2.0866153240203857, "learning_rate": 1.3464919396266018e-06, "loss": 1.1652, "step": 848 }, { "epoch": 2.3853727144866386, "grad_norm": 2.224863052368164, "learning_rate": 1.3362071716847424e-06, "loss": 1.2356, "step": 849 }, { "epoch": 2.388185654008439, "grad_norm": 2.0457394123077393, "learning_rate": 1.3259557716420868e-06, "loss": 1.2145, "step": 850 }, { "epoch": 2.388185654008439, "eval_loss": 0.6384085416793823, "eval_runtime": 2.8184, "eval_samples_per_second": 9.225, "eval_steps_per_second": 1.419, "step": 850 }, { "epoch": 2.388185654008439, "eval_active_sample_count": 30, "eval_avg_loss": 600.875, "eval_avg_mem_token_accuracy": 0.25177304964539005, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008220446914437884, "eval_avg_mem_token_rate": 0.5745360207543404, "eval_avg_mem_token_recall(Accuracy)": 0.25177304964539005, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 850, "eval_loss": 0.6384085416793823, "eval_num_samples": 30, "eval_runtime": 2.8184, "eval_samples_per_second": 9.225, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.419, "eval_total_correct_count": 71, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8637, "step": 850 }, { "epoch": 2.390998593530239, "grad_norm": 2.302175760269165, "learning_rate": 1.3157378328630027e-06, "loss": 1.2569, "step": 851 }, { "epoch": 2.3938115330520393, "grad_norm": 2.0771360397338867, "learning_rate": 1.3055534484070997e-06, "loss": 1.0361, "step": 852 }, { "epoch": 2.3966244725738397, "grad_norm": 2.1782445907592773, "learning_rate": 1.2954027110284035e-06, "loss": 1.1286, "step": 853 }, { "epoch": 2.39943741209564, "grad_norm": 2.210466146469116, "learning_rate": 1.285285713174489e-06, "loss": 1.0967, "step": 854 }, { "epoch": 2.40225035161744, "grad_norm": 2.1318819522857666, "learning_rate": 1.2752025469856598e-06, "loss": 1.1318, "step": 855 }, { "epoch": 2.4050632911392404, "grad_norm": 2.405397653579712, "learning_rate": 1.2651533042940883e-06, "loss": 1.1057, "step": 856 }, { "epoch": 2.407876230661041, "grad_norm": 1.8459330797195435, "learning_rate": 1.2551380766230003e-06, "loss": 0.9308, "step": 857 }, { "epoch": 2.4106891701828412, "grad_norm": 2.1533725261688232, "learning_rate": 1.2451569551858183e-06, "loss": 1.1996, "step": 858 }, { "epoch": 2.413502109704641, "grad_norm": 2.2185754776000977, "learning_rate": 1.2352100308853548e-06, "loss": 1.3325, "step": 859 }, { "epoch": 2.4163150492264416, "grad_norm": 1.8294565677642822, "learning_rate": 1.225297394312966e-06, "loss": 0.8245, "step": 860 }, { "epoch": 2.419127988748242, "grad_norm": 2.1881840229034424, "learning_rate": 1.2154191357477352e-06, "loss": 1.1655, "step": 861 }, { "epoch": 2.4219409282700424, "grad_norm": 1.8707904815673828, "learning_rate": 1.205575345155649e-06, "loss": 0.9647, "step": 862 }, { "epoch": 2.4247538677918423, "grad_norm": 1.8865529298782349, "learning_rate": 1.1957661121887782e-06, "loss": 0.972, "step": 863 }, { "epoch": 2.4275668073136427, "grad_norm": 2.1275415420532227, "learning_rate": 1.1859915261844596e-06, "loss": 0.9982, "step": 864 }, { "epoch": 2.430379746835443, "grad_norm": 2.7815465927124023, "learning_rate": 1.1762516761644831e-06, "loss": 0.9779, "step": 865 }, { "epoch": 2.4331926863572435, "grad_norm": 2.201364517211914, "learning_rate": 1.1665466508342876e-06, "loss": 1.1864, "step": 866 }, { "epoch": 2.4360056258790435, "grad_norm": 1.9111566543579102, "learning_rate": 1.1568765385821373e-06, "loss": 1.1079, "step": 867 }, { "epoch": 2.438818565400844, "grad_norm": 2.0928750038146973, "learning_rate": 1.147241427478336e-06, "loss": 0.8893, "step": 868 }, { "epoch": 2.4416315049226442, "grad_norm": 2.2094082832336426, "learning_rate": 1.1376414052744055e-06, "loss": 1.1135, "step": 869 }, { "epoch": 2.4444444444444446, "grad_norm": 2.2001736164093018, "learning_rate": 1.128076559402308e-06, "loss": 1.0784, "step": 870 }, { "epoch": 2.4472573839662446, "grad_norm": 2.1906962394714355, "learning_rate": 1.1185469769736262e-06, "loss": 1.0625, "step": 871 }, { "epoch": 2.450070323488045, "grad_norm": 2.111055612564087, "learning_rate": 1.1090527447787924e-06, "loss": 1.0759, "step": 872 }, { "epoch": 2.4528832630098454, "grad_norm": 2.1977760791778564, "learning_rate": 1.0995939492862783e-06, "loss": 1.156, "step": 873 }, { "epoch": 2.4556962025316453, "grad_norm": 2.4149186611175537, "learning_rate": 1.0901706766418247e-06, "loss": 1.0938, "step": 874 }, { "epoch": 2.4585091420534457, "grad_norm": 1.9314627647399902, "learning_rate": 1.0807830126676444e-06, "loss": 0.8718, "step": 875 }, { "epoch": 2.461322081575246, "grad_norm": 2.219050168991089, "learning_rate": 1.0714310428616464e-06, "loss": 0.9997, "step": 876 }, { "epoch": 2.4641350210970465, "grad_norm": 1.7131034135818481, "learning_rate": 1.0621148523966552e-06, "loss": 0.8264, "step": 877 }, { "epoch": 2.466947960618847, "grad_norm": 2.0101089477539062, "learning_rate": 1.052834526119637e-06, "loss": 1.0334, "step": 878 }, { "epoch": 2.469760900140647, "grad_norm": 2.2573459148406982, "learning_rate": 1.0435901485509254e-06, "loss": 1.2282, "step": 879 }, { "epoch": 2.4725738396624473, "grad_norm": 2.196690797805786, "learning_rate": 1.0343818038834513e-06, "loss": 1.013, "step": 880 }, { "epoch": 2.4753867791842477, "grad_norm": 2.2671730518341064, "learning_rate": 1.0252095759819785e-06, "loss": 1.1514, "step": 881 }, { "epoch": 2.4781997187060476, "grad_norm": 2.392235279083252, "learning_rate": 1.016073548382337e-06, "loss": 1.2227, "step": 882 }, { "epoch": 2.481012658227848, "grad_norm": 2.245374917984009, "learning_rate": 1.0069738042906635e-06, "loss": 1.2656, "step": 883 }, { "epoch": 2.4838255977496484, "grad_norm": 1.7064595222473145, "learning_rate": 9.979104265826438e-07, "loss": 0.9954, "step": 884 }, { "epoch": 2.486638537271449, "grad_norm": 1.9993723630905151, "learning_rate": 9.888834978027589e-07, "loss": 1.0137, "step": 885 }, { "epoch": 2.489451476793249, "grad_norm": 2.405082941055298, "learning_rate": 9.798931001635298e-07, "loss": 1.1, "step": 886 }, { "epoch": 2.492264416315049, "grad_norm": 2.263054132461548, "learning_rate": 9.709393155447734e-07, "loss": 1.1043, "step": 887 }, { "epoch": 2.4950773558368495, "grad_norm": 2.4851043224334717, "learning_rate": 9.62022225492853e-07, "loss": 1.4185, "step": 888 }, { "epoch": 2.49789029535865, "grad_norm": 2.131120443344116, "learning_rate": 9.531419112199375e-07, "loss": 1.0574, "step": 889 }, { "epoch": 2.50070323488045, "grad_norm": 2.3178141117095947, "learning_rate": 9.442984536032612e-07, "loss": 1.1726, "step": 890 }, { "epoch": 2.5035161744022503, "grad_norm": 2.0481185913085938, "learning_rate": 9.354919331843865e-07, "loss": 1.1169, "step": 891 }, { "epoch": 2.5063291139240507, "grad_norm": 2.4421157836914062, "learning_rate": 9.267224301684763e-07, "loss": 1.27, "step": 892 }, { "epoch": 2.509142053445851, "grad_norm": 2.13606333732605, "learning_rate": 9.17990024423549e-07, "loss": 1.2005, "step": 893 }, { "epoch": 2.5119549929676515, "grad_norm": 2.085256576538086, "learning_rate": 9.09294795479771e-07, "loss": 0.9328, "step": 894 }, { "epoch": 2.5147679324894514, "grad_norm": 1.9264284372329712, "learning_rate": 9.006368225287116e-07, "loss": 0.8267, "step": 895 }, { "epoch": 2.517580872011252, "grad_norm": 1.8938343524932861, "learning_rate": 8.920161844226416e-07, "loss": 0.9883, "step": 896 }, { "epoch": 2.520393811533052, "grad_norm": 2.379265308380127, "learning_rate": 8.834329596737995e-07, "loss": 1.2038, "step": 897 }, { "epoch": 2.523206751054852, "grad_norm": 1.9732309579849243, "learning_rate": 8.748872264536856e-07, "loss": 1.0939, "step": 898 }, { "epoch": 2.5260196905766525, "grad_norm": 1.9441081285476685, "learning_rate": 8.663790625923451e-07, "loss": 1.1116, "step": 899 }, { "epoch": 2.528832630098453, "grad_norm": 1.8037775754928589, "learning_rate": 8.57908545577662e-07, "loss": 0.9497, "step": 900 }, { "epoch": 2.528832630098453, "eval_loss": 0.6382944583892822, "eval_runtime": 2.8103, "eval_samples_per_second": 9.252, "eval_steps_per_second": 1.423, "step": 900 }, { "epoch": 2.528832630098453, "eval_active_sample_count": 30, "eval_avg_loss": 600.5, "eval_avg_mem_token_accuracy": 0.24822695035460993, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008113120074177098, "eval_avg_mem_token_rate": 0.5739373378567152, "eval_avg_mem_token_recall(Accuracy)": 0.24822695035460993, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 900, "eval_loss": 0.6382944583892822, "eval_num_samples": 30, "eval_runtime": 2.8103, "eval_samples_per_second": 9.252, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.423, "eval_total_correct_count": 70, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8628, "step": 900 }, { "epoch": 2.5316455696202533, "grad_norm": 2.0587706565856934, "learning_rate": 8.494757525546538e-07, "loss": 0.989, "step": 901 }, { "epoch": 2.5344585091420533, "grad_norm": 2.0397393703460693, "learning_rate": 8.410807603247656e-07, "loss": 0.9581, "step": 902 }, { "epoch": 2.5372714486638537, "grad_norm": 1.872904896736145, "learning_rate": 8.327236453451743e-07, "loss": 0.9432, "step": 903 }, { "epoch": 2.540084388185654, "grad_norm": 2.3130741119384766, "learning_rate": 8.244044837280901e-07, "loss": 1.2045, "step": 904 }, { "epoch": 2.542897327707454, "grad_norm": 2.1820616722106934, "learning_rate": 8.161233512400641e-07, "loss": 1.1755, "step": 905 }, { "epoch": 2.5457102672292544, "grad_norm": 1.8425172567367554, "learning_rate": 8.078803233012966e-07, "loss": 0.8806, "step": 906 }, { "epoch": 2.548523206751055, "grad_norm": 2.0481603145599365, "learning_rate": 7.996754749849567e-07, "loss": 1.0307, "step": 907 }, { "epoch": 2.551336146272855, "grad_norm": 1.9898444414138794, "learning_rate": 7.915088810164856e-07, "loss": 0.9326, "step": 908 }, { "epoch": 2.5541490857946556, "grad_norm": 2.014399766921997, "learning_rate": 7.833806157729329e-07, "loss": 1.0494, "step": 909 }, { "epoch": 2.5569620253164556, "grad_norm": 1.9588618278503418, "learning_rate": 7.752907532822613e-07, "loss": 1.0299, "step": 910 }, { "epoch": 2.559774964838256, "grad_norm": 2.05118465423584, "learning_rate": 7.672393672226902e-07, "loss": 1.2032, "step": 911 }, { "epoch": 2.5625879043600563, "grad_norm": 2.032313585281372, "learning_rate": 7.592265309220071e-07, "loss": 1.2313, "step": 912 }, { "epoch": 2.5654008438818563, "grad_norm": 2.2414844036102295, "learning_rate": 7.512523173569175e-07, "loss": 1.1436, "step": 913 }, { "epoch": 2.5682137834036567, "grad_norm": 2.186063289642334, "learning_rate": 7.433167991523632e-07, "loss": 1.1053, "step": 914 }, { "epoch": 2.571026722925457, "grad_norm": 2.098294734954834, "learning_rate": 7.354200485808749e-07, "loss": 1.1406, "step": 915 }, { "epoch": 2.5738396624472575, "grad_norm": 2.103463888168335, "learning_rate": 7.275621375619058e-07, "loss": 1.2908, "step": 916 }, { "epoch": 2.576652601969058, "grad_norm": 2.070359706878662, "learning_rate": 7.197431376611785e-07, "loss": 0.9896, "step": 917 }, { "epoch": 2.579465541490858, "grad_norm": 1.8880215883255005, "learning_rate": 7.11963120090034e-07, "loss": 0.9669, "step": 918 }, { "epoch": 2.5822784810126582, "grad_norm": 1.9502841234207153, "learning_rate": 7.042221557047823e-07, "loss": 0.9554, "step": 919 }, { "epoch": 2.5850914205344586, "grad_norm": 2.4192519187927246, "learning_rate": 6.96520315006059e-07, "loss": 1.4215, "step": 920 }, { "epoch": 2.5879043600562586, "grad_norm": 2.0227794647216797, "learning_rate": 6.888576681381798e-07, "loss": 1.0162, "step": 921 }, { "epoch": 2.590717299578059, "grad_norm": 2.049302101135254, "learning_rate": 6.81234284888505e-07, "loss": 1.1344, "step": 922 }, { "epoch": 2.5935302390998594, "grad_norm": 2.3195278644561768, "learning_rate": 6.736502346868018e-07, "loss": 1.1883, "step": 923 }, { "epoch": 2.5963431786216598, "grad_norm": 1.9605528116226196, "learning_rate": 6.661055866046134e-07, "loss": 0.9725, "step": 924 }, { "epoch": 2.59915611814346, "grad_norm": 2.021388530731201, "learning_rate": 6.586004093546277e-07, "loss": 1.1272, "step": 925 }, { "epoch": 2.60196905766526, "grad_norm": 1.7564787864685059, "learning_rate": 6.511347712900545e-07, "loss": 0.9292, "step": 926 }, { "epoch": 2.6047819971870605, "grad_norm": 1.886629581451416, "learning_rate": 6.437087404040016e-07, "loss": 1.027, "step": 927 }, { "epoch": 2.607594936708861, "grad_norm": 2.0022552013397217, "learning_rate": 6.363223843288535e-07, "loss": 1.0797, "step": 928 }, { "epoch": 2.610407876230661, "grad_norm": 2.084672451019287, "learning_rate": 6.289757703356597e-07, "loss": 1.164, "step": 929 }, { "epoch": 2.6132208157524612, "grad_norm": 2.0323879718780518, "learning_rate": 6.216689653335184e-07, "loss": 1.172, "step": 930 }, { "epoch": 2.6160337552742616, "grad_norm": 1.9796019792556763, "learning_rate": 6.144020358689679e-07, "loss": 1.1588, "step": 931 }, { "epoch": 2.618846694796062, "grad_norm": 2.1912734508514404, "learning_rate": 6.071750481253835e-07, "loss": 1.0916, "step": 932 }, { "epoch": 2.6216596343178624, "grad_norm": 2.242549419403076, "learning_rate": 5.999880679223702e-07, "loss": 1.1584, "step": 933 }, { "epoch": 2.6244725738396624, "grad_norm": 2.412274122238159, "learning_rate": 5.928411607151651e-07, "loss": 1.2867, "step": 934 }, { "epoch": 2.6272855133614628, "grad_norm": 2.416025161743164, "learning_rate": 5.857343915940434e-07, "loss": 1.2418, "step": 935 }, { "epoch": 2.630098452883263, "grad_norm": 2.027195453643799, "learning_rate": 5.786678252837213e-07, "loss": 1.1176, "step": 936 }, { "epoch": 2.632911392405063, "grad_norm": 1.915125846862793, "learning_rate": 5.71641526142771e-07, "loss": 1.0964, "step": 937 }, { "epoch": 2.6357243319268635, "grad_norm": 1.882155179977417, "learning_rate": 5.646555581630319e-07, "loss": 0.9061, "step": 938 }, { "epoch": 2.638537271448664, "grad_norm": 2.08971905708313, "learning_rate": 5.577099849690276e-07, "loss": 1.0459, "step": 939 }, { "epoch": 2.6413502109704643, "grad_norm": 2.2240920066833496, "learning_rate": 5.508048698173879e-07, "loss": 1.283, "step": 940 }, { "epoch": 2.6441631504922647, "grad_norm": 2.1256864070892334, "learning_rate": 5.439402755962719e-07, "loss": 0.9836, "step": 941 }, { "epoch": 2.6469760900140646, "grad_norm": 2.5735840797424316, "learning_rate": 5.371162648247957e-07, "loss": 1.3213, "step": 942 }, { "epoch": 2.649789029535865, "grad_norm": 2.2286038398742676, "learning_rate": 5.303328996524626e-07, "loss": 1.2165, "step": 943 }, { "epoch": 2.652601969057665, "grad_norm": 1.9804893732070923, "learning_rate": 5.235902418585958e-07, "loss": 1.0179, "step": 944 }, { "epoch": 2.6554149085794654, "grad_norm": 2.038052797317505, "learning_rate": 5.168883528517793e-07, "loss": 1.0582, "step": 945 }, { "epoch": 2.6582278481012658, "grad_norm": 2.0677716732025146, "learning_rate": 5.102272936692948e-07, "loss": 1.2318, "step": 946 }, { "epoch": 2.661040787623066, "grad_norm": 2.240928888320923, "learning_rate": 5.036071249765673e-07, "loss": 0.9381, "step": 947 }, { "epoch": 2.6638537271448666, "grad_norm": 2.2003684043884277, "learning_rate": 4.970279070666162e-07, "loss": 1.1822, "step": 948 }, { "epoch": 2.6666666666666665, "grad_norm": 2.238095998764038, "learning_rate": 4.904896998594955e-07, "loss": 1.2912, "step": 949 }, { "epoch": 2.669479606188467, "grad_norm": 2.166447639465332, "learning_rate": 4.839925629017638e-07, "loss": 1.1712, "step": 950 }, { "epoch": 2.669479606188467, "eval_loss": 0.6378054022789001, "eval_runtime": 2.8903, "eval_samples_per_second": 8.996, "eval_steps_per_second": 1.384, "step": 950 }, { "epoch": 2.669479606188467, "eval_active_sample_count": 30, "eval_avg_loss": 600.375, "eval_avg_mem_token_accuracy": 0.2553191489361702, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008346858335265477, "eval_avg_mem_token_rate": 0.5738042972127985, "eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 950, "eval_loss": 0.6378054022789001, "eval_num_samples": 30, "eval_runtime": 2.8903, "eval_samples_per_second": 8.996, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.384, "eval_total_correct_count": 72, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8626, "step": 950 }, { "epoch": 2.6722925457102673, "grad_norm": 2.2033162117004395, "learning_rate": 4.775365553659256e-07, "loss": 1.0523, "step": 951 }, { "epoch": 2.6751054852320673, "grad_norm": 2.277907133102417, "learning_rate": 4.711217360499082e-07, "loss": 1.0803, "step": 952 }, { "epoch": 2.6779184247538677, "grad_norm": 1.9675984382629395, "learning_rate": 4.6474816337650883e-07, "loss": 1.258, "step": 953 }, { "epoch": 2.680731364275668, "grad_norm": 2.1231744289398193, "learning_rate": 4.5841589539288187e-07, "loss": 1.0332, "step": 954 }, { "epoch": 2.6835443037974684, "grad_norm": 2.0946061611175537, "learning_rate": 4.5212498976999196e-07, "loss": 1.1456, "step": 955 }, { "epoch": 2.686357243319269, "grad_norm": 2.3890576362609863, "learning_rate": 4.458755038021029e-07, "loss": 1.2698, "step": 956 }, { "epoch": 2.689170182841069, "grad_norm": 1.8794134855270386, "learning_rate": 4.3966749440624736e-07, "loss": 0.9727, "step": 957 }, { "epoch": 2.691983122362869, "grad_norm": 2.3660783767700195, "learning_rate": 4.3350101812171143e-07, "loss": 1.1163, "step": 958 }, { "epoch": 2.6947960618846696, "grad_norm": 2.015714168548584, "learning_rate": 4.2737613110951924e-07, "loss": 1.1079, "step": 959 }, { "epoch": 2.6976090014064695, "grad_norm": 2.051121234893799, "learning_rate": 4.2129288915192355e-07, "loss": 1.1844, "step": 960 }, { "epoch": 2.70042194092827, "grad_norm": 2.295501708984375, "learning_rate": 4.152513476518927e-07, "loss": 1.2118, "step": 961 }, { "epoch": 2.7032348804500703, "grad_norm": 1.991119623184204, "learning_rate": 4.092515616326126e-07, "loss": 1.1834, "step": 962 }, { "epoch": 2.7060478199718707, "grad_norm": 1.856577754020691, "learning_rate": 4.0329358573697906e-07, "loss": 0.972, "step": 963 }, { "epoch": 2.708860759493671, "grad_norm": 2.042525291442871, "learning_rate": 3.973774742271047e-07, "loss": 1.1083, "step": 964 }, { "epoch": 2.711673699015471, "grad_norm": 1.8524376153945923, "learning_rate": 3.9150328098382593e-07, "loss": 0.9043, "step": 965 }, { "epoch": 2.7144866385372715, "grad_norm": 2.0273165702819824, "learning_rate": 3.8567105950620353e-07, "loss": 0.9573, "step": 966 }, { "epoch": 2.717299578059072, "grad_norm": 2.551295757293701, "learning_rate": 3.798808629110479e-07, "loss": 1.0811, "step": 967 }, { "epoch": 2.720112517580872, "grad_norm": 2.2737653255462646, "learning_rate": 3.7413274393242327e-07, "loss": 1.1984, "step": 968 }, { "epoch": 2.722925457102672, "grad_norm": 2.330913543701172, "learning_rate": 3.68426754921179e-07, "loss": 1.223, "step": 969 }, { "epoch": 2.7257383966244726, "grad_norm": 2.24187970161438, "learning_rate": 3.6276294784446e-07, "loss": 1.0989, "step": 970 }, { "epoch": 2.728551336146273, "grad_norm": 2.3575563430786133, "learning_rate": 3.5714137428524754e-07, "loss": 1.2727, "step": 971 }, { "epoch": 2.7313642756680734, "grad_norm": 2.3462178707122803, "learning_rate": 3.5156208544187554e-07, "loss": 1.2697, "step": 972 }, { "epoch": 2.7341772151898733, "grad_norm": 2.2106142044067383, "learning_rate": 3.460251321275759e-07, "loss": 0.9519, "step": 973 }, { "epoch": 2.7369901547116737, "grad_norm": 1.885840654373169, "learning_rate": 3.4053056477000856e-07, "loss": 0.8887, "step": 974 }, { "epoch": 2.739803094233474, "grad_norm": 1.8733952045440674, "learning_rate": 3.350784334108048e-07, "loss": 1.1189, "step": 975 }, { "epoch": 2.742616033755274, "grad_norm": 2.0802693367004395, "learning_rate": 3.2966878770511025e-07, "loss": 1.0736, "step": 976 }, { "epoch": 2.7454289732770745, "grad_norm": 2.003995656967163, "learning_rate": 3.24301676921136e-07, "loss": 0.9954, "step": 977 }, { "epoch": 2.748241912798875, "grad_norm": 1.968119740486145, "learning_rate": 3.189771499397043e-07, "loss": 1.0114, "step": 978 }, { "epoch": 2.7510548523206753, "grad_norm": 2.2957983016967773, "learning_rate": 3.136952552538092e-07, "loss": 1.1369, "step": 979 }, { "epoch": 2.7538677918424757, "grad_norm": 2.131643772125244, "learning_rate": 3.084560409681703e-07, "loss": 1.2212, "step": 980 }, { "epoch": 2.7566807313642756, "grad_norm": 1.8769854307174683, "learning_rate": 3.0325955479879765e-07, "loss": 0.94, "step": 981 }, { "epoch": 2.759493670886076, "grad_norm": 1.8766363859176636, "learning_rate": 2.981058440725559e-07, "loss": 0.9704, "step": 982 }, { "epoch": 2.7623066104078764, "grad_norm": 2.0633304119110107, "learning_rate": 2.929949557267331e-07, "loss": 0.9554, "step": 983 }, { "epoch": 2.7651195499296763, "grad_norm": 2.1459577083587646, "learning_rate": 2.8792693630861345e-07, "loss": 1.0209, "step": 984 }, { "epoch": 2.7679324894514767, "grad_norm": 2.0213375091552734, "learning_rate": 2.829018319750543e-07, "loss": 1.0121, "step": 985 }, { "epoch": 2.770745428973277, "grad_norm": 2.148283004760742, "learning_rate": 2.779196884920643e-07, "loss": 1.1324, "step": 986 }, { "epoch": 2.7735583684950775, "grad_norm": 2.2942779064178467, "learning_rate": 2.729805512343875e-07, "loss": 1.3349, "step": 987 }, { "epoch": 2.7763713080168775, "grad_norm": 1.860045075416565, "learning_rate": 2.6808446518508835e-07, "loss": 0.9753, "step": 988 }, { "epoch": 2.779184247538678, "grad_norm": 2.135307550430298, "learning_rate": 2.632314749351483e-07, "loss": 1.2426, "step": 989 }, { "epoch": 2.7819971870604783, "grad_norm": 2.502941131591797, "learning_rate": 2.5842162468304845e-07, "loss": 1.3143, "step": 990 }, { "epoch": 2.7848101265822782, "grad_norm": 1.8326023817062378, "learning_rate": 2.5365495823437834e-07, "loss": 1.0144, "step": 991 }, { "epoch": 2.7876230661040786, "grad_norm": 2.351020574569702, "learning_rate": 2.489315190014291e-07, "loss": 1.2042, "step": 992 }, { "epoch": 2.790436005625879, "grad_norm": 1.9044114351272583, "learning_rate": 2.4425135000280374e-07, "loss": 1.0554, "step": 993 }, { "epoch": 2.7932489451476794, "grad_norm": 2.1605467796325684, "learning_rate": 2.3961449386302017e-07, "loss": 1.1091, "step": 994 }, { "epoch": 2.79606188466948, "grad_norm": 1.9160940647125244, "learning_rate": 2.3502099281212775e-07, "loss": 0.9543, "step": 995 }, { "epoch": 2.7988748241912798, "grad_norm": 2.0379810333251953, "learning_rate": 2.3047088868531796e-07, "loss": 1.0654, "step": 996 }, { "epoch": 2.80168776371308, "grad_norm": 2.0998106002807617, "learning_rate": 2.2596422292254893e-07, "loss": 1.1908, "step": 997 }, { "epoch": 2.8045007032348805, "grad_norm": 2.1208677291870117, "learning_rate": 2.2150103656816357e-07, "loss": 1.0795, "step": 998 }, { "epoch": 2.8073136427566805, "grad_norm": 2.2069194316864014, "learning_rate": 2.1708137027051601e-07, "loss": 1.1354, "step": 999 }, { "epoch": 2.810126582278481, "grad_norm": 2.2347195148468018, "learning_rate": 2.1270526428160466e-07, "loss": 1.3928, "step": 1000 }, { "epoch": 2.810126582278481, "eval_loss": 0.63798987865448, "eval_runtime": 2.8525, "eval_samples_per_second": 9.115, "eval_steps_per_second": 1.402, "step": 1000 }, { "epoch": 2.810126582278481, "eval_active_sample_count": 30, "eval_avg_loss": 599.75, "eval_avg_mem_token_accuracy": 0.24113475177304963, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.00790054606715464, "eval_avg_mem_token_rate": 0.5725404110955897, "eval_avg_mem_token_recall(Accuracy)": 0.24113475177304963, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 1000, "eval_loss": 0.63798987865448, "eval_num_samples": 30, "eval_runtime": 2.8525, "eval_samples_per_second": 9.115, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.402, "eval_total_correct_count": 68, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8607, "step": 1000 }, { "epoch": 2.8129395218002813, "grad_norm": 2.0307729244232178, "learning_rate": 2.0837275845670135e-07, "loss": 1.2427, "step": 1001 }, { "epoch": 2.8157524613220817, "grad_norm": 2.4855947494506836, "learning_rate": 2.0408389225399339e-07, "loss": 1.1572, "step": 1002 }, { "epoch": 2.818565400843882, "grad_norm": 2.137430429458618, "learning_rate": 1.9983870473421761e-07, "loss": 1.1247, "step": 1003 }, { "epoch": 2.821378340365682, "grad_norm": 1.7523655891418457, "learning_rate": 1.9563723456031303e-07, "loss": 1.1162, "step": 1004 }, { "epoch": 2.8241912798874824, "grad_norm": 2.1431448459625244, "learning_rate": 1.9147951999705928e-07, "loss": 1.2084, "step": 1005 }, { "epoch": 2.827004219409283, "grad_norm": 2.178713798522949, "learning_rate": 1.8736559891073703e-07, "loss": 1.2073, "step": 1006 }, { "epoch": 2.8298171589310828, "grad_norm": 2.0820088386535645, "learning_rate": 1.8329550876877488e-07, "loss": 1.1191, "step": 1007 }, { "epoch": 2.832630098452883, "grad_norm": 2.0419578552246094, "learning_rate": 1.7926928663941635e-07, "loss": 1.0641, "step": 1008 }, { "epoch": 2.8354430379746836, "grad_norm": 2.2004177570343018, "learning_rate": 1.7528696919137444e-07, "loss": 1.3558, "step": 1009 }, { "epoch": 2.838255977496484, "grad_norm": 2.3024518489837646, "learning_rate": 1.7134859269350546e-07, "loss": 1.2914, "step": 1010 }, { "epoch": 2.8410689170182843, "grad_norm": 2.0407681465148926, "learning_rate": 1.6745419301446962e-07, "loss": 1.0491, "step": 1011 }, { "epoch": 2.8438818565400843, "grad_norm": 2.028738498687744, "learning_rate": 1.6360380562241428e-07, "loss": 1.2034, "step": 1012 }, { "epoch": 2.8466947960618847, "grad_norm": 2.436655044555664, "learning_rate": 1.5979746558464237e-07, "loss": 1.4506, "step": 1013 }, { "epoch": 2.849507735583685, "grad_norm": 2.0717296600341797, "learning_rate": 1.5603520756729885e-07, "loss": 1.1103, "step": 1014 }, { "epoch": 2.852320675105485, "grad_norm": 2.195970058441162, "learning_rate": 1.5231706583505256e-07, "loss": 1.2775, "step": 1015 }, { "epoch": 2.8551336146272854, "grad_norm": 2.2911033630371094, "learning_rate": 1.486430742507833e-07, "loss": 1.1482, "step": 1016 }, { "epoch": 2.857946554149086, "grad_norm": 2.503101348876953, "learning_rate": 1.4501326627527513e-07, "loss": 1.4186, "step": 1017 }, { "epoch": 2.8607594936708862, "grad_norm": 1.9371217489242554, "learning_rate": 1.4142767496691135e-07, "loss": 0.9705, "step": 1018 }, { "epoch": 2.8635724331926866, "grad_norm": 2.0493252277374268, "learning_rate": 1.3788633298137288e-07, "loss": 0.9959, "step": 1019 }, { "epoch": 2.8663853727144866, "grad_norm": 1.987891674041748, "learning_rate": 1.3438927257134083e-07, "loss": 0.9549, "step": 1020 }, { "epoch": 2.869198312236287, "grad_norm": 2.177379608154297, "learning_rate": 1.3093652558620384e-07, "loss": 1.1057, "step": 1021 }, { "epoch": 2.8720112517580874, "grad_norm": 1.7878172397613525, "learning_rate": 1.2752812347176514e-07, "loss": 0.865, "step": 1022 }, { "epoch": 2.8748241912798873, "grad_norm": 2.258223295211792, "learning_rate": 1.2416409726996037e-07, "loss": 1.1227, "step": 1023 }, { "epoch": 2.8776371308016877, "grad_norm": 2.103666067123413, "learning_rate": 1.2084447761857244e-07, "loss": 1.1573, "step": 1024 }, { "epoch": 2.880450070323488, "grad_norm": 1.982913851737976, "learning_rate": 1.1756929475095103e-07, "loss": 1.0078, "step": 1025 }, { "epoch": 2.8832630098452885, "grad_norm": 1.9436091184616089, "learning_rate": 1.143385784957407e-07, "loss": 1.0486, "step": 1026 }, { "epoch": 2.8860759493670884, "grad_norm": 2.438931465148926, "learning_rate": 1.111523582766072e-07, "loss": 1.2295, "step": 1027 }, { "epoch": 2.888888888888889, "grad_norm": 1.8638874292373657, "learning_rate": 1.0801066311196872e-07, "loss": 1.06, "step": 1028 }, { "epoch": 2.8917018284106892, "grad_norm": 1.9490095376968384, "learning_rate": 1.0491352161473345e-07, "loss": 1.0883, "step": 1029 }, { "epoch": 2.894514767932489, "grad_norm": 2.201900005340576, "learning_rate": 1.018609619920391e-07, "loss": 0.9764, "step": 1030 }, { "epoch": 2.8973277074542896, "grad_norm": 2.4178552627563477, "learning_rate": 9.885301204499321e-08, "loss": 1.2852, "step": 1031 }, { "epoch": 2.90014064697609, "grad_norm": 2.231503486633301, "learning_rate": 9.588969916842272e-08, "loss": 1.1528, "step": 1032 }, { "epoch": 2.9029535864978904, "grad_norm": 1.870887041091919, "learning_rate": 9.297105035062426e-08, "loss": 1.0726, "step": 1033 }, { "epoch": 2.9057665260196908, "grad_norm": 2.3219852447509766, "learning_rate": 9.009709217311702e-08, "loss": 1.1784, "step": 1034 }, { "epoch": 2.9085794655414907, "grad_norm": 2.1292107105255127, "learning_rate": 8.72678508104008e-08, "loss": 1.2251, "step": 1035 }, { "epoch": 2.911392405063291, "grad_norm": 2.016449451446533, "learning_rate": 8.448335202971891e-08, "loss": 0.9478, "step": 1036 }, { "epoch": 2.9142053445850915, "grad_norm": 2.08313250541687, "learning_rate": 8.174362119082291e-08, "loss": 1.0649, "step": 1037 }, { "epoch": 2.9170182841068915, "grad_norm": 2.0640265941619873, "learning_rate": 7.9048683245741e-08, "loss": 1.1765, "step": 1038 }, { "epoch": 2.919831223628692, "grad_norm": 2.1048390865325928, "learning_rate": 7.639856273855106e-08, "loss": 1.0642, "step": 1039 }, { "epoch": 2.9226441631504922, "grad_norm": 2.1916463375091553, "learning_rate": 7.379328380515805e-08, "loss": 1.2419, "step": 1040 }, { "epoch": 2.9254571026722926, "grad_norm": 2.252420425415039, "learning_rate": 7.123287017307302e-08, "loss": 1.3343, "step": 1041 }, { "epoch": 2.928270042194093, "grad_norm": 2.1169185638427734, "learning_rate": 6.871734516119721e-08, "loss": 1.129, "step": 1042 }, { "epoch": 2.931082981715893, "grad_norm": 2.2315621376037598, "learning_rate": 6.624673167961004e-08, "loss": 1.1125, "step": 1043 }, { "epoch": 2.9338959212376934, "grad_norm": 1.8748716115951538, "learning_rate": 6.382105222936085e-08, "loss": 1.049, "step": 1044 }, { "epoch": 2.9367088607594938, "grad_norm": 1.9676600694656372, "learning_rate": 6.144032890226304e-08, "loss": 1.1791, "step": 1045 }, { "epoch": 2.9395218002812937, "grad_norm": 1.765437126159668, "learning_rate": 5.910458338069192e-08, "loss": 0.9795, "step": 1046 }, { "epoch": 2.942334739803094, "grad_norm": 2.3168399333953857, "learning_rate": 5.6813836937392175e-08, "loss": 1.1186, "step": 1047 }, { "epoch": 2.9451476793248945, "grad_norm": 2.183238983154297, "learning_rate": 5.456811043527632e-08, "loss": 1.1833, "step": 1048 }, { "epoch": 2.947960618846695, "grad_norm": 1.8787195682525635, "learning_rate": 5.236742432724262e-08, "loss": 0.9953, "step": 1049 }, { "epoch": 2.9507735583684953, "grad_norm": 2.0316836833953857, "learning_rate": 5.021179865598136e-08, "loss": 1.0088, "step": 1050 }, { "epoch": 2.9507735583684953, "eval_loss": 0.6373986005783081, "eval_runtime": 2.8523, "eval_samples_per_second": 9.115, "eval_steps_per_second": 1.402, "step": 1050 }, { "epoch": 2.9507735583684953, "eval_active_sample_count": 30, "eval_avg_loss": 599.625, "eval_avg_mem_token_accuracy": 0.2553191489361702, "eval_avg_mem_token_gt_count": 9.4, "eval_avg_mem_token_precision": 0.008363340689975607, "eval_avg_mem_token_rate": 0.5726734517395065, "eval_avg_mem_token_recall(Accuracy)": 0.2553191489361702, "eval_avg_slot_norm_mean": 197.63333333333333, "eval_avg_slot_sim_mean": 0.996875, "eval_global_step": 1050, "eval_loss": 0.6373986005783081, "eval_num_samples": 30, "eval_runtime": 2.8523, "eval_samples_per_second": 9.115, "eval_sim_active_sample_count": 30, "eval_steps_per_second": 1.402, "eval_total_correct_count": 72, "eval_total_gt_mem_token_count": 282, "eval_total_positions": 15033, "eval_total_pred_mem_token_count": 8609, "step": 1050 }, { "epoch": 2.9535864978902953, "grad_norm": 2.193411111831665, "learning_rate": 4.810125305379998e-08, "loss": 1.086, "step": 1051 }, { "epoch": 2.9563994374120957, "grad_norm": 1.7261470556259155, "learning_rate": 4.6035806742436575e-08, "loss": 1.004, "step": 1052 }, { "epoch": 2.959212376933896, "grad_norm": 1.943182110786438, "learning_rate": 4.4015478532891675e-08, "loss": 1.1523, "step": 1053 }, { "epoch": 2.962025316455696, "grad_norm": 2.992014169692993, "learning_rate": 4.20402868252523e-08, "loss": 1.1195, "step": 1054 }, { "epoch": 2.9648382559774964, "grad_norm": 2.0633037090301514, "learning_rate": 4.01102496085265e-08, "loss": 1.1554, "step": 1055 }, { "epoch": 2.967651195499297, "grad_norm": 5.867424964904785, "learning_rate": 3.822538446047852e-08, "loss": 1.1499, "step": 1056 }, { "epoch": 2.970464135021097, "grad_norm": 2.3555386066436768, "learning_rate": 3.6385708547468925e-08, "loss": 1.296, "step": 1057 }, { "epoch": 2.9732770745428976, "grad_norm": 2.298612594604492, "learning_rate": 3.4591238624299696e-08, "loss": 1.1622, "step": 1058 }, { "epoch": 2.9760900140646975, "grad_norm": 2.095074415206909, "learning_rate": 3.284199103405883e-08, "loss": 1.0392, "step": 1059 }, { "epoch": 2.978902953586498, "grad_norm": 1.7967655658721924, "learning_rate": 3.113798170797489e-08, "loss": 0.8557, "step": 1060 }, { "epoch": 2.9817158931082983, "grad_norm": 2.187788963317871, "learning_rate": 2.9479226165268216e-08, "loss": 1.2315, "step": 1061 }, { "epoch": 2.9845288326300983, "grad_norm": 2.0555531978607178, "learning_rate": 2.7865739513012746e-08, "loss": 1.0719, "step": 1062 }, { "epoch": 2.9873417721518987, "grad_norm": 2.1727023124694824, "learning_rate": 2.629753644599664e-08, "loss": 1.0655, "step": 1063 }, { "epoch": 2.990154711673699, "grad_norm": 2.1658568382263184, "learning_rate": 2.4774631246589075e-08, "loss": 1.0773, "step": 1064 }, { "epoch": 2.9929676511954995, "grad_norm": 2.12109112739563, "learning_rate": 2.3297037784609787e-08, "loss": 1.1639, "step": 1065 }, { "epoch": 2.9957805907173, "grad_norm": 2.118447780609131, "learning_rate": 2.1864769517204177e-08, "loss": 1.1426, "step": 1066 }, { "epoch": 2.9985935302391, "grad_norm": 1.9243059158325195, "learning_rate": 2.0477839488718398e-08, "loss": 0.9786, "step": 1067 }, { "epoch": 3.0, "grad_norm": 1.6388542652130127, "learning_rate": 1.913626033058169e-08, "loss": 0.5664, "step": 1068 }, { "epoch": 3.0028129395218004, "grad_norm": 1.8314422369003296, "learning_rate": 1.784004426119257e-08, "loss": 1.0312, "step": 1069 }, { "epoch": 3.0056258790436003, "grad_norm": 2.122387170791626, "learning_rate": 1.6589203085804473e-08, "loss": 1.0936, "step": 1070 }, { "epoch": 3.0084388185654007, "grad_norm": 2.0820372104644775, "learning_rate": 1.538374819642252e-08, "loss": 1.0541, "step": 1071 }, { "epoch": 3.011251758087201, "grad_norm": 1.9248408079147339, "learning_rate": 1.4223690571695815e-08, "loss": 0.9005, "step": 1072 }, { "epoch": 3.0140646976090015, "grad_norm": 1.9669166803359985, "learning_rate": 1.3109040776819181e-08, "loss": 1.1376, "step": 1073 }, { "epoch": 3.0168776371308015, "grad_norm": 1.9701210260391235, "learning_rate": 1.2039808963437705e-08, "loss": 1.0197, "step": 1074 }, { "epoch": 3.019690576652602, "grad_norm": 2.451758623123169, "learning_rate": 1.1016004869551788e-08, "loss": 1.2066, "step": 1075 }, { "epoch": 3.0225035161744023, "grad_norm": 1.9009047746658325, "learning_rate": 1.0037637819431123e-08, "loss": 1.0529, "step": 1076 }, { "epoch": 3.0253164556962027, "grad_norm": 2.054837465286255, "learning_rate": 9.10471672352864e-09, "loss": 1.1907, "step": 1077 }, { "epoch": 3.0281293952180026, "grad_norm": 2.181744337081909, "learning_rate": 8.217250078400018e-09, "loss": 1.1479, "step": 1078 }, { "epoch": 3.030942334739803, "grad_norm": 2.066051721572876, "learning_rate": 7.375245966623757e-09, "loss": 1.2419, "step": 1079 }, { "epoch": 3.0337552742616034, "grad_norm": 2.2346465587615967, "learning_rate": 6.5787120567317734e-09, "loss": 0.9984, "step": 1080 }, { "epoch": 3.036568213783404, "grad_norm": 1.9933655261993408, "learning_rate": 5.827655603135585e-09, "loss": 1.0698, "step": 1081 }, { "epoch": 3.0393811533052038, "grad_norm": 2.1959750652313232, "learning_rate": 5.122083446062464e-09, "loss": 1.1049, "step": 1082 }, { "epoch": 3.042194092827004, "grad_norm": 2.2590200901031494, "learning_rate": 4.462002011493271e-09, "loss": 1.1198, "step": 1083 }, { "epoch": 3.0450070323488045, "grad_norm": 2.1988589763641357, "learning_rate": 3.847417311102497e-09, "loss": 1.1142, "step": 1084 }, { "epoch": 3.047819971870605, "grad_norm": 2.254117727279663, "learning_rate": 3.2783349422044197e-09, "loss": 1.199, "step": 1085 }, { "epoch": 3.050632911392405, "grad_norm": 1.9562636613845825, "learning_rate": 2.7547600877020355e-09, "loss": 1.0887, "step": 1086 }, { "epoch": 3.0534458509142053, "grad_norm": 1.9559649229049683, "learning_rate": 2.276697516039872e-09, "loss": 1.0819, "step": 1087 }, { "epoch": 3.0562587904360057, "grad_norm": 2.017869472503662, "learning_rate": 1.8441515811612465e-09, "loss": 0.9884, "step": 1088 }, { "epoch": 3.059071729957806, "grad_norm": 1.8643865585327148, "learning_rate": 1.4571262224666315e-09, "loss": 0.9771, "step": 1089 }, { "epoch": 3.061884669479606, "grad_norm": 2.1424920558929443, "learning_rate": 1.1156249647797934e-09, "loss": 1.2107, "step": 1090 }, { "epoch": 3.0646976090014064, "grad_norm": 2.071485757827759, "learning_rate": 8.196509183139301e-10, "loss": 0.8257, "step": 1091 }, { "epoch": 3.067510548523207, "grad_norm": 1.8392572402954102, "learning_rate": 5.692067786455813e-10, "loss": 1.119, "step": 1092 }, { "epoch": 3.070323488045007, "grad_norm": 2.0427193641662598, "learning_rate": 3.6429482668853824e-10, "loss": 1.0698, "step": 1093 }, { "epoch": 3.073136427566807, "grad_norm": 2.2885656356811523, "learning_rate": 2.0491692867330438e-10, "loss": 1.4175, "step": 1094 }, { "epoch": 3.0759493670886076, "grad_norm": 2.181267499923706, "learning_rate": 9.107453612933192e-11, "loss": 1.0596, "step": 1095 }, { "epoch": 3.078762306610408, "grad_norm": 2.340491533279419, "learning_rate": 2.2768685873364448e-11, "loss": 1.1616, "step": 1096 } ], "logging_steps": 1, "max_steps": 1096, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }